PyPI - waterfall - Versions diffs - 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

waterfall 0.1.7py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

waterfall/WatermarkerBase.py +220 -51
waterfall/permute.py +9 -2
waterfall/watermark.py +131 -123
{waterfall-0.1.7.dist-info → waterfall-0.2.1.dist-info}/METADATA +2 -2
waterfall-0.2.1.dist-info/RECORD +12 -0
waterfall-0.1.7.dist-info/RECORD +0 -12
{waterfall-0.1.7.dist-info → waterfall-0.2.1.dist-info}/WHEEL +0 -0
{waterfall-0.1.7.dist-info → waterfall-0.2.1.dist-info}/entry_points.txt +0 -0
{waterfall-0.1.7.dist-info → waterfall-0.2.1.dist-info}/licenses/LICENSE +0 -0

waterfall/WatermarkerBase.py CHANGED Viewed

@@ -6,19 +6,24 @@ from collections import defaultdict
 from functools import partial
 from multiprocessing import Pool
 from typing import List, Tuple, Optional
+from itertools import repeat
 import numpy as np
 import torch
 from scipy.sparse import csr_matrix, vstack
 from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers.modeling_utils import PreTrainedModel
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase, BatchEncoding
 from transformers.generation.logits_process import LogitsProcessor, TopKLogitsWarper, TopPLogitsWarper
+from transformers.generation.configuration_utils import GenerationConfig
 from waterfall.permute import Permute
 from waterfall.WatermarkingFn import WatermarkingFn
 from waterfall.WatermarkingFnFourier import WatermarkingFnFourier
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 class PerturbationProcessor(LogitsProcessor):
     def __init__(self,
                  N : int = 32000,     # Vocab size
@@ -28,7 +33,7 @@ class PerturbationProcessor(LogitsProcessor):
         self.id = id
         self.N = N
         self.init_token_count = None
-        self.phi = np.ones(N)
+        self.phi = torch.zeros(N)
         self.n_gram = 2
         self.skip_watermark = False
@@ -38,14 +43,14 @@ class PerturbationProcessor(LogitsProcessor):
     def reset(self, n_gram : int = 2) -> None:
         self.n_gram = n_gram
         self.init_token_count = None
-        if np.allclose(self.phi,np.median(self.phi)):
+        if torch.allclose(self.phi,torch.median(self.phi)):
             self.skip_watermark = True
             logging.warning(f"Generating without watermark as watermarking function is flat")
         else:
             self.skip_watermark = False
     def set_phi(self, phi : np.ndarray) -> None:
-        self.phi = phi
+        self.phi = torch.from_numpy(phi)
     def __call__(self, input_ids: torch.LongTensor,
                  scores: torch.FloatTensor) -> torch.FloatTensor:
@@ -60,12 +65,17 @@ class PerturbationProcessor(LogitsProcessor):
         if self.init_token_count + self.n_gram - 1 > input_ids.shape[1]:
             return scores
+        # using numpy as PyTorch tensors doesn't hash properly for rng and dict key
         prev_tokens = input_ids[:,-self.n_gram+1:].cpu().numpy()
-        permutations = [self.permute.get_permutation(prev_tokens[i,:], self.id, cache=True) for i in range(prev_tokens.shape[0])]
-        scores[:,:self.N] += torch.tensor(self.phi[permutations],
-                                          device=scores.device,
-                                          dtype=scores.dtype)
+        permutations = (
+            self.permute.get_permutation(prev_tokens[i,:], self.id, cache=True)
+            for i in range(prev_tokens.shape[0])
+        )
+        perturbations = torch.stack([
+            self.phi[permutation] for permutation in permutations
+        ])
+        scores[:,:self.N] += perturbations.to(device=scores.device, dtype=scores.dtype)
         return scores
 def indices_to_counts(N : int, dtype : np.dtype, indices : np.ndarray) -> csr_matrix:
@@ -74,99 +84,258 @@ def indices_to_counts(N : int, dtype : np.dtype, indices : np.ndarray) -> csr_ma
 class Watermarker:
     def __init__(self,
-                 tokenizer : PreTrainedTokenizerBase,
-                 model : Optional[PreTrainedModel] = None,
+                 tokenizer : Optional[PreTrainedTokenizerBase | str] = None,
+                 model : Optional[PreTrainedModel | str] = None,
                  id : int = 0,
                  kappa : float = 6,
                  k_p : int = 1,
                  n_gram : int = 2,
-                 watermarkingFnClass = WatermarkingFnFourier
+                 watermarkingFnClass = WatermarkingFnFourier,
+                 device = None,
                  ) -> None:
         assert kappa >= 0, f"kappa must be >= 0, value provided is {kappa}"
-        assert (model is None) or isinstance(model, PreTrainedModel), f"model must be a transformers model, value provided is {type(model)}" # argument order for tokenizer and model were swapped since the original code
-        self.tokenizer = tokenizer
-        self.model = model
         self.id = id
         self.k_p = k_p
         self.n_gram = n_gram
         self.kappa = kappa
+        if tokenizer is None:
+            if isinstance(model, str):
+                self.tokenizer = AutoTokenizer.from_pretrained(model)
+            elif isinstance(model, PreTrainedModel):
+                self.tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
+            else:
+                raise NotImplementedError("tokenizer must be provided or model must be a string or PreTrainedModel")
+        elif isinstance(tokenizer, str):
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+        else:
+            self.tokenizer = tokenizer
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
         self.N = self.tokenizer.vocab_size
-        self.logits_processor = PerturbationProcessor(N = self.N, id = id)
+        self.logits_processor = PerturbationProcessor(N = self.N, id = self.id)
+        if isinstance(model, str):
+            self.load_model(model, device_map=device)
+        else:
+            self.model = model
+        assert (self.model is None) or isinstance(self.model, PreTrainedModel), f"model must be a transformers model, value provided is {type(self.model)}" # argument order for tokenizer and model were swapped since the original code
         self.compute_phi(watermarkingFnClass)
+    def load_model(self, model_name_or_path : str, device_map : str = "auto"):
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name_or_path,
+            device_map=device_map,
+        )
     def compute_phi(self, watermarkingFnClass = WatermarkingFnFourier) -> None:
-        self.watermarking_fn: WatermarkingFn = watermarkingFnClass(id = id, k_p = self.k_p, N = self.N, kappa = self.kappa)
+        self.watermarking_fn: WatermarkingFn = watermarkingFnClass(id = self.id, k_p = self.k_p, N = self.N, kappa = self.kappa)
         self.phi = self.watermarking_fn.phi
         self.logits_processor.set_phi(self.phi)
+    # Format prompt(s) into chat template
+    def format_prompt(
+            self,
+            T_os : str | List[str],
+            system_prompt : Optional[str] = None,
+            assistant_prefill : Optional[str | List[str]] = "",
+            ) -> str | List[str]:
+        if isinstance(system_prompt, str):
+            _system_prompt = {"role":"system", "content":system_prompt}
+        is_single = isinstance(T_os, str)
+        if is_single:
+            T_os = [T_os]
+        if not isinstance(assistant_prefill, list):
+            assistant_prefill = repeat(assistant_prefill, len(T_os))
+        else:
+            assert len(assistant_prefill) == len(T_os), "Length of assistant_prefill must match length of T_os"
+        formatted_prompts = []
+        for T_o, prefill in zip(T_os, assistant_prefill):
+            formatted_prompt : str = self.tokenizer.apply_chat_template(
+                [
+                    _system_prompt,
+                    {"role":"user", "content":T_o},
+                ], tokenize=False, add_generation_prompt = True)
+            if prefill is not None:
+                formatted_prompt += prefill
+            formatted_prompts.append(formatted_prompt)
+        if is_single:
+            return formatted_prompts[0]
+        return formatted_prompts
+    # Find the largest batch size that fits in GPU memory
+    def find_largest_batch_size(
+            self,
+            tokd_inputs : List[BatchEncoding],
+            logits_processor : List[LogitsProcessor] = [],
+            **kwargs,
+        ):
+        longest_idx = np.argmax([tokd_input["input_ids"].shape[-1] for tokd_input in tokd_inputs])
+        if "generation_config" in kwargs:
+            generation_config = GenerationConfig(**kwargs["generation_config"].to_dict()) # copy
+            max_new_tokens = generation_config.max_new_tokens
+        else:
+            generation_config = GenerationConfig(**kwargs)
+            max_new_tokens = kwargs.get("max_new_tokens", 2048)
+        generation_config.update(max_new_tokens=1)
+        input_ids = tokd_inputs[longest_idx]["input_ids"]
+        input_ids = torch.zeros(
+            (1, max_new_tokens + input_ids.shape[-1] - 1),
+            dtype=input_ids.dtype,
+            device=self.model.device
+            )
+        max_batch_size = 1
+        with torch.no_grad():
+            while max_batch_size < min(16, len(tokd_inputs)):
+                torch.cuda.empty_cache()
+                try:
+                    _ = self.model.generate(
+                        input_ids=input_ids,
+                        attention_mask=torch.ones_like(input_ids),
+                        logits_processor=logits_processor,
+                        generation_config=generation_config,
+                        pad_token_id=self.tokenizer.eos_token_id,
+                        tokenizer=self.tokenizer,
+                    )
+                    max_batch_size = input_ids.shape[0]
+                except RuntimeError as e:
+                    if "CUDA out of memory" in str(e):
+                        break
+                    else:
+                        raise e
+                input_ids = torch.cat([input_ids, input_ids], dim=0)
+        torch.cuda.empty_cache()
+        return max_batch_size
     def generate(
             self,
-            prompt : Optional[str] = None,
-            tokd_input : Optional[torch.Tensor] = None,
+            prompts : Optional[str | List[str]] = None,
+            tokd_inputs : Optional[torch.Tensor | List[torch.Tensor] | BatchEncoding | List[BatchEncoding]] = None,
             n_gram : Optional[int] = None,
-            max_new_tokens : int = 1000,
-            return_text : bool =True,
-            return_tokens : bool =False,
-            return_scores : bool =False,
-            do_sample : bool =True,
-            **kwargs
-            ) -> List[str] | dict:
+            return_text : bool = True,
+            return_tokens : bool = False,
+            return_scores : bool = False,
+            use_tqdm : bool = False,
+            batched_generate : bool = True,
+            **kwargs    # Other generate parameters
+            ) -> List[str] | dict:  # Returns flattened list of query x beam
         assert self.model is not None, "Model is not loaded. Please load the model before generating text."
+        is_single = isinstance(prompts, str) or isinstance(tokd_inputs, torch.Tensor)
+        if is_single:
+            prompts = [prompts] if prompts is not None else None
+            tokd_inputs = [tokd_inputs] if tokd_inputs is not None else None
         if n_gram is None:
             n_gram = self.n_gram
-        if tokd_input is None:
-            assert prompt is not None, "Either prompt or tokd_input must be provided."
-            tokd_input = self.tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
-        tokd_input = tokd_input.to(self.model.device)
+        if tokd_inputs is None:
+            assert prompts is not None, "Either prompt or tokd_input must be provided."
+            tokd_inputs = [self.tokenizer(prompt, return_tensors="pt", add_special_tokens=False) for prompt in prompts]
+        # If tokd_input is a tensor, convert it to a BatchEncoding
+        squeezed_tokd_inputs = []
+        for tokd_input in tokd_inputs:
+            if isinstance(tokd_input, torch.Tensor):
+                input_ids = tokd_input
+                attention_mask = torch.ones_like(tokd_input)
+            else:
+                input_ids = tokd_input["input_ids"]
+                attention_mask = tokd_input["attention_mask"]
+            if input_ids.ndim == 2:
+                input_ids = input_ids.squeeze()
+                attention_mask = attention_mask.squeeze()
+            squeezed_tokd_inputs.append(BatchEncoding({"input_ids": input_ids, "attention_mask": attention_mask}))
+        tokd_inputs = squeezed_tokd_inputs
         logits_processor = []
-        if "top_k" in kwargs and kwargs["top_k"] is not None and kwargs["top_k"] != 0:
-            logits_processor.append(TopKLogitsWarper(kwargs.pop("top_k")))
-        if "top_p" in kwargs and kwargs["top_p"] is not None and kwargs["top_p"] < 1.0:
-            logits_processor.append(TopPLogitsWarper(kwargs.pop("top_p")))
+        # Ensure top_k and top_p happens before watermarking
+        if "generation_config" in kwargs:
+            generation_config: GenerationConfig = kwargs["generation_config"]
+            top_k = generation_config.top_k
+            top_p = generation_config.top_p
+            generation_config.update(top_p=1.0)
+        else:
+            top_k = kwargs.pop("top_k", None)
+            top_p = kwargs.pop("top_p", None)
+        if top_k is not None and top_k != 0:
+            logits_processor.append(TopKLogitsWarper(top_k))
+        if top_p is not None and top_p < 1.0:
+            logits_processor.append(TopPLogitsWarper(top_p))
         if self.kappa != 0:
             logits_processor.append(self.logits_processor)
+        if batched_generate and len(tokd_inputs) >= 8:
+            max_batch_size = self.find_largest_batch_size(tokd_inputs, logits_processor=logits_processor, **kwargs)
+        else:
+            max_batch_size = 1
+        # Group inputs by token length
+        if max_batch_size > 1:
+            tokd_inputs_order = sorted(range(len(tokd_inputs)), key=lambda i: tokd_inputs[i]["input_ids"].shape[-1])
+            tokd_inputs = [tokd_inputs[i] for i in tokd_inputs_order]
+        else:
+            tokd_inputs_order = range(len(tokd_inputs))
+        tokd_input_batches = []
+        for i in range(0, len(tokd_inputs), max_batch_size):
+            batch = self.tokenizer.pad(tokd_inputs[i:i+max_batch_size], padding=True, padding_side="left").to(self.model.device, non_blocking=True)
+            tokd_input_batches.append(batch)
+        torch.cuda.synchronize()
+        outputs = []
         with torch.no_grad():
-            self.logits_processor.reset(n_gram)
-            output = self.model.generate(
-                **tokd_input,
-                max_new_tokens=max_new_tokens,
-                do_sample=do_sample,
-                logits_processor=logits_processor,
-                pad_token_id=self.tokenizer.eos_token_id,
-                tokenizer=self.tokenizer,
-                **kwargs
-                )
-        output = output[:,tokd_input["input_ids"].shape[-1]:].cpu()
+            bar = tqdm(total=len(tokd_inputs), desc="Generating text", disable=not use_tqdm)
+            for tokd_input_batch in tokd_input_batches:
+                self.logits_processor.reset(n_gram)
+                output = self.model.generate(
+                    **tokd_input_batch,
+                    logits_processor=logits_processor,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                    tokenizer=self.tokenizer,
+                    **kwargs
+                    )
+                output = output[:,tokd_input_batch["input_ids"].shape[-1]:].to("cpu", non_blocking=True)
+                outputs.append(output)
+                bar.update(tokd_input_batch["input_ids"].shape[0])
+        torch.cuda.synchronize()
+        outputs = [j for i in outputs for j in i]  # Flatten the list of outputs
+        # Restore original ordering
+        if max_batch_size > 1:
+            reordered_outputs = [None] * len(outputs)
+            num_return_sequences = len(outputs) // len(tokd_inputs)
+            for i, idx in enumerate(tokd_inputs_order):
+                reordered_outputs[idx * num_return_sequences:(idx + 1) * num_return_sequences] = outputs[i * num_return_sequences:(i + 1) * num_return_sequences]
+            outputs = reordered_outputs
         return_dict = {}
         if return_scores:
-            cumulative_token_count = self.get_cumulative_token_count(self.id, output, n_gram = n_gram, return_dense=False)
+            cumulative_token_count = self.get_cumulative_token_count(self.id, outputs, n_gram = n_gram, return_dense=False)
             cumulative_token_count = vstack([i[0] for i in cumulative_token_count], format="csr")
             q_score, _, _ = self.watermarking_fn.q(cumulative_token_count, k_p = [self.k_p], use_tqdm=False)
-            return_dict["q_score"] = q_score[:,0]
+            return_dict["q_score"] = q_score
         if return_tokens:
-            return_dict["tokens"] = output
+            return_dict["tokens"] = outputs
         if return_text:
-            decoded_output = self.tokenizer.batch_decode(output, skip_special_tokens=True)
+            decoded_output = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
             decoded_output = [i.strip() for i in decoded_output]
             return_dict["text"] = decoded_output
-        if len(output) == 1:
+        if is_single:
             for k, v in return_dict.items():
                 return_dict[k] = v[0]
-        if return_text and len(return_dict) == 0:
+        if return_text and len(return_dict) == 1:
             return decoded_output
         return return_dict

waterfall/permute.py CHANGED Viewed

@@ -49,16 +49,23 @@ class Permute:
         size_per_permutation_in_bytes = N * self.dtype.itemsize
         cache_size = int(psutil.virtual_memory().total * 0.02 / size_per_permutation_in_bytes)  # 2% of total memory
         self.permutations.capacity = cache_size
+        self.no_permutation = np.arange(self.N, dtype=self.dtype)
+    def _permute(self, key):
+        return np.random.RandomState(key).permutation(self.N).astype(self.dtype)
     def get_permutation(self, prev_tok, id : int, cache : bool = False) -> np.ndarray:
+        # Skip special tokens
+        if any((i >= self.N for i in prev_tok)):
+            return self.no_permutation
         key = (id, *prev_tok)
         if cache:
             permutation = self.permutations.get(key)
             if permutation is None:
-                permutation = np.random.RandomState(key).permutation(self.N).astype(self.dtype)
+                permutation = self._permute(key)
                 self.permutations.put(key, permutation)
         else:
-            permutation = np.random.RandomState(key).permutation(self.N).astype(self.dtype)
+            permutation = self._permute(key)
         return permutation
     def get_unshuffled_indices(self, ids, args) -> dict[int, np.ndarray]:

waterfall/watermark.py CHANGED Viewed

@@ -8,15 +8,13 @@ from typing import List, Literal, Optional, Tuple
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers.modeling_utils import PreTrainedModel
+from transformers.generation.configuration_utils import GenerationConfig
 from sentence_transformers import SentenceTransformer
-from tqdm.auto import tqdm
 from waterfall.WatermarkingFnFourier import WatermarkingFnFourier
 from waterfall.WatermarkingFnSquare import WatermarkingFnSquare
 from waterfall.WatermarkerBase import Watermarker
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
 PROMPT = (
     "Paraphrase the user provided text while preserving semantic similarity. "
     "Do not include any other sentences in the response, such as explanations of the paraphrasing. "
@@ -24,7 +22,7 @@ PROMPT = (
 )
 PRE_PARAPHRASED = "Here is a paraphrased version of the text while preserving the semantic similarity:\n\n"
-waterfall_cached_watermarking_model = None  # Global variable to cache the watermarking model
+waterfall_cached_watermarking_model: PreTrainedModel | None = None  # Global variable to cache the watermarking model
 def detect_gpu() -> str:
     """
@@ -41,47 +39,137 @@ def detect_gpu() -> str:
     else:
         return 'cpu'
-def watermark(
-    T_o: str,
-    watermarker: Watermarker,
-    sts_model: SentenceTransformer,
+def watermark_texts(
+    T_os: List[str],
+    id: Optional[int] = None,
+    k_p: int = 1,
+    kappa: float = 2.0,
+    model_path: Optional[str] = "meta-llama/Llama-3.1-8B-Instruct",
+    sts_model_path: Optional[str] = "sentence-transformers/all-mpnet-base-v2",
+    watermark_fn: Literal["fourier", "square"] = "fourier",
+    watermarker: Optional[Watermarker] = None,
+    sts_model: Optional[SentenceTransformer] = None,
+    device: str = detect_gpu(),
+    STS_scale: float = 2.0,
+    use_tqdm: bool = False,
+    do_sample: bool = False,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    max_new_tokens: Optional[int] = None,
     num_beam_groups: int = 4,
     beams_per_group: int = 2,
-    STS_scale: float = 2.0,
     diversity_penalty: float = 0.5,
-    max_new_tokens: Optional[int] = None,
-    **kwargs
-) -> str:
-    paraphrasing_prompt = watermarker.tokenizer.apply_chat_template(
-        [
-            {"role":"system", "content":PROMPT},
-            {"role":"user", "content":T_o},
-        ], tokenize=False, add_generation_prompt = True) + PRE_PARAPHRASED
+    stop_at_double_newline: bool = True,    # if True, will stop generation at the first double newline. Prevent repeated paraphrasing of the same text.
+) -> List[str]:
+    if watermark_fn == 'fourier':
+        watermarkingFnClass = WatermarkingFnFourier
+    elif watermark_fn == 'square':
+        watermarkingFnClass = WatermarkingFnSquare
+    else:
+        raise ValueError("Invalid watermarking function")
+    # Check if watermarker/model/tokenizer are loaded
+    if watermarker is None:
+        assert model_path is not None, "model_path must be provided if watermarker is not passed"
+        assert id is not None, "id must be provided if watermarker is not passed"
+        global waterfall_cached_watermarking_model
+        if isinstance(waterfall_cached_watermarking_model, PreTrainedModel) and waterfall_cached_watermarking_model.name_or_path != model_path:
+            device = waterfall_cached_watermarking_model.device.type
+            waterfall_cached_watermarking_model = None
+            gc.collect()
+            if device == "cuda":
+                torch.cuda.empty_cache()
+            elif device == "mps":
+                torch.mps.empty_cache()
+        if waterfall_cached_watermarking_model is None:
+            model = model_path
+        else:
+            model = waterfall_cached_watermarking_model
+        watermarker = Watermarker(model=model, id=id, kappa=kappa, k_p=k_p, watermarkingFnClass=watermarkingFnClass)
+    else:
+        device = watermarker.model.device.type
+        id = watermarker.id
+    waterfall_cached_watermarking_model = watermarker.model
+    # Check if sts model is loaded
+    if sts_model is None:
+        assert sts_model_path is not None, "sts_model_path must be provided if sts_model is not passed"
+        sts_model = SentenceTransformer(sts_model_path, device=device)
+    # Replace all \n\n in source text if stop_at_double_newline is True
+    # Models tend to generate \n\n before endlessly repeating itself, so we want to stop the model from doing that
+    if stop_at_double_newline:
+        for i in range(len(T_os)):
+            if "\n\n" in T_os[i]:
+                logging.warning(f"Text idx {i} contains \\n\\n and stop_at_double_newline is set to True, replacing all \\n\\n in text.")
+                T_os[i] = T_os[i].replace("\n\n", " ")  # replace double newlines with space
+    # Add system prompt and prefill, and format into appropriate chat format
+    formatted_T_os = watermarker.format_prompt(
+        T_os,
+        system_prompt=PROMPT,
+        assistant_prefill=PRE_PARAPHRASED,
+    )
+    if max_new_tokens is None:
+        max_input_len = max(len(p) for p in formatted_T_os)
+        max_new_tokens = max_input_len
+    if do_sample:
+        assert (do_sample and temperature is not None and top_p is not None and num_beam_groups == 1 and beams_per_group == 1), \
+           "do_sample=True requires temperature, top_p, num_beam_groups=1 and beams_per_group=1"
+    else:   # Using beam search
+        assert (not do_sample and temperature is None and top_p is None and num_beam_groups >= 1 and beams_per_group >= 1), \
+           "do_sample=False requires temperature=None, top_p=None, num_beam_groups>=1 and beams_per_group>=1"
+    eos_token_id = watermarker.tokenizer.eos_token_id
+    # add "\n\n" tokens to eos_token_id list
+    if stop_at_double_newline:
+        eos_token_id = [eos_token_id]
+        # llama tokenizer's .vocab() has weird symbols and doesn't work with GenerationConfig's stop_strings, so we have to brute force check all tokens
+        for token_id,string in enumerate(watermarker.tokenizer.batch_decode(torch.arange(watermarker.tokenizer.vocab_size).unsqueeze(1))):
+            if "\n\n" in string:
+                eos_token_id.append(token_id)
+    generation_config = GenerationConfig(
+        max_new_tokens=max_new_tokens,
+        do_sample=do_sample,
+        temperature=temperature,
+        top_p=top_p,
+        num_beam_groups=num_beam_groups,
+        num_beams=num_beam_groups * beams_per_group,
+        diversity_penalty=diversity_penalty,
+        eos_token_id=eos_token_id,
+        num_return_sequences=num_beam_groups * beams_per_group,
+    )
     watermarked = watermarker.generate(
-        paraphrasing_prompt,
-        return_scores = True,
-        max_new_tokens = int(len(paraphrasing_prompt) * 1.5) if max_new_tokens is None else max_new_tokens,
-        do_sample = False, temperature=None, top_p=None,
-        num_beams = num_beam_groups * beams_per_group,
-        num_beam_groups = num_beam_groups,
-        num_return_sequences = num_beam_groups * beams_per_group,
-        diversity_penalty = diversity_penalty,
-        **kwargs,
-        )
+        prompts=formatted_T_os,
+        return_text=True,
+        return_scores=True,
+        use_tqdm=use_tqdm,
+        generation_config=generation_config,
+    )
+    T_ws = watermarked["text"]
+    # Reshape T_ws to Queries X Beams
+    num_beams = num_beam_groups * beams_per_group
+    T_ws = [T_ws[i * num_beams:(i + 1) * num_beams] for i in range(len(T_os))]
     # Select best paraphrasing based on q_score and semantic similarity
-    sts_scores = STS_scorer(T_o, watermarked["text"], sts_model)
-    selection_score = sts_scores * STS_scale + torch.from_numpy(watermarked["q_score"])
-    selection = torch.argmax(selection_score)
+    sts_scores = STS_scorer_batch(T_os, T_ws, sts_model)
+    selection_scores = sts_scores * STS_scale + torch.from_numpy(watermarked["q_score"]).reshape(-1, num_beams)
+    selections = torch.argmax(selection_scores, dim = -1)
-    T_w = watermarked["text"][selection]
+    T_ws = [T_w[selection] for T_w, selection in zip(T_ws, selections)]
-    return T_w
+    return T_ws
-def verify_texts(texts: List[str], id: int,
-                     watermarker: Optional[Watermarker] = None,
-                     k_p: Optional[int] = None,
+def verify_texts(texts: List[str], id: int,
+                     watermarker: Optional[Watermarker] = None,
+                     k_p: Optional[int] = None,
                      model_path: Optional[str] = "meta-llama/Llama-3.1-8B-Instruct",
                      return_extracted_k_p: bool = False
                      ) -> np.ndarray | Tuple[np.ndarray,np.ndarray]:
@@ -89,9 +177,8 @@ def verify_texts(texts: List[str], id: int,
     if watermarker is None:
         assert model_path is not None, "model_path must be provided if watermarker is not passed"
-        tokenizer = AutoTokenizer.from_pretrained(model_path)
-        watermarker = Watermarker(tokenizer=tokenizer)
+        watermarker = Watermarker(tokenizer=model_path)
     if k_p is None:
         k_p = watermarker.k_p
@@ -135,87 +222,6 @@ def STS_scorer(
         cos_sim = cos_sim.item()
     return cos_sim
-def watermark_texts(
-    T_os: List[str],
-    id: Optional[int] = None,
-    k_p: int = 1,
-    kappa: float = 2.0,
-    model_path: str = "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype: torch.dtype = torch.bfloat16,
-    sts_model_path: str = "sentence-transformers/all-mpnet-base-v2",
-    watermark_fn: Literal["fourier", "square"] = "fourier",
-    watermarker: Optional[Watermarker] = None,
-    sts_model: Optional[SentenceTransformer] = None,
-    device: str = detect_gpu(),
-    num_beam_groups: int = 4,
-    beams_per_group: int = 2,
-    diversity_penalty: float = 0.5,
-    STS_scale:float = 2.0,
-    use_tqdm: bool = False,
-    stop_at_double_newline: bool = True,    # if True, will stop generation at the first double newline. Prevent repeated paraphrasing of the same text.
-) -> List[str]:
-    if watermark_fn == 'fourier':
-        watermarkingFnClass = WatermarkingFnFourier
-    elif watermark_fn == 'square':
-        watermarkingFnClass = WatermarkingFnSquare
-    else:
-        raise ValueError("Invalid watermarking function")
-    if watermarker is None:
-        assert model_path is not None, "model_path must be provided if watermarker is not passed"
-        global waterfall_cached_watermarking_model
-        if isinstance(waterfall_cached_watermarking_model, PreTrainedModel) and waterfall_cached_watermarking_model.name_or_path != model_path:
-            device = waterfall_cached_watermarking_model.device.type
-            waterfall_cached_watermarking_model = None
-            gc.collect()
-            if device == "cuda":
-                torch.cuda.empty_cache()
-            elif device == "mps":
-                torch.mps.empty_cache()
-        if waterfall_cached_watermarking_model is None:
-            waterfall_cached_watermarking_model = AutoModelForCausalLM.from_pretrained(
-                model_path,
-                torch_dtype=torch_dtype,
-                device_map=device,
-                )
-        model = waterfall_cached_watermarking_model
-        tokenizer = AutoTokenizer.from_pretrained(model_path)
-        watermarker = Watermarker(tokenizer=tokenizer, model=model, id=id, kappa=kappa, k_p=k_p, watermarkingFnClass=watermarkingFnClass)
-    else:
-        tokenizer = watermarker.tokenizer
-        device = watermarker.model.device
-        id = watermarker.id
-    if id is None:
-        raise Exception("ID or Watermarker class must be passed to watermark_texts.")
-    if sts_model is None:
-        assert sts_model_path is not None, "sts_model_path must be provided if sts_model is not passed"
-        sts_model = SentenceTransformer(sts_model_path, device=device)
-    T_ws = []
-    for T_o in tqdm(T_os, desc="Watermarking texts",  disable=not use_tqdm):
-        if stop_at_double_newline and "\n\n" in T_o:
-            logging.warning("Text contains \\n\\n and stop_at_double_newline is set to True, replacing all \\n\\n in text.")
-            T_o = T_o.replace("\n\n", " ")  # replace double newlines with space
-        T_w = watermark(
-            T_o,
-            watermarker = watermarker,
-            sts_model = sts_model,
-            num_beam_groups = num_beam_groups,
-            beams_per_group = beams_per_group,
-            diversity_penalty = diversity_penalty,
-            STS_scale = STS_scale,
-            stop_strings=["\n\n"] if stop_at_double_newline else None,
-            )
-        T_ws.append(T_w)
-    return T_ws
 def pretty_print(
         T_o: str, T_w: str,
         sts_score: float,
@@ -303,13 +309,15 @@ def main():
     sts_model = SentenceTransformer(sts_model_name, device=device)
     T_ws = watermark_texts(
-        T_os, id, k_p, kappa,
-        watermarker=watermarker, sts_model=sts_model,
+        T_os,
+        id=id, k_p=k_p, kappa=kappa,
+        watermarker=watermarker,
+        sts_model=sts_model,
         beams_per_group=beams_per_group,
         num_beam_groups=num_beam_groups,
         diversity_penalty=diversity_penalty,
         STS_scale=STS_scale,
-        use_tqdm=True
+        use_tqdm=True,
         )
     # watermarker = Watermarker(tokenizer=tokenizer, model=None, id=id, k_p=k_p, watermarkingFnClass=watermarkingFnClass)   # If only verifying the watermark, do not need to instantiate the model
@@ -320,7 +328,7 @@ def main():
         # in an IDE or something else without terminal size
         try:
             column_size = os.get_terminal_size().columns
-        except OSError as ose:
+        except OSError:
             column_size = 80
         print("=" * column_size)

{waterfall-0.1.7.dist-info → waterfall-0.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: waterfall
-Version: 0.1.7
+Version: 0.2.1
 Summary: Scalable Framework for Robust Text Watermarking and Provenance for LLMs
 Project-URL: Homepage, https://github.com/aoi3142/Waterfall
 Project-URL: Issues, https://github.com/aoi3142/Waterfall/issues
@@ -15,7 +15,7 @@ Requires-Dist: numpy>=2.0.0
 Requires-Dist: scipy>=1.13.0
 Requires-Dist: sentence-transformers>=3.0.0
 Requires-Dist: torch>=2.3.0
-Requires-Dist: transformers>=4.43.1
+Requires-Dist: transformers<4.55.0,>=4.43.1
 Description-Content-Type: text/markdown
 # Waterfall: Scalable Framework for Robust Text Watermarking and Provenance for LLMs [EMNLP 2024 Main Long]

waterfall-0.2.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+waterfall/WatermarkerBase.py,sha256=A2VRfsnBfz6-8DSL2NKQZdM1OLI0sQ73qjYaV6rIgJ0,20822
+waterfall/WatermarkingFn.py,sha256=-b-kGRdL0a7eKRqJmcHPAR_rCjxQYnsg1Ne6bTwBc1I,1931
+waterfall/WatermarkingFnFourier.py,sha256=QYayAQYwi1dQkDIyqmvhU568VhrVYTVy47HkI8F8SZs,1358
+waterfall/WatermarkingFnSquare.py,sha256=2PAO05DdKT02npo7GDf_82D520nP7kGAWK6H4E4JMt4,1638
+waterfall/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+waterfall/permute.py,sha256=uYKdmn4pGvjB6hInInLGxFIF6vt507lqJ_qe-ST1PFE,2783
+waterfall/watermark.py,sha256=IbH5r3oqjtKztDVryfDTr_NDn-CLZHow0S8nAEtZmdc,14420
+waterfall-0.2.1.dist-info/METADATA,sha256=Mzyp7Nw395RLCN3wnzp2StEpKZEN2erb5BvCOd5Z-4I,8722
+waterfall-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+waterfall-0.2.1.dist-info/entry_points.txt,sha256=XXnUzuWXu2nc9j4WAll9tq6HyodN_8WJLjeG0O4Y2Gw,60
+waterfall-0.2.1.dist-info/licenses/LICENSE,sha256=zAtaO-k41Q-Q4Etl4bzuh7pgNJsPH-dYfzvznRa0OvM,11341
+waterfall-0.2.1.dist-info/RECORD,,

waterfall-0.1.7.dist-info/RECORD DELETED Viewed

@@ -1,12 +0,0 @@
-waterfall/WatermarkerBase.py,sha256=NrDo4yJ4gnliTHH3LZemALpU_L-MCaPapevV1YnRHuE,12999
-waterfall/WatermarkingFn.py,sha256=-b-kGRdL0a7eKRqJmcHPAR_rCjxQYnsg1Ne6bTwBc1I,1931
-waterfall/WatermarkingFnFourier.py,sha256=QYayAQYwi1dQkDIyqmvhU568VhrVYTVy47HkI8F8SZs,1358
-waterfall/WatermarkingFnSquare.py,sha256=2PAO05DdKT02npo7GDf_82D520nP7kGAWK6H4E4JMt4,1638
-waterfall/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-waterfall/permute.py,sha256=RwxOHFhx_VSOhhFwy5s79YgwTUBkfW2-LCCXYR3VT2o,2582
-waterfall/watermark.py,sha256=W5jYGqYGOXXO-KLPKzJoin5zC_Xb6Xk9BzsAA9-LKXA,13494
-waterfall-0.1.7.dist-info/METADATA,sha256=-QVkPeyZWXdPHr_SvhvIFyCNy3G2GuzHKPmg9w8Z1-I,8714
-waterfall-0.1.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-waterfall-0.1.7.dist-info/entry_points.txt,sha256=XXnUzuWXu2nc9j4WAll9tq6HyodN_8WJLjeG0O4Y2Gw,60
-waterfall-0.1.7.dist-info/licenses/LICENSE,sha256=zAtaO-k41Q-Q4Etl4bzuh7pgNJsPH-dYfzvznRa0OvM,11341
-waterfall-0.1.7.dist-info/RECORD,,

{waterfall-0.1.7.dist-info → waterfall-0.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{waterfall-0.1.7.dist-info → waterfall-0.2.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{waterfall-0.1.7.dist-info → waterfall-0.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

waterfall 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl

waterfall 0.1.7py3-none-any.whl → 0.2.1py3-none-any.whl