wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.2.dist-info/METADATA +67 -0
  215. wisent-0.5.2.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import field, dataclass
5
+
6
+ if TYPE_CHECKING:
7
+ from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
8
+
9
+ __all__ = [
10
+ "CleanStep",
11
+ "Cleaner",
12
+ "CleanStepStats",
13
+ "CleanerStats",
14
+ ]
15
+
16
+ @dataclass(frozen=True)
17
+ class CleanStepStats:
18
+ total_items: int = 0
19
+ removed_items: int = 0
20
+ modified_items: int = 0
21
+
22
+ @dataclass(frozen=True)
23
+ class CleanerStats:
24
+ step_stats: dict[str, CleanStepStats] = field(default_factory=dict)
25
+
26
+
27
+ class CleanStep(ABC):
28
+ """
29
+ Single step in a cleaning pipeline.
30
+
31
+ attributes:
32
+ name:
33
+ Name of the step, used in stats and logging.
34
+
35
+ methods:
36
+ stats():
37
+ Return a dict of statistics about the last run of `apply()`.
38
+ apply(items):
39
+ Apply the cleaning step to a list of items.
40
+ """
41
+ name: str = "step"
42
+
43
+ def stats(self) -> CleanStepStats:
44
+ return CleanStepStats()
45
+
46
+ @abstractmethod
47
+ def apply(self, items: ContrastivePairSet) -> ContrastivePairSet:
48
+ ...
49
+
50
+ class Cleaner(ABC):
51
+ """
52
+ Cleaning pipeline composed of multiple `CleanStep`s.
53
+ """
54
+ @abstractmethod
55
+ def clean(
56
+ self, items:ContrastivePairSet
57
+ ) -> tuple[ContrastivePairSet, CleanerStats]:
58
+ ...
@@ -0,0 +1,53 @@
1
+ from wisent.synthetic.cleaners.core.atoms import CleanStep
2
+ from wisent.synthetic.cleaners.core.atoms import CleanStepStats
3
+
4
+ from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
5
+ from wisent.synthetic.cleaners.methods.core.atoms import Deduper
6
+
7
+
8
+ __all__ = [
9
+ "DeduperCleaner",
10
+ ]
11
+
12
+ class DeduperCleaner(CleanStep):
13
+ """
14
+ Deduplication step; removes duplicate items from the pipeline.
15
+
16
+ attributes:
17
+ deduper:
18
+ Deduper instance to use for deduplication.
19
+ """
20
+ name = "deduper_cleaner"
21
+
22
+ def __init__(self, deduper: Deduper) -> None:
23
+ self._deduper = deduper
24
+ self._last_stats = 0
25
+ self._last_total = 0
26
+
27
+ def stats(self) -> CleanStepStats:
28
+ '''
29
+ Return statistics about the last run of 'apply()'.
30
+
31
+ returns:
32
+ CleanStepStats with total and removed items from the last deduplication run.
33
+ '''
34
+ return CleanStepStats(
35
+ total_items=self._last_total,
36
+ removed_items=self._last_stats,
37
+ )
38
+
39
+ def apply(self, items: ContrastivePairSet) -> ContrastivePairSet:
40
+ '''
41
+ Apply the deduplication step to the given ContrastivePairSet.
42
+
43
+ arguments:
44
+ items:
45
+ ContrastivePairSet to deduplicate.
46
+
47
+ returns:
48
+ Deduplicated ContrastivePairSet.
49
+ '''
50
+ self._last_total = len(items)
51
+ dedupe_items = self._deduper.dedupe(items)
52
+ self._last_stats = self._last_total - len(dedupe_items)
53
+ return dedupe_items
File without changes
@@ -0,0 +1,320 @@
1
+ import re
2
+ import unicodedata
3
+ import hashlib
4
+ from collections import Counter, defaultdict
5
+ from typing import Mapping, Sequence, Callable
6
+
7
+ from wisent.synthetic.cleaners.methods.core.atoms import Deduper
8
+ from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
9
+
10
+ __all__ = [
11
+ "SimHashDeduper",
12
+ ]
13
+
14
+ class SimHashDeduper(Deduper):
15
+ """
16
+ Deduplicate items based on near-duplicate similarity of selected fields.
17
+ Uses SimHash + banded LSH for efficient near-duplicate detection.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ threshold_bits: int = 3,
23
+ fields_to_hash: Sequence[str] = ("prompt",),
24
+ field_weights: Mapping[str, float] | None = None,
25
+ tokenizer: str = "auto", # "auto" | "word" | "char"
26
+ word_ngram: int = 3,
27
+ char_ngram: int = 4,
28
+ strip_accents: bool = True,
29
+ stopwords: set[str] | None = None,
30
+ num_bands: int = 8, # 64 must be divisible by num_bands; band_size = 64/num_bands
31
+ exact_keys: Sequence[str] = ("prompt", "positive", "negative"),
32
+ key_fn: Callable[[Mapping[str, str]], str] | None = None,
33
+ ) -> None:
34
+ if 64 % num_bands != 0:
35
+ raise ValueError("num_bands must divide 64 (e.g., 4, 8, 16, 32).")
36
+ if tokenizer not in {"auto", "word", "char"}:
37
+ raise ValueError("tokenizer must be 'auto', 'word', or 'char'.")
38
+ if word_ngram < 1 or char_ngram < 1:
39
+ raise ValueError("n-gram sizes must be >= 1.")
40
+
41
+ self.threshold_bits = threshold_bits
42
+ self.fields_to_hash = tuple(fields_to_hash)
43
+ self.field_weights = dict(field_weights or {})
44
+ self.tokenizer = tokenizer
45
+ self.word_ngram = int(word_ngram)
46
+ self.char_ngram = int(char_ngram)
47
+ self.strip_accents = bool(strip_accents)
48
+ self.stopwords = set(stopwords or self._default_stopwords())
49
+ self.num_bands = int(num_bands)
50
+ self.band_size = 64 // self.num_bands
51
+ self.exact_keys = tuple(exact_keys)
52
+ self.key_fn = key_fn
53
+
54
+ # Precompute band masks/shifts
55
+ self._band_masks = [(1 << self.band_size) - 1 for _ in range(self.num_bands)]
56
+ self._band_shifts = [i * self.band_size for i in range(self.num_bands)]
57
+
58
+ # Simple CJK detection regex for "auto" tokenizer
59
+ self._re_cjk = re.compile(r"[\u3400-\u9FFF\uF900-\uFAFF\u3040-\u30FF\uAC00-\uD7AF]")
60
+
61
+
62
+ def dedupe(self, items: ContrastivePairSet) -> ContrastivePairSet:
63
+ """
64
+ Deduplicate items based on near-duplicate similarity of selected fields.
65
+
66
+ arguments:
67
+ items: ContrastivePairSet to deduplicate.
68
+
69
+ returns:
70
+ deduplicated ContrastivePairSet (first occurrence kept)
71
+
72
+ the processing steps are:
73
+ 1) Exact dedup by canonical tuple of exact_keys (e.g., prompt+positive+negative).
74
+ 2) For each item, compute 64-bit SimHash fingerprint of selected fields.
75
+ 3) Use banded LSH to find candidate near-duplicates.
76
+ 4) For candidates, compute exact Hamming distance; if within threshold, treat as duplicate.
77
+ 5) Keep first item in each near-duplicate cluster; discard others.
78
+ """
79
+ out: ContrastivePairSet = ContrastivePairSet(
80
+ name=items.name,
81
+ task_type=items.task_type,
82
+ )
83
+ out_fps: list[int] = []
84
+
85
+ exact_seen: set[tuple[tuple[str, str], ...]] = set()
86
+
87
+ buckets: list[defaultdict[int, list[int]]] = [defaultdict(list) for _ in range(self.num_bands)]
88
+
89
+ for it in items.pairs:
90
+
91
+ it_dict = {
92
+ "prompt": it.prompt,
93
+ "positive": it.positive_response.model_response,
94
+ "negative": it.negative_response.model_response,
95
+ }
96
+ ex_key = self._exact_key(it_dict)
97
+ if ex_key in exact_seen:
98
+ continue
99
+
100
+ fp = self._simhash64_for_item(it_dict)
101
+
102
+ candidates: set[int] = set()
103
+ for b, shift in enumerate(self._band_shifts):
104
+ band_val = (fp >> shift) & self._band_masks[b]
105
+ if band_val in buckets[b]:
106
+ candidates.update(buckets[b][band_val])
107
+
108
+ if not candidates and out_fps:
109
+ candidates = set(range(len(out_fps)))
110
+
111
+ is_dup = any(self._hamming_distance(fp, out_fps[idx]) <= self.threshold_bits for idx in candidates)
112
+ if is_dup:
113
+ continue
114
+
115
+ idx = len(out)
116
+ out.add(it)
117
+ out_fps.append(fp)
118
+ exact_seen.add(ex_key)
119
+ for b, shift in enumerate(self._band_shifts):
120
+ band_val = (fp >> shift) & self._band_masks[b]
121
+ buckets[b][band_val].append(idx)
122
+
123
+ return out
124
+
125
+ def _simhash64_for_item(self, item: Mapping[str, str]) -> int:
126
+ """
127
+ Compute 64-bit SimHash fingerprint for the given item.
128
+
129
+ arguments:
130
+ item: mapping of field -> text
131
+
132
+ returns:
133
+ 64-bit integer SimHash fingerprint
134
+
135
+ example:
136
+ >>> deduper = SimHashDeduper(fields_to_hash=("prompt","positive"), field_weights={"prompt":2.0})
137
+ >>> item = {"prompt":"Tell me a joke.","positive":"Here's a joke.","negative":"I can't help."}
138
+ >>> deduper._simhash64_for_item(item)
139
+ 0b101010101010... (64 bits)
140
+ """
141
+ feats: Counter[str] = Counter()
142
+ if self.key_fn:
143
+ text = self.key_fn(item)
144
+ feats.update(self._extract_features(text))
145
+ else:
146
+ for field in self.fields_to_hash:
147
+ text = item.get(field, "") or ""
148
+ w = float(self.field_weights.get(field, 1.0))
149
+ if not text or w == 0.0:
150
+ continue
151
+ f = self._extract_features(text)
152
+ if w != 1.0:
153
+ for k, v in f.items():
154
+ f[k] = v * w
155
+ feats.update(f)
156
+ return self._simhash64(feats)
157
+
158
+ def _simhash64(self, features: Mapping[str, float]) -> int:
159
+ """
160
+ Compute 64-bit SimHash fingerprint from weighted features.
161
+
162
+ arguments:
163
+ features: mapping of feature -> weight (e.g., shingle -> count or tf-idf)
164
+
165
+ returns:
166
+ 64-bit integer SimHash fingerprint
167
+
168
+ example:
169
+ >>> SimHashDeduper()._simhash64(Counter({'cat': 1, 'sat': 1, 'mat': 1}))
170
+ 0b101010101010... (64 bits)
171
+ """
172
+ v = [0.0] * 64
173
+ for feat, weight in features.items():
174
+ h = self._hash64(feat)
175
+ for i in range(64):
176
+ if h & (1 << i):
177
+ v[i] += weight
178
+ else:
179
+ v[i] -= weight
180
+
181
+ fp = 0
182
+ for i in range(64):
183
+ if v[i] >= 0:
184
+ fp |= (1 << i)
185
+ return fp
186
+
187
+ def _extract_features(self, text: str) -> Counter[str]:
188
+ """
189
+ Extract features (shingles) from text based on tokenizer mode.
190
+
191
+ arguments:
192
+ text: input string
193
+
194
+ returns:
195
+ Counter of features (shingle -> count)
196
+
197
+ example:
198
+ >>> SimHashDeduper()._extract_features("The cat sat on the mat.")
199
+ Counter({'cat': 1, 'sat': 1, 'mat': 1})
200
+ >>> SimHashDeduper(tokenizer="char", char_ngram=3)._extract_features("hello")
201
+ Counter({'hel': 1, 'ell': 1, 'llo': 1})
202
+ """
203
+ t = self._normalize(text)
204
+ mode = self._pick_mode(t)
205
+
206
+ if mode == "word":
207
+ toks = [tok for tok in re.findall(r"\w+", t) if tok not in self.stopwords]
208
+ if self.word_ngram == 1:
209
+ return Counter(toks)
210
+ shingles = [" ".join(toks[i:i + self.word_ngram]) for i in range(len(toks) - self.word_ngram + 1)]
211
+ return Counter(shingles)
212
+
213
+ if self.char_ngram == 1:
214
+ chars = list(t.replace(" ", ""))
215
+ return Counter(chars)
216
+ s = re.sub(r"\s+", " ", t)
217
+ s = s.replace(" ", "␠")
218
+ shingles = [s[i:i + self.char_ngram] for i in range(max(0, len(s) - self.char_ngram + 1))]
219
+ return Counter(shingles)
220
+
221
+ def _pick_mode(self, text: str) -> str:
222
+ """
223
+ Decide tokenizer mode based on text and config.
224
+
225
+ arguments:
226
+ text: input string
227
+
228
+ returns:
229
+ "word" or "char"
230
+ """
231
+ if self.tokenizer == "auto":
232
+ return "char" if self._re_cjk.search(text) else "word"
233
+ return self.tokenizer
234
+
235
+ def _normalize(self, text: str) -> str:
236
+ """
237
+ Unicode NFKC normalization, casefold, optional accent strip, URL/email strip, whitespace
238
+
239
+ arguments:
240
+ text: input string
241
+
242
+ returns:
243
+ normalized string
244
+
245
+ example:
246
+ >>> SimHashDeduper()._normalize("Café at https://example.com!")
247
+ 'cafe at <URL> !'
248
+ >>> SimHashDeduper(strip_accents=False)._normalize("The cat sat on the mat.")
249
+ 'the cat sat on the mat.'
250
+
251
+ the processing steps are:
252
+ 1) Replace URLs with <URL> token
253
+ 2) Replace emails with <EMAIL> token
254
+ 3) Unicode NFKC normalization
255
+ 4) Casefold (lowercase + some locale-aware folding)
256
+ 5) Optional accent strip (NFKD + remove combining marks)
257
+ 6) Collapse whitespace to single spaces, trim leading/trailing
258
+ """
259
+ text = re.sub(r"https?://\S+", " <URL> ", text)
260
+ text = re.sub(r"\b\S+@\S+\b", " <EMAIL> ", text)
261
+
262
+ text = unicodedata.normalize("NFKC", text).casefold()
263
+
264
+ if self.strip_accents:
265
+ text = unicodedata.normalize("NFKD", text)
266
+ text = "".join(ch for ch in text if not unicodedata.combining(ch))
267
+
268
+ text = re.sub(r"\s+", " ", text).strip()
269
+ return text
270
+
271
+ def _hash64(self, s: str) -> int:
272
+ """
273
+ Stable 64-bit hash of a string.
274
+
275
+ arguments:
276
+ s: input string
277
+
278
+ returns:
279
+ 64-bit integer hash
280
+
281
+ example:
282
+ >>> SimHashDeduper()._hash64("wisent_guard")
283
+ TODO: actual value"
284
+ """
285
+ h = hashlib.blake2b(s.encode("utf-8"), digest_size=8)
286
+ return int.from_bytes(h.digest(), "big", signed=False)
287
+
288
+ def _hamming_distance(self, a: int, b: int) -> int:
289
+ """
290
+ Compute Hamming distance between two 64-bit integers.
291
+
292
+ arguments:
293
+ a, b: 64-bit integers
294
+
295
+ returns:
296
+ Hamming distance (number of differing bits)
297
+
298
+ intuition:
299
+ XOR the two integers; the number of set bits in the result is the Hamming distance
300
+ For example, let word_1 = "hause" and word_2 = "mause", then
301
+ a = hash64("hause") = 0b110100101011... (64 bits)
302
+ b = hash64("mause") = 0b110100111011... (64 bits)
303
+ a ^ b = 0b000000110000... (64 bits)
304
+ The number of 1s in a ^ b is the Hamming distance, so here it is 2.
305
+ """
306
+ x = a ^ b
307
+ return x.bit_count() if hasattr(int, "bit_count") else bin(x).count("1")
308
+
309
+ def _exact_key(self, item: Mapping[str, str]) -> tuple[tuple[str, str], ...]:
310
+ kv = [(k, item.get(k, "")) for k in self.exact_keys]
311
+ return tuple(sorted(kv))
312
+
313
+ @staticmethod
314
+ def _default_stopwords() -> set[str]:
315
+ return {
316
+ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
317
+ "if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
318
+ "such", "that", "the", "their", "then", "there", "these", "they",
319
+ "this", "to", "was", "will", "with", "i", "you", "he", "she", "we",
320
+ }