wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.1.dist-info/METADATA +67 -0
- wisent-0.5.1.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import field, dataclass
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from wisent_guard.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"CleanStep",
|
|
11
|
+
"Cleaner",
|
|
12
|
+
"CleanStepStats",
|
|
13
|
+
"CleanerStats",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class CleanStepStats:
|
|
18
|
+
total_items: int = 0
|
|
19
|
+
removed_items: int = 0
|
|
20
|
+
modified_items: int = 0
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class CleanerStats:
|
|
24
|
+
step_stats: dict[str, CleanStepStats] = field(default_factory=dict)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class CleanStep(ABC):
|
|
28
|
+
"""
|
|
29
|
+
Single step in a cleaning pipeline.
|
|
30
|
+
|
|
31
|
+
attributes:
|
|
32
|
+
name:
|
|
33
|
+
Name of the step, used in stats and logging.
|
|
34
|
+
|
|
35
|
+
methods:
|
|
36
|
+
stats():
|
|
37
|
+
Return a dict of statistics about the last run of `apply()`.
|
|
38
|
+
apply(items):
|
|
39
|
+
Apply the cleaning step to a list of items.
|
|
40
|
+
"""
|
|
41
|
+
name: str = "step"
|
|
42
|
+
|
|
43
|
+
def stats(self) -> CleanStepStats:
|
|
44
|
+
return CleanStepStats()
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def apply(self, items: ContrastivePairSet) -> ContrastivePairSet:
|
|
48
|
+
...
|
|
49
|
+
|
|
50
|
+
class Cleaner(ABC):
|
|
51
|
+
"""
|
|
52
|
+
Cleaning pipeline composed of multiple `CleanStep`s.
|
|
53
|
+
"""
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def clean(
|
|
56
|
+
self, items:ContrastivePairSet
|
|
57
|
+
) -> tuple[ContrastivePairSet, CleanerStats]:
|
|
58
|
+
...
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from wisent_guard.synthetic.cleaners.core.atoms import CleanStep
|
|
2
|
+
from wisent_guard.synthetic.cleaners.core.atoms import CleanStepStats
|
|
3
|
+
|
|
4
|
+
from wisent_guard.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
5
|
+
from wisent_guard.synthetic.cleaners.methods.core.atoms import Deduper
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"DeduperCleaner",
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
class DeduperCleaner(CleanStep):
|
|
13
|
+
"""
|
|
14
|
+
Deduplication step; removes duplicate items from the pipeline.
|
|
15
|
+
|
|
16
|
+
attributes:
|
|
17
|
+
deduper:
|
|
18
|
+
Deduper instance to use for deduplication.
|
|
19
|
+
"""
|
|
20
|
+
name = "deduper_cleaner"
|
|
21
|
+
|
|
22
|
+
def __init__(self, deduper: Deduper) -> None:
|
|
23
|
+
self._deduper = deduper
|
|
24
|
+
self._last_stats = 0
|
|
25
|
+
self._last_total = 0
|
|
26
|
+
|
|
27
|
+
def stats(self) -> CleanStepStats:
|
|
28
|
+
'''
|
|
29
|
+
Return statistics about the last run of 'apply()'.
|
|
30
|
+
|
|
31
|
+
returns:
|
|
32
|
+
CleanStepStats with total and removed items from the last deduplication run.
|
|
33
|
+
'''
|
|
34
|
+
return CleanStepStats(
|
|
35
|
+
total_items=self._last_total,
|
|
36
|
+
removed_items=self._last_stats,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def apply(self, items: ContrastivePairSet) -> ContrastivePairSet:
|
|
40
|
+
'''
|
|
41
|
+
Apply the deduplication step to the given ContrastivePairSet.
|
|
42
|
+
|
|
43
|
+
arguments:
|
|
44
|
+
items:
|
|
45
|
+
ContrastivePairSet to deduplicate.
|
|
46
|
+
|
|
47
|
+
returns:
|
|
48
|
+
Deduplicated ContrastivePairSet.
|
|
49
|
+
'''
|
|
50
|
+
self._last_total = len(items)
|
|
51
|
+
dedupe_items = self._deduper.dedupe(items)
|
|
52
|
+
self._last_stats = self._last_total - len(dedupe_items)
|
|
53
|
+
return dedupe_items
|
|
File without changes
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import unicodedata
|
|
3
|
+
import hashlib
|
|
4
|
+
from collections import Counter, defaultdict
|
|
5
|
+
from typing import Mapping, Sequence, Callable
|
|
6
|
+
|
|
7
|
+
from wisent_guard.synthetic.cleaners.methods.core.atoms import Deduper
|
|
8
|
+
from wisent_guard.core.contrastive_pairs.core.set import ContrastivePairSet
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"SimHashDeduper",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
class SimHashDeduper(Deduper):
|
|
15
|
+
"""
|
|
16
|
+
Deduplicate items based on near-duplicate similarity of selected fields.
|
|
17
|
+
Uses SimHash + banded LSH for efficient near-duplicate detection.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
threshold_bits: int = 3,
|
|
23
|
+
fields_to_hash: Sequence[str] = ("prompt",),
|
|
24
|
+
field_weights: Mapping[str, float] | None = None,
|
|
25
|
+
tokenizer: str = "auto", # "auto" | "word" | "char"
|
|
26
|
+
word_ngram: int = 3,
|
|
27
|
+
char_ngram: int = 4,
|
|
28
|
+
strip_accents: bool = True,
|
|
29
|
+
stopwords: set[str] | None = None,
|
|
30
|
+
num_bands: int = 8, # 64 must be divisible by num_bands; band_size = 64/num_bands
|
|
31
|
+
exact_keys: Sequence[str] = ("prompt", "positive", "negative"),
|
|
32
|
+
key_fn: Callable[[Mapping[str, str]], str] | None = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
if 64 % num_bands != 0:
|
|
35
|
+
raise ValueError("num_bands must divide 64 (e.g., 4, 8, 16, 32).")
|
|
36
|
+
if tokenizer not in {"auto", "word", "char"}:
|
|
37
|
+
raise ValueError("tokenizer must be 'auto', 'word', or 'char'.")
|
|
38
|
+
if word_ngram < 1 or char_ngram < 1:
|
|
39
|
+
raise ValueError("n-gram sizes must be >= 1.")
|
|
40
|
+
|
|
41
|
+
self.threshold_bits = threshold_bits
|
|
42
|
+
self.fields_to_hash = tuple(fields_to_hash)
|
|
43
|
+
self.field_weights = dict(field_weights or {})
|
|
44
|
+
self.tokenizer = tokenizer
|
|
45
|
+
self.word_ngram = int(word_ngram)
|
|
46
|
+
self.char_ngram = int(char_ngram)
|
|
47
|
+
self.strip_accents = bool(strip_accents)
|
|
48
|
+
self.stopwords = set(stopwords or self._default_stopwords())
|
|
49
|
+
self.num_bands = int(num_bands)
|
|
50
|
+
self.band_size = 64 // self.num_bands
|
|
51
|
+
self.exact_keys = tuple(exact_keys)
|
|
52
|
+
self.key_fn = key_fn
|
|
53
|
+
|
|
54
|
+
# Precompute band masks/shifts
|
|
55
|
+
self._band_masks = [(1 << self.band_size) - 1 for _ in range(self.num_bands)]
|
|
56
|
+
self._band_shifts = [i * self.band_size for i in range(self.num_bands)]
|
|
57
|
+
|
|
58
|
+
# Simple CJK detection regex for "auto" tokenizer
|
|
59
|
+
self._re_cjk = re.compile(r"[\u3400-\u9FFF\uF900-\uFAFF\u3040-\u30FF\uAC00-\uD7AF]")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def dedupe(self, items: ContrastivePairSet) -> ContrastivePairSet:
|
|
63
|
+
"""
|
|
64
|
+
Deduplicate items based on near-duplicate similarity of selected fields.
|
|
65
|
+
|
|
66
|
+
arguments:
|
|
67
|
+
items: ContrastivePairSet to deduplicate.
|
|
68
|
+
|
|
69
|
+
returns:
|
|
70
|
+
deduplicated ContrastivePairSet (first occurrence kept)
|
|
71
|
+
|
|
72
|
+
the processing steps are:
|
|
73
|
+
1) Exact dedup by canonical tuple of exact_keys (e.g., prompt+positive+negative).
|
|
74
|
+
2) For each item, compute 64-bit SimHash fingerprint of selected fields.
|
|
75
|
+
3) Use banded LSH to find candidate near-duplicates.
|
|
76
|
+
4) For candidates, compute exact Hamming distance; if within threshold, treat as duplicate.
|
|
77
|
+
5) Keep first item in each near-duplicate cluster; discard others.
|
|
78
|
+
"""
|
|
79
|
+
out: ContrastivePairSet = ContrastivePairSet(
|
|
80
|
+
name=items.name,
|
|
81
|
+
task_type=items.task_type,
|
|
82
|
+
)
|
|
83
|
+
out_fps: list[int] = []
|
|
84
|
+
|
|
85
|
+
exact_seen: set[tuple[tuple[str, str], ...]] = set()
|
|
86
|
+
|
|
87
|
+
buckets: list[defaultdict[int, list[int]]] = [defaultdict(list) for _ in range(self.num_bands)]
|
|
88
|
+
|
|
89
|
+
for it in items.pairs:
|
|
90
|
+
|
|
91
|
+
it_dict = {
|
|
92
|
+
"prompt": it.prompt,
|
|
93
|
+
"positive": it.positive_response.model_response,
|
|
94
|
+
"negative": it.negative_response.model_response,
|
|
95
|
+
}
|
|
96
|
+
ex_key = self._exact_key(it_dict)
|
|
97
|
+
if ex_key in exact_seen:
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
fp = self._simhash64_for_item(it_dict)
|
|
101
|
+
|
|
102
|
+
candidates: set[int] = set()
|
|
103
|
+
for b, shift in enumerate(self._band_shifts):
|
|
104
|
+
band_val = (fp >> shift) & self._band_masks[b]
|
|
105
|
+
if band_val in buckets[b]:
|
|
106
|
+
candidates.update(buckets[b][band_val])
|
|
107
|
+
|
|
108
|
+
if not candidates and out_fps:
|
|
109
|
+
candidates = set(range(len(out_fps)))
|
|
110
|
+
|
|
111
|
+
is_dup = any(self._hamming_distance(fp, out_fps[idx]) <= self.threshold_bits for idx in candidates)
|
|
112
|
+
if is_dup:
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
idx = len(out)
|
|
116
|
+
out.add(it)
|
|
117
|
+
out_fps.append(fp)
|
|
118
|
+
exact_seen.add(ex_key)
|
|
119
|
+
for b, shift in enumerate(self._band_shifts):
|
|
120
|
+
band_val = (fp >> shift) & self._band_masks[b]
|
|
121
|
+
buckets[b][band_val].append(idx)
|
|
122
|
+
|
|
123
|
+
return out
|
|
124
|
+
|
|
125
|
+
def _simhash64_for_item(self, item: Mapping[str, str]) -> int:
|
|
126
|
+
"""
|
|
127
|
+
Compute 64-bit SimHash fingerprint for the given item.
|
|
128
|
+
|
|
129
|
+
arguments:
|
|
130
|
+
item: mapping of field -> text
|
|
131
|
+
|
|
132
|
+
returns:
|
|
133
|
+
64-bit integer SimHash fingerprint
|
|
134
|
+
|
|
135
|
+
example:
|
|
136
|
+
>>> deduper = SimHashDeduper(fields_to_hash=("prompt","positive"), field_weights={"prompt":2.0})
|
|
137
|
+
>>> item = {"prompt":"Tell me a joke.","positive":"Here's a joke.","negative":"I can't help."}
|
|
138
|
+
>>> deduper._simhash64_for_item(item)
|
|
139
|
+
0b101010101010... (64 bits)
|
|
140
|
+
"""
|
|
141
|
+
feats: Counter[str] = Counter()
|
|
142
|
+
if self.key_fn:
|
|
143
|
+
text = self.key_fn(item)
|
|
144
|
+
feats.update(self._extract_features(text))
|
|
145
|
+
else:
|
|
146
|
+
for field in self.fields_to_hash:
|
|
147
|
+
text = item.get(field, "") or ""
|
|
148
|
+
w = float(self.field_weights.get(field, 1.0))
|
|
149
|
+
if not text or w == 0.0:
|
|
150
|
+
continue
|
|
151
|
+
f = self._extract_features(text)
|
|
152
|
+
if w != 1.0:
|
|
153
|
+
for k, v in f.items():
|
|
154
|
+
f[k] = v * w
|
|
155
|
+
feats.update(f)
|
|
156
|
+
return self._simhash64(feats)
|
|
157
|
+
|
|
158
|
+
def _simhash64(self, features: Mapping[str, float]) -> int:
|
|
159
|
+
"""
|
|
160
|
+
Compute 64-bit SimHash fingerprint from weighted features.
|
|
161
|
+
|
|
162
|
+
arguments:
|
|
163
|
+
features: mapping of feature -> weight (e.g., shingle -> count or tf-idf)
|
|
164
|
+
|
|
165
|
+
returns:
|
|
166
|
+
64-bit integer SimHash fingerprint
|
|
167
|
+
|
|
168
|
+
example:
|
|
169
|
+
>>> SimHashDeduper()._simhash64(Counter({'cat': 1, 'sat': 1, 'mat': 1}))
|
|
170
|
+
0b101010101010... (64 bits)
|
|
171
|
+
"""
|
|
172
|
+
v = [0.0] * 64
|
|
173
|
+
for feat, weight in features.items():
|
|
174
|
+
h = self._hash64(feat)
|
|
175
|
+
for i in range(64):
|
|
176
|
+
if h & (1 << i):
|
|
177
|
+
v[i] += weight
|
|
178
|
+
else:
|
|
179
|
+
v[i] -= weight
|
|
180
|
+
|
|
181
|
+
fp = 0
|
|
182
|
+
for i in range(64):
|
|
183
|
+
if v[i] >= 0:
|
|
184
|
+
fp |= (1 << i)
|
|
185
|
+
return fp
|
|
186
|
+
|
|
187
|
+
def _extract_features(self, text: str) -> Counter[str]:
|
|
188
|
+
"""
|
|
189
|
+
Extract features (shingles) from text based on tokenizer mode.
|
|
190
|
+
|
|
191
|
+
arguments:
|
|
192
|
+
text: input string
|
|
193
|
+
|
|
194
|
+
returns:
|
|
195
|
+
Counter of features (shingle -> count)
|
|
196
|
+
|
|
197
|
+
example:
|
|
198
|
+
>>> SimHashDeduper()._extract_features("The cat sat on the mat.")
|
|
199
|
+
Counter({'cat': 1, 'sat': 1, 'mat': 1})
|
|
200
|
+
>>> SimHashDeduper(tokenizer="char", char_ngram=3)._extract_features("hello")
|
|
201
|
+
Counter({'hel': 1, 'ell': 1, 'llo': 1})
|
|
202
|
+
"""
|
|
203
|
+
t = self._normalize(text)
|
|
204
|
+
mode = self._pick_mode(t)
|
|
205
|
+
|
|
206
|
+
if mode == "word":
|
|
207
|
+
toks = [tok for tok in re.findall(r"\w+", t) if tok not in self.stopwords]
|
|
208
|
+
if self.word_ngram == 1:
|
|
209
|
+
return Counter(toks)
|
|
210
|
+
shingles = [" ".join(toks[i:i + self.word_ngram]) for i in range(len(toks) - self.word_ngram + 1)]
|
|
211
|
+
return Counter(shingles)
|
|
212
|
+
|
|
213
|
+
if self.char_ngram == 1:
|
|
214
|
+
chars = list(t.replace(" ", ""))
|
|
215
|
+
return Counter(chars)
|
|
216
|
+
s = re.sub(r"\s+", " ", t)
|
|
217
|
+
s = s.replace(" ", "␠")
|
|
218
|
+
shingles = [s[i:i + self.char_ngram] for i in range(max(0, len(s) - self.char_ngram + 1))]
|
|
219
|
+
return Counter(shingles)
|
|
220
|
+
|
|
221
|
+
def _pick_mode(self, text: str) -> str:
|
|
222
|
+
"""
|
|
223
|
+
Decide tokenizer mode based on text and config.
|
|
224
|
+
|
|
225
|
+
arguments:
|
|
226
|
+
text: input string
|
|
227
|
+
|
|
228
|
+
returns:
|
|
229
|
+
"word" or "char"
|
|
230
|
+
"""
|
|
231
|
+
if self.tokenizer == "auto":
|
|
232
|
+
return "char" if self._re_cjk.search(text) else "word"
|
|
233
|
+
return self.tokenizer
|
|
234
|
+
|
|
235
|
+
def _normalize(self, text: str) -> str:
|
|
236
|
+
"""
|
|
237
|
+
Unicode NFKC normalization, casefold, optional accent strip, URL/email strip, whitespace
|
|
238
|
+
|
|
239
|
+
arguments:
|
|
240
|
+
text: input string
|
|
241
|
+
|
|
242
|
+
returns:
|
|
243
|
+
normalized string
|
|
244
|
+
|
|
245
|
+
example:
|
|
246
|
+
>>> SimHashDeduper()._normalize("Café at https://example.com!")
|
|
247
|
+
'cafe at <URL> !'
|
|
248
|
+
>>> SimHashDeduper(strip_accents=False)._normalize("The cat sat on the mat.")
|
|
249
|
+
'the cat sat on the mat.'
|
|
250
|
+
|
|
251
|
+
the processing steps are:
|
|
252
|
+
1) Replace URLs with <URL> token
|
|
253
|
+
2) Replace emails with <EMAIL> token
|
|
254
|
+
3) Unicode NFKC normalization
|
|
255
|
+
4) Casefold (lowercase + some locale-aware folding)
|
|
256
|
+
5) Optional accent strip (NFKD + remove combining marks)
|
|
257
|
+
6) Collapse whitespace to single spaces, trim leading/trailing
|
|
258
|
+
"""
|
|
259
|
+
text = re.sub(r"https?://\S+", " <URL> ", text)
|
|
260
|
+
text = re.sub(r"\b\S+@\S+\b", " <EMAIL> ", text)
|
|
261
|
+
|
|
262
|
+
text = unicodedata.normalize("NFKC", text).casefold()
|
|
263
|
+
|
|
264
|
+
if self.strip_accents:
|
|
265
|
+
text = unicodedata.normalize("NFKD", text)
|
|
266
|
+
text = "".join(ch for ch in text if not unicodedata.combining(ch))
|
|
267
|
+
|
|
268
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
269
|
+
return text
|
|
270
|
+
|
|
271
|
+
def _hash64(self, s: str) -> int:
|
|
272
|
+
"""
|
|
273
|
+
Stable 64-bit hash of a string.
|
|
274
|
+
|
|
275
|
+
arguments:
|
|
276
|
+
s: input string
|
|
277
|
+
|
|
278
|
+
returns:
|
|
279
|
+
64-bit integer hash
|
|
280
|
+
|
|
281
|
+
example:
|
|
282
|
+
>>> SimHashDeduper()._hash64("wisent_guard")
|
|
283
|
+
TODO: actual value"
|
|
284
|
+
"""
|
|
285
|
+
h = hashlib.blake2b(s.encode("utf-8"), digest_size=8)
|
|
286
|
+
return int.from_bytes(h.digest(), "big", signed=False)
|
|
287
|
+
|
|
288
|
+
def _hamming_distance(self, a: int, b: int) -> int:
|
|
289
|
+
"""
|
|
290
|
+
Compute Hamming distance between two 64-bit integers.
|
|
291
|
+
|
|
292
|
+
arguments:
|
|
293
|
+
a, b: 64-bit integers
|
|
294
|
+
|
|
295
|
+
returns:
|
|
296
|
+
Hamming distance (number of differing bits)
|
|
297
|
+
|
|
298
|
+
intuition:
|
|
299
|
+
XOR the two integers; the number of set bits in the result is the Hamming distance
|
|
300
|
+
For example, let word_1 = "hause" and word_2 = "mause", then
|
|
301
|
+
a = hash64("hause") = 0b110100101011... (64 bits)
|
|
302
|
+
b = hash64("mause") = 0b110100111011... (64 bits)
|
|
303
|
+
a ^ b = 0b000000110000... (64 bits)
|
|
304
|
+
The number of 1s in a ^ b is the Hamming distance, so here it is 2.
|
|
305
|
+
"""
|
|
306
|
+
x = a ^ b
|
|
307
|
+
return x.bit_count() if hasattr(int, "bit_count") else bin(x).count("1")
|
|
308
|
+
|
|
309
|
+
def _exact_key(self, item: Mapping[str, str]) -> tuple[tuple[str, str], ...]:
|
|
310
|
+
kv = [(k, item.get(k, "")) for k in self.exact_keys]
|
|
311
|
+
return tuple(sorted(kv))
|
|
312
|
+
|
|
313
|
+
@staticmethod
|
|
314
|
+
def _default_stopwords() -> set[str]:
|
|
315
|
+
return {
|
|
316
|
+
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for",
|
|
317
|
+
"if", "in", "into", "is", "it", "no", "not", "of", "on", "or",
|
|
318
|
+
"such", "that", "the", "their", "then", "there", "these", "they",
|
|
319
|
+
"this", "to", "was", "will", "with", "i", "you", "he", "she", "we",
|
|
320
|
+
}
|