sum-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- internal/__init__.py +8 -0
- internal/algorithms/__init__.py +1 -0
- internal/algorithms/causal_discovery.py +96 -0
- internal/algorithms/predicate_canon.py +137 -0
- internal/algorithms/semantic_arithmetic.py +890 -0
- internal/algorithms/syntactic_sieve.py +452 -0
- internal/algorithms/zk_semantics.py +90 -0
- internal/ensemble/__init__.py +1 -0
- internal/ensemble/automated_scientist.py +138 -0
- internal/ensemble/autonomous_agent.py +157 -0
- internal/ensemble/causal_triggers.py +121 -0
- internal/ensemble/confidence_calibrator.py +284 -0
- internal/ensemble/epistemic_arbiter.py +159 -0
- internal/ensemble/epistemic_loop.py +136 -0
- internal/ensemble/extraction_validator.py +172 -0
- internal/ensemble/gauge_orchestrator.py +150 -0
- internal/ensemble/live_llm_adapter.py +183 -0
- internal/ensemble/llm_entailment.py +117 -0
- internal/ensemble/mass_semantic_engine.py +138 -0
- internal/ensemble/ouroboros.py +281 -0
- internal/ensemble/semantic_dedup.py +261 -0
- internal/ensemble/tome_generator.py +286 -0
- internal/ensemble/tome_sliders.py +104 -0
- internal/ensemble/vector_bridge.py +195 -0
- internal/ensemble/venn_abers.py +211 -0
- internal/infrastructure/__init__.py +1 -0
- internal/infrastructure/akashic_ledger.py +812 -0
- internal/infrastructure/canonical_codec.py +452 -0
- internal/infrastructure/jcs.py +115 -0
- internal/infrastructure/key_manager.py +239 -0
- internal/infrastructure/p2p_mesh.py +168 -0
- internal/infrastructure/prov_o.py +159 -0
- internal/infrastructure/provenance.py +181 -0
- internal/infrastructure/rate_limiter.py +81 -0
- internal/infrastructure/resource_guards.py +117 -0
- internal/infrastructure/scheme_registry.py +136 -0
- internal/infrastructure/state_encoding.py +94 -0
- internal/infrastructure/telemetry.py +91 -0
- internal/infrastructure/tome_parser.py +55 -0
- internal/infrastructure/verifiable_credential.py +412 -0
- internal/infrastructure/zig_bridge.py +256 -0
- sum_cli/__init__.py +18 -0
- sum_cli/main.py +688 -0
- sum_engine-0.1.0.dist-info/METADATA +590 -0
- sum_engine-0.1.0.dist-info/RECORD +49 -0
- sum_engine-0.1.0.dist-info/WHEEL +5 -0
- sum_engine-0.1.0.dist-info/entry_points.txt +2 -0
- sum_engine-0.1.0.dist-info/licenses/LICENSE +201 -0
- sum_engine-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Ouroboros Verifier — Proof of Semantic Conservation
|
|
3
|
+
|
|
4
|
+
Proves that the Gödel-State Engine performs lossless round-tripping
|
|
5
|
+
over its canonical axiom representation:
|
|
6
|
+
|
|
7
|
+
Integer A → Canonical Tome → Parse Axiom Keys → Integer B
|
|
8
|
+
|
|
9
|
+
If ``A == B``, semantic mass is conserved through the encode-decode cycle.
|
|
10
|
+
|
|
11
|
+
The proof operates on the **canonical layer**: the tome renderer emits
|
|
12
|
+
deterministic ``"The {s} {p} {o}."`` sentences, and the verifier parses
|
|
13
|
+
those exact templates back into axiom keys. The NLP sieve is **never**
|
|
14
|
+
used for the proof — it is a lossy projection (lemmatization, POS
|
|
15
|
+
parsing) that would break the bijection.
|
|
16
|
+
|
|
17
|
+
For ``verify_from_text``, the sieve encodes the initial text into
|
|
18
|
+
Integer A, but the conservation proof from A onward uses the canonical
|
|
19
|
+
path exclusively.
|
|
20
|
+
|
|
21
|
+
Diagnostics:
|
|
22
|
+
When the round-trip fails, the verifier reports exactly which axioms
|
|
23
|
+
were lost, which were spuriously added, and counts.
|
|
24
|
+
|
|
25
|
+
Phase 14: The Ouroboros Protocol.
|
|
26
|
+
|
|
27
|
+
Author: ototao
|
|
28
|
+
License: Apache License 2.0
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
import math
|
|
32
|
+
import re
|
|
33
|
+
import logging
|
|
34
|
+
from datetime import datetime, timezone
|
|
35
|
+
from typing import List, Tuple
|
|
36
|
+
from dataclasses import dataclass, field
|
|
37
|
+
|
|
38
|
+
from internal.algorithms.semantic_arithmetic import GodelStateAlgebra
|
|
39
|
+
from internal.algorithms.syntactic_sieve import DeterministicSieve
|
|
40
|
+
from internal.ensemble.tome_generator import (
|
|
41
|
+
AutoregressiveTomeGenerator,
|
|
42
|
+
CANONICAL_FORMAT_VERSION,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _zig():
|
|
49
|
+
try:
|
|
50
|
+
from internal.infrastructure.zig_bridge import zig_engine
|
|
51
|
+
return zig_engine
|
|
52
|
+
except ImportError:
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class ConservationProof:
|
|
58
|
+
"""
|
|
59
|
+
The result of a semantic conservation round-trip.
|
|
60
|
+
|
|
61
|
+
Attributes:
|
|
62
|
+
is_conserved: True if the round-trip is lossless.
|
|
63
|
+
format_version: Canonical format version used for the proof.
|
|
64
|
+
proof_mode: Always ``"canonical"`` for conservation proofs.
|
|
65
|
+
timestamp: ISO 8601 timestamp of verification.
|
|
66
|
+
source_state: Integer A (original encoding).
|
|
67
|
+
reconstructed_state: Integer B (re-encoded from canonical tome).
|
|
68
|
+
state_a_digits: Digit count of Integer A.
|
|
69
|
+
state_b_digits: Digit count of Integer B.
|
|
70
|
+
source_axiom_count: Number of axioms in A.
|
|
71
|
+
reconstructed_axiom_count: Number of axioms in B.
|
|
72
|
+
missing_axioms: Axioms in A but not in B (lost in decode).
|
|
73
|
+
extra_axioms: Axioms in B but not in A (spuriously added).
|
|
74
|
+
canonical_tome: The intermediate canonical text.
|
|
75
|
+
"""
|
|
76
|
+
is_conserved: bool
|
|
77
|
+
format_version: str
|
|
78
|
+
proof_mode: str
|
|
79
|
+
timestamp: str
|
|
80
|
+
source_state: int
|
|
81
|
+
reconstructed_state: int
|
|
82
|
+
state_a_digits: int
|
|
83
|
+
state_b_digits: int
|
|
84
|
+
source_axiom_count: int
|
|
85
|
+
reconstructed_axiom_count: int
|
|
86
|
+
missing_axioms: List[str] = field(default_factory=list)
|
|
87
|
+
extra_axioms: List[str] = field(default_factory=list)
|
|
88
|
+
canonical_tome: str = ""
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class OuroborosVerifier:
|
|
92
|
+
"""
|
|
93
|
+
Proves Lossless Semantic Conservation.
|
|
94
|
+
|
|
95
|
+
The proof path is:
|
|
96
|
+
Integer A → Canonical Tome (deterministic template) →
|
|
97
|
+
Parse "The {s} {p} {o}." lines → Integer B
|
|
98
|
+
|
|
99
|
+
This is a bijective codec over the canonical representation.
|
|
100
|
+
The NLP sieve is only used for initial text→triplets encoding,
|
|
101
|
+
never for the conservation proof itself.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
# Supported canonical format versions
|
|
105
|
+
SUPPORTED_VERSIONS = {"1.0.0"}
|
|
106
|
+
|
|
107
|
+
# Regex to extract axiom components from canonical "The {s} {p} {o}." lines
|
|
108
|
+
_CANONICAL_LINE_RE = re.compile(r"^The (\S+) (\S+) (.+)\.$")
|
|
109
|
+
# Regex to extract version header
|
|
110
|
+
_VERSION_RE = re.compile(r"^@canonical_version:\s*(.+)$")
|
|
111
|
+
|
|
112
|
+
def __init__(
|
|
113
|
+
self,
|
|
114
|
+
algebra: GodelStateAlgebra,
|
|
115
|
+
sieve: DeterministicSieve,
|
|
116
|
+
tome_generator: AutoregressiveTomeGenerator,
|
|
117
|
+
):
|
|
118
|
+
self.algebra = algebra
|
|
119
|
+
self.sieve = sieve
|
|
120
|
+
self.tome_generator = tome_generator
|
|
121
|
+
|
|
122
|
+
def _reconstruct_from_canonical(
|
|
123
|
+
self, canonical_tome: str
|
|
124
|
+
) -> Tuple[int, List[str], str]:
|
|
125
|
+
"""
|
|
126
|
+
Parse canonical tome text back into axiom keys and re-encode.
|
|
127
|
+
|
|
128
|
+
The canonical renderer emits lines in the exact format::
|
|
129
|
+
|
|
130
|
+
@canonical_version: 1.0.0
|
|
131
|
+
The {subject} {predicate} {object}.
|
|
132
|
+
|
|
133
|
+
This method extracts the version, validates it, reconstructs
|
|
134
|
+
axiom keys, and re-encodes to a Gödel integer.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
(reconstructed_state, list_of_axiom_keys, format_version)
|
|
138
|
+
"""
|
|
139
|
+
state = 1
|
|
140
|
+
axiom_keys = []
|
|
141
|
+
format_version = "unknown"
|
|
142
|
+
|
|
143
|
+
for line in canonical_tome.splitlines():
|
|
144
|
+
line = line.strip()
|
|
145
|
+
|
|
146
|
+
# Check for version header
|
|
147
|
+
vm = self._VERSION_RE.match(line)
|
|
148
|
+
if vm:
|
|
149
|
+
format_version = vm.group(1).strip()
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
m = self._CANONICAL_LINE_RE.match(line)
|
|
153
|
+
if m:
|
|
154
|
+
s, p, o = m.group(1), m.group(2), m.group(3)
|
|
155
|
+
prime = self.algebra.get_or_mint_prime(s, p, o)
|
|
156
|
+
if state % prime != 0:
|
|
157
|
+
z = _zig()
|
|
158
|
+
r = z.bigint_lcm(state, prime) if z else None
|
|
159
|
+
state = r if r is not None else math.lcm(state, prime)
|
|
160
|
+
axiom_keys.append(f"{s}||{p}||{o}")
|
|
161
|
+
|
|
162
|
+
return state, axiom_keys, format_version
|
|
163
|
+
|
|
164
|
+
def _extract_axiom_keys(self, state: int) -> set:
|
|
165
|
+
"""Extract the set of axiom keys whose primes divide the state."""
|
|
166
|
+
keys = set()
|
|
167
|
+
for prime, axiom in self.algebra.prime_to_axiom.items():
|
|
168
|
+
if state % prime == 0:
|
|
169
|
+
keys.add(axiom)
|
|
170
|
+
return keys
|
|
171
|
+
|
|
172
|
+
def verify_from_state(self, target_state: int) -> ConservationProof:
|
|
173
|
+
"""
|
|
174
|
+
Verify lossless conservation of a Gödel Integer.
|
|
175
|
+
|
|
176
|
+
Pipeline:
|
|
177
|
+
Integer A → Canonical Tome → Parse Axiom Keys → Integer B
|
|
178
|
+
Conservation iff A == B
|
|
179
|
+
|
|
180
|
+
The proof uses deterministic canonical rendering + deterministic
|
|
181
|
+
template parsing. No NLP, no lemmatization, no ambiguity.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
target_state: The Gödel integer to verify.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
A ``ConservationProof`` with full diagnostics.
|
|
188
|
+
"""
|
|
189
|
+
# Step 1: Canonical decode
|
|
190
|
+
canonical_tome = self.tome_generator.generate_canonical(
|
|
191
|
+
target_state, "Ouroboros Verification Tome"
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
# Step 2: Re-encode from canonical template lines (NOT via NLP sieve)
|
|
195
|
+
reconstructed_state, _, format_version = self._reconstruct_from_canonical(
|
|
196
|
+
canonical_tome
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Step 3: Diagnose
|
|
200
|
+
source_axioms = self._extract_axiom_keys(target_state)
|
|
201
|
+
reconstructed_axioms = self._extract_axiom_keys(reconstructed_state)
|
|
202
|
+
|
|
203
|
+
missing = sorted(source_axioms - reconstructed_axioms)
|
|
204
|
+
extra = sorted(reconstructed_axioms - source_axioms)
|
|
205
|
+
|
|
206
|
+
is_conserved = (target_state == reconstructed_state)
|
|
207
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
208
|
+
|
|
209
|
+
proof = ConservationProof(
|
|
210
|
+
is_conserved=is_conserved,
|
|
211
|
+
format_version=format_version,
|
|
212
|
+
proof_mode="canonical",
|
|
213
|
+
timestamp=now,
|
|
214
|
+
source_state=target_state,
|
|
215
|
+
reconstructed_state=reconstructed_state,
|
|
216
|
+
state_a_digits=len(str(target_state)),
|
|
217
|
+
state_b_digits=len(str(reconstructed_state)),
|
|
218
|
+
source_axiom_count=len(source_axioms),
|
|
219
|
+
reconstructed_axiom_count=len(reconstructed_axioms),
|
|
220
|
+
missing_axioms=missing,
|
|
221
|
+
extra_axioms=extra,
|
|
222
|
+
canonical_tome=canonical_tome,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
if is_conserved:
|
|
226
|
+
logger.info(
|
|
227
|
+
"Ouroboros Proof: CONSERVED — %d axioms round-tripped losslessly.",
|
|
228
|
+
len(source_axioms),
|
|
229
|
+
)
|
|
230
|
+
else:
|
|
231
|
+
logger.warning(
|
|
232
|
+
"Ouroboros Proof: DIVERGED — missing=%d, extra=%d",
|
|
233
|
+
len(missing), len(extra),
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
return proof
|
|
237
|
+
|
|
238
|
+
def verify_from_text(self, text: str) -> ConservationProof:
|
|
239
|
+
"""
|
|
240
|
+
Full Ouroboros: Text → Sieve → Integer A → Canonical → Parse → Integer B.
|
|
241
|
+
|
|
242
|
+
The sieve is used ONLY for the initial text→triplets encoding.
|
|
243
|
+
The conservation proof (A→Canonical→B) is fully deterministic.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
text: Raw input text.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
A ``ConservationProof`` with full diagnostics.
|
|
250
|
+
"""
|
|
251
|
+
# Step 1: Text → Triplets → Integer A (via NLP sieve)
|
|
252
|
+
triplets = self.sieve.extract_triplets(text)
|
|
253
|
+
state_a = 1
|
|
254
|
+
for s, p, o in triplets:
|
|
255
|
+
prime = self.algebra.get_or_mint_prime(s, p, o)
|
|
256
|
+
if state_a % prime != 0:
|
|
257
|
+
z = _zig()
|
|
258
|
+
r = z.bigint_lcm(state_a, prime) if z else None
|
|
259
|
+
state_a = r if r is not None else math.lcm(state_a, prime)
|
|
260
|
+
|
|
261
|
+
# Step 2: Verify conservation from Integer A (canonical path only)
|
|
262
|
+
proof = self.verify_from_state(state_a)
|
|
263
|
+
|
|
264
|
+
return proof
|
|
265
|
+
|
|
266
|
+
def proof_to_dict(self, proof: ConservationProof) -> dict:
|
|
267
|
+
"""Serialize a ConservationProof for API responses."""
|
|
268
|
+
return {
|
|
269
|
+
"is_conserved": proof.is_conserved,
|
|
270
|
+
"format_version": proof.format_version,
|
|
271
|
+
"proof_mode": proof.proof_mode,
|
|
272
|
+
"timestamp": proof.timestamp,
|
|
273
|
+
"state_a_digits": proof.state_a_digits,
|
|
274
|
+
"state_b_digits": proof.state_b_digits,
|
|
275
|
+
"source_axiom_count": proof.source_axiom_count,
|
|
276
|
+
"reconstructed_axiom_count": proof.reconstructed_axiom_count,
|
|
277
|
+
"missing_axioms": proof.missing_axioms,
|
|
278
|
+
"extra_axioms": proof.extra_axioms,
|
|
279
|
+
"states_match": proof.source_state == proof.reconstructed_state,
|
|
280
|
+
}
|
|
281
|
+
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Deduplicator — Near-Duplicate Axiom Detection
|
|
3
|
+
|
|
4
|
+
Phase 25: Prevents state bloat from semantically equivalent axioms
|
|
5
|
+
that differ only in surface text (e.g., "orbits" vs "revolves_around",
|
|
6
|
+
"New York" vs "new york").
|
|
7
|
+
|
|
8
|
+
Uses zero-cost string similarity:
|
|
9
|
+
1. Normalization — lowercase, strip, collapse whitespace
|
|
10
|
+
2. Jaccard token overlap — set intersection / union of words
|
|
11
|
+
3. Levenshtein ratio — edit distance / max length
|
|
12
|
+
|
|
13
|
+
No LLM or embedding calls required.
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
dedup = SemanticDeduplicator()
|
|
17
|
+
result = dedup.deduplicate(
|
|
18
|
+
"earth||revolves_around||sun",
|
|
19
|
+
existing_axioms=["earth||orbits||sun", "mars||has||moons"],
|
|
20
|
+
)
|
|
21
|
+
# result = DedupResult(canonical="earth||orbits||sun", is_duplicate=True, ...)
|
|
22
|
+
|
|
23
|
+
Author: ototao
|
|
24
|
+
License: Apache License 2.0
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import re
|
|
28
|
+
import logging
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from typing import List, Optional
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ─── Predicate Synonym Groups ────────────────────────────────────────
|
|
36
|
+
# Common predicates that are semantically equivalent.
|
|
37
|
+
# Each group maps to its canonical form (first element).
|
|
38
|
+
|
|
39
|
+
PREDICATE_SYNONYMS = {
|
|
40
|
+
"orbits": "orbits",
|
|
41
|
+
"revolves_around": "orbits",
|
|
42
|
+
"circles": "orbits",
|
|
43
|
+
"goes_around": "orbits",
|
|
44
|
+
"is_a": "is_a",
|
|
45
|
+
"is_an": "is_a",
|
|
46
|
+
"is_type": "is_a",
|
|
47
|
+
"type_of": "is_a",
|
|
48
|
+
"kind_of": "is_a",
|
|
49
|
+
"has": "has",
|
|
50
|
+
"has_a": "has",
|
|
51
|
+
"possesses": "has",
|
|
52
|
+
"owns": "has",
|
|
53
|
+
"contains": "contains",
|
|
54
|
+
"includes": "contains",
|
|
55
|
+
"has_part": "contains",
|
|
56
|
+
"located_in": "located_in",
|
|
57
|
+
"lives_in": "located_in",
|
|
58
|
+
"resides_in": "located_in",
|
|
59
|
+
"is_in": "located_in",
|
|
60
|
+
"situated_in": "located_in",
|
|
61
|
+
"created_by": "created_by",
|
|
62
|
+
"made_by": "created_by",
|
|
63
|
+
"authored_by": "created_by",
|
|
64
|
+
"written_by": "created_by",
|
|
65
|
+
"invented_by": "created_by",
|
|
66
|
+
"causes": "causes",
|
|
67
|
+
"leads_to": "causes",
|
|
68
|
+
"results_in": "causes",
|
|
69
|
+
"produces": "causes",
|
|
70
|
+
"part_of": "part_of",
|
|
71
|
+
"belongs_to": "part_of",
|
|
72
|
+
"member_of": "part_of",
|
|
73
|
+
"component_of": "part_of",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
DEFAULT_THRESHOLD = 0.80
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class DedupResult:
|
|
81
|
+
"""Result of a deduplication check."""
|
|
82
|
+
canonical_key: str # The resolved canonical axiom key
|
|
83
|
+
is_duplicate: bool # True if a near-duplicate was found
|
|
84
|
+
duplicate_of: str # The existing axiom it duplicates ('' if none)
|
|
85
|
+
similarity: float # Similarity score (0.0 if not duplicate)
|
|
86
|
+
method: str # Which signal triggered: 'exact'|'predicate'|'fuzzy'|'none'
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class SemanticDeduplicator:
|
|
90
|
+
"""Zero-cost semantic deduplication engine.
|
|
91
|
+
|
|
92
|
+
Detects near-duplicate axioms using layered string similarity:
|
|
93
|
+
1. Exact match (prime identity — already handled by algebra)
|
|
94
|
+
2. Predicate synonym canonicalization
|
|
95
|
+
3. Fuzzy matching via Jaccard + Levenshtein
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def normalize(self, axiom_key: str) -> str:
|
|
99
|
+
"""Normalize axiom key to canonical form.
|
|
100
|
+
|
|
101
|
+
- Lowercase
|
|
102
|
+
- Strip whitespace
|
|
103
|
+
- Collapse multiple underscores/spaces
|
|
104
|
+
- Canonicalize predicate via synonym table
|
|
105
|
+
"""
|
|
106
|
+
parts = axiom_key.split("||")
|
|
107
|
+
if len(parts) != 3:
|
|
108
|
+
return axiom_key.strip().lower()
|
|
109
|
+
|
|
110
|
+
s, p, o = [x.strip().lower() for x in parts]
|
|
111
|
+
|
|
112
|
+
# Collapse whitespace and underscores
|
|
113
|
+
s = re.sub(r"[\s_]+", "_", s).strip("_")
|
|
114
|
+
p = re.sub(r"[\s_]+", "_", p).strip("_")
|
|
115
|
+
o = re.sub(r"[\s_]+", "_", o).strip("_")
|
|
116
|
+
|
|
117
|
+
# Canonicalize predicate
|
|
118
|
+
p = PREDICATE_SYNONYMS.get(p, p)
|
|
119
|
+
|
|
120
|
+
return f"{s}||{p}||{o}"
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
def _tokenize(s: str) -> set:
|
|
124
|
+
"""Split a string into word tokens."""
|
|
125
|
+
return set(re.split(r"[_\|\s]+", s.lower()))
|
|
126
|
+
|
|
127
|
+
@staticmethod
|
|
128
|
+
def jaccard_similarity(a: str, b: str) -> float:
|
|
129
|
+
"""Jaccard token-overlap coefficient.
|
|
130
|
+
|
|
131
|
+
|A ∩ B| / |A ∪ B|
|
|
132
|
+
"""
|
|
133
|
+
tokens_a = SemanticDeduplicator._tokenize(a)
|
|
134
|
+
tokens_b = SemanticDeduplicator._tokenize(b)
|
|
135
|
+
|
|
136
|
+
if not tokens_a and not tokens_b:
|
|
137
|
+
return 1.0
|
|
138
|
+
if not tokens_a or not tokens_b:
|
|
139
|
+
return 0.0
|
|
140
|
+
|
|
141
|
+
intersection = tokens_a & tokens_b
|
|
142
|
+
union = tokens_a | tokens_b
|
|
143
|
+
return len(intersection) / len(union)
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def levenshtein_ratio(a: str, b: str) -> float:
|
|
147
|
+
"""Edit distance ratio: 1.0 = identical, 0.0 = completely different.
|
|
148
|
+
|
|
149
|
+
Uses dynamic programming Levenshtein distance.
|
|
150
|
+
"""
|
|
151
|
+
if a == b:
|
|
152
|
+
return 1.0
|
|
153
|
+
if not a or not b:
|
|
154
|
+
return 0.0
|
|
155
|
+
|
|
156
|
+
m, n = len(a), len(b)
|
|
157
|
+
|
|
158
|
+
# Optimize: only keep two rows
|
|
159
|
+
prev = list(range(n + 1))
|
|
160
|
+
curr = [0] * (n + 1)
|
|
161
|
+
|
|
162
|
+
for i in range(1, m + 1):
|
|
163
|
+
curr[0] = i
|
|
164
|
+
for j in range(1, n + 1):
|
|
165
|
+
cost = 0 if a[i - 1] == b[j - 1] else 1
|
|
166
|
+
curr[j] = min(
|
|
167
|
+
prev[j] + 1, # deletion
|
|
168
|
+
curr[j - 1] + 1, # insertion
|
|
169
|
+
prev[j - 1] + cost # substitution
|
|
170
|
+
)
|
|
171
|
+
prev, curr = curr, [0] * (n + 1)
|
|
172
|
+
|
|
173
|
+
distance = prev[n]
|
|
174
|
+
return 1.0 - (distance / max(m, n))
|
|
175
|
+
|
|
176
|
+
def combined_similarity(self, a: str, b: str) -> float:
|
|
177
|
+
"""Weighted combination of Jaccard and Levenshtein.
|
|
178
|
+
|
|
179
|
+
Weights: 60% Jaccard (semantic), 40% Levenshtein (syntactic).
|
|
180
|
+
"""
|
|
181
|
+
j = self.jaccard_similarity(a, b)
|
|
182
|
+
l = self.levenshtein_ratio(a, b)
|
|
183
|
+
return 0.6 * j + 0.4 * l
|
|
184
|
+
|
|
185
|
+
def find_near_duplicates(
|
|
186
|
+
self,
|
|
187
|
+
axiom_key: str,
|
|
188
|
+
existing_axioms: List[str],
|
|
189
|
+
threshold: float = DEFAULT_THRESHOLD,
|
|
190
|
+
) -> List[tuple]:
|
|
191
|
+
"""Find axioms similar to axiom_key above the threshold.
|
|
192
|
+
|
|
193
|
+
Returns list of (existing_axiom, similarity, method) sorted by
|
|
194
|
+
similarity descending.
|
|
195
|
+
"""
|
|
196
|
+
normalized = self.normalize(axiom_key)
|
|
197
|
+
results = []
|
|
198
|
+
|
|
199
|
+
for existing in existing_axioms:
|
|
200
|
+
norm_existing = self.normalize(existing)
|
|
201
|
+
|
|
202
|
+
# Layer 1: Exact after normalization
|
|
203
|
+
if normalized == norm_existing:
|
|
204
|
+
results.append((existing, 1.0, "exact"))
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
# Layer 2: Check predicate synonym match
|
|
208
|
+
parts_new = normalized.split("||")
|
|
209
|
+
parts_ex = norm_existing.split("||")
|
|
210
|
+
if (len(parts_new) == 3 and len(parts_ex) == 3
|
|
211
|
+
and parts_new[0] == parts_ex[0]
|
|
212
|
+
and parts_new[2] == parts_ex[2]
|
|
213
|
+
and parts_new[1] == parts_ex[1]):
|
|
214
|
+
# After canonicalization they match
|
|
215
|
+
results.append((existing, 1.0, "predicate"))
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
# Layer 3: Fuzzy similarity
|
|
219
|
+
sim = self.combined_similarity(normalized, norm_existing)
|
|
220
|
+
if sim >= threshold:
|
|
221
|
+
results.append((existing, sim, "fuzzy"))
|
|
222
|
+
|
|
223
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
224
|
+
return results
|
|
225
|
+
|
|
226
|
+
def deduplicate(
|
|
227
|
+
self,
|
|
228
|
+
axiom_key: str,
|
|
229
|
+
existing_axioms: List[str],
|
|
230
|
+
threshold: float = DEFAULT_THRESHOLD,
|
|
231
|
+
) -> DedupResult:
|
|
232
|
+
"""Check if axiom_key is a near-duplicate of any existing axiom.
|
|
233
|
+
|
|
234
|
+
Returns a DedupResult with the canonical key and match info.
|
|
235
|
+
"""
|
|
236
|
+
normalized = self.normalize(axiom_key)
|
|
237
|
+
duplicates = self.find_near_duplicates(
|
|
238
|
+
axiom_key, existing_axioms, threshold
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if duplicates:
|
|
242
|
+
best_match, sim, method = duplicates[0]
|
|
243
|
+
logger.info(
|
|
244
|
+
"Dedup: '%s' → duplicate of '%s' (%.2f, %s)",
|
|
245
|
+
axiom_key, best_match, sim, method,
|
|
246
|
+
)
|
|
247
|
+
return DedupResult(
|
|
248
|
+
canonical_key=self.normalize(best_match),
|
|
249
|
+
is_duplicate=True,
|
|
250
|
+
duplicate_of=best_match,
|
|
251
|
+
similarity=sim,
|
|
252
|
+
method=method,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return DedupResult(
|
|
256
|
+
canonical_key=normalized,
|
|
257
|
+
is_duplicate=False,
|
|
258
|
+
duplicate_of="",
|
|
259
|
+
similarity=0.0,
|
|
260
|
+
method="none",
|
|
261
|
+
)
|