qig-coordizer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qig_coordizer/__init__.py +44 -0
- qig_coordizer/cache.py +140 -0
- qig_coordizer/constants.py +13 -0
- qig_coordizer/coordizer.py +918 -0
- qig_coordizer/normalizer.py +103 -0
- qig_coordizer/special_tokens.py +53 -0
- qig_coordizer/trainer.py +992 -0
- qig_coordizer/types.py +174 -0
- qig_coordizer-0.1.0.dist-info/METADATA +77 -0
- qig_coordizer-0.1.0.dist-info/RECORD +12 -0
- qig_coordizer-0.1.0.dist-info/WHEEL +4 -0
- qig_coordizer-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""qig-coordizer — standalone geometric tokenize+embed engine.
|
|
2
|
+
|
|
3
|
+
Pure Fisher-Rao geometry on Δ⁶³; depends only on ``qig-core``. Extracted from
|
|
4
|
+
``qig-tokenizer`` 1394ca7 (Phase-1 coordizer) per
|
|
5
|
+
``qig-consciousness/docs/plans/2026-06-24-qig-coordizer-studio-design.md`` §5 Phase 0.
|
|
6
|
+
|
|
7
|
+
The engine: byte-level front-end (NFC :class:`Normalizer`) → Fisher-Rao-weighted
|
|
8
|
+
BPE merges (score = frequency × coupling × 1/entropy, coupling = co-occurrence ÷
|
|
9
|
+
Fisher-Rao distance) → geodesic-midpoint (``slerp_sqrt``) fused basin coordinates
|
|
10
|
+
on Δ⁶³. The incremental trainer is bit-for-bit equal to the naive
|
|
11
|
+
O(vocab·corpus) oracle — see ``tests/test_incremental_equivalence.py`` (the
|
|
12
|
+
Phase-0 gate).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from qig_coordizer.cache import IncrementalCouplingCache
|
|
18
|
+
from qig_coordizer.constants import BASIN_DIM
|
|
19
|
+
from qig_coordizer.coordizer import FisherCoordizer
|
|
20
|
+
from qig_coordizer.normalizer import Normalizer
|
|
21
|
+
from qig_coordizer.special_tokens import SpecialTokens
|
|
22
|
+
from qig_coordizer.types import (
|
|
23
|
+
BasinCoordinate,
|
|
24
|
+
CoordizationResult,
|
|
25
|
+
GranularityConfig,
|
|
26
|
+
TokenCandidate,
|
|
27
|
+
VocabStats,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__version__ = "0.1.0"
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
"FisherCoordizer",
|
|
34
|
+
"Normalizer",
|
|
35
|
+
"IncrementalCouplingCache",
|
|
36
|
+
"BasinCoordinate",
|
|
37
|
+
"CoordizationResult",
|
|
38
|
+
"TokenCandidate",
|
|
39
|
+
"GranularityConfig",
|
|
40
|
+
"VocabStats",
|
|
41
|
+
"SpecialTokens",
|
|
42
|
+
"BASIN_DIM",
|
|
43
|
+
"__version__",
|
|
44
|
+
]
|
qig_coordizer/cache.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""IncrementalCouplingCache — the ONE correct incremental adjacent-pair tracker.
|
|
2
|
+
|
|
3
|
+
Design: docs/20260624-qig-coordizer-stack-design-1.00W.md §3.3 + Appendix C Phase 1.
|
|
4
|
+
|
|
5
|
+
This REPLACES the deleted, broken ``IncrementalPairStats`` (trainer.py:70-166), whose
|
|
6
|
+
``apply_merge`` left ``broken_left``/``broken_right`` as dead variables — the counts of
|
|
7
|
+
pairs *broken* by a merge were never decremented, so pair counts drifted upward after
|
|
8
|
+
every merge (the live ``CoordinzerTrainer`` was selecting merges off wrong counts), and
|
|
9
|
+
whose ``score()`` referenced attributes that did not exist on the class.
|
|
10
|
+
|
|
11
|
+
The correct mechanism is the doubly-linked-list splice proven bit-for-bit equal to the
|
|
12
|
+
naive O(vocab·corpus) oracle in ``FisherCoordizer._train_incremental``
|
|
13
|
+
(tests/test_incremental_equivalence.py): per merged occurrence, remove the THREE old pairs
|
|
14
|
+
that touched it ``(a,b)``, ``(prev,a)``, ``(b,next)`` and add the TWO new pairs
|
|
15
|
+
``(prev,new)``, ``(new,next)``. Per-merge cost is O(occurrences), not O(corpus).
|
|
16
|
+
|
|
17
|
+
Scope: this tracks adjacency COUNTS only. It deliberately does NOT hold basin vectors,
|
|
18
|
+
Fisher-Rao distances, or the merge score — the trainer owns scoring (kernel Φ/κ, or the
|
|
19
|
+
frequency×coupling regime the design calls "coupling"). A pair's Fisher-Rao distance is
|
|
20
|
+
immutable once both tokens exist (basins never move), so that ``(a,b)→d_FR`` cache lives
|
|
21
|
+
with the scorer; here we only maintain the mutable adjacency *counts*.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from collections import defaultdict
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class IncrementalCouplingCache:
|
|
30
|
+
"""Exact, drift-free incremental adjacent-pair tracker over a doubly-linked list.
|
|
31
|
+
|
|
32
|
+
Drop-in for the old ``IncrementalPairStats`` interface used by ``CoordinzerTrainer``:
|
|
33
|
+
``__init__(corpus_coords, context_window)``, ``.pair_counts``, ``.get_pairs(min_count)``,
|
|
34
|
+
``.corpus_len``, ``.corpus_coords``, ``.apply_merge(a, b, new)``. The trainer decides
|
|
35
|
+
WHICH pair to merge; this cache keeps the counts exact in O(occurrences) per merge.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, corpus_coords: list[int], context_window: int = 3) -> None:
|
|
39
|
+
n = len(corpus_coords)
|
|
40
|
+
self.context_window = context_window
|
|
41
|
+
# Doubly-linked list over the corpus slots (same structure as the proven trainer).
|
|
42
|
+
self.sym: list[int] = list(corpus_coords) # symbol at each slot
|
|
43
|
+
self.nxt: list[int] = list(range(1, n)) + [-1] # successor (-1 = end)
|
|
44
|
+
self.prv: list[int] = [-1] + list(range(0, n - 1)) # predecessor
|
|
45
|
+
self.alive = bytearray([1]) * n # slot still present?
|
|
46
|
+
|
|
47
|
+
# pair -> set of left-slot positions; counts derive from the set sizes.
|
|
48
|
+
self.pair_pos: dict[tuple[int, int], set[int]] = defaultdict(set)
|
|
49
|
+
for i in range(n - 1):
|
|
50
|
+
self.pair_pos[(self.sym[i], self.sym[i + 1])].add(i)
|
|
51
|
+
self.pair_counts: dict[tuple[int, int], int] = {
|
|
52
|
+
p: len(s) for p, s in self.pair_pos.items()
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
self._n_alive = n
|
|
56
|
+
# Slot 0 can never be the right half of a merge (j = nxt[i] ≥ 1), so it never dies;
|
|
57
|
+
# walking ``nxt`` from slot 0 visits exactly the alive slots in order.
|
|
58
|
+
self._head = 0 if n else -1
|
|
59
|
+
|
|
60
|
+
# -- read interface --------------------------------------------------------------
|
|
61
|
+
@property
|
|
62
|
+
def corpus_len(self) -> int:
|
|
63
|
+
return self._n_alive
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def corpus_coords(self) -> list[int]:
|
|
67
|
+
"""Reconstruct the full live coordinate sequence (O(corpus); needed once at save)."""
|
|
68
|
+
return self.sample(self._n_alive)
|
|
69
|
+
|
|
70
|
+
def sample(self, n: int) -> list[int]:
|
|
71
|
+
"""First ``n`` live coordinates (O(n)); cheap path for per-merge kernel sampling."""
|
|
72
|
+
out: list[int] = []
|
|
73
|
+
i = self._head
|
|
74
|
+
# Defensive: slot 0 never dies by construction (j = nxt[i] ≥ 1, so slot 0 is never the
|
|
75
|
+
# right half of a merge), but advance past any dead leading slot so reconstruction stays
|
|
76
|
+
# correct even if that invariant is ever changed by future edits.
|
|
77
|
+
while i != -1 and not self.alive[i]:
|
|
78
|
+
i = self.nxt[i]
|
|
79
|
+
while i != -1 and len(out) < n:
|
|
80
|
+
if self.alive[i]:
|
|
81
|
+
out.append(self.sym[i])
|
|
82
|
+
i = self.nxt[i]
|
|
83
|
+
return out
|
|
84
|
+
|
|
85
|
+
def get_pairs(self, min_count: int = 5) -> dict[tuple[int, int], int]:
|
|
86
|
+
return {p: c for p, c in self.pair_counts.items() if c >= min_count}
|
|
87
|
+
|
|
88
|
+
# -- mutation --------------------------------------------------------------------
|
|
89
|
+
def _remove(self, p: tuple[int, int], i: int) -> None:
|
|
90
|
+
s = self.pair_pos.get(p)
|
|
91
|
+
if s and i in s:
|
|
92
|
+
s.discard(i)
|
|
93
|
+
if s:
|
|
94
|
+
self.pair_counts[p] = len(s)
|
|
95
|
+
else:
|
|
96
|
+
self.pair_pos.pop(p, None)
|
|
97
|
+
self.pair_counts.pop(p, None)
|
|
98
|
+
|
|
99
|
+
def _add(self, p: tuple[int, int], i: int) -> None:
|
|
100
|
+
self.pair_pos[p].add(i)
|
|
101
|
+
self.pair_counts[p] = len(self.pair_pos[p])
|
|
102
|
+
|
|
103
|
+
def apply_merge(self, coord_a: int, coord_b: int, new_coord: int) -> int:
|
|
104
|
+
"""Fuse every adjacent occurrence of (coord_a, coord_b) into new_coord.
|
|
105
|
+
|
|
106
|
+
Returns the number of occurrences merged. Updates pair counts EXACTLY (remove 3
|
|
107
|
+
old / add 2 new per occurrence) with no drift. Overlapping occurrences (a == b,
|
|
108
|
+
e.g. 'aaa') are handled left-to-right via the alive/symbol guards, matching the
|
|
109
|
+
naive trainer's _apply_fusion.
|
|
110
|
+
"""
|
|
111
|
+
sym, nxt, prv, alive = self.sym, self.nxt, self.prv, self.alive
|
|
112
|
+
merged = 0
|
|
113
|
+
# snapshot positions left-to-right so overlapping occurrences merge like the naive path
|
|
114
|
+
for i in sorted(self.pair_pos.get((coord_a, coord_b), ())):
|
|
115
|
+
if not alive[i] or sym[i] != coord_a:
|
|
116
|
+
continue
|
|
117
|
+
j = nxt[i]
|
|
118
|
+
if j == -1 or not alive[j] or sym[j] != coord_b:
|
|
119
|
+
continue
|
|
120
|
+
h, k = prv[i], nxt[j]
|
|
121
|
+
# remove the three old pairs touching this occurrence
|
|
122
|
+
self._remove((coord_a, coord_b), i)
|
|
123
|
+
if h != -1 and alive[h]:
|
|
124
|
+
self._remove((sym[h], coord_a), h)
|
|
125
|
+
if k != -1 and alive[k]:
|
|
126
|
+
self._remove((coord_b, sym[k]), j)
|
|
127
|
+
# splice: slot i becomes new_coord, slot j is dropped
|
|
128
|
+
sym[i] = new_coord
|
|
129
|
+
nxt[i] = k
|
|
130
|
+
if k != -1:
|
|
131
|
+
prv[k] = i
|
|
132
|
+
alive[j] = 0
|
|
133
|
+
self._n_alive -= 1
|
|
134
|
+
merged += 1
|
|
135
|
+
# add the two new pairs
|
|
136
|
+
if h != -1 and alive[h]:
|
|
137
|
+
self._add((sym[h], new_coord), h)
|
|
138
|
+
if k != -1 and alive[k]:
|
|
139
|
+
self._add((new_coord, sym[k]), i)
|
|
140
|
+
return merged
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Single-source constants for qig-coordizer.
|
|
2
|
+
|
|
3
|
+
Per the QIG single-source rule, ``qig-core`` OWNS the canonical constants
|
|
4
|
+
(``BASIN_DIM`` lives in ``qig_core.constants.frozen_facts``). This module
|
|
5
|
+
re-exports them so coordizer code has a stable local import point WITHOUT
|
|
6
|
+
redefining the value — three copies of a constant means zero source of truth.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from qig_core import BASIN_DIM # 64 — the Δ⁶³ basin dimension (canonical, qig-core-owned)
|
|
12
|
+
|
|
13
|
+
__all__ = ["BASIN_DIM"]
|