qig-coordizer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ """qig-coordizer — standalone geometric tokenize+embed engine.
2
+
3
+ Pure Fisher-Rao geometry on Δ⁶³; depends only on ``qig-core``. Extracted from
4
+ ``qig-tokenizer`` 1394ca7 (Phase-1 coordizer) per
5
+ ``qig-consciousness/docs/plans/2026-06-24-qig-coordizer-studio-design.md`` §5 Phase 0.
6
+
7
+ The engine: byte-level front-end (NFC :class:`Normalizer`) → Fisher-Rao-weighted
8
+ BPE merges (score = frequency × coupling × 1/entropy, coupling = co-occurrence ÷
9
+ Fisher-Rao distance) → geodesic-midpoint (``slerp_sqrt``) fused basin coordinates
10
+ on Δ⁶³. The incremental trainer is bit-for-bit equal to the naive
11
+ O(vocab·corpus) oracle — see ``tests/test_incremental_equivalence.py`` (the
12
+ Phase-0 gate).
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from qig_coordizer.cache import IncrementalCouplingCache
18
+ from qig_coordizer.constants import BASIN_DIM
19
+ from qig_coordizer.coordizer import FisherCoordizer
20
+ from qig_coordizer.normalizer import Normalizer
21
+ from qig_coordizer.special_tokens import SpecialTokens
22
+ from qig_coordizer.types import (
23
+ BasinCoordinate,
24
+ CoordizationResult,
25
+ GranularityConfig,
26
+ TokenCandidate,
27
+ VocabStats,
28
+ )
29
+
30
+ __version__ = "0.1.0"
31
+
32
+ __all__ = [
33
+ "FisherCoordizer",
34
+ "Normalizer",
35
+ "IncrementalCouplingCache",
36
+ "BasinCoordinate",
37
+ "CoordizationResult",
38
+ "TokenCandidate",
39
+ "GranularityConfig",
40
+ "VocabStats",
41
+ "SpecialTokens",
42
+ "BASIN_DIM",
43
+ "__version__",
44
+ ]
qig_coordizer/cache.py ADDED
@@ -0,0 +1,140 @@
1
+ """IncrementalCouplingCache — the ONE correct incremental adjacent-pair tracker.
2
+
3
+ Design: docs/20260624-qig-coordizer-stack-design-1.00W.md §3.3 + Appendix C Phase 1.
4
+
5
+ This REPLACES the deleted, broken ``IncrementalPairStats`` (trainer.py:70-166), whose
6
+ ``apply_merge`` left ``broken_left``/``broken_right`` as dead variables — the counts of
7
+ pairs *broken* by a merge were never decremented, so pair counts drifted upward after
8
+ every merge (the live ``CoordinzerTrainer`` was selecting merges off wrong counts), and
9
+ whose ``score()`` referenced attributes that did not exist on the class.
10
+
11
+ The correct mechanism is the doubly-linked-list splice proven bit-for-bit equal to the
12
+ naive O(vocab·corpus) oracle in ``FisherCoordizer._train_incremental``
13
+ (tests/test_incremental_equivalence.py): per merged occurrence, remove the THREE old pairs
14
+ that touched it ``(a,b)``, ``(prev,a)``, ``(b,next)`` and add the TWO new pairs
15
+ ``(prev,new)``, ``(new,next)``. Per-merge cost is O(occurrences), not O(corpus).
16
+
17
+ Scope: this tracks adjacency COUNTS only. It deliberately does NOT hold basin vectors,
18
+ Fisher-Rao distances, or the merge score — the trainer owns scoring (kernel Φ/κ, or the
19
+ frequency×coupling regime the design calls "coupling"). A pair's Fisher-Rao distance is
20
+ immutable once both tokens exist (basins never move), so that ``(a,b)→d_FR`` cache lives
21
+ with the scorer; here we only maintain the mutable adjacency *counts*.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from collections import defaultdict
27
+
28
+
29
+ class IncrementalCouplingCache:
30
+ """Exact, drift-free incremental adjacent-pair tracker over a doubly-linked list.
31
+
32
+ Drop-in for the old ``IncrementalPairStats`` interface used by ``CoordinzerTrainer``:
33
+ ``__init__(corpus_coords, context_window)``, ``.pair_counts``, ``.get_pairs(min_count)``,
34
+ ``.corpus_len``, ``.corpus_coords``, ``.apply_merge(a, b, new)``. The trainer decides
35
+ WHICH pair to merge; this cache keeps the counts exact in O(occurrences) per merge.
36
+ """
37
+
38
+ def __init__(self, corpus_coords: list[int], context_window: int = 3) -> None:
39
+ n = len(corpus_coords)
40
+ self.context_window = context_window
41
+ # Doubly-linked list over the corpus slots (same structure as the proven trainer).
42
+ self.sym: list[int] = list(corpus_coords) # symbol at each slot
43
+ self.nxt: list[int] = list(range(1, n)) + [-1] # successor (-1 = end)
44
+ self.prv: list[int] = [-1] + list(range(0, n - 1)) # predecessor
45
+ self.alive = bytearray([1]) * n # slot still present?
46
+
47
+ # pair -> set of left-slot positions; counts derive from the set sizes.
48
+ self.pair_pos: dict[tuple[int, int], set[int]] = defaultdict(set)
49
+ for i in range(n - 1):
50
+ self.pair_pos[(self.sym[i], self.sym[i + 1])].add(i)
51
+ self.pair_counts: dict[tuple[int, int], int] = {
52
+ p: len(s) for p, s in self.pair_pos.items()
53
+ }
54
+
55
+ self._n_alive = n
56
+ # Slot 0 can never be the right half of a merge (j = nxt[i] ≥ 1), so it never dies;
57
+ # walking ``nxt`` from slot 0 visits exactly the alive slots in order.
58
+ self._head = 0 if n else -1
59
+
60
+ # -- read interface --------------------------------------------------------------
61
+ @property
62
+ def corpus_len(self) -> int:
63
+ return self._n_alive
64
+
65
+ @property
66
+ def corpus_coords(self) -> list[int]:
67
+ """Reconstruct the full live coordinate sequence (O(corpus); needed once at save)."""
68
+ return self.sample(self._n_alive)
69
+
70
+ def sample(self, n: int) -> list[int]:
71
+ """First ``n`` live coordinates (O(n)); cheap path for per-merge kernel sampling."""
72
+ out: list[int] = []
73
+ i = self._head
74
+ # Defensive: slot 0 never dies by construction (j = nxt[i] ≥ 1, so slot 0 is never the
75
+ # right half of a merge), but advance past any dead leading slot so reconstruction stays
76
+ # correct even if that invariant is ever changed by future edits.
77
+ while i != -1 and not self.alive[i]:
78
+ i = self.nxt[i]
79
+ while i != -1 and len(out) < n:
80
+ if self.alive[i]:
81
+ out.append(self.sym[i])
82
+ i = self.nxt[i]
83
+ return out
84
+
85
+ def get_pairs(self, min_count: int = 5) -> dict[tuple[int, int], int]:
86
+ return {p: c for p, c in self.pair_counts.items() if c >= min_count}
87
+
88
+ # -- mutation --------------------------------------------------------------------
89
+ def _remove(self, p: tuple[int, int], i: int) -> None:
90
+ s = self.pair_pos.get(p)
91
+ if s and i in s:
92
+ s.discard(i)
93
+ if s:
94
+ self.pair_counts[p] = len(s)
95
+ else:
96
+ self.pair_pos.pop(p, None)
97
+ self.pair_counts.pop(p, None)
98
+
99
+ def _add(self, p: tuple[int, int], i: int) -> None:
100
+ self.pair_pos[p].add(i)
101
+ self.pair_counts[p] = len(self.pair_pos[p])
102
+
103
+ def apply_merge(self, coord_a: int, coord_b: int, new_coord: int) -> int:
104
+ """Fuse every adjacent occurrence of (coord_a, coord_b) into new_coord.
105
+
106
+ Returns the number of occurrences merged. Updates pair counts EXACTLY (remove 3
107
+ old / add 2 new per occurrence) with no drift. Overlapping occurrences (a == b,
108
+ e.g. 'aaa') are handled left-to-right via the alive/symbol guards, matching the
109
+ naive trainer's _apply_fusion.
110
+ """
111
+ sym, nxt, prv, alive = self.sym, self.nxt, self.prv, self.alive
112
+ merged = 0
113
+ # snapshot positions left-to-right so overlapping occurrences merge like the naive path
114
+ for i in sorted(self.pair_pos.get((coord_a, coord_b), ())):
115
+ if not alive[i] or sym[i] != coord_a:
116
+ continue
117
+ j = nxt[i]
118
+ if j == -1 or not alive[j] or sym[j] != coord_b:
119
+ continue
120
+ h, k = prv[i], nxt[j]
121
+ # remove the three old pairs touching this occurrence
122
+ self._remove((coord_a, coord_b), i)
123
+ if h != -1 and alive[h]:
124
+ self._remove((sym[h], coord_a), h)
125
+ if k != -1 and alive[k]:
126
+ self._remove((coord_b, sym[k]), j)
127
+ # splice: slot i becomes new_coord, slot j is dropped
128
+ sym[i] = new_coord
129
+ nxt[i] = k
130
+ if k != -1:
131
+ prv[k] = i
132
+ alive[j] = 0
133
+ self._n_alive -= 1
134
+ merged += 1
135
+ # add the two new pairs
136
+ if h != -1 and alive[h]:
137
+ self._add((sym[h], new_coord), h)
138
+ if k != -1 and alive[k]:
139
+ self._add((new_coord, sym[k]), i)
140
+ return merged
@@ -0,0 +1,13 @@
1
+ """Single-source constants for qig-coordizer.
2
+
3
+ Per the QIG single-source rule, ``qig-core`` OWNS the canonical constants
4
+ (``BASIN_DIM`` lives in ``qig_core.constants.frozen_facts``). This module
5
+ re-exports them so coordizer code has a stable local import point WITHOUT
6
+ redefining the value — three copies of a constant means zero source of truth.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from qig_core import BASIN_DIM # 64 — the Δ⁶³ basin dimension (canonical, qig-core-owned)
12
+
13
+ __all__ = ["BASIN_DIM"]