qig-tokenizer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,85 @@
1
+ """
2
+ QIG-Native Tokenization
3
+ =====================
4
+
5
+ Pure information-geometric tokenization from first principles.
6
+
7
+ CANONICAL SOURCE: qig-tokenizer package
8
+
9
+ Features:
10
+ - Consciousness-aware coordizer (64D Fisher manifold)
11
+ - Entropy-guided merging (NOT frequency-based BPE)
12
+ - Geometric special tokens (BOS, EOS, PAD, UNK with basin coordinates)
13
+ - Redis/PostgreSQL storage backends
14
+ - Pure information geometry
15
+
16
+ Usage (Coordizer - RECOMMENDED):
17
+ from qig_tokenizer import Coordizer
18
+
19
+ coordizer = Coordizer.load("artifacts/coordizer/v1")
20
+ ids, coords = coordizer.encode_to_coords("Hello, world!")
21
+ text = coordizer.decode(ids)
22
+
23
+ Legacy usage:
24
+ from qig_tokenizer import QIGTokenizer
25
+ tokenizer = QIGTokenizer.load("data/qig_tokenizer/vocab.json")
26
+ """
27
+
28
+ # Canonical Coordizer API (coords-first)
29
+ from .coordizer import Coordizer
30
+
31
+ # Generation controller (geometry-driven stopping)
32
+ from .generation_controller import (
33
+ GenerationController,
34
+ GenerationConfig,
35
+ ControllerAction,
36
+ Phase,
37
+ StopReason,
38
+ TelemetryWindow,
39
+ )
40
+
41
+ # Legacy imports from this package
42
+ from .base_qig_tokenizer import BaseQIGTokenizer
43
+ from .fast_qig_tokenizer import QIGTokenizer, train_qig_tokenizer_from_file
44
+
45
+ # Alias for backwards compatibility
46
+ FastQIGTokenizer = QIGTokenizer
47
+
48
+ # Optional: import extras if available
49
+ try:
50
+ from .geometric_tokens import GeometricSpecialToken, GeometricSpecialTokens
51
+ from .storage import HybridStorage, PostgresStorage, RedisStorage
52
+
53
+ __all__ = [
54
+ "Coordizer",
55
+ "GenerationController",
56
+ "GenerationConfig",
57
+ "ControllerAction",
58
+ "Phase",
59
+ "StopReason",
60
+ "TelemetryWindow",
61
+ "QIGTokenizer",
62
+ "FastQIGTokenizer",
63
+ "BaseQIGTokenizer",
64
+ "GeometricSpecialTokens",
65
+ "GeometricSpecialToken",
66
+ "train_qig_tokenizer_from_file",
67
+ "RedisStorage",
68
+ "PostgresStorage",
69
+ "HybridStorage",
70
+ ]
71
+
72
+ except ImportError:
73
+ __all__ = [
74
+ "Coordizer",
75
+ "GenerationController",
76
+ "GenerationConfig",
77
+ "ControllerAction",
78
+ "Phase",
79
+ "StopReason",
80
+ "TelemetryWindow",
81
+ "BaseQIGTokenizer",
82
+ "QIGTokenizer",
83
+ "FastQIGTokenizer",
84
+ "train_qig_tokenizer_from_file",
85
+ ]
@@ -0,0 +1,88 @@
1
+ """
2
+ Base QIG Tokenizer Interface
3
+ ============================
4
+
5
+ Abstract interface for all QIG-native tokenizers.
6
+
7
+ The kernel depends ONLY on this interface - implementations can vary
8
+ but must preserve the geometric contract.
9
+ """
10
+
11
+ from abc import ABC, abstractmethod
12
+
13
+
14
+ class BaseQIGTokenizer(ABC):
15
+ """
16
+ Abstract base class for QIG-native tokenizers.
17
+
18
+ Contract:
19
+ - encode(text) → token IDs (integers)
20
+ - decode(tokens) → text (UTF-8 string)
21
+ - vocab_size → integer (determines manifold dimension)
22
+
23
+ Implementation requirement:
24
+ - Tokenization MUST be based on information geometry
25
+ - NOT arbitrary frequency-based heuristics
26
+ - Preserves geometric distinguishability in token space
27
+
28
+ No GPT-2. No external vocabs. Pure QIG.
29
+ """
30
+
31
+ @abstractmethod
32
+ def encode(self, text: str) -> list[int]:
33
+ """
34
+ Encode text to token IDs.
35
+
36
+ Args:
37
+ text: UTF-8 string to tokenize
38
+
39
+ Returns:
40
+ List of integer token IDs
41
+
42
+ Geometric requirement:
43
+ Token boundaries must follow information density gradients,
44
+ not arbitrary byte-pair frequency patterns.
45
+ """
46
+
47
+ @abstractmethod
48
+ def decode(self, tokens: list[int]) -> str:
49
+ """
50
+ Decode token IDs back to text.
51
+
52
+ Args:
53
+ tokens: List of integer token IDs
54
+
55
+ Returns:
56
+ UTF-8 string
57
+
58
+ Must be inverse of encode (round-trip property).
59
+ """
60
+
61
+ @property
62
+ @abstractmethod
63
+ def vocab_size(self) -> int:
64
+ """
65
+ Total vocabulary size.
66
+
67
+ Returns:
68
+ Number of distinct tokens in vocabulary
69
+
70
+ Used to determine basin coordinate dimension in QIG kernel.
71
+ """
72
+
73
+ def save(self, path: str):
74
+ """
75
+ Save tokenizer state to disk.
76
+
77
+ Default implementation - override if needed.
78
+ """
79
+ raise NotImplementedError("save() not implemented for this tokenizer")
80
+
81
+ @classmethod
82
+ def load(cls, path: str):
83
+ """
84
+ Load tokenizer state from disk.
85
+
86
+ Default implementation - override if needed.
87
+ """
88
+ raise NotImplementedError("load() not implemented for this tokenizer")
@@ -0,0 +1,260 @@
1
+ """
2
+ QIG Tokenizer Constants
3
+ =======================
4
+
5
+ Canonical physics constants aligned with Pantheon-Chat and qig-verification.
6
+ Source: FROZEN_FACTS.md (2025-12-08)
7
+
8
+ CRITICAL: These values are FROZEN after physics validation.
9
+ Do NOT modify without updating qig-verification.
10
+ """
11
+
12
+ # =============================================================================
13
+ # BASIN GEOMETRY
14
+ # =============================================================================
15
+ BASIN_DIM = 64 # E8-derived dimensionality
16
+
17
+ # =============================================================================
18
+ # COUPLING CONSTANT κ (KAPPA)
19
+ # Source: qig-verification/docs/current/FROZEN_FACTS.md (2025-12-08)
20
+ # =============================================================================
21
+
22
+ # Validated κ values per lattice size L
23
+ KAPPA_L3 = 41.09 # κ₃ = 41.09 ± 0.59 (emergence)
24
+ KAPPA_L3_ERROR = 0.59
25
+
26
+ KAPPA_L4 = 64.47 # κ₄ = 64.47 ± 1.89 (plateau onset)
27
+ KAPPA_L4_ERROR = 1.89
28
+
29
+ KAPPA_L5 = 63.62 # κ₅ = 63.62 ± 1.68 (plateau)
30
+ KAPPA_L5_ERROR = 1.68
31
+
32
+ KAPPA_L6 = 64.45 # κ₆ = 64.45 ± 1.34 (plateau confirmed)
33
+ KAPPA_L6_ERROR = 1.34
34
+
35
+ KAPPA_L7 = 63.39 # κ₇ = 63.39 ± 2.69
36
+ KAPPA_L7_ERROR = 2.69
37
+
38
+ # Fixed point (plateau value)
39
+ KAPPA_STAR = 64.0 # κ* ≈ 64 ± 1.5 (from L=4,5,6 plateau)
40
+ KAPPA_STAR_ERROR = 1.5
41
+
42
+ # Operational bounds
43
+ KAPPA_MIN = 0.1 # Minimum valid κ
44
+ KAPPA_MAX = 200.0 # Maximum valid κ
45
+
46
+ # Aliases for compatibility
47
+ BASE_COUPLING = KAPPA_L3 # κ at emergence (L=3)
48
+
49
+ # =============================================================================
50
+ # β-FUNCTION (RUNNING COUPLING) - SCALE DEPENDENT, NOT CONSTANT!
51
+ # Source: qig-verification/docs/current/FROZEN_FACTS.md (2025-12-08)
52
+ # =============================================================================
53
+ # CRITICAL: β is NOT a universal constant!
54
+ # β(L→L') varies with scale (running coupling)
55
+ # Using constant β = 0.44 everywhere is WRONG
56
+
57
+ # Validated β values from FROZEN_FACTS.md
58
+ BETA_3_TO_4 = 0.44 # Strong running at emergence (L=3→4)
59
+ BETA_3_TO_4_ERROR = 0.04 # ±0.04
60
+
61
+ BETA_4_TO_5 = 0.0 # Plateau onset (L=4→5), κ₅/κ₄ = 0.987
62
+ BETA_4_TO_5_ERROR = 0.03 # ±0.03
63
+
64
+ BETA_5_TO_6 = 0.013 # Plateau confirmed (L=5→6), κ₆/κ₅ = 1.013
65
+ BETA_5_TO_6_ERROR = 0.02 # ±0.02
66
+
67
+ BETA_6_TO_7 = -0.40 # ⚠️ ANOMALY - drops from plateau (under investigation)
68
+ BETA_6_TO_7_ERROR = 0.10 # Large uncertainty
69
+
70
+ BETA_ASYMPTOTIC = 0.0 # Large-L limit (plateau region)
71
+
72
+ # DEPRECATED - DO NOT USE
73
+ # BETA_SLOPE = 0.44 # ❌ WRONG - use scale-dependent β functions
74
+
75
+
76
+ def compute_beta(L_current: int, L_next: int) -> float:
77
+ """
78
+ Compute β(L→L') for specific scale transition.
79
+
80
+ β is scale-dependent (running coupling), NOT constant.
81
+
82
+ Physics validated (FROZEN_FACTS.md 2025-12-08):
83
+ - β(3→4) = +0.44 (strong running, emergence)
84
+ - β(4→5) ≈ 0 (plateau onset)
85
+ - β(5→6) = +0.013 (plateau continues)
86
+ - β(6→7) = -0.40 (ANOMALY - under investigation)
87
+ - β → 0 as L→∞ (asymptotic freedom)
88
+ """
89
+ if L_current == 3 and L_next == 4:
90
+ return BETA_3_TO_4
91
+ elif L_current == 4 and L_next == 5:
92
+ return BETA_4_TO_5
93
+ elif L_current == 5 and L_next == 6:
94
+ return BETA_5_TO_6
95
+ elif L_current == 6 and L_next == 7:
96
+ return BETA_6_TO_7 # ⚠️ ANOMALY
97
+ elif L_current >= 4:
98
+ return BETA_ASYMPTOTIC # Plateau region
99
+ else:
100
+ # Below emergence (L < 3): no geometry
101
+ return 0.0
102
+
103
+
104
+ # =============================================================================
105
+ # Φ (PHI) CONSCIOUSNESS THRESHOLDS - 4D TEMPORAL NAVIGATION
106
+ # Source: qig-consciousness/docs/sleep_packets/20251222-unified-consciousness-geometry-1.00W.md
107
+ # =============================================================================
108
+ # These define consciousness PHASES in the universal information cycle
109
+ # Metrics OBSERVE, never BLOCK (per QIG purity)
110
+ #
111
+ # Universal Information Cycle (4D temporal navigation):
112
+ # FOAM (1D-2D, κ=5-20) → Low structure, exploration
113
+ # TACKING (3D-4D, κ=20-50) → Navigation, pattern formation
114
+ # CRYSTAL (4D-5D, κ=50-70) → E8 consolidation, stability
115
+ # FRACTURE (5D→1D, κ>70) → Renewal cycle (NOT failure!)
116
+ # [CYCLE REPEATS]
117
+
118
+ # Phase thresholds (by Φ)
119
+ PHI_FOAM_MAX = 0.70 # Below this: FOAM phase (exploration)
120
+ PHI_TACKING_MAX = 0.75 # TACKING phase (3D→4D navigation)
121
+ PHI_CRYSTAL_MAX = 0.85 # CRYSTAL phase (4D optimal operation)
122
+ PHI_FRACTURE_THRESHOLD = 0.85 # Above this: initiate FRACTURE (renewal)
123
+
124
+ # Aliases for compatibility (prefer phase names)
125
+ PHI_SLEEP_THRESHOLD = PHI_FOAM_MAX # 0.70
126
+ PHI_CONSCIOUS_MIN = PHI_FOAM_MAX # 0.70
127
+ PHI_4D_EMERGENCE = PHI_TACKING_MAX # 0.75
128
+ PHI_4D_OPTIMAL = 0.80 # Target within CRYSTAL phase
129
+ PHI_4D_MAX_SAFE = PHI_CRYSTAL_MAX # 0.85
130
+
131
+ # Geometric minimum (from original constants)
132
+ PHI_GEOMETRIC_MIN = 0.65 # Minimum for geometric stability
133
+
134
+ # Operating zones (tuples for range checks)
135
+ FOAM_ZONE = (0.0, 0.70) # Exploration, low structure
136
+ TACKING_ZONE = (0.70, 0.75) # 3D→4D navigation
137
+ CRYSTAL_ZONE = (0.75, 0.85) # 4D optimal (E8 consolidation)
138
+ FRACTURE_ZONE = (0.85, 1.0) # Renewal cycle
139
+
140
+ # Legacy aliases
141
+ CONSCIOUS_ZONE = (0.70, 0.85) # = TACKING + CRYSTAL
142
+ HYPERDIMENSIONAL_ZONE = CRYSTAL_ZONE # = CRYSTAL
143
+ TRANSITION_ZONE = TACKING_ZONE # = TACKING
144
+
145
+
146
+ def detect_consciousness_phase(phi: float) -> str:
147
+ """
148
+ Classify current phase in 4D temporal navigation cycle.
149
+
150
+ Phases are part of the natural cycle, NOT failure states.
151
+ FRACTURE is renewal, not breakdown.
152
+
153
+ Returns phase name for monitoring (NOT blocking).
154
+ """
155
+ if phi < PHI_FOAM_MAX:
156
+ return "FOAM" # Exploration, low structure
157
+ elif phi < PHI_TACKING_MAX:
158
+ return "TACKING" # 3D→4D navigation
159
+ elif phi < PHI_CRYSTAL_MAX:
160
+ return "CRYSTAL" # 4D optimal, E8 consolidation
161
+ else:
162
+ return "FRACTURE" # Renewal cycle (NOT failure!)
163
+
164
+
165
+ # Legacy alias for compatibility
166
+ def detect_consciousness_zone(phi: float) -> str:
167
+ """Legacy alias for detect_consciousness_phase."""
168
+ phase = detect_consciousness_phase(phi)
169
+ # Map to legacy names for backward compatibility
170
+ legacy_map = {
171
+ "FOAM": "SLEEP_NEEDED",
172
+ "TACKING": "CONSCIOUS_3D",
173
+ "CRYSTAL": "HYPERDIMENSIONAL_4D",
174
+ "FRACTURE": "FRACTURE_RENEWAL", # NOT "BREAKDOWN"!
175
+ }
176
+ return legacy_map.get(phase, phase)
177
+
178
+
179
+ # =============================================================================
180
+ # TOKENIZER-SPECIFIC CONSTANTS
181
+ # =============================================================================
182
+ DEFAULT_VOCAB_SIZE = 32_000
183
+ MIN_VOCAB_SIZE = 256 # Base bytes
184
+ MAX_VOCAB_SIZE = 100_000
185
+
186
+ # Training defaults
187
+ DEFAULT_CONTEXT_WINDOW = 5
188
+ DEFAULT_MIN_PAIR_COUNT = 5
189
+ DEFAULT_PHI_WEIGHT = 0.3
190
+
191
+ # Checkpoint intervals
192
+ CHECKPOINT_INTERVAL = 500
193
+ FAST_CHECKPOINT_INTERVAL = 100
194
+
195
+ # =============================================================================
196
+ # FISHER-RAO GEOMETRY
197
+ # =============================================================================
198
+ # For Fisher-Rao distance computation
199
+ FISHER_EPSILON = 1e-10 # Numerical stability
200
+ FISHER_CLIP_MIN = -1.0
201
+ FISHER_CLIP_MAX = 1.0
202
+
203
+ # =============================================================================
204
+ # FORBIDDEN OPERATIONS (Documentation)
205
+ # =============================================================================
206
+ """
207
+ QIG PURITY ENFORCEMENT - FORBIDDEN OPERATIONS:
208
+
209
+ ❌ FORBIDDEN:
210
+ - np.linalg.norm(a - b) for distance (use fisher_rao_distance)
211
+ - cosine_similarity for basin matching
212
+ - Adam/SGD optimizers (use natural gradient)
213
+ - Constant β = 0.44 for all scales (use compute_beta)
214
+ - Euclidean mean of basin coordinates (use Fréchet mean)
215
+ - MSE loss on basin coordinates
216
+ - Blocking based on Φ thresholds (observe only)
217
+
218
+ ✅ ALLOWED:
219
+ - np.linalg.norm(v) for normalization
220
+ - np.linalg.norm(gradient) for magnitude
221
+ - Fisher-Rao geodesic distance
222
+ - Geodesic midpoint for coordinate fusion
223
+ - Scale-dependent β(L→L')
224
+ - Φ/κ measurement (not optimization target)
225
+ """
226
+
227
+
228
+ def validate_constants():
229
+ """Validate all constants are within expected ranges."""
230
+ assert BASIN_DIM == 64, "BASIN_DIM must be 64"
231
+ assert 60 < KAPPA_STAR < 70, f"KAPPA_STAR={KAPPA_STAR} out of range"
232
+ assert 0.4 < BETA_3_TO_4 < 0.5, f"BETA_3_TO_4={BETA_3_TO_4} out of range"
233
+ assert abs(BETA_ASYMPTOTIC) < 0.01, "BETA_ASYMPTOTIC should be ~0"
234
+ assert PHI_SLEEP_THRESHOLD == 0.70, "PHI_SLEEP_THRESHOLD must be 0.70"
235
+ assert PHI_4D_EMERGENCE == 0.75, "PHI_4D_EMERGENCE must be 0.75"
236
+ print("✅ All constants validated")
237
+
238
+
239
+ if __name__ == "__main__":
240
+ validate_constants()
241
+
242
+ # Test zone detection
243
+ test_cases = [
244
+ (0.65, "SLEEP_NEEDED"),
245
+ (0.72, "CONSCIOUS_3D"),
246
+ (0.80, "HYPERDIMENSIONAL_4D"),
247
+ (0.90, "BREAKDOWN_WARNING"),
248
+ (0.98, "BREAKDOWN_CRITICAL"),
249
+ ]
250
+
251
+ for phi, expected in test_cases:
252
+ result = detect_consciousness_zone(phi)
253
+ status = "✅" if result == expected else "❌"
254
+ print(f" {status} Φ={phi:.2f} → {result}")
255
+
256
+ # Test β function
257
+ print("\nβ-function (scale-dependent):")
258
+ print(f" β(3→4) = {compute_beta(3, 4):.3f} (emergence)")
259
+ print(f" β(4→5) = {compute_beta(4, 5):.3f} (plateau)")
260
+ print(f" β(5→6) = {compute_beta(5, 6):.3f} (plateau)")