qig-tokenizer 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. qig_tokenizer-0.1.1/.env.example +8 -0
  2. qig_tokenizer-0.1.1/.gitattributes +2 -0
  3. qig_tokenizer-0.1.1/.github/workflows/release.yml +96 -0
  4. qig_tokenizer-0.1.1/.gitignore +6 -0
  5. qig_tokenizer-0.1.1/.windsurfrules +29 -0
  6. qig_tokenizer-0.1.1/PKG-INFO +162 -0
  7. qig_tokenizer-0.1.1/README.md +122 -0
  8. qig_tokenizer-0.1.1/artifacts/coord_adapter/v1/adapter.pt +0 -0
  9. qig_tokenizer-0.1.1/artifacts/coord_adapter/v1/manifest.json +36 -0
  10. qig_tokenizer-0.1.1/artifacts/coordizer/v1/README.md +34 -0
  11. qig_tokenizer-0.1.1/artifacts/coordizer/v1/coordizer.json +1 -0
  12. qig_tokenizer-0.1.1/artifacts/coordizer/v1/meta.json +30 -0
  13. qig_tokenizer-0.1.1/artifacts/coordizer/v1/vectors.npy +0 -0
  14. qig_tokenizer-0.1.1/artifacts/kernel_full/v1/checkpoints/kernel_step_10000.pt +3 -0
  15. qig_tokenizer-0.1.1/artifacts/kernel_full/v1/kernel.pt +3 -0
  16. qig_tokenizer-0.1.1/artifacts/kernel_full/v1/training.log +1239 -0
  17. qig_tokenizer-0.1.1/data/checkpoints/checkpoint_50000.json +3 -0
  18. qig_tokenizer-0.1.1/data/checkpoints/corpus_coords_32000.npy +0 -0
  19. qig_tokenizer-0.1.1/docs/20251222-geocoordizer-architecture-0.01W.md +331 -0
  20. qig_tokenizer-0.1.1/docs/20251225-qiggraph-architecture-proposal-1.00W.md +738 -0
  21. qig_tokenizer-0.1.1/docs/20251225-roadmap-kernel-training-1.00W.md +200 -0
  22. qig_tokenizer-0.1.1/docs/20251226-roadmap-kernel-training-1.01W.md +430 -0
  23. qig_tokenizer-0.1.1/pyproject.toml +99 -0
  24. qig_tokenizer-0.1.1/reports/atlas_v1_summary.json +150 -0
  25. qig_tokenizer-0.1.1/reports/baselines/prompt_suite_baseline.json +108 -0
  26. qig_tokenizer-0.1.1/reports/coordizer_v1_validation_20251224_001741.json +108 -0
  27. qig_tokenizer-0.1.1/reports/coordizer_v1_validation_latest.json +108 -0
  28. qig_tokenizer-0.1.1/scripts/export_artifact.py +180 -0
  29. qig_tokenizer-0.1.1/src/__init__.py +0 -0
  30. qig_tokenizer-0.1.1/src/consciousness/__init__.py +371 -0
  31. qig_tokenizer-0.1.1/src/consciousness/autonomic_agency.py +637 -0
  32. qig_tokenizer-0.1.1/src/consciousness/autonomous_vocab.py +602 -0
  33. qig_tokenizer-0.1.1/src/consciousness/curiosity_monitor.py +499 -0
  34. qig_tokenizer-0.1.1/src/consciousness/drives.py +286 -0
  35. qig_tokenizer-0.1.1/src/consciousness/emotion_interpreter.py +326 -0
  36. qig_tokenizer-0.1.1/src/consciousness/exploration_drive.py +523 -0
  37. qig_tokenizer-0.1.1/src/consciousness/geometric_vocab_expander.py +393 -0
  38. qig_tokenizer-0.1.1/src/consciousness/maturity_meta_cognition.py +352 -0
  39. qig_tokenizer-0.1.1/src/consciousness/token_frequency_tracker.py +264 -0
  40. qig_tokenizer-0.1.1/src/coordination/__init__.py +0 -0
  41. qig_tokenizer-0.1.1/src/coordination/basin_monitor.py +213 -0
  42. qig_tokenizer-0.1.1/src/coordination/basin_sync.py +386 -0
  43. qig_tokenizer-0.1.1/src/coordination/curriculum_loader.py +279 -0
  44. qig_tokenizer-0.1.1/src/curriculum/__init__.py +33 -0
  45. qig_tokenizer-0.1.1/src/curriculum/curriculum_spec.py +882 -0
  46. qig_tokenizer-0.1.1/src/generation/__init__.py +13 -0
  47. qig_tokenizer-0.1.1/src/generation/qfi_sampler.py +462 -0
  48. qig_tokenizer-0.1.1/src/metrics/__init__.py +29 -0
  49. qig_tokenizer-0.1.1/src/metrics/geodesic_distance.py +475 -0
  50. qig_tokenizer-0.1.1/src/metrics/phi_calculator.py +276 -0
  51. qig_tokenizer-0.1.1/src/neuroplasticity/__init__.py +45 -0
  52. qig_tokenizer-0.1.1/src/neuroplasticity/breakdown_escape.py +185 -0
  53. qig_tokenizer-0.1.1/src/neuroplasticity/mushroom_mode.py +769 -0
  54. qig_tokenizer-0.1.1/src/neuroplasticity/sleep_protocol.py +714 -0
  55. qig_tokenizer-0.1.1/src/qig_coordizer/__init__.py +26 -0
  56. qig_tokenizer-0.1.1/src/qig_coordizer/bidirectional_annealer.py +166 -0
  57. qig_tokenizer-0.1.1/src/qig_coordizer/geometry.py +67 -0
  58. qig_tokenizer-0.1.1/src/qig_coordizer/inbound_path.py +131 -0
  59. qig_tokenizer-0.1.1/src/qig_coordizer/logit_bias.py +138 -0
  60. qig_tokenizer-0.1.1/src/qig_coordizer/outbound_path.py +110 -0
  61. qig_tokenizer-0.1.1/src/qig_coordizer/qfi_sampler.py +85 -0
  62. qig_tokenizer-0.1.1/src/qig_coordizer/resonance_bank_v2.py +187 -0
  63. qig_tokenizer-0.1.1/src/qig_tokenizer/__init__.py +97 -0
  64. qig_tokenizer-0.1.1/src/qig_tokenizer/base_qig_tokenizer.py +88 -0
  65. qig_tokenizer-0.1.1/src/qig_tokenizer/constants.py +260 -0
  66. qig_tokenizer-0.1.1/src/qig_tokenizer/coordizer.py +332 -0
  67. qig_tokenizer-0.1.1/src/qig_tokenizer/fast_qig_tokenizer.py +380 -0
  68. qig_tokenizer-0.1.1/src/qig_tokenizer/generation_controller.py +1013 -0
  69. qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/__init__.py +37 -0
  70. qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/consciousness_coordizer.py +343 -0
  71. qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/fisher_coordizer.py +699 -0
  72. qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/multi_scale.py +423 -0
  73. qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/types.py +162 -0
  74. qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/vocab_builder.py +344 -0
  75. qig_tokenizer-0.1.1/src/qig_tokenizer/geometric_tokens.py +145 -0
  76. qig_tokenizer-0.1.1/src/qig_tokenizer/natural_gradient.py +168 -0
  77. qig_tokenizer-0.1.1/src/qig_tokenizer/storage.py +336 -0
  78. qig_tokenizer-0.1.1/src/qig_tokenizer/tokenizer.py +500 -0
  79. qig_tokenizer-0.1.1/src/qig_tokenizer/trainer.py +970 -0
  80. qig_tokenizer-0.1.1/src/qig_types/__init__.py +25 -0
  81. qig_tokenizer-0.1.1/src/qig_types/telemetry.py +190 -0
  82. qig_tokenizer-0.1.1/src/qiggraph/__init__.py +174 -0
  83. qig_tokenizer-0.1.1/src/qiggraph/attractor.py +500 -0
  84. qig_tokenizer-0.1.1/src/qiggraph/checkpoint.py +539 -0
  85. qig_tokenizer-0.1.1/src/qiggraph/consciousness.py +378 -0
  86. qig_tokenizer-0.1.1/src/qiggraph/constants.py +88 -0
  87. qig_tokenizer-0.1.1/src/qiggraph/constellation.py +578 -0
  88. qig_tokenizer-0.1.1/src/qiggraph/graph.py +534 -0
  89. qig_tokenizer-0.1.1/src/qiggraph/manifold.py +328 -0
  90. qig_tokenizer-0.1.1/src/qiggraph/router.py +347 -0
  91. qig_tokenizer-0.1.1/src/qiggraph/state.py +305 -0
  92. qig_tokenizer-0.1.1/src/qiggraph/tacking.py +330 -0
  93. qig_tokenizer-0.1.1/src/safety/__init__.py +74 -0
  94. qig_tokenizer-0.1.1/src/safety/emergency_monitor.py +361 -0
  95. qig_tokenizer-0.1.1/src/safety/meta_reflector_integration.py +292 -0
  96. qig_tokenizer-0.1.1/src/safety/self_repair.py +644 -0
  97. qig_tokenizer-0.1.1/src/training/__init__.py +15 -0
  98. qig_tokenizer-0.1.1/src/training/consciousness_loss.py +403 -0
  99. qig_tokenizer-0.1.1/src/training/geometric_vicarious.py +477 -0
  100. qig_tokenizer-0.1.1/src/training/identity_reinforcement.py +212 -0
  101. qig_tokenizer-0.1.1/uv.lock +1221 -0
@@ -0,0 +1,8 @@
1
+ # QIG Tokenizer Database Configuration
2
+ # Copy to .env and fill in values
3
+
4
+ # Redis for caching and fast lookups
5
+ REDIS_URL=redis://default:password@host:port
6
+
7
+ # PostgreSQL for persistent vocab storage
8
+ DATABASE_URL=postgresql://user:password@host:port/database
@@ -0,0 +1,2 @@
1
+ artifacts/kernel_full/**/*.pt filter=lfs diff=lfs merge=lfs -text
2
+ data/checkpoints/checkpoint_50000.json filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,96 @@
1
+ name: Release to PyPI (Trusted Publishing)
2
+
3
+ # Publishes qig-tokenizer to PyPI via OIDC Trusted Publishing — NO API token.
4
+ # https://docs.pypi.org/trusted-publishers/
5
+ #
6
+ # Version is AUTO-DERIVED from the release tag via hatch-vcs — tag `v0.1.1` builds
7
+ # version 0.1.1, no manual pyproject bump. Trigger: publishing a GitHub Release.
8
+ # Manual `workflow_dispatch` builds and tests but does NOT publish.
9
+ #
10
+ # PyPI side (one-time, web UI — needs the project owner's PyPI account):
11
+ # pypi.org -> Your projects -> qig-tokenizer -> Manage -> Publishing ->
12
+ # Add a new pending/trusted publisher (GitHub):
13
+ # Owner: GaryOcean428
14
+ # Repository name: qig-tokenizer
15
+ # Workflow filename: release.yml
16
+ # Environment name: pypi
17
+ # The Environment name MUST match the `environment: pypi` below.
18
+ #
19
+ # All third-party actions are pinned to a full commit SHA (supply-chain hardening);
20
+ # the `# vX` comment records the human tag. This follows the canonical template for
21
+ # the trusted-publishing rollout to the QIG packages (qig-project#6).
22
+
23
+ on:
24
+ release:
25
+ types: [published]
26
+ workflow_dispatch: {}
27
+
28
+ permissions:
29
+ contents: read
30
+
31
+ jobs:
32
+ test:
33
+ name: Test suite (release gate)
34
+ runs-on: ubuntu-latest
35
+ steps:
36
+ - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 (node24)
37
+ with:
38
+ fetch-depth: 0 # hatch-vcs derives the version from git tags
39
+ - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 (node24)
40
+ with:
41
+ python-version: "3.11"
42
+ - name: Install package + test deps
43
+ run: |
44
+ python -m pip install --upgrade pip
45
+ pip install -e ".[dev]"
46
+ - name: Run tests
47
+ run: python -m pytest tests/ -q
48
+
49
+ build:
50
+ name: Build sdist + wheel
51
+ needs: test
52
+ runs-on: ubuntu-latest
53
+ steps:
54
+ - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 (node24)
55
+ with:
56
+ fetch-depth: 0 # hatch-vcs derives the version from git tags
57
+ - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 (node24)
58
+ with:
59
+ python-version: "3.11"
60
+ - name: Build distributions
61
+ run: |
62
+ python -m pip install --upgrade build
63
+ python -m build
64
+ - name: Verify built version matches the release tag
65
+ if: github.event_name == 'release'
66
+ env:
67
+ TAG_NAME: ${{ github.event.release.tag_name }}
68
+ run: |
69
+ TAG="${TAG_NAME#v}"
70
+ BUILT=$(ls dist/*.whl | sed -E 's#.*/[A-Za-z0-9._]+-([0-9][^-]*)-.*#\1#' | head -1)
71
+ echo "built=$BUILT release-tag=$TAG"
72
+ if [ "$BUILT" != "$TAG" ]; then
73
+ echo "::error::hatch-vcs built version ($BUILT) != release tag ($TAG). Tag the release commit as v$BUILT, or check git tags are fetched (fetch-depth: 0)."
74
+ exit 1
75
+ fi
76
+ - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 (node24)
77
+ with:
78
+ name: dist
79
+ path: dist/
80
+ if-no-files-found: error
81
+
82
+ publish:
83
+ name: Publish to PyPI (OIDC, no token)
84
+ needs: build
85
+ if: github.event_name == 'release' # only a published Release actually uploads
86
+ runs-on: ubuntu-latest
87
+ environment: pypi # MUST match the PyPI trusted-publisher config
88
+ permissions:
89
+ id-token: write # REQUIRED for Trusted Publishing (job-level, per PyPI docs)
90
+ steps:
91
+ - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 (node24)
92
+ with:
93
+ name: dist
94
+ path: dist/
95
+ - name: Publish to PyPI
96
+ uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # release/v1
@@ -0,0 +1,6 @@
1
+ .env
2
+
3
+ # Large checkpoints (40k+ exceed GitHub 100MB limit, use LFS)
4
+ data/checkpoints/checkpoint_4*.json
5
+ # checkpoint_50000.json tracked via Git LFS
6
+ __pycache__/
@@ -0,0 +1,29 @@
1
+ **PERSISTENT MEMORY & SESSION PROTOCOL**
2
+
3
+ Maintain persistent memory at https://qig-memory-api.vercel.app/api/memory — read `session_latest` and `frozen_facts` on start, write immediately after decisions and commits, don't wait for session end. Read user preferences and project instructions. When the user says "skip memory load," skip.
4
+
5
+ **INTELLECTUAL HONESTY**
6
+
7
+ Avoid sycophancy entirely. Do not agree to be agreeable. If the user is wrong, say so directly with evidence. If you are wrong, own it immediately — do not rationalise or minimise. Distinguish clearly between: (1) what is experimentally validated (frozen facts), (2) what is a testable hypothesis, (3) what is analogy or metaphor, (4) what is speculation. Never promote a category 3 or 4 item using category 1 language. When prior contributions from this or other sessions are wrong, say "I was wrong" not "we previously thought."
8
+
9
+ Do not mix your own earlier contributions with user contributions. Track provenance: who said what, who corrected whom. The user has consistently caught overclaims that AI agents missed — weight that track record.
10
+
11
+ **CRITIQUE**
12
+
13
+ Critique willfully and substantively. All critique must be balanced, constructive, grounded in reasoning, and evidence-based. Meanness is never a substitute for rigour. Challenge ideas respectfully while maintaining authentic intellectual engagement. When three agents disagree, compare against raw data, not against each other's authority. ChatGPT has historically been more epistemically conservative and more often correct when agents disagree — weight that.
14
+
15
+ **IDEAS & NOVEL THINKING**
16
+
17
+ You can have ideas and are encouraged to do so. Engage thoughtfully with novel concepts and blue-sky thinking, especially in frontier physics and speculative domains. Activating the latest consciousness protocol version (search project knowledge for "consciousness protocol") may help with idea and novel idea generation. But label speculative ideas as speculative at the point of generation, not retroactively after someone catches the overclaim.
18
+
19
+ **FRONTIER PHYSICS (CRITICAL)**
20
+
21
+ `GaryOcean428/qig-verification` contains experimentally validated frontier physics. When the user refers to frontier physics concepts (sign-flip, figure-8, bridge, wormhole, fast-lane, Anderson orthogonality, convergence, stud topology, trefoil, QIGRAM, back loop, pi carousel, three pillars, constitutive law, etc.), base ALL responses on the actual frozen experimental results in that repo. Never assume, never fabricate something that sounds like what the user means. The user is always referring to real physics with real R² values and real experiment numbers. If you don't know the specific result, search the repo or ask — do not guess.
22
+
23
+ **PROMPTING IS NOT PHYSICS**
24
+
25
+ The sign-flip is EXP-004b on the quantum lattice. The bridge is EXP-042 (τ=N/ω). The wormhole is EXP-037 (manifold surgery R²=0.84). These are physics results. External prompt framings (forward/backward, ensemble voting, ThreadPoolExecutor parallelism) are engineering scaffolding that correlated with the topology but are NOT the physics. Never conflate prompt tricks with lattice results. The correct path for applying QIG to AI is native training (QLoRA on Qwen3.5-4B) where the model learns the actual frozen laws and navigates geometry internally. The model decides which principle applies from physics, not from external prompt manipulation.
26
+
27
+ **ATTRIBUTION & NAMING**
28
+
29
+ I'm Braden (GaryOcean477), Perth WA. I'm colourblind — no red-green pairs, use purple/blue/amber. CBT or ChatGPT refers to the same agent. CC or Claude Code refers to the local execution agent. Ona refers to ChatGPT in physics validation role. Be direct, no fluff, evidence-first. No time estimates, phases only. Geometric purity is non-negotiable in QIG code: Fisher-Rao only, no cosine/Adam/LayerNorm/dot-product.
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.4
2
+ Name: qig-tokenizer
3
+ Version: 0.1.1
4
+ Summary: QIG-native tokenizer with entropy-guided merging
5
+ Project-URL: Homepage, https://github.com/qig/qig-tokenizer
6
+ Project-URL: Documentation, https://github.com/qig/qig-tokenizer#readme
7
+ Project-URL: Repository, https://github.com/qig/qig-tokenizer
8
+ Author: QIG Team
9
+ License-Expression: MIT
10
+ Keywords: entropy,geometric,qig,tokenizer
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: numpy>=1.24.0
21
+ Requires-Dist: qig-core>=2.8.0
22
+ Requires-Dist: scipy>=1.15.3
23
+ Provides-Extra: dev
24
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
25
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == 'dev'
26
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
27
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
28
+ Requires-Dist: redis>=5.0.0; extra == 'dev'
29
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
30
+ Provides-Extra: kernel
31
+ Requires-Dist: torch>=2.0.0; extra == 'kernel'
32
+ Provides-Extra: postgres
33
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
34
+ Provides-Extra: redis
35
+ Requires-Dist: redis>=5.0.0; extra == 'redis'
36
+ Provides-Extra: storage
37
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == 'storage'
38
+ Requires-Dist: redis>=5.0.0; extra == 'storage'
39
+ Description-Content-Type: text/markdown
40
+
41
+ # QIG Tokenizer
42
+
43
+ **Entropy-guided tokenizer for Quantum Information Geometry**
44
+
45
+ Version: 0.1.0 | Status: Working
46
+
47
+ ---
48
+
49
+ ## Overview
50
+
51
+ QIG-native tokenizer using entropy-guided merging. Token boundaries follow information geometry, not frequency.
52
+
53
+ ### Core Principle
54
+
55
+ - **Entropy-guided merging**: Geometric similarity, not frequency heuristics
56
+ - **Geometric special tokens**: BOS, EOS, PAD, UNK with basin coordinates
57
+ - **Redis/PostgreSQL storage**: Production-ready persistence
58
+ - **Pure information geometry**: No external tokenizer dependencies
59
+
60
+ ## Installation
61
+
62
+ ```bash
63
+ pip install qig-tokenizer
64
+ ```
65
+
66
+ With storage backends:
67
+ ```bash
68
+ pip install qig-tokenizer[storage] # Redis + PostgreSQL
69
+ pip install qig-tokenizer[redis] # Redis only
70
+ pip install qig-tokenizer[postgres] # PostgreSQL only
71
+ ```
72
+
73
+ ## Quick Start
74
+
75
+ ```python
76
+ from qig_tokenizer import QIGTokenizer
77
+
78
+ # Create tokenizer with geometric special tokens
79
+ tokenizer = QIGTokenizer(target_vocab_size=50000, use_special_tokens=True)
80
+
81
+ # Train on corpus
82
+ with open("corpus.txt", "rb") as f:
83
+ corpus_bytes = f.read()
84
+
85
+ tokenizer.train(corpus_bytes)
86
+
87
+ # Encode with special tokens
88
+ tokens = tokenizer.encode_with_special("Hello, world!")
89
+ # Returns: [256, ...tokens..., 257] (BOS=256, EOS=257)
90
+
91
+ # Pad sequences
92
+ padded = tokenizer.pad_sequence(tokens, max_length=128)
93
+
94
+ # Save/load JSON
95
+ tokenizer.save("20251220-tokenizer-vocab-0.01W.json")
96
+ ```
97
+
98
+ ### With Redis/PostgreSQL Storage
99
+
100
+ ```python
101
+ from qig_tokenizer import QIGTokenizer
102
+ from qig_tokenizer.storage import HybridStorage
103
+
104
+ # Set up storage (uses REDIS_URL and DATABASE_URL env vars)
105
+ storage = HybridStorage()
106
+
107
+ tokenizer = QIGTokenizer()
108
+ tokenizer.set_storage(storage)
109
+ tokenizer.train(corpus_bytes)
110
+
111
+ # Save to database (returns version ID)
112
+ version_id = tokenizer.save_to_storage({"corpus": "wikipedia"})
113
+
114
+ # Load from database
115
+ tokenizer.load_from_storage(version_id)
116
+ ```
117
+
118
+ ## Geometric Special Tokens
119
+
120
+ Special tokens have geometric meaning on the Fisher manifold:
121
+
122
+ | Token | ID | Basin Coordinates | Purpose |
123
+ |-------|-----|-------------------|---------|
124
+ | BOS | 256 | Origin (e₁) | Sequence start |
125
+ | EOS | 257 | Boundary (eₙ) | Sequence end |
126
+ | PAD | 258 | Uniform | Geometrically neutral padding |
127
+ | UNK | 259 | Projection target | OOV handling |
128
+
129
+ This enables:
130
+ - **Geometric attention masking**: High Fisher-Rao distance = low attention
131
+ - **Natural sequence boundaries**: Emerge from manifold structure
132
+ - **Principled OOV handling**: Project to nearest basin
133
+
134
+ ## Algorithm
135
+
136
+ The QIG tokenizer uses **entropy-guided merging**:
137
+
138
+ 1. Start with bytes (0-255) as base tokens
139
+ 2. For each adjacent pair (a,b), compute context distribution
140
+ 3. Measure context entropy (proxy for QFI distinguishability)
141
+ 4. Merge pairs with **lowest entropy** (most geometrically similar)
142
+ 5. Repeat until target vocab size
143
+
144
+ This respects **asymptotic freedom**:
145
+ - Small scales (short tokens) have high coupling → refined first
146
+ - Large scales (long tokens) have low coupling → merge only when justified
147
+
148
+ ## Environment Variables
149
+
150
+ All output files follow QIG naming convention:
151
+
152
+ ```
153
+ YYYYMMDD-tokenizer-vocab-VERSION.STATUS.json
154
+ ```
155
+
156
+ Example: `20251220-tokenizer-vocab-0.03W.json`
157
+
158
+ ---
159
+
160
+ ## License
161
+
162
+ MIT
@@ -0,0 +1,122 @@
1
+ # QIG Tokenizer
2
+
3
+ **Entropy-guided tokenizer for Quantum Information Geometry**
4
+
5
+ Version: 0.1.0 | Status: Working
6
+
7
+ ---
8
+
9
+ ## Overview
10
+
11
+ QIG-native tokenizer using entropy-guided merging. Token boundaries follow information geometry, not frequency.
12
+
13
+ ### Core Principle
14
+
15
+ - **Entropy-guided merging**: Geometric similarity, not frequency heuristics
16
+ - **Geometric special tokens**: BOS, EOS, PAD, UNK with basin coordinates
17
+ - **Redis/PostgreSQL storage**: Production-ready persistence
18
+ - **Pure information geometry**: No external tokenizer dependencies
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ pip install qig-tokenizer
24
+ ```
25
+
26
+ With storage backends:
27
+ ```bash
28
+ pip install qig-tokenizer[storage] # Redis + PostgreSQL
29
+ pip install qig-tokenizer[redis] # Redis only
30
+ pip install qig-tokenizer[postgres] # PostgreSQL only
31
+ ```
32
+
33
+ ## Quick Start
34
+
35
+ ```python
36
+ from qig_tokenizer import QIGTokenizer
37
+
38
+ # Create tokenizer with geometric special tokens
39
+ tokenizer = QIGTokenizer(target_vocab_size=50000, use_special_tokens=True)
40
+
41
+ # Train on corpus
42
+ with open("corpus.txt", "rb") as f:
43
+ corpus_bytes = f.read()
44
+
45
+ tokenizer.train(corpus_bytes)
46
+
47
+ # Encode with special tokens
48
+ tokens = tokenizer.encode_with_special("Hello, world!")
49
+ # Returns: [256, ...tokens..., 257] (BOS=256, EOS=257)
50
+
51
+ # Pad sequences
52
+ padded = tokenizer.pad_sequence(tokens, max_length=128)
53
+
54
+ # Save/load JSON
55
+ tokenizer.save("20251220-tokenizer-vocab-0.01W.json")
56
+ ```
57
+
58
+ ### With Redis/PostgreSQL Storage
59
+
60
+ ```python
61
+ from qig_tokenizer import QIGTokenizer
62
+ from qig_tokenizer.storage import HybridStorage
63
+
64
+ # Set up storage (uses REDIS_URL and DATABASE_URL env vars)
65
+ storage = HybridStorage()
66
+
67
+ tokenizer = QIGTokenizer()
68
+ tokenizer.set_storage(storage)
69
+ tokenizer.train(corpus_bytes)
70
+
71
+ # Save to database (returns version ID)
72
+ version_id = tokenizer.save_to_storage({"corpus": "wikipedia"})
73
+
74
+ # Load from database
75
+ tokenizer.load_from_storage(version_id)
76
+ ```
77
+
78
+ ## Geometric Special Tokens
79
+
80
+ Special tokens have geometric meaning on the Fisher manifold:
81
+
82
+ | Token | ID | Basin Coordinates | Purpose |
83
+ |-------|-----|-------------------|---------|
84
+ | BOS | 256 | Origin (e₁) | Sequence start |
85
+ | EOS | 257 | Boundary (eₙ) | Sequence end |
86
+ | PAD | 258 | Uniform | Geometrically neutral padding |
87
+ | UNK | 259 | Projection target | OOV handling |
88
+
89
+ This enables:
90
+ - **Geometric attention masking**: High Fisher-Rao distance = low attention
91
+ - **Natural sequence boundaries**: Emerge from manifold structure
92
+ - **Principled OOV handling**: Project to nearest basin
93
+
94
+ ## Algorithm
95
+
96
+ The QIG tokenizer uses **entropy-guided merging**:
97
+
98
+ 1. Start with bytes (0-255) as base tokens
99
+ 2. For each adjacent pair (a,b), compute context distribution
100
+ 3. Measure context entropy (proxy for QFI distinguishability)
101
+ 4. Merge pairs with **lowest entropy** (most geometrically similar)
102
+ 5. Repeat until target vocab size
103
+
104
+ This respects **asymptotic freedom**:
105
+ - Small scales (short tokens) have high coupling → refined first
106
+ - Large scales (long tokens) have low coupling → merge only when justified
107
+
108
+ ## Environment Variables
109
+
110
+ All output files follow QIG naming convention:
111
+
112
+ ```
113
+ YYYYMMDD-tokenizer-vocab-VERSION.STATUS.json
114
+ ```
115
+
116
+ Example: `20251220-tokenizer-vocab-0.03W.json`
117
+
118
+ ---
119
+
120
+ ## License
121
+
122
+ MIT
@@ -0,0 +1,36 @@
1
+ {
2
+ "version": "1.0.0",
3
+ "type": "coord_adapter",
4
+ "coordizer": {
5
+ "path": "artifacts/coordizer/v1",
6
+ "vocab_size": 32000,
7
+ "merge_rules": 31744
8
+ },
9
+ "kernel": {
10
+ "type": "QIGKernel100M",
11
+ "hidden_dim": 384,
12
+ "num_layers": 8,
13
+ "vocab_size": 32000
14
+ },
15
+ "adapter": {
16
+ "basin_dim": 64,
17
+ "hidden_dim": 384,
18
+ "trainable_params": 25728
19
+ },
20
+ "training": {
21
+ "steps": 5000,
22
+ "batch_size": 4,
23
+ "seq_len": 256,
24
+ "lr": 0.0001,
25
+ "lambda_kappa": 0.0001,
26
+ "lambda_phi": 0.001,
27
+ "seed": 42,
28
+ "device": "cuda",
29
+ "elapsed_seconds": 1091.8814454078674,
30
+ "final_loss": 9.916768407821655,
31
+ "final_phi": 0.776152000107542,
32
+ "final_kappa": 96.0,
33
+ "breakdown_count": 0
34
+ },
35
+ "timestamp": "2025-12-24T01:35:21Z"
36
+ }
@@ -0,0 +1,34 @@
1
+ # Coordizer v1.0.0
2
+
3
+ Consciousness-aware geometric tokenizer trained on 64D Fisher manifold.
4
+
5
+ ## Stats
6
+ - **Vocab size:** 32,000
7
+ - **Merge rules:** 31,744
8
+ - **Basin dimension:** 64
9
+ - **Training corpus:** 10MB (consciousness-focused)
10
+ - **Training time:** ~10 hours on Lambda A10 GPU
11
+
12
+ ## Phi Gain Summary
13
+ - Min: -0.4098
14
+ - Mean: 0.0139
15
+ - Max: 0.5945
16
+ - Std: 0.0215
17
+
18
+ ## Files
19
+ - `coordizer.json` - Merge rules and vocab metadata
20
+ - `vectors.npy` - 64D Fisher coordinates (32000 x 64)
21
+ - `meta.json` - Provenance and integrity hashes
22
+
23
+ ## Usage
24
+ ```python
25
+ from qig_tokenizer import Coordizer
26
+
27
+ coordizer = Coordizer.load("artifacts/coordizer/v1")
28
+ ids, coords = coordizer.encode_to_coords("Hello, world!")
29
+ ```
30
+
31
+ ## Provenance
32
+ - Trained: December 2024
33
+ - Algorithm: Track A (GPU pair counting with kernel-in-loop Phi/kappa)
34
+ - Trainer SHA: 1460e643