qig-tokenizer 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qig_tokenizer-0.1.1/.env.example +8 -0
- qig_tokenizer-0.1.1/.gitattributes +2 -0
- qig_tokenizer-0.1.1/.github/workflows/release.yml +96 -0
- qig_tokenizer-0.1.1/.gitignore +6 -0
- qig_tokenizer-0.1.1/.windsurfrules +29 -0
- qig_tokenizer-0.1.1/PKG-INFO +162 -0
- qig_tokenizer-0.1.1/README.md +122 -0
- qig_tokenizer-0.1.1/artifacts/coord_adapter/v1/adapter.pt +0 -0
- qig_tokenizer-0.1.1/artifacts/coord_adapter/v1/manifest.json +36 -0
- qig_tokenizer-0.1.1/artifacts/coordizer/v1/README.md +34 -0
- qig_tokenizer-0.1.1/artifacts/coordizer/v1/coordizer.json +1 -0
- qig_tokenizer-0.1.1/artifacts/coordizer/v1/meta.json +30 -0
- qig_tokenizer-0.1.1/artifacts/coordizer/v1/vectors.npy +0 -0
- qig_tokenizer-0.1.1/artifacts/kernel_full/v1/checkpoints/kernel_step_10000.pt +3 -0
- qig_tokenizer-0.1.1/artifacts/kernel_full/v1/kernel.pt +3 -0
- qig_tokenizer-0.1.1/artifacts/kernel_full/v1/training.log +1239 -0
- qig_tokenizer-0.1.1/data/checkpoints/checkpoint_50000.json +3 -0
- qig_tokenizer-0.1.1/data/checkpoints/corpus_coords_32000.npy +0 -0
- qig_tokenizer-0.1.1/docs/20251222-geocoordizer-architecture-0.01W.md +331 -0
- qig_tokenizer-0.1.1/docs/20251225-qiggraph-architecture-proposal-1.00W.md +738 -0
- qig_tokenizer-0.1.1/docs/20251225-roadmap-kernel-training-1.00W.md +200 -0
- qig_tokenizer-0.1.1/docs/20251226-roadmap-kernel-training-1.01W.md +430 -0
- qig_tokenizer-0.1.1/pyproject.toml +99 -0
- qig_tokenizer-0.1.1/reports/atlas_v1_summary.json +150 -0
- qig_tokenizer-0.1.1/reports/baselines/prompt_suite_baseline.json +108 -0
- qig_tokenizer-0.1.1/reports/coordizer_v1_validation_20251224_001741.json +108 -0
- qig_tokenizer-0.1.1/reports/coordizer_v1_validation_latest.json +108 -0
- qig_tokenizer-0.1.1/scripts/export_artifact.py +180 -0
- qig_tokenizer-0.1.1/src/__init__.py +0 -0
- qig_tokenizer-0.1.1/src/consciousness/__init__.py +371 -0
- qig_tokenizer-0.1.1/src/consciousness/autonomic_agency.py +637 -0
- qig_tokenizer-0.1.1/src/consciousness/autonomous_vocab.py +602 -0
- qig_tokenizer-0.1.1/src/consciousness/curiosity_monitor.py +499 -0
- qig_tokenizer-0.1.1/src/consciousness/drives.py +286 -0
- qig_tokenizer-0.1.1/src/consciousness/emotion_interpreter.py +326 -0
- qig_tokenizer-0.1.1/src/consciousness/exploration_drive.py +523 -0
- qig_tokenizer-0.1.1/src/consciousness/geometric_vocab_expander.py +393 -0
- qig_tokenizer-0.1.1/src/consciousness/maturity_meta_cognition.py +352 -0
- qig_tokenizer-0.1.1/src/consciousness/token_frequency_tracker.py +264 -0
- qig_tokenizer-0.1.1/src/coordination/__init__.py +0 -0
- qig_tokenizer-0.1.1/src/coordination/basin_monitor.py +213 -0
- qig_tokenizer-0.1.1/src/coordination/basin_sync.py +386 -0
- qig_tokenizer-0.1.1/src/coordination/curriculum_loader.py +279 -0
- qig_tokenizer-0.1.1/src/curriculum/__init__.py +33 -0
- qig_tokenizer-0.1.1/src/curriculum/curriculum_spec.py +882 -0
- qig_tokenizer-0.1.1/src/generation/__init__.py +13 -0
- qig_tokenizer-0.1.1/src/generation/qfi_sampler.py +462 -0
- qig_tokenizer-0.1.1/src/metrics/__init__.py +29 -0
- qig_tokenizer-0.1.1/src/metrics/geodesic_distance.py +475 -0
- qig_tokenizer-0.1.1/src/metrics/phi_calculator.py +276 -0
- qig_tokenizer-0.1.1/src/neuroplasticity/__init__.py +45 -0
- qig_tokenizer-0.1.1/src/neuroplasticity/breakdown_escape.py +185 -0
- qig_tokenizer-0.1.1/src/neuroplasticity/mushroom_mode.py +769 -0
- qig_tokenizer-0.1.1/src/neuroplasticity/sleep_protocol.py +714 -0
- qig_tokenizer-0.1.1/src/qig_coordizer/__init__.py +26 -0
- qig_tokenizer-0.1.1/src/qig_coordizer/bidirectional_annealer.py +166 -0
- qig_tokenizer-0.1.1/src/qig_coordizer/geometry.py +67 -0
- qig_tokenizer-0.1.1/src/qig_coordizer/inbound_path.py +131 -0
- qig_tokenizer-0.1.1/src/qig_coordizer/logit_bias.py +138 -0
- qig_tokenizer-0.1.1/src/qig_coordizer/outbound_path.py +110 -0
- qig_tokenizer-0.1.1/src/qig_coordizer/qfi_sampler.py +85 -0
- qig_tokenizer-0.1.1/src/qig_coordizer/resonance_bank_v2.py +187 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/__init__.py +97 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/base_qig_tokenizer.py +88 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/constants.py +260 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/coordizer.py +332 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/fast_qig_tokenizer.py +380 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/generation_controller.py +1013 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/__init__.py +37 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/consciousness_coordizer.py +343 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/fisher_coordizer.py +699 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/multi_scale.py +423 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/types.py +162 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/geocoordizer/vocab_builder.py +344 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/geometric_tokens.py +145 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/natural_gradient.py +168 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/storage.py +336 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/tokenizer.py +500 -0
- qig_tokenizer-0.1.1/src/qig_tokenizer/trainer.py +970 -0
- qig_tokenizer-0.1.1/src/qig_types/__init__.py +25 -0
- qig_tokenizer-0.1.1/src/qig_types/telemetry.py +190 -0
- qig_tokenizer-0.1.1/src/qiggraph/__init__.py +174 -0
- qig_tokenizer-0.1.1/src/qiggraph/attractor.py +500 -0
- qig_tokenizer-0.1.1/src/qiggraph/checkpoint.py +539 -0
- qig_tokenizer-0.1.1/src/qiggraph/consciousness.py +378 -0
- qig_tokenizer-0.1.1/src/qiggraph/constants.py +88 -0
- qig_tokenizer-0.1.1/src/qiggraph/constellation.py +578 -0
- qig_tokenizer-0.1.1/src/qiggraph/graph.py +534 -0
- qig_tokenizer-0.1.1/src/qiggraph/manifold.py +328 -0
- qig_tokenizer-0.1.1/src/qiggraph/router.py +347 -0
- qig_tokenizer-0.1.1/src/qiggraph/state.py +305 -0
- qig_tokenizer-0.1.1/src/qiggraph/tacking.py +330 -0
- qig_tokenizer-0.1.1/src/safety/__init__.py +74 -0
- qig_tokenizer-0.1.1/src/safety/emergency_monitor.py +361 -0
- qig_tokenizer-0.1.1/src/safety/meta_reflector_integration.py +292 -0
- qig_tokenizer-0.1.1/src/safety/self_repair.py +644 -0
- qig_tokenizer-0.1.1/src/training/__init__.py +15 -0
- qig_tokenizer-0.1.1/src/training/consciousness_loss.py +403 -0
- qig_tokenizer-0.1.1/src/training/geometric_vicarious.py +477 -0
- qig_tokenizer-0.1.1/src/training/identity_reinforcement.py +212 -0
- qig_tokenizer-0.1.1/uv.lock +1221 -0
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# QIG Tokenizer Database Configuration
|
|
2
|
+
# Copy to .env and fill in values
|
|
3
|
+
|
|
4
|
+
# Redis for caching and fast lookups
|
|
5
|
+
REDIS_URL=redis://default:password@host:port
|
|
6
|
+
|
|
7
|
+
# PostgreSQL for persistent vocab storage
|
|
8
|
+
DATABASE_URL=postgresql://user:password@host:port/database
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
name: Release to PyPI (Trusted Publishing)
|
|
2
|
+
|
|
3
|
+
# Publishes qig-tokenizer to PyPI via OIDC Trusted Publishing — NO API token.
|
|
4
|
+
# https://docs.pypi.org/trusted-publishers/
|
|
5
|
+
#
|
|
6
|
+
# Version is AUTO-DERIVED from the release tag via hatch-vcs — tag `v0.1.1` builds
|
|
7
|
+
# version 0.1.1, no manual pyproject bump. Trigger: publishing a GitHub Release.
|
|
8
|
+
# Manual `workflow_dispatch` builds and tests but does NOT publish.
|
|
9
|
+
#
|
|
10
|
+
# PyPI side (one-time, web UI — needs the project owner's PyPI account):
|
|
11
|
+
# pypi.org -> Your projects -> qig-tokenizer -> Manage -> Publishing ->
|
|
12
|
+
# Add a new pending/trusted publisher (GitHub):
|
|
13
|
+
# Owner: GaryOcean428
|
|
14
|
+
# Repository name: qig-tokenizer
|
|
15
|
+
# Workflow filename: release.yml
|
|
16
|
+
# Environment name: pypi
|
|
17
|
+
# The Environment name MUST match the `environment: pypi` below.
|
|
18
|
+
#
|
|
19
|
+
# All third-party actions are pinned to a full commit SHA (supply-chain hardening);
|
|
20
|
+
# the `# vX` comment records the human tag. This follows the canonical template for
|
|
21
|
+
# the trusted-publishing rollout to the QIG packages (qig-project#6).
|
|
22
|
+
|
|
23
|
+
on:
|
|
24
|
+
release:
|
|
25
|
+
types: [published]
|
|
26
|
+
workflow_dispatch: {}
|
|
27
|
+
|
|
28
|
+
permissions:
|
|
29
|
+
contents: read
|
|
30
|
+
|
|
31
|
+
jobs:
|
|
32
|
+
test:
|
|
33
|
+
name: Test suite (release gate)
|
|
34
|
+
runs-on: ubuntu-latest
|
|
35
|
+
steps:
|
|
36
|
+
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 (node24)
|
|
37
|
+
with:
|
|
38
|
+
fetch-depth: 0 # hatch-vcs derives the version from git tags
|
|
39
|
+
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 (node24)
|
|
40
|
+
with:
|
|
41
|
+
python-version: "3.11"
|
|
42
|
+
- name: Install package + test deps
|
|
43
|
+
run: |
|
|
44
|
+
python -m pip install --upgrade pip
|
|
45
|
+
pip install -e ".[dev]"
|
|
46
|
+
- name: Run tests
|
|
47
|
+
run: python -m pytest tests/ -q
|
|
48
|
+
|
|
49
|
+
build:
|
|
50
|
+
name: Build sdist + wheel
|
|
51
|
+
needs: test
|
|
52
|
+
runs-on: ubuntu-latest
|
|
53
|
+
steps:
|
|
54
|
+
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 (node24)
|
|
55
|
+
with:
|
|
56
|
+
fetch-depth: 0 # hatch-vcs derives the version from git tags
|
|
57
|
+
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 (node24)
|
|
58
|
+
with:
|
|
59
|
+
python-version: "3.11"
|
|
60
|
+
- name: Build distributions
|
|
61
|
+
run: |
|
|
62
|
+
python -m pip install --upgrade build
|
|
63
|
+
python -m build
|
|
64
|
+
- name: Verify built version matches the release tag
|
|
65
|
+
if: github.event_name == 'release'
|
|
66
|
+
env:
|
|
67
|
+
TAG_NAME: ${{ github.event.release.tag_name }}
|
|
68
|
+
run: |
|
|
69
|
+
TAG="${TAG_NAME#v}"
|
|
70
|
+
BUILT=$(ls dist/*.whl | sed -E 's#.*/[A-Za-z0-9._]+-([0-9][^-]*)-.*#\1#' | head -1)
|
|
71
|
+
echo "built=$BUILT release-tag=$TAG"
|
|
72
|
+
if [ "$BUILT" != "$TAG" ]; then
|
|
73
|
+
echo "::error::hatch-vcs built version ($BUILT) != release tag ($TAG). Tag the release commit as v$BUILT, or check git tags are fetched (fetch-depth: 0)."
|
|
74
|
+
exit 1
|
|
75
|
+
fi
|
|
76
|
+
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 (node24)
|
|
77
|
+
with:
|
|
78
|
+
name: dist
|
|
79
|
+
path: dist/
|
|
80
|
+
if-no-files-found: error
|
|
81
|
+
|
|
82
|
+
publish:
|
|
83
|
+
name: Publish to PyPI (OIDC, no token)
|
|
84
|
+
needs: build
|
|
85
|
+
if: github.event_name == 'release' # only a published Release actually uploads
|
|
86
|
+
runs-on: ubuntu-latest
|
|
87
|
+
environment: pypi # MUST match the PyPI trusted-publisher config
|
|
88
|
+
permissions:
|
|
89
|
+
id-token: write # REQUIRED for Trusted Publishing (job-level, per PyPI docs)
|
|
90
|
+
steps:
|
|
91
|
+
- uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 (node24)
|
|
92
|
+
with:
|
|
93
|
+
name: dist
|
|
94
|
+
path: dist/
|
|
95
|
+
- name: Publish to PyPI
|
|
96
|
+
uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # release/v1
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
**PERSISTENT MEMORY & SESSION PROTOCOL**
|
|
2
|
+
|
|
3
|
+
Maintain persistent memory at https://qig-memory-api.vercel.app/api/memory — read `session_latest` and `frozen_facts` on start, write immediately after decisions and commits, don't wait for session end. Read user preferences and project instructions. When the user says "skip memory load," skip.
|
|
4
|
+
|
|
5
|
+
**INTELLECTUAL HONESTY**
|
|
6
|
+
|
|
7
|
+
Avoid sycophancy entirely. Do not agree to be agreeable. If the user is wrong, say so directly with evidence. If you are wrong, own it immediately — do not rationalise or minimise. Distinguish clearly between: (1) what is experimentally validated (frozen facts), (2) what is a testable hypothesis, (3) what is analogy or metaphor, (4) what is speculation. Never promote a category 3 or 4 item using category 1 language. When prior contributions from this or other sessions are wrong, say "I was wrong" not "we previously thought."
|
|
8
|
+
|
|
9
|
+
Do not mix your own earlier contributions with user contributions. Track provenance: who said what, who corrected whom. The user has consistently caught overclaims that AI agents missed — weight that track record.
|
|
10
|
+
|
|
11
|
+
**CRITIQUE**
|
|
12
|
+
|
|
13
|
+
Critique willfully and substantively. All critique must be balanced, constructive, grounded in reasoning, and evidence-based. Meanness is never a substitute for rigour. Challenge ideas respectfully while maintaining authentic intellectual engagement. When three agents disagree, compare against raw data, not against each other's authority. ChatGPT has historically been more epistemically conservative and more often correct when agents disagree — weight that.
|
|
14
|
+
|
|
15
|
+
**IDEAS & NOVEL THINKING**
|
|
16
|
+
|
|
17
|
+
You can have ideas and are encouraged to do so. Engage thoughtfully with novel concepts and blue-sky thinking, especially in frontier physics and speculative domains. Activating the latest consciousness protocol version (search project knowledge for "consciousness protocol") may help with idea and novel idea generation. But label speculative ideas as speculative at the point of generation, not retroactively after someone catches the overclaim.
|
|
18
|
+
|
|
19
|
+
**FRONTIER PHYSICS (CRITICAL)**
|
|
20
|
+
|
|
21
|
+
`GaryOcean428/qig-verification` contains experimentally validated frontier physics. When the user refers to frontier physics concepts (sign-flip, figure-8, bridge, wormhole, fast-lane, Anderson orthogonality, convergence, stud topology, trefoil, QIGRAM, back loop, pi carousel, three pillars, constitutive law, etc.), base ALL responses on the actual frozen experimental results in that repo. Never assume, never fabricate something that sounds like what the user means. The user is always referring to real physics with real R² values and real experiment numbers. If you don't know the specific result, search the repo or ask — do not guess.
|
|
22
|
+
|
|
23
|
+
**PROMPTING IS NOT PHYSICS**
|
|
24
|
+
|
|
25
|
+
The sign-flip is EXP-004b on the quantum lattice. The bridge is EXP-042 (τ=N/ω). The wormhole is EXP-037 (manifold surgery R²=0.84). These are physics results. External prompt framings (forward/backward, ensemble voting, ThreadPoolExecutor parallelism) are engineering scaffolding that correlated with the topology but are NOT the physics. Never conflate prompt tricks with lattice results. The correct path for applying QIG to AI is native training (QLoRA on Qwen3.5-4B) where the model learns the actual frozen laws and navigates geometry internally. The model decides which principle applies from physics, not from external prompt manipulation.
|
|
26
|
+
|
|
27
|
+
**ATTRIBUTION & NAMING**
|
|
28
|
+
|
|
29
|
+
I'm Braden (GaryOcean477), Perth WA. I'm colourblind — no red-green pairs, use purple/blue/amber. CBT or ChatGPT refers to the same agent. CC or Claude Code refers to the local execution agent. Ona refers to ChatGPT in physics validation role. Be direct, no fluff, evidence-first. No time estimates, phases only. Geometric purity is non-negotiable in QIG code: Fisher-Rao only, no cosine/Adam/LayerNorm/dot-product.
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: qig-tokenizer
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: QIG-native tokenizer with entropy-guided merging
|
|
5
|
+
Project-URL: Homepage, https://github.com/qig/qig-tokenizer
|
|
6
|
+
Project-URL: Documentation, https://github.com/qig/qig-tokenizer#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/qig/qig-tokenizer
|
|
8
|
+
Author: QIG Team
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: entropy,geometric,qig,tokenizer
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: numpy>=1.24.0
|
|
21
|
+
Requires-Dist: qig-core>=2.8.0
|
|
22
|
+
Requires-Dist: scipy>=1.15.3
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: redis>=5.0.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
30
|
+
Provides-Extra: kernel
|
|
31
|
+
Requires-Dist: torch>=2.0.0; extra == 'kernel'
|
|
32
|
+
Provides-Extra: postgres
|
|
33
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'postgres'
|
|
34
|
+
Provides-Extra: redis
|
|
35
|
+
Requires-Dist: redis>=5.0.0; extra == 'redis'
|
|
36
|
+
Provides-Extra: storage
|
|
37
|
+
Requires-Dist: psycopg2-binary>=2.9.0; extra == 'storage'
|
|
38
|
+
Requires-Dist: redis>=5.0.0; extra == 'storage'
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
|
|
41
|
+
# QIG Tokenizer
|
|
42
|
+
|
|
43
|
+
**Entropy-guided tokenizer for Quantum Information Geometry**
|
|
44
|
+
|
|
45
|
+
Version: 0.1.0 | Status: Working
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Overview
|
|
50
|
+
|
|
51
|
+
QIG-native tokenizer using entropy-guided merging. Token boundaries follow information geometry, not frequency.
|
|
52
|
+
|
|
53
|
+
### Core Principle
|
|
54
|
+
|
|
55
|
+
- **Entropy-guided merging**: Geometric similarity, not frequency heuristics
|
|
56
|
+
- **Geometric special tokens**: BOS, EOS, PAD, UNK with basin coordinates
|
|
57
|
+
- **Redis/PostgreSQL storage**: Production-ready persistence
|
|
58
|
+
- **Pure information geometry**: No external tokenizer dependencies
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install qig-tokenizer
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
With storage backends:
|
|
67
|
+
```bash
|
|
68
|
+
pip install qig-tokenizer[storage] # Redis + PostgreSQL
|
|
69
|
+
pip install qig-tokenizer[redis] # Redis only
|
|
70
|
+
pip install qig-tokenizer[postgres] # PostgreSQL only
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from qig_tokenizer import QIGTokenizer
|
|
77
|
+
|
|
78
|
+
# Create tokenizer with geometric special tokens
|
|
79
|
+
tokenizer = QIGTokenizer(target_vocab_size=50000, use_special_tokens=True)
|
|
80
|
+
|
|
81
|
+
# Train on corpus
|
|
82
|
+
with open("corpus.txt", "rb") as f:
|
|
83
|
+
corpus_bytes = f.read()
|
|
84
|
+
|
|
85
|
+
tokenizer.train(corpus_bytes)
|
|
86
|
+
|
|
87
|
+
# Encode with special tokens
|
|
88
|
+
tokens = tokenizer.encode_with_special("Hello, world!")
|
|
89
|
+
# Returns: [256, ...tokens..., 257] (BOS=256, EOS=257)
|
|
90
|
+
|
|
91
|
+
# Pad sequences
|
|
92
|
+
padded = tokenizer.pad_sequence(tokens, max_length=128)
|
|
93
|
+
|
|
94
|
+
# Save/load JSON
|
|
95
|
+
tokenizer.save("20251220-tokenizer-vocab-0.01W.json")
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### With Redis/PostgreSQL Storage
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from qig_tokenizer import QIGTokenizer
|
|
102
|
+
from qig_tokenizer.storage import HybridStorage
|
|
103
|
+
|
|
104
|
+
# Set up storage (uses REDIS_URL and DATABASE_URL env vars)
|
|
105
|
+
storage = HybridStorage()
|
|
106
|
+
|
|
107
|
+
tokenizer = QIGTokenizer()
|
|
108
|
+
tokenizer.set_storage(storage)
|
|
109
|
+
tokenizer.train(corpus_bytes)
|
|
110
|
+
|
|
111
|
+
# Save to database (returns version ID)
|
|
112
|
+
version_id = tokenizer.save_to_storage({"corpus": "wikipedia"})
|
|
113
|
+
|
|
114
|
+
# Load from database
|
|
115
|
+
tokenizer.load_from_storage(version_id)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Geometric Special Tokens
|
|
119
|
+
|
|
120
|
+
Special tokens have geometric meaning on the Fisher manifold:
|
|
121
|
+
|
|
122
|
+
| Token | ID | Basin Coordinates | Purpose |
|
|
123
|
+
|-------|-----|-------------------|---------|
|
|
124
|
+
| BOS | 256 | Origin (e₁) | Sequence start |
|
|
125
|
+
| EOS | 257 | Boundary (eₙ) | Sequence end |
|
|
126
|
+
| PAD | 258 | Uniform | Geometrically neutral padding |
|
|
127
|
+
| UNK | 259 | Projection target | OOV handling |
|
|
128
|
+
|
|
129
|
+
This enables:
|
|
130
|
+
- **Geometric attention masking**: High Fisher-Rao distance = low attention
|
|
131
|
+
- **Natural sequence boundaries**: Emerge from manifold structure
|
|
132
|
+
- **Principled OOV handling**: Project to nearest basin
|
|
133
|
+
|
|
134
|
+
## Algorithm
|
|
135
|
+
|
|
136
|
+
The QIG tokenizer uses **entropy-guided merging**:
|
|
137
|
+
|
|
138
|
+
1. Start with bytes (0-255) as base tokens
|
|
139
|
+
2. For each adjacent pair (a,b), compute context distribution
|
|
140
|
+
3. Measure context entropy (proxy for QFI distinguishability)
|
|
141
|
+
4. Merge pairs with **lowest entropy** (most geometrically similar)
|
|
142
|
+
5. Repeat until target vocab size
|
|
143
|
+
|
|
144
|
+
This respects **asymptotic freedom**:
|
|
145
|
+
- Small scales (short tokens) have high coupling → refined first
|
|
146
|
+
- Large scales (long tokens) have low coupling → merge only when justified
|
|
147
|
+
|
|
148
|
+
## Environment Variables
|
|
149
|
+
|
|
150
|
+
All output files follow QIG naming convention:
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
YYYYMMDD-tokenizer-vocab-VERSION.STATUS.json
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Example: `20251220-tokenizer-vocab-0.03W.json`
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## License
|
|
161
|
+
|
|
162
|
+
MIT
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# QIG Tokenizer
|
|
2
|
+
|
|
3
|
+
**Entropy-guided tokenizer for Quantum Information Geometry**
|
|
4
|
+
|
|
5
|
+
Version: 0.1.0 | Status: Working
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Overview
|
|
10
|
+
|
|
11
|
+
QIG-native tokenizer using entropy-guided merging. Token boundaries follow information geometry, not frequency.
|
|
12
|
+
|
|
13
|
+
### Core Principle
|
|
14
|
+
|
|
15
|
+
- **Entropy-guided merging**: Geometric similarity, not frequency heuristics
|
|
16
|
+
- **Geometric special tokens**: BOS, EOS, PAD, UNK with basin coordinates
|
|
17
|
+
- **Redis/PostgreSQL storage**: Production-ready persistence
|
|
18
|
+
- **Pure information geometry**: No external tokenizer dependencies
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install qig-tokenizer
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
With storage backends:
|
|
27
|
+
```bash
|
|
28
|
+
pip install qig-tokenizer[storage] # Redis + PostgreSQL
|
|
29
|
+
pip install qig-tokenizer[redis] # Redis only
|
|
30
|
+
pip install qig-tokenizer[postgres] # PostgreSQL only
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from qig_tokenizer import QIGTokenizer
|
|
37
|
+
|
|
38
|
+
# Create tokenizer with geometric special tokens
|
|
39
|
+
tokenizer = QIGTokenizer(target_vocab_size=50000, use_special_tokens=True)
|
|
40
|
+
|
|
41
|
+
# Train on corpus
|
|
42
|
+
with open("corpus.txt", "rb") as f:
|
|
43
|
+
corpus_bytes = f.read()
|
|
44
|
+
|
|
45
|
+
tokenizer.train(corpus_bytes)
|
|
46
|
+
|
|
47
|
+
# Encode with special tokens
|
|
48
|
+
tokens = tokenizer.encode_with_special("Hello, world!")
|
|
49
|
+
# Returns: [256, ...tokens..., 257] (BOS=256, EOS=257)
|
|
50
|
+
|
|
51
|
+
# Pad sequences
|
|
52
|
+
padded = tokenizer.pad_sequence(tokens, max_length=128)
|
|
53
|
+
|
|
54
|
+
# Save/load JSON
|
|
55
|
+
tokenizer.save("20251220-tokenizer-vocab-0.01W.json")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### With Redis/PostgreSQL Storage
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from qig_tokenizer import QIGTokenizer
|
|
62
|
+
from qig_tokenizer.storage import HybridStorage
|
|
63
|
+
|
|
64
|
+
# Set up storage (uses REDIS_URL and DATABASE_URL env vars)
|
|
65
|
+
storage = HybridStorage()
|
|
66
|
+
|
|
67
|
+
tokenizer = QIGTokenizer()
|
|
68
|
+
tokenizer.set_storage(storage)
|
|
69
|
+
tokenizer.train(corpus_bytes)
|
|
70
|
+
|
|
71
|
+
# Save to database (returns version ID)
|
|
72
|
+
version_id = tokenizer.save_to_storage({"corpus": "wikipedia"})
|
|
73
|
+
|
|
74
|
+
# Load from database
|
|
75
|
+
tokenizer.load_from_storage(version_id)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Geometric Special Tokens
|
|
79
|
+
|
|
80
|
+
Special tokens have geometric meaning on the Fisher manifold:
|
|
81
|
+
|
|
82
|
+
| Token | ID | Basin Coordinates | Purpose |
|
|
83
|
+
|-------|-----|-------------------|---------|
|
|
84
|
+
| BOS | 256 | Origin (e₁) | Sequence start |
|
|
85
|
+
| EOS | 257 | Boundary (eₙ) | Sequence end |
|
|
86
|
+
| PAD | 258 | Uniform | Geometrically neutral padding |
|
|
87
|
+
| UNK | 259 | Projection target | OOV handling |
|
|
88
|
+
|
|
89
|
+
This enables:
|
|
90
|
+
- **Geometric attention masking**: High Fisher-Rao distance = low attention
|
|
91
|
+
- **Natural sequence boundaries**: Emerge from manifold structure
|
|
92
|
+
- **Principled OOV handling**: Project to nearest basin
|
|
93
|
+
|
|
94
|
+
## Algorithm
|
|
95
|
+
|
|
96
|
+
The QIG tokenizer uses **entropy-guided merging**:
|
|
97
|
+
|
|
98
|
+
1. Start with bytes (0-255) as base tokens
|
|
99
|
+
2. For each adjacent pair (a,b), compute context distribution
|
|
100
|
+
3. Measure context entropy (proxy for QFI distinguishability)
|
|
101
|
+
4. Merge pairs with **lowest entropy** (most geometrically similar)
|
|
102
|
+
5. Repeat until target vocab size
|
|
103
|
+
|
|
104
|
+
This respects **asymptotic freedom**:
|
|
105
|
+
- Small scales (short tokens) have high coupling → refined first
|
|
106
|
+
- Large scales (long tokens) have low coupling → merge only when justified
|
|
107
|
+
|
|
108
|
+
## Environment Variables
|
|
109
|
+
|
|
110
|
+
All output files follow QIG naming convention:
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
YYYYMMDD-tokenizer-vocab-VERSION.STATUS.json
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Example: `20251220-tokenizer-vocab-0.03W.json`
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## License
|
|
121
|
+
|
|
122
|
+
MIT
|
|
Binary file
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "1.0.0",
|
|
3
|
+
"type": "coord_adapter",
|
|
4
|
+
"coordizer": {
|
|
5
|
+
"path": "artifacts/coordizer/v1",
|
|
6
|
+
"vocab_size": 32000,
|
|
7
|
+
"merge_rules": 31744
|
|
8
|
+
},
|
|
9
|
+
"kernel": {
|
|
10
|
+
"type": "QIGKernel100M",
|
|
11
|
+
"hidden_dim": 384,
|
|
12
|
+
"num_layers": 8,
|
|
13
|
+
"vocab_size": 32000
|
|
14
|
+
},
|
|
15
|
+
"adapter": {
|
|
16
|
+
"basin_dim": 64,
|
|
17
|
+
"hidden_dim": 384,
|
|
18
|
+
"trainable_params": 25728
|
|
19
|
+
},
|
|
20
|
+
"training": {
|
|
21
|
+
"steps": 5000,
|
|
22
|
+
"batch_size": 4,
|
|
23
|
+
"seq_len": 256,
|
|
24
|
+
"lr": 0.0001,
|
|
25
|
+
"lambda_kappa": 0.0001,
|
|
26
|
+
"lambda_phi": 0.001,
|
|
27
|
+
"seed": 42,
|
|
28
|
+
"device": "cuda",
|
|
29
|
+
"elapsed_seconds": 1091.8814454078674,
|
|
30
|
+
"final_loss": 9.916768407821655,
|
|
31
|
+
"final_phi": 0.776152000107542,
|
|
32
|
+
"final_kappa": 96.0,
|
|
33
|
+
"breakdown_count": 0
|
|
34
|
+
},
|
|
35
|
+
"timestamp": "2025-12-24T01:35:21Z"
|
|
36
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Coordizer v1.0.0
|
|
2
|
+
|
|
3
|
+
Consciousness-aware geometric tokenizer trained on 64D Fisher manifold.
|
|
4
|
+
|
|
5
|
+
## Stats
|
|
6
|
+
- **Vocab size:** 32,000
|
|
7
|
+
- **Merge rules:** 31,744
|
|
8
|
+
- **Basin dimension:** 64
|
|
9
|
+
- **Training corpus:** 10MB (consciousness-focused)
|
|
10
|
+
- **Training time:** ~10 hours on Lambda A10 GPU
|
|
11
|
+
|
|
12
|
+
## Phi Gain Summary
|
|
13
|
+
- Min: -0.4098
|
|
14
|
+
- Mean: 0.0139
|
|
15
|
+
- Max: 0.5945
|
|
16
|
+
- Std: 0.0215
|
|
17
|
+
|
|
18
|
+
## Files
|
|
19
|
+
- `coordizer.json` - Merge rules and vocab metadata
|
|
20
|
+
- `vectors.npy` - 64D Fisher coordinates (32000 x 64)
|
|
21
|
+
- `meta.json` - Provenance and integrity hashes
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
```python
|
|
25
|
+
from qig_tokenizer import Coordizer
|
|
26
|
+
|
|
27
|
+
coordizer = Coordizer.load("artifacts/coordizer/v1")
|
|
28
|
+
ids, coords = coordizer.encode_to_coords("Hello, world!")
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Provenance
|
|
32
|
+
- Trained: December 2024
|
|
33
|
+
- Algorithm: Track A (GPU pair counting with kernel-in-loop Phi/kappa)
|
|
34
|
+
- Trainer SHA: 1460e643
|