PyPI - kvcache-simulator - Versions diffs - 0.1.0__tar.gz - Mend

kvcache-simulator 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

kvcache_simulator-0.1.0/LICENSE.md ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2016-present George Cushen (https://georgecushen.com)
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

kvcache_simulator-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,129 @@
+Metadata-Version: 2.4
+Name: kvcache-simulator
+Version: 0.1.0
+Summary: Local KV cache hit-rate simulator for JSONL traces.
+Author: KVCache.AI
+License-Expression: MIT
+Project-URL: Homepage, https://kvcache.ai/
+Project-URL: Repository, https://github.com/kvcache-ai/kvcache-blog
+Project-URL: Issues, https://github.com/kvcache-ai/kvcache-blog/issues
+Keywords: kv-cache,llm,inference,simulator,prefix-cache
+Classifier: Development Status :: 4 - Beta
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering
+Classifier: Topic :: System :: Benchmark
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE.md
+Dynamic: license-file
+# KV Cache Simulator
+`kvcache-simulator` runs the KVCache.AI hit-rate simulator locally on JSONL traces. It uses the same model accounting formulas as the KV Cache Size Calculator and the same prefix-aware hit-rate semantics as the KV Cache Hit Rate Simulator.
+## Installation
+```bash
+pip install kvcache-simulator
+```
+The default simulation backend compiles and runs a bundled C++ replay core on first use. You need a local C++ compiler such as `c++` or `clang++`. If a compiler is not available, use `--backend python`.
+## Quick Start
+```bash
+kvcache-simulator run \
+  --trace trace.jsonl.gz \
+  --model glm-5.2 \
+  --kv-precision fp8_int8 \
+  --indexer-precision fp4_int4
+```
+The default output is a readable table. Use `--format json` when another script needs to consume the result.
+```bash
+kvcache-simulator run \
+  --trace trace.jsonl.gz \
+  --model deepseek-v4-pro \
+  --kv-precision fp8_int8 \
+  --indexer-precision fp4_int4 \
+  --format json \
+  --output result.json
+```
+List supported model ids:
+```bash
+kvcache-simulator list-models
+```
+`python -m kvcache_sim ...` also works, but the installed CLI command is preferred.
+`run` is the main command. It evaluates the selected trace across a set of KV cache memory budgets. `sweep` is kept as an alias for users who prefer benchmark terminology.
+## Input Trace Format
+Input is JSONL or JSONL.GZ, one request per line. The minimal accepted format is:
+```json
+{"block_size":64,"hash_ids":[2001,2002],"input_length":128}
+```
+Required fields:
+- `hash_ids`: cache block identities in request-prefix order.
+- `input_length`: prefill input token count for this request.
+- `block_size`: source-native block size. It can be omitted only when `--block-size` is provided.
+Optional fields:
+- `timestamp`: ignored by the simulator. Requests are replayed in file order, so sort production traces by timestamp before running the command.
+- `output_length`: ignored by the hit-rate denominator. Generated output matters only if it appears later in another request's `hash_ids`.
+- `block_tokens`: advanced field for exact per-block token weights. If present, it must be a positive integer list with the same length as `hash_ids`.
+`--block-size` is only a fallback for records that omit `block_size`. If any record declares `block_size`, the trace-declared value is used and overrides the CLI fallback for the whole trace.
+## Options
+| Option | Meaning |
+| --- | --- |
+| `--trace PATH` | JSONL/JSONL.GZ trace path, or `-` for stdin. |
+| `--model ID` | Model id from the bundled KV Cache Size Calculator model catalog. Use `kvcache-simulator list-models` to list ids. |
+| `--kv-precision ID` | KV cache precision: usually `bf16_fp16`, `fp8_int8`, or `fp4_int4`. Defaults follow the web calculator. |
+| `--indexer-precision ID` | Indexer cache precision for models with an indexer cache, such as DeepSeek V4 / GLM / MiniMax M3. |
+| `--include-draft-kv-cache` | Include draft/MTP KV layers when the selected model defines them. Default is off. |
+| `--block-size N` | Fallback block size when trace records omit `block_size`; trace-declared `block_size` overrides it. |
+| `--estimate-tokens N` | Override the token count used for token-dependent bytes/token formulas. By default the trace average input length is used. |
+| `--budgets-gib A,B,C` | Comma-separated KV cache memory budgets in GiB. Default matches the web budget sweep: `1,2,4,...,16384`. |
+| `--policies fifo,lru,optimal` | Eviction policies to simulate. Defaults to all three. |
+| `--backend cpp\|python` | Simulation backend. Default is `cpp`; use `python` for debugging or machines without a compiler. |
+| `--jobs N` | Number of worker processes for the Python backend. The C++ backend runs one batch process and ignores this option. |
+| `--no-progress` | Disable terminal progress output. Progress is written to stderr only when stderr is interactive, so JSON stdout stays valid. |
+| `--format table\|json` | Output format. Default is `table`. |
+| `--output PATH` | Write output to a file. Default `-` prints to stdout. |
+| `--max-records N` | Debug/testing limit: stop after N valid requests. |
+| `--max-events N` | Debug/testing limit: stop after N trace blocks. |
+## Output Semantics
+- Hit rate is measured over the last 50% of requests.
+- Budget points that do not fill the cache before that measurement window are omitted, because they are not under memory pressure yet.
+- Hit tokens count only the longest continuous cached prefix of each request. If a middle block misses, later blocks in that same request do not count as prefill hits even if their ids are already cached.
+- `speedup` is an ideal prefill-only upper bound: `1 / (1 - hit_rate)`. `1.0x` means no-cache prefill throughput where every prefill input token is computed. It does not include decode, KV lookup, network, batching, scheduling, or memory bandwidth overhead.
+## Performance Notes
+The default C++ backend runs all cache-budget points in one batch after loading the trace and building the prefix trie once. This is usually faster than the Python backend, especially on large traces.
+`--jobs` applies only to the Python backend. It parallelizes independent `(policy, cache budget)` simulation tasks. More jobs are not always faster: the default budget sweep has only a small number of budget points, tasks have uneven runtimes, and large traces can become memory-bandwidth limited.
+## Limits
+The browser version caps uploads to protect the UI. This local package does not use that browser cap, but the C++ backend currently stores trace events and prefix node indexes in 32-bit arrays, so it supports at most `2^32 - 1` block events. In practice, memory and runtime are usually the real limits before that.

kvcache_simulator-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,103 @@
+# KV Cache Simulator
+`kvcache-simulator` runs the KVCache.AI hit-rate simulator locally on JSONL traces. It uses the same model accounting formulas as the KV Cache Size Calculator and the same prefix-aware hit-rate semantics as the KV Cache Hit Rate Simulator.
+## Installation
+```bash
+pip install kvcache-simulator
+```
+The default simulation backend compiles and runs a bundled C++ replay core on first use. You need a local C++ compiler such as `c++` or `clang++`. If a compiler is not available, use `--backend python`.
+## Quick Start
+```bash
+kvcache-simulator run \
+  --trace trace.jsonl.gz \
+  --model glm-5.2 \
+  --kv-precision fp8_int8 \
+  --indexer-precision fp4_int4
+```
+The default output is a readable table. Use `--format json` when another script needs to consume the result.
+```bash
+kvcache-simulator run \
+  --trace trace.jsonl.gz \
+  --model deepseek-v4-pro \
+  --kv-precision fp8_int8 \
+  --indexer-precision fp4_int4 \
+  --format json \
+  --output result.json
+```
+List supported model ids:
+```bash
+kvcache-simulator list-models
+```
+`python -m kvcache_sim ...` also works, but the installed CLI command is preferred.
+`run` is the main command. It evaluates the selected trace across a set of KV cache memory budgets. `sweep` is kept as an alias for users who prefer benchmark terminology.
+## Input Trace Format
+Input is JSONL or JSONL.GZ, one request per line. The minimal accepted format is:
+```json
+{"block_size":64,"hash_ids":[2001,2002],"input_length":128}
+```
+Required fields:
+- `hash_ids`: cache block identities in request-prefix order.
+- `input_length`: prefill input token count for this request.
+- `block_size`: source-native block size. It can be omitted only when `--block-size` is provided.
+Optional fields:
+- `timestamp`: ignored by the simulator. Requests are replayed in file order, so sort production traces by timestamp before running the command.
+- `output_length`: ignored by the hit-rate denominator. Generated output matters only if it appears later in another request's `hash_ids`.
+- `block_tokens`: advanced field for exact per-block token weights. If present, it must be a positive integer list with the same length as `hash_ids`.
+`--block-size` is only a fallback for records that omit `block_size`. If any record declares `block_size`, the trace-declared value is used and overrides the CLI fallback for the whole trace.
+## Options
+| Option | Meaning |
+| --- | --- |
+| `--trace PATH` | JSONL/JSONL.GZ trace path, or `-` for stdin. |
+| `--model ID` | Model id from the bundled KV Cache Size Calculator model catalog. Use `kvcache-simulator list-models` to list ids. |
+| `--kv-precision ID` | KV cache precision: usually `bf16_fp16`, `fp8_int8`, or `fp4_int4`. Defaults follow the web calculator. |
+| `--indexer-precision ID` | Indexer cache precision for models with an indexer cache, such as DeepSeek V4 / GLM / MiniMax M3. |
+| `--include-draft-kv-cache` | Include draft/MTP KV layers when the selected model defines them. Default is off. |
+| `--block-size N` | Fallback block size when trace records omit `block_size`; trace-declared `block_size` overrides it. |
+| `--estimate-tokens N` | Override the token count used for token-dependent bytes/token formulas. By default the trace average input length is used. |
+| `--budgets-gib A,B,C` | Comma-separated KV cache memory budgets in GiB. Default matches the web budget sweep: `1,2,4,...,16384`. |
+| `--policies fifo,lru,optimal` | Eviction policies to simulate. Defaults to all three. |
+| `--backend cpp\|python` | Simulation backend. Default is `cpp`; use `python` for debugging or machines without a compiler. |
+| `--jobs N` | Number of worker processes for the Python backend. The C++ backend runs one batch process and ignores this option. |
+| `--no-progress` | Disable terminal progress output. Progress is written to stderr only when stderr is interactive, so JSON stdout stays valid. |
+| `--format table\|json` | Output format. Default is `table`. |
+| `--output PATH` | Write output to a file. Default `-` prints to stdout. |
+| `--max-records N` | Debug/testing limit: stop after N valid requests. |
+| `--max-events N` | Debug/testing limit: stop after N trace blocks. |
+## Output Semantics
+- Hit rate is measured over the last 50% of requests.
+- Budget points that do not fill the cache before that measurement window are omitted, because they are not under memory pressure yet.
+- Hit tokens count only the longest continuous cached prefix of each request. If a middle block misses, later blocks in that same request do not count as prefill hits even if their ids are already cached.
+- `speedup` is an ideal prefill-only upper bound: `1 / (1 - hit_rate)`. `1.0x` means no-cache prefill throughput where every prefill input token is computed. It does not include decode, KV lookup, network, batching, scheduling, or memory bandwidth overhead.
+## Performance Notes
+The default C++ backend runs all cache-budget points in one batch after loading the trace and building the prefix trie once. This is usually faster than the Python backend, especially on large traces.
+`--jobs` applies only to the Python backend. It parallelizes independent `(policy, cache budget)` simulation tasks. More jobs are not always faster: the default budget sweep has only a small number of budget points, tasks have uneven runtimes, and large traces can become memory-bandwidth limited.
+## Limits
+The browser version caps uploads to protect the UI. This local package does not use that browser cap, but the C++ backend currently stores trace events and prefix node indexes in 32-bit arrays, so it supports at most `2^32 - 1` block events. In practice, memory and runtime are usually the real limits before that.

kvcache_simulator-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,43 @@
+[build-system]
+requires = ["setuptools>=77", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "kvcache-simulator"
+version = "0.1.0"
+description = "Local KV cache hit-rate simulator for JSONL traces."
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+license-files = ["LICENSE.md"]
+authors = [
+  { name = "KVCache.AI" }
+]
+keywords = ["kv-cache", "llm", "inference", "simulator", "prefix-cache"]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Environment :: Console",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Science/Research",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Topic :: Scientific/Engineering",
+  "Topic :: System :: Benchmark",
+]
+[project.urls]
+Homepage = "https://kvcache.ai/"
+Repository = "https://github.com/kvcache-ai/kvcache-blog"
+Issues = "https://github.com/kvcache-ai/kvcache-blog/issues"
+[project.scripts]
+kvcache-simulator = "kvcache_sim.cli:main"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.setuptools.package-data]
+"kvcache_sim.resources" = ["models.yaml", "kv-cache-lab-native-sim.cc"]

kvcache_simulator-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

kvcache_simulator-0.1.0/src/kvcache_sim/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Local KV cache hit-rate simulator aligned with KVCache.AI web tools."""
+from .calculator import BYTES_PER_GIB, calculate_cache_size, load_models_data
+from .simulator import run_sweep
+__version__ = "0.1.0"
+__all__ = [
+    "BYTES_PER_GIB",
+    "__version__",
+    "calculate_cache_size",
+    "load_models_data",
+    "run_sweep",
+]

kvcache_simulator-0.1.0/src/kvcache_sim/__main__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .cli import main
+if __name__ == "__main__":
+    raise SystemExit(main())

kvcache_simulator-0.1.0/src/kvcache_sim/_resources.py ADDED Viewed

@@ -0,0 +1,27 @@
+from __future__ import annotations
+from importlib import resources
+from pathlib import Path
+import hashlib
+import re
+import tempfile
+RESOURCE_PACKAGE = "kvcache_sim.resources"
+def package_resource_path(name: str) -> Path:
+    resource = resources.files(RESOURCE_PACKAGE).joinpath(name)
+    try:
+        path = Path(resource)
+        if path.exists():
+            return path
+    except TypeError:
+        pass
+    payload = resource.read_bytes()
+    digest = hashlib.sha256(payload).hexdigest()[:16]
+    safe_name = re.sub(r"[^A-Za-z0-9_.-]+", "-", name)
+    target = Path(tempfile.gettempdir()) / f"kvcache-simulator-{digest}-{safe_name}"
+    if not target.exists() or target.read_bytes() != payload:
+        target.write_bytes(payload)
+    return target