kvcache-simulator 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. kvcache_simulator-0.1.0/LICENSE.md +21 -0
  2. kvcache_simulator-0.1.0/PKG-INFO +129 -0
  3. kvcache_simulator-0.1.0/README.md +103 -0
  4. kvcache_simulator-0.1.0/pyproject.toml +43 -0
  5. kvcache_simulator-0.1.0/setup.cfg +4 -0
  6. kvcache_simulator-0.1.0/src/kvcache_sim/__init__.py +14 -0
  7. kvcache_simulator-0.1.0/src/kvcache_sim/__main__.py +5 -0
  8. kvcache_simulator-0.1.0/src/kvcache_sim/_resources.py +27 -0
  9. kvcache_simulator-0.1.0/src/kvcache_sim/calculator.py +442 -0
  10. kvcache_simulator-0.1.0/src/kvcache_sim/cli.py +115 -0
  11. kvcache_simulator-0.1.0/src/kvcache_sim/cpp_backend.py +203 -0
  12. kvcache_simulator-0.1.0/src/kvcache_sim/formatting.py +63 -0
  13. kvcache_simulator-0.1.0/src/kvcache_sim/plan.py +78 -0
  14. kvcache_simulator-0.1.0/src/kvcache_sim/policies.py +286 -0
  15. kvcache_simulator-0.1.0/src/kvcache_sim/progress.py +36 -0
  16. kvcache_simulator-0.1.0/src/kvcache_sim/resources/__init__.py +1 -0
  17. kvcache_simulator-0.1.0/src/kvcache_sim/resources/kv-cache-lab-native-sim.cc +710 -0
  18. kvcache_simulator-0.1.0/src/kvcache_sim/resources/models.yaml +934 -0
  19. kvcache_simulator-0.1.0/src/kvcache_sim/simulator.py +226 -0
  20. kvcache_simulator-0.1.0/src/kvcache_sim/trace.py +226 -0
  21. kvcache_simulator-0.1.0/src/kvcache_simulator.egg-info/PKG-INFO +129 -0
  22. kvcache_simulator-0.1.0/src/kvcache_simulator.egg-info/SOURCES.txt +24 -0
  23. kvcache_simulator-0.1.0/src/kvcache_simulator.egg-info/dependency_links.txt +1 -0
  24. kvcache_simulator-0.1.0/src/kvcache_simulator.egg-info/entry_points.txt +2 -0
  25. kvcache_simulator-0.1.0/src/kvcache_simulator.egg-info/top_level.txt +1 -0
  26. kvcache_simulator-0.1.0/tests/test_kvcache_sim.py +327 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2016-present George Cushen (https://georgecushen.com)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,129 @@
1
+ Metadata-Version: 2.4
2
+ Name: kvcache-simulator
3
+ Version: 0.1.0
4
+ Summary: Local KV cache hit-rate simulator for JSONL traces.
5
+ Author: KVCache.AI
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://kvcache.ai/
8
+ Project-URL: Repository, https://github.com/kvcache-ai/kvcache-blog
9
+ Project-URL: Issues, https://github.com/kvcache-ai/kvcache-blog/issues
10
+ Keywords: kv-cache,llm,inference,simulator,prefix-cache
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Scientific/Engineering
21
+ Classifier: Topic :: System :: Benchmark
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE.md
25
+ Dynamic: license-file
26
+
27
+ # KV Cache Simulator
28
+
29
+ `kvcache-simulator` runs the KVCache.AI hit-rate simulator locally on JSONL traces. It uses the same model accounting formulas as the KV Cache Size Calculator and the same prefix-aware hit-rate semantics as the KV Cache Hit Rate Simulator.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install kvcache-simulator
35
+ ```
36
+
37
+ The default simulation backend compiles and runs a bundled C++ replay core on first use. You need a local C++ compiler such as `c++` or `clang++`. If a compiler is not available, use `--backend python`.
38
+
39
+ ## Quick Start
40
+
41
+ ```bash
42
+ kvcache-simulator run \
43
+ --trace trace.jsonl.gz \
44
+ --model glm-5.2 \
45
+ --kv-precision fp8_int8 \
46
+ --indexer-precision fp4_int4
47
+ ```
48
+
49
+ The default output is a readable table. Use `--format json` when another script needs to consume the result.
50
+
51
+ ```bash
52
+ kvcache-simulator run \
53
+ --trace trace.jsonl.gz \
54
+ --model deepseek-v4-pro \
55
+ --kv-precision fp8_int8 \
56
+ --indexer-precision fp4_int4 \
57
+ --format json \
58
+ --output result.json
59
+ ```
60
+
61
+ List supported model ids:
62
+
63
+ ```bash
64
+ kvcache-simulator list-models
65
+ ```
66
+
67
+ `python -m kvcache_sim ...` also works, but the installed CLI command is preferred.
68
+
69
+ `run` is the main command. It evaluates the selected trace across a set of KV cache memory budgets. `sweep` is kept as an alias for users who prefer benchmark terminology.
70
+
71
+ ## Input Trace Format
72
+
73
+ Input is JSONL or JSONL.GZ, one request per line. The minimal accepted format is:
74
+
75
+ ```json
76
+ {"block_size":64,"hash_ids":[2001,2002],"input_length":128}
77
+ ```
78
+
79
+ Required fields:
80
+
81
+ - `hash_ids`: cache block identities in request-prefix order.
82
+ - `input_length`: prefill input token count for this request.
83
+ - `block_size`: source-native block size. It can be omitted only when `--block-size` is provided.
84
+
85
+ Optional fields:
86
+
87
+ - `timestamp`: ignored by the simulator. Requests are replayed in file order, so sort production traces by timestamp before running the command.
88
+ - `output_length`: ignored by the hit-rate denominator. Generated output matters only if it appears later in another request's `hash_ids`.
89
+ - `block_tokens`: advanced field for exact per-block token weights. If present, it must be a positive integer list with the same length as `hash_ids`.
90
+
91
+ `--block-size` is only a fallback for records that omit `block_size`. If any record declares `block_size`, the trace-declared value is used and overrides the CLI fallback for the whole trace.
92
+
93
+ ## Options
94
+
95
+ | Option | Meaning |
96
+ | --- | --- |
97
+ | `--trace PATH` | JSONL/JSONL.GZ trace path, or `-` for stdin. |
98
+ | `--model ID` | Model id from the bundled KV Cache Size Calculator model catalog. Use `kvcache-simulator list-models` to list ids. |
99
+ | `--kv-precision ID` | KV cache precision: usually `bf16_fp16`, `fp8_int8`, or `fp4_int4`. Defaults follow the web calculator. |
100
+ | `--indexer-precision ID` | Indexer cache precision for models with an indexer cache, such as DeepSeek V4 / GLM / MiniMax M3. |
101
+ | `--include-draft-kv-cache` | Include draft/MTP KV layers when the selected model defines them. Default is off. |
102
+ | `--block-size N` | Fallback block size when trace records omit `block_size`; trace-declared `block_size` overrides it. |
103
+ | `--estimate-tokens N` | Override the token count used for token-dependent bytes/token formulas. By default the trace average input length is used. |
104
+ | `--budgets-gib A,B,C` | Comma-separated KV cache memory budgets in GiB. Default matches the web budget sweep: `1,2,4,...,16384`. |
105
+ | `--policies fifo,lru,optimal` | Eviction policies to simulate. Defaults to all three. |
106
+ | `--backend cpp\|python` | Simulation backend. Default is `cpp`; use `python` for debugging or machines without a compiler. |
107
+ | `--jobs N` | Number of worker processes for the Python backend. The C++ backend runs one batch process and ignores this option. |
108
+ | `--no-progress` | Disable terminal progress output. Progress is written to stderr only when stderr is interactive, so JSON stdout stays valid. |
109
+ | `--format table\|json` | Output format. Default is `table`. |
110
+ | `--output PATH` | Write output to a file. Default `-` prints to stdout. |
111
+ | `--max-records N` | Debug/testing limit: stop after N valid requests. |
112
+ | `--max-events N` | Debug/testing limit: stop after N trace blocks. |
113
+
114
+ ## Output Semantics
115
+
116
+ - Hit rate is measured over the last 50% of requests.
117
+ - Budget points that do not fill the cache before that measurement window are omitted, because they are not under memory pressure yet.
118
+ - Hit tokens count only the longest continuous cached prefix of each request. If a middle block misses, later blocks in that same request do not count as prefill hits even if their ids are already cached.
119
+ - `speedup` is an ideal prefill-only upper bound: `1 / (1 - hit_rate)`. `1.0x` means no-cache prefill throughput where every prefill input token is computed. It does not include decode, KV lookup, network, batching, scheduling, or memory bandwidth overhead.
120
+
121
+ ## Performance Notes
122
+
123
+ The default C++ backend runs all cache-budget points in one batch after loading the trace and building the prefix trie once. This is usually faster than the Python backend, especially on large traces.
124
+
125
+ `--jobs` applies only to the Python backend. It parallelizes independent `(policy, cache budget)` simulation tasks. More jobs are not always faster: the default budget sweep has only a small number of budget points, tasks have uneven runtimes, and large traces can become memory-bandwidth limited.
126
+
127
+ ## Limits
128
+
129
+ The browser version caps uploads to protect the UI. This local package does not use that browser cap, but the C++ backend currently stores trace events and prefix node indexes in 32-bit arrays, so it supports at most `2^32 - 1` block events. In practice, memory and runtime are usually the real limits before that.
@@ -0,0 +1,103 @@
1
+ # KV Cache Simulator
2
+
3
+ `kvcache-simulator` runs the KVCache.AI hit-rate simulator locally on JSONL traces. It uses the same model accounting formulas as the KV Cache Size Calculator and the same prefix-aware hit-rate semantics as the KV Cache Hit Rate Simulator.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install kvcache-simulator
9
+ ```
10
+
11
+ The default simulation backend compiles and runs a bundled C++ replay core on first use. You need a local C++ compiler such as `c++` or `clang++`. If a compiler is not available, use `--backend python`.
12
+
13
+ ## Quick Start
14
+
15
+ ```bash
16
+ kvcache-simulator run \
17
+ --trace trace.jsonl.gz \
18
+ --model glm-5.2 \
19
+ --kv-precision fp8_int8 \
20
+ --indexer-precision fp4_int4
21
+ ```
22
+
23
+ The default output is a readable table. Use `--format json` when another script needs to consume the result.
24
+
25
+ ```bash
26
+ kvcache-simulator run \
27
+ --trace trace.jsonl.gz \
28
+ --model deepseek-v4-pro \
29
+ --kv-precision fp8_int8 \
30
+ --indexer-precision fp4_int4 \
31
+ --format json \
32
+ --output result.json
33
+ ```
34
+
35
+ List supported model ids:
36
+
37
+ ```bash
38
+ kvcache-simulator list-models
39
+ ```
40
+
41
+ `python -m kvcache_sim ...` also works, but the installed CLI command is preferred.
42
+
43
+ `run` is the main command. It evaluates the selected trace across a set of KV cache memory budgets. `sweep` is kept as an alias for users who prefer benchmark terminology.
44
+
45
+ ## Input Trace Format
46
+
47
+ Input is JSONL or JSONL.GZ, one request per line. The minimal accepted format is:
48
+
49
+ ```json
50
+ {"block_size":64,"hash_ids":[2001,2002],"input_length":128}
51
+ ```
52
+
53
+ Required fields:
54
+
55
+ - `hash_ids`: cache block identities in request-prefix order.
56
+ - `input_length`: prefill input token count for this request.
57
+ - `block_size`: source-native block size. It can be omitted only when `--block-size` is provided.
58
+
59
+ Optional fields:
60
+
61
+ - `timestamp`: ignored by the simulator. Requests are replayed in file order, so sort production traces by timestamp before running the command.
62
+ - `output_length`: ignored by the hit-rate denominator. Generated output matters only if it appears later in another request's `hash_ids`.
63
+ - `block_tokens`: advanced field for exact per-block token weights. If present, it must be a positive integer list with the same length as `hash_ids`.
64
+
65
+ `--block-size` is only a fallback for records that omit `block_size`. If any record declares `block_size`, the trace-declared value is used and overrides the CLI fallback for the whole trace.
66
+
67
+ ## Options
68
+
69
+ | Option | Meaning |
70
+ | --- | --- |
71
+ | `--trace PATH` | JSONL/JSONL.GZ trace path, or `-` for stdin. |
72
+ | `--model ID` | Model id from the bundled KV Cache Size Calculator model catalog. Use `kvcache-simulator list-models` to list ids. |
73
+ | `--kv-precision ID` | KV cache precision: usually `bf16_fp16`, `fp8_int8`, or `fp4_int4`. Defaults follow the web calculator. |
74
+ | `--indexer-precision ID` | Indexer cache precision for models with an indexer cache, such as DeepSeek V4 / GLM / MiniMax M3. |
75
+ | `--include-draft-kv-cache` | Include draft/MTP KV layers when the selected model defines them. Default is off. |
76
+ | `--block-size N` | Fallback block size when trace records omit `block_size`; trace-declared `block_size` overrides it. |
77
+ | `--estimate-tokens N` | Override the token count used for token-dependent bytes/token formulas. By default the trace average input length is used. |
78
+ | `--budgets-gib A,B,C` | Comma-separated KV cache memory budgets in GiB. Default matches the web budget sweep: `1,2,4,...,16384`. |
79
+ | `--policies fifo,lru,optimal` | Eviction policies to simulate. Defaults to all three. |
80
+ | `--backend cpp\|python` | Simulation backend. Default is `cpp`; use `python` for debugging or machines without a compiler. |
81
+ | `--jobs N` | Number of worker processes for the Python backend. The C++ backend runs one batch process and ignores this option. |
82
+ | `--no-progress` | Disable terminal progress output. Progress is written to stderr only when stderr is interactive, so JSON stdout stays valid. |
83
+ | `--format table\|json` | Output format. Default is `table`. |
84
+ | `--output PATH` | Write output to a file. Default `-` prints to stdout. |
85
+ | `--max-records N` | Debug/testing limit: stop after N valid requests. |
86
+ | `--max-events N` | Debug/testing limit: stop after N trace blocks. |
87
+
88
+ ## Output Semantics
89
+
90
+ - Hit rate is measured over the last 50% of requests.
91
+ - Budget points that do not fill the cache before that measurement window are omitted, because they are not under memory pressure yet.
92
+ - Hit tokens count only the longest continuous cached prefix of each request. If a middle block misses, later blocks in that same request do not count as prefill hits even if their ids are already cached.
93
+ - `speedup` is an ideal prefill-only upper bound: `1 / (1 - hit_rate)`. `1.0x` means no-cache prefill throughput where every prefill input token is computed. It does not include decode, KV lookup, network, batching, scheduling, or memory bandwidth overhead.
94
+
95
+ ## Performance Notes
96
+
97
+ The default C++ backend runs all cache-budget points in one batch after loading the trace and building the prefix trie once. This is usually faster than the Python backend, especially on large traces.
98
+
99
+ `--jobs` applies only to the Python backend. It parallelizes independent `(policy, cache budget)` simulation tasks. More jobs are not always faster: the default budget sweep has only a small number of budget points, tasks have uneven runtimes, and large traces can become memory-bandwidth limited.
100
+
101
+ ## Limits
102
+
103
+ The browser version caps uploads to protect the UI. This local package does not use that browser cap, but the C++ backend currently stores trace events and prefix node indexes in 32-bit arrays, so it supports at most `2^32 - 1` block events. In practice, memory and runtime are usually the real limits before that.
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["setuptools>=77", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "kvcache-simulator"
7
+ version = "0.1.0"
8
+ description = "Local KV cache hit-rate simulator for JSONL traces."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ license-files = ["LICENSE.md"]
13
+ authors = [
14
+ { name = "KVCache.AI" }
15
+ ]
16
+ keywords = ["kv-cache", "llm", "inference", "simulator", "prefix-cache"]
17
+ classifiers = [
18
+ "Development Status :: 4 - Beta",
19
+ "Environment :: Console",
20
+ "Intended Audience :: Developers",
21
+ "Intended Audience :: Science/Research",
22
+ "Programming Language :: Python :: 3",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Topic :: Scientific/Engineering",
28
+ "Topic :: System :: Benchmark",
29
+ ]
30
+
31
+ [project.urls]
32
+ Homepage = "https://kvcache.ai/"
33
+ Repository = "https://github.com/kvcache-ai/kvcache-blog"
34
+ Issues = "https://github.com/kvcache-ai/kvcache-blog/issues"
35
+
36
+ [project.scripts]
37
+ kvcache-simulator = "kvcache_sim.cli:main"
38
+
39
+ [tool.setuptools.packages.find]
40
+ where = ["src"]
41
+
42
+ [tool.setuptools.package-data]
43
+ "kvcache_sim.resources" = ["models.yaml", "kv-cache-lab-native-sim.cc"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,14 @@
1
+ """Local KV cache hit-rate simulator aligned with KVCache.AI web tools."""
2
+
3
+ from .calculator import BYTES_PER_GIB, calculate_cache_size, load_models_data
4
+ from .simulator import run_sweep
5
+
6
+ __version__ = "0.1.0"
7
+
8
+ __all__ = [
9
+ "BYTES_PER_GIB",
10
+ "__version__",
11
+ "calculate_cache_size",
12
+ "load_models_data",
13
+ "run_sweep",
14
+ ]
@@ -0,0 +1,5 @@
1
+ from .cli import main
2
+
3
+
4
+ if __name__ == "__main__":
5
+ raise SystemExit(main())
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from importlib import resources
4
+ from pathlib import Path
5
+ import hashlib
6
+ import re
7
+ import tempfile
8
+
9
+ RESOURCE_PACKAGE = "kvcache_sim.resources"
10
+
11
+
12
+ def package_resource_path(name: str) -> Path:
13
+ resource = resources.files(RESOURCE_PACKAGE).joinpath(name)
14
+ try:
15
+ path = Path(resource)
16
+ if path.exists():
17
+ return path
18
+ except TypeError:
19
+ pass
20
+
21
+ payload = resource.read_bytes()
22
+ digest = hashlib.sha256(payload).hexdigest()[:16]
23
+ safe_name = re.sub(r"[^A-Za-z0-9_.-]+", "-", name)
24
+ target = Path(tempfile.gettempdir()) / f"kvcache-simulator-{digest}-{safe_name}"
25
+ if not target.exists() or target.read_bytes() != payload:
26
+ target.write_bytes(payload)
27
+ return target