squish-ai 9.32.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- squish_ai-9.32.0/LICENSE +104 -0
- squish_ai-9.32.0/PKG-INFO +290 -0
- squish_ai-9.32.0/README.md +225 -0
- squish_ai-9.32.0/pyproject.toml +167 -0
- squish_ai-9.32.0/setup.cfg +4 -0
- squish_ai-9.32.0/squish/__init__.py +437 -0
- squish_ai-9.32.0/squish/_fast_imports.py +149 -0
- squish_ai-9.32.0/squish/_term.py +257 -0
- squish_ai-9.32.0/squish/agent/__init__.py +5 -0
- squish_ai-9.32.0/squish/agent/builtin_tools.py +745 -0
- squish_ai-9.32.0/squish/agent/tool_name_map.py +90 -0
- squish_ai-9.32.0/squish/agent/tool_registry.py +339 -0
- squish_ai-9.32.0/squish/api/__init__.py +6 -0
- squish_ai-9.32.0/squish/api/v1_router.py +447 -0
- squish_ai-9.32.0/squish/backend.py +413 -0
- squish_ai-9.32.0/squish/catalog.py +1139 -0
- squish_ai-9.32.0/squish/cli.py +7361 -0
- squish_ai-9.32.0/squish/compressed_loader_torch.py +193 -0
- squish_ai-9.32.0/squish/config.py +167 -0
- squish_ai-9.32.0/squish/context/__init__.py +1 -0
- squish_ai-9.32.0/squish/context/lazy_llm.py +334 -0
- squish_ai-9.32.0/squish/context/prompt_compressor.py +189 -0
- squish_ai-9.32.0/squish/convert.py +1259 -0
- squish_ai-9.32.0/squish/daemon/__init__.py +32 -0
- squish_ai-9.32.0/squish/daemon/client.py +151 -0
- squish_ai-9.32.0/squish/daemon/launchagent.py +226 -0
- squish_ai-9.32.0/squish/daemon/squishd.py +654 -0
- squish_ai-9.32.0/squish/experimental/__init__.py +0 -0
- squish_ai-9.32.0/squish/experimental/_eval_torch.py +409 -0
- squish_ai-9.32.0/squish/experimental/astc_loader.py +420 -0
- squish_ai-9.32.0/squish/experimental/convert_coreml.py +450 -0
- squish_ai-9.32.0/squish/experimental/coreml_loader.py +347 -0
- squish_ai-9.32.0/squish/experimental/hqq_quant.py +23 -0
- squish_ai-9.32.0/squish/experimental/jacobi_decode.py +344 -0
- squish_ai-9.32.0/squish/experimental/layer_overlap_loader.py +351 -0
- squish_ai-9.32.0/squish/experimental/lm_studio_bridge.py +252 -0
- squish_ai-9.32.0/squish/experimental/localai_compat.py +77 -0
- squish_ai-9.32.0/squish/experimental/spin_quant.py +440 -0
- squish_ai-9.32.0/squish/experimental/structured_sparsity.py +245 -0
- squish_ai-9.32.0/squish/experimental/torch_ops.py +173 -0
- squish_ai-9.32.0/squish/grammar/__init__.py +1 -0
- squish_ai-9.32.0/squish/grammar/grammar_cache.py +439 -0
- squish_ai-9.32.0/squish/grammar/grammar_engine.py +578 -0
- squish_ai-9.32.0/squish/grammar/schema_gen.py +488 -0
- squish_ai-9.32.0/squish/hardware/__init__.py +1 -0
- squish_ai-9.32.0/squish/hardware/capability_probe.py +269 -0
- squish_ai-9.32.0/squish/hardware/chip_detector.py +383 -0
- squish_ai-9.32.0/squish/hardware/fused_kernels.py +438 -0
- squish_ai-9.32.0/squish/hardware/fused_sampler.py +315 -0
- squish_ai-9.32.0/squish/hardware/production_profiler.py +266 -0
- squish_ai-9.32.0/squish/integrations/__init__.py +6 -0
- squish_ai-9.32.0/squish/integrations/hf.py +366 -0
- squish_ai-9.32.0/squish/io/__init__.py +1 -0
- squish_ai-9.32.0/squish/io/entropy.py +362 -0
- squish_ai-9.32.0/squish/io/gguf_loader.py +484 -0
- squish_ai-9.32.0/squish/io/loader_utils.py +401 -0
- squish_ai-9.32.0/squish/io/model_shard_loader.py +386 -0
- squish_ai-9.32.0/squish/io/split_loader.py +480 -0
- squish_ai-9.32.0/squish/io/weight_decompress_stream.py +350 -0
- squish_ai-9.32.0/squish/kv/__init__.py +1 -0
- squish_ai-9.32.0/squish/kv/block_kv_cache.py +589 -0
- squish_ai-9.32.0/squish/kv/delta.py +314 -0
- squish_ai-9.32.0/squish/kv/head_importance.py +242 -0
- squish_ai-9.32.0/squish/kv/kv_cache.py +3740 -0
- squish_ai-9.32.0/squish/kv/mmap_cache.py +363 -0
- squish_ai-9.32.0/squish/kv/prompt_kv_cache.py +499 -0
- squish_ai-9.32.0/squish/kv/radix_cache.py +495 -0
- squish_ai-9.32.0/squish/loaders/__init__.py +12 -0
- squish_ai-9.32.0/squish/logging_config.py +118 -0
- squish_ai-9.32.0/squish/platform/__init__.py +30 -0
- squish_ai-9.32.0/squish/platform/ane_router.py +326 -0
- squish_ai-9.32.0/squish/platform/detector.py +347 -0
- squish_ai-9.32.0/squish/platform/feature_registry.py +249 -0
- squish_ai-9.32.0/squish/platform/platform_router.py +388 -0
- squish_ai-9.32.0/squish/quant/__init__.py +1 -0
- squish_ai-9.32.0/squish/quant/aqlm.py +579 -0
- squish_ai-9.32.0/squish/quant/awq.py +840 -0
- squish_ai-9.32.0/squish/quant/compressed_loader.py +2083 -0
- squish_ai-9.32.0/squish/quant/hqq.py +337 -0
- squish_ai-9.32.0/squish/quant/int3_linear.py +256 -0
- squish_ai-9.32.0/squish/quant/int3_runtime.py +245 -0
- squish_ai-9.32.0/squish/quant/milo_quant.py +623 -0
- squish_ai-9.32.0/squish/quant/quantizer.py +810 -0
- squish_ai-9.32.0/squish/quant/sqint2.py +1454 -0
- squish_ai-9.32.0/squish/quant/sqint2_linear.py +423 -0
- squish_ai-9.32.0/squish/reasoning/__init__.py +1 -0
- squish_ai-9.32.0/squish/reasoning/coconut.py +256 -0
- squish_ai-9.32.0/squish/reasoning/self_consistency.py +163 -0
- squish_ai-9.32.0/squish/runtime/__init__.py +8 -0
- squish_ai-9.32.0/squish/runtime/auto_profile.py +399 -0
- squish_ai-9.32.0/squish/runtime/format_validator.py +394 -0
- squish_ai-9.32.0/squish/runtime/squish_runtime.py +501 -0
- squish_ai-9.32.0/squish/semantic_cache.py +10 -0
- squish_ai-9.32.0/squish/server.py +5514 -0
- squish_ai-9.32.0/squish/serving/__init__.py +1 -0
- squish_ai-9.32.0/squish/serving/backend_router.py +156 -0
- squish_ai-9.32.0/squish/serving/blazing.py +145 -0
- squish_ai-9.32.0/squish/serving/feature_state.py +103 -0
- squish_ai-9.32.0/squish/serving/kernel_cache.py +148 -0
- squish_ai-9.32.0/squish/serving/local_model_scanner.py +701 -0
- squish_ai-9.32.0/squish/serving/mcp_client.py +367 -0
- squish_ai-9.32.0/squish/serving/memory_governor.py +345 -0
- squish_ai-9.32.0/squish/serving/obs_report.py +156 -0
- squish_ai-9.32.0/squish/serving/ollama_compat.py +523 -0
- squish_ai-9.32.0/squish/serving/quality_monitor.py +440 -0
- squish_ai-9.32.0/squish/serving/router.py +411 -0
- squish_ai-9.32.0/squish/serving/scheduler.py +1373 -0
- squish_ai-9.32.0/squish/serving/startup_profiler.py +195 -0
- squish_ai-9.32.0/squish/serving/tool_calling.py +410 -0
- squish_ai-9.32.0/squish/speculative/__init__.py +22 -0
- squish_ai-9.32.0/squish/speculative/eagle3.py +463 -0
- squish_ai-9.32.0/squish/speculative/prompt_lookup.py +314 -0
- squish_ai-9.32.0/squish/speculative/speculative.py +2111 -0
- squish_ai-9.32.0/squish/streaming/__init__.py +1 -0
- squish_ai-9.32.0/squish/streaming/chunked_prefill.py +110 -0
- squish_ai-9.32.0/squish/streaming/streaming_sink.py +269 -0
- squish_ai-9.32.0/squish/telemetry.py +568 -0
- squish_ai-9.32.0/squish/ui.py +684 -0
- squish_ai-9.32.0/squish_ai.egg-info/PKG-INFO +290 -0
- squish_ai-9.32.0/squish_ai.egg-info/SOURCES.txt +223 -0
- squish_ai-9.32.0/squish_ai.egg-info/dependency_links.txt +1 -0
- squish_ai-9.32.0/squish_ai.egg-info/entry_points.txt +5 -0
- squish_ai-9.32.0/squish_ai.egg-info/requires.txt +53 -0
- squish_ai-9.32.0/squish_ai.egg-info/top_level.txt +2 -0
- squish_ai-9.32.0/tests/test_auto_calibrate.py +402 -0
- squish_ai-9.32.0/tests/test_backend_unit.py +634 -0
- squish_ai-9.32.0/tests/test_bench.py +198 -0
- squish_ai-9.32.0/tests/test_block_kv_cache.py +237 -0
- squish_ai-9.32.0/tests/test_catalog_branches.py +220 -0
- squish_ai-9.32.0/tests/test_catalog_extended.py +234 -0
- squish_ai-9.32.0/tests/test_catalog_hash.py +171 -0
- squish_ai-9.32.0/tests/test_catalog_ssl.py +180 -0
- squish_ai-9.32.0/tests/test_catalog_unit.py +289 -0
- squish_ai-9.32.0/tests/test_cli_eval.py +346 -0
- squish_ai-9.32.0/tests/test_cli_extras.py +163 -0
- squish_ai-9.32.0/tests/test_cli_sbom.py +158 -0
- squish_ai-9.32.0/tests/test_cli_unit.py +438 -0
- squish_ai-9.32.0/tests/test_compressed_loader_torch_unit.py +219 -0
- squish_ai-9.32.0/tests/test_config_unit.py +309 -0
- squish_ai-9.32.0/tests/test_convert_unit.py +356 -0
- squish_ai-9.32.0/tests/test_demo_server.py +312 -0
- squish_ai-9.32.0/tests/test_docker_entrypoint_unit.py +397 -0
- squish_ai-9.32.0/tests/test_eval_binder.py +289 -0
- squish_ai-9.32.0/tests/test_fast_imports.py +213 -0
- squish_ai-9.32.0/tests/test_governor_middleware.py +290 -0
- squish_ai-9.32.0/tests/test_grammar_independent_mask.py +304 -0
- squish_ai-9.32.0/tests/test_helm_chart_unit.py +422 -0
- squish_ai-9.32.0/tests/test_kitty_channel_sensitivity.py +482 -0
- squish_ai-9.32.0/tests/test_kv_budget.py +359 -0
- squish_ai-9.32.0/tests/test_kv_int2.py +379 -0
- squish_ai-9.32.0/tests/test_kv_int4.py +441 -0
- squish_ai-9.32.0/tests/test_kv_p1.py +408 -0
- squish_ai-9.32.0/tests/test_lazy_load_modes.py +370 -0
- squish_ai-9.32.0/tests/test_load_mlx_model_parallel.py +205 -0
- squish_ai-9.32.0/tests/test_logging_config_unit.py +213 -0
- squish_ai-9.32.0/tests/test_model_pipeline_unit.py +522 -0
- squish_ai-9.32.0/tests/test_oms_signer.py +80 -0
- squish_ai-9.32.0/tests/test_openai_compat.py +259 -0
- squish_ai-9.32.0/tests/test_overnight_bench_unit.py +299 -0
- squish_ai-9.32.0/tests/test_predownload_scan.py +483 -0
- squish_ai-9.32.0/tests/test_prompt_kv_cache.py +404 -0
- squish_ai-9.32.0/tests/test_quality_monitor.py +374 -0
- squish_ai-9.32.0/tests/test_quant_aqlm.py +580 -0
- squish_ai-9.32.0/tests/test_radix_kv_reuse_integration.py +197 -0
- squish_ai-9.32.0/tests/test_router.py +394 -0
- squish_ai-9.32.0/tests/test_rust_matmul.py +207 -0
- squish_ai-9.32.0/tests/test_sbom_builder.py +298 -0
- squish_ai-9.32.0/tests/test_spaces_demo.py +349 -0
- squish_ai-9.32.0/tests/test_sparsity_trim.py +337 -0
- squish_ai-9.32.0/tests/test_sqint2.py +920 -0
- squish_ai-9.32.0/tests/test_sqint2_compress.py +455 -0
- squish_ai-9.32.0/tests/test_sqint2_linear.py +605 -0
- squish_ai-9.32.0/tests/test_sqint2_loader.py +329 -0
- squish_ai-9.32.0/tests/test_sqint2_residual_gemv.py +377 -0
- squish_ai-9.32.0/tests/test_sqint2_router.py +437 -0
- squish_ai-9.32.0/tests/test_squishd_unit.py +469 -0
- squish_ai-9.32.0/tests/test_stop_token_suppression.py +269 -0
- squish_ai-9.32.0/tests/test_synthetic_model_fixture.py +168 -0
- squish_ai-9.32.0/tests/test_telemetry_unit.py +689 -0
- squish_ai-9.32.0/tests/test_term_unit.py +279 -0
- squish_ai-9.32.0/tests/test_tool_choice_unit.py +158 -0
- squish_ai-9.32.0/tests/test_torch_ops_unit.py +201 -0
- squish_ai-9.32.0/tests/test_ui_unit.py +278 -0
- squish_ai-9.32.0/tests/test_version.py +128 -0
- squish_ai-9.32.0/tests/test_wave108_calculator.py +323 -0
- squish_ai-9.32.0/tests/test_wave114_rep_loop.py +187 -0
- squish_ai-9.32.0/tests/test_wave119_dead_stub_purge.py +158 -0
- squish_ai-9.32.0/tests/test_wave120_dead_global_purge.py +108 -0
- squish_ai-9.32.0/tests/test_wave121_dead_flag_purge.py +165 -0
- squish_ai-9.32.0/tests/test_wave122_dead_const_purge.py +149 -0
- squish_ai-9.32.0/tests/test_wave123_empty_section_purge.py +100 -0
- squish_ai-9.32.0/tests/test_wave124_orphan_global_purge.py +78 -0
- squish_ai-9.32.0/tests/test_wave125_stale_comment_purge.py +48 -0
- squish_ai-9.32.0/tests/test_wave126_empty_header_purge.py +42 -0
- squish_ai-9.32.0/tests/test_wave64a_trace_endpoint.py +293 -0
- squish_ai-9.32.0/tests/test_wave70_squish_runtime.py +627 -0
- squish_ai-9.32.0/tests/test_wave72_quantize_fix.py +919 -0
- squish_ai-9.32.0/tests/test_wave72_resquish.py +393 -0
- squish_ai-9.32.0/tests/test_wave74_run_polish.py +250 -0
- squish_ai-9.32.0/tests/test_wave74_web_ui.py +39 -0
- squish_ai-9.32.0/tests/test_wave75_perf_foundations.py +411 -0
- squish_ai-9.32.0/tests/test_wave76_agent_tools.py +526 -0
- squish_ai-9.32.0/tests/test_wave78_perf_quality.py +576 -0
- squish_ai-9.32.0/tests/test_wave79_auto_profile.py +461 -0
- squish_ai-9.32.0/tests/test_wave79_startup_inference.py +283 -0
- squish_ai-9.32.0/tests/test_wave80_chunk_fingerprint.py +336 -0
- squish_ai-9.32.0/tests/test_wave81_blazing_m3.py +429 -0
- squish_ai-9.32.0/tests/test_wave81_orjson_sse.py +366 -0
- squish_ai-9.32.0/tests/test_wave82_autoload_eagle3.py +473 -0
- squish_ai-9.32.0/tests/test_wave82_ux_polish.py +343 -0
- squish_ai-9.32.0/tests/test_wave85_color_dedup.py +381 -0
- squish_ai-9.32.0/tests/test_wave86_observability.py +312 -0
- squish_ai-9.32.0/tests/test_wave87_agent_tools.py +208 -0
- squish_ai-9.32.0/tests/test_wave88_api_compat.py +311 -0
- squish_ai-9.32.0/tests/test_wave89_local_model_scan.py +448 -0
- squish_ai-9.32.0/tests/test_wave90_startup_lean.py +296 -0
- squish_ai-9.32.0/tests/test_wave91_performance.py +266 -0
- squish_ai-9.32.0/tests/test_wave92_presquish.py +244 -0
- squish_ai-9.32.0/tests/test_wave93_squishbar.py +238 -0
- squish_ai-9.32.0/tests/test_wave95_ps_logs.py +418 -0
- squish_ai-9.32.0/tests/test_wave95_release.py +261 -0
- squish_ai-9.32.0/tests/test_wave96_lm_studio.py +404 -0
- squish_ai-9.32.0/tests/test_wave97_inference_fixes.py +258 -0
- squish_ai-9.32.0/tests/test_wave98_lean_server.py +251 -0
- squish_ai-9.32.0/tests/test_wave99_speed_restore.py +294 -0
squish_ai-9.32.0/LICENSE
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Business Source License 1.1
|
|
2
|
+
|
|
3
|
+
Parameters
|
|
4
|
+
|
|
5
|
+
Licensor: squishai
|
|
6
|
+
Licensed Work: squish
|
|
7
|
+
The Licensed Work is (c) 2025 squishai
|
|
8
|
+
Additional Use Grant: You may make production use of the Licensed Work,
|
|
9
|
+
provided that such use is for any of the following
|
|
10
|
+
purposes: (i) personal or household use; (ii)
|
|
11
|
+
non-commercial research, academic study, or
|
|
12
|
+
educational purposes; (iii) evaluation or testing of
|
|
13
|
+
the Licensed Work for potential adoption, provided
|
|
14
|
+
that such evaluation does not itself constitute a
|
|
15
|
+
production commercial deployment. You may also make
|
|
16
|
+
any non-production use of the Licensed Work (including
|
|
17
|
+
development, staging, and internal testing).
|
|
18
|
+
Change Date: 2030-01-01
|
|
19
|
+
Change License: MIT
|
|
20
|
+
|
|
21
|
+
For information about alternative commercial licensing arrangements for the
|
|
22
|
+
Licensed Work, please contact: wes@squish.run
|
|
23
|
+
|
|
24
|
+
-----------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
Notice
|
|
27
|
+
|
|
28
|
+
The Business Source License (this document, or the "License") is not an Open
|
|
29
|
+
Source license. However, the Licensed Work will eventually be made available
|
|
30
|
+
under an Open Source License, as stated in this License.
|
|
31
|
+
|
|
32
|
+
License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
|
|
33
|
+
"Business Source License" is a trademark of MariaDB Corporation Ab.
|
|
34
|
+
|
|
35
|
+
-----------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
Business Source License 1.1
|
|
38
|
+
|
|
39
|
+
Terms
|
|
40
|
+
|
|
41
|
+
The Licensor hereby grants you the right to copy, modify, create derivative
|
|
42
|
+
works, redistribute, and make non-production use of the Licensed Work. The
|
|
43
|
+
Licensor may make an Additional Use Grant, above, permitting limited
|
|
44
|
+
production use.
|
|
45
|
+
|
|
46
|
+
Effective on the Change Date, or the fourth anniversary of the first publicly
|
|
47
|
+
available distribution of a specific version of the Licensed Work under this
|
|
48
|
+
License, whichever comes first, the Licensor hereby grants you rights under
|
|
49
|
+
the terms of the Change License, and the rights granted in the paragraph
|
|
50
|
+
above terminate.
|
|
51
|
+
|
|
52
|
+
If your use of the Licensed Work does not comply with the requirements
|
|
53
|
+
currently in effect as described in this License, you must purchase a
|
|
54
|
+
commercial license from the Licensor, its affiliated entities, or authorized
|
|
55
|
+
resellers, or you must refrain from using the Licensed Work.
|
|
56
|
+
|
|
57
|
+
All copies of the original and modified Licensed Work, and derivative works
|
|
58
|
+
of the Licensed Work, are subject to this License. This License applies
|
|
59
|
+
separately for each version of the Licensed Work and the Change Date may vary
|
|
60
|
+
for each version of the Licensed Work released by Licensor.
|
|
61
|
+
|
|
62
|
+
You must conspicuously display this License on each original or modified copy
|
|
63
|
+
of the Licensed Work. If you receive the Licensed Work in original or modified
|
|
64
|
+
form from a third party, the terms and conditions set forth in this License
|
|
65
|
+
apply to your use of that work.
|
|
66
|
+
|
|
67
|
+
Any use of the Licensed Work in violation of this License will automatically
|
|
68
|
+
terminate your rights under this License for the current and all other
|
|
69
|
+
versions of the Licensed Work.
|
|
70
|
+
|
|
71
|
+
This License does not grant you any right in any trademark or logo of
|
|
72
|
+
Licensor or its affiliates (provided that you may use a trademark or logo of
|
|
73
|
+
Licensor as expressly required by this License).
|
|
74
|
+
|
|
75
|
+
TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
|
|
76
|
+
AN "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
|
|
77
|
+
EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
|
|
78
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
|
|
79
|
+
TITLE.
|
|
80
|
+
|
|
81
|
+
MariaDB hereby grants you permission to use this License's text to license
|
|
82
|
+
your works, and to refer to it using the trademark "Business Source License",
|
|
83
|
+
as long as you comply with the Covenants of Licensor below.
|
|
84
|
+
|
|
85
|
+
Covenants of Licensor
|
|
86
|
+
|
|
87
|
+
In consideration of the right to use this License's text and the "Business
|
|
88
|
+
Source License" name and trademark, Licensor covenants to MariaDB, and to
|
|
89
|
+
all recipients of the licensed work to be provided by Licensor:
|
|
90
|
+
|
|
91
|
+
1. To specify as the Change License the GPL Version 2.0 or any later
|
|
92
|
+
version, or a license that is compatible with GPL Version 2.0 or a later
|
|
93
|
+
version, where "compatible" means that software provided under the Change
|
|
94
|
+
License can be included in a program with software provided under GPL
|
|
95
|
+
Version 2.0 or a later version. Licensor may specify additional Change
|
|
96
|
+
Licenses as an alternative to the Change License.
|
|
97
|
+
|
|
98
|
+
2. To either: (a) specify an additional grant of rights to use that does not
|
|
99
|
+
impose any additional restriction on the right granted in this License,
|
|
100
|
+
as the Additional Use Grant; or (b) insert the text "None".
|
|
101
|
+
|
|
102
|
+
3. To specify a Change Date.
|
|
103
|
+
|
|
104
|
+
4. Not to modify this License in any other way.
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: squish-ai
|
|
3
|
+
Version: 9.32.0
|
|
4
|
+
Summary: Local LLM inference server for Apple Silicon. Block-level paged KV cache for long-context workloads. 5.4× faster end-to-end on 4K-token prompts vs Ollama, less RAM, INT3 support for Qwen3. OpenAI-compatible API.
|
|
5
|
+
License: BUSL-1.1
|
|
6
|
+
Project-URL: Homepage, https://github.com/wesleyscholl/squish
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/wesleyscholl/squish/issues
|
|
8
|
+
Project-URL: Documentation, https://wesleyscholl.github.io/squish
|
|
9
|
+
Keywords: llm,inference,quantization,apple-silicon,mlx,speculative-decoding
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: Other/Proprietary License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: mlx>=0.18; sys_platform == "darwin" and platform_machine == "arm64"
|
|
23
|
+
Requires-Dist: mlx-lm>=0.19; sys_platform == "darwin" and platform_machine == "arm64"
|
|
24
|
+
Requires-Dist: numpy>=1.26
|
|
25
|
+
Requires-Dist: safetensors>=0.4
|
|
26
|
+
Requires-Dist: fastapi>=0.111
|
|
27
|
+
Requires-Dist: uvicorn[standard]>=0.29
|
|
28
|
+
Requires-Dist: sse-starlette>=1.8
|
|
29
|
+
Requires-Dist: huggingface-hub>=0.23
|
|
30
|
+
Requires-Dist: transformers>=4.40
|
|
31
|
+
Requires-Dist: zstandard>=0.22
|
|
32
|
+
Requires-Dist: rich>=13.0
|
|
33
|
+
Requires-Dist: orjson>=3.9
|
|
34
|
+
Provides-Extra: quant
|
|
35
|
+
Provides-Extra: retrieval
|
|
36
|
+
Requires-Dist: hnswlib>=0.8; extra == "retrieval"
|
|
37
|
+
Provides-Extra: llmlingua
|
|
38
|
+
Requires-Dist: llmlingua>=0.2; extra == "llmlingua"
|
|
39
|
+
Provides-Extra: grammar
|
|
40
|
+
Requires-Dist: xgrammar>=0.1; extra == "grammar"
|
|
41
|
+
Provides-Extra: cache
|
|
42
|
+
Requires-Dist: sqlite-vec>=0.1; extra == "cache"
|
|
43
|
+
Provides-Extra: whatsapp
|
|
44
|
+
Provides-Extra: eval
|
|
45
|
+
Requires-Dist: lm-eval>=0.4; extra == "eval"
|
|
46
|
+
Requires-Dist: datasets>=2.18; extra == "eval"
|
|
47
|
+
Requires-Dist: accelerate>=0.29; extra == "eval"
|
|
48
|
+
Requires-Dist: sacrebleu; extra == "eval"
|
|
49
|
+
Requires-Dist: rouge_score; extra == "eval"
|
|
50
|
+
Requires-Dist: nltk; extra == "eval"
|
|
51
|
+
Provides-Extra: linux
|
|
52
|
+
Requires-Dist: torch>=2.0; extra == "linux"
|
|
53
|
+
Provides-Extra: dev
|
|
54
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
55
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
56
|
+
Requires-Dist: pytest-timeout>=2.3; extra == "dev"
|
|
57
|
+
Requires-Dist: httpx>=0.27; extra == "dev"
|
|
58
|
+
Requires-Dist: maturin>=1.5; extra == "dev"
|
|
59
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
60
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
61
|
+
Requires-Dist: mkdocs-material>=9.5; extra == "dev"
|
|
62
|
+
Requires-Dist: mkdocs-exclude>=1.0; extra == "dev"
|
|
63
|
+
Requires-Dist: cryptography>=42.0; extra == "dev"
|
|
64
|
+
Dynamic: license-file
|
|
65
|
+
|
|
66
|
+
# Squish
|
|
67
|
+
|
|
68
|
+
**Local LLM inference for Apple Silicon. Faster end-to-end response on long contexts, less RAM, INT3 support.**
|
|
69
|
+
|
|
70
|
+
[](LICENSE)
|
|
71
|
+
[](https://pypi.org/project/squish/)
|
|
72
|
+
[](https://github.com/konjoai/squish/actions/workflows/ci.yml)
|
|
73
|
+
[](https://github.com/konjoai/squish)
|
|
74
|
+
[](https://huggingface.co/squish-community)
|
|
75
|
+
|
|
76
|
+
<img src="assets/squish-logo-1.png" height="320" alt="Squish Logo"/>
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## The Numbers (v9.32.0 / bench v5.1.1)
|
|
81
|
+
|
|
82
|
+
Measured 2026-06-02 on Apple M3 MacBook Pro, 16 GB unified memory.
|
|
83
|
+
Model: Qwen2.5-7B-Instruct. Quant: INT4 (squish) / Q4_K_M (Ollama).
|
|
84
|
+
Five-run medians. Raw artifacts in [`results/benchmarks_v5_1_1/`](results/benchmarks_v5_1_1/).
|
|
85
|
+
|
|
86
|
+
| Metric | Ollama 0.18.2 | **Squish (recommended)** |
|
|
87
|
+
|---|---:|---:|
|
|
88
|
+
| **E2E response @ 4000-token prompt** | 69.63 s | **12.78 s** _(5.4× faster)_ |
|
|
89
|
+
| **E2E response @ 75-token prompt** | 8.09 s | **5.50 s** _(1.5× faster)_ |
|
|
90
|
+
| **Peak RAM during inference** | ~5 GB | **3.36 GB** |
|
|
91
|
+
| **Disk size — INT4** | 4.36 GB | **4.00 GB** |
|
|
92
|
+
| **Disk size — INT3 (Qwen3)** | not supported | **3.56 GB** |
|
|
93
|
+
| **TTFT @ 75-token prompt** | **131 ms** | 279 ms _(honest loss)_ |
|
|
94
|
+
|
|
95
|
+
**Squish wins end-to-end response time at every prompt size measured**, with
|
|
96
|
+
the largest win on long contexts (5.4× at 4000 tokens), uses ~33% less RAM,
|
|
97
|
+
and supports INT3 for compatible model families.
|
|
98
|
+
|
|
99
|
+
**Ollama wins time-to-first-token at every prompt size**, and inter-token
|
|
100
|
+
jitter on long contexts. If first-byte latency matters more than full-response
|
|
101
|
+
latency, Ollama is the right tool.
|
|
102
|
+
|
|
103
|
+
Full table, methodology, and ablation: [`docs/RESULTS.md`](docs/RESULTS.md)
|
|
104
|
+
(v5.1.1 section).
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Why Squish
|
|
109
|
+
|
|
110
|
+
Squish is for the workload most local-LLM tools aren't tuned for: **the same
|
|
111
|
+
model called many times an hour from the terminal with shifting context** —
|
|
112
|
+
git-commit-message generation, code-review prompts, agent loops, multi-turn
|
|
113
|
+
chat, document Q&A.
|
|
114
|
+
|
|
115
|
+
On a 16 GB Mac, that workload collides with the rest of your work. Ollama
|
|
116
|
+
keeps ~5 GB resident and pays a long prefill cost on each new long prompt.
|
|
117
|
+
Squish is a persistent daemon: the model loads once when the daemon starts,
|
|
118
|
+
and a two-cache architecture (block-paged KV cache for shifting prefixes,
|
|
119
|
+
prompt KV cache for exact repeats) avoids re-prefilling work the daemon has
|
|
120
|
+
already done.
|
|
121
|
+
|
|
122
|
+
Designed for one developer on one machine. Not a production multi-tenant API.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Install
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
# PyPI
|
|
130
|
+
pip install squish
|
|
131
|
+
|
|
132
|
+
# Homebrew tap (coming with v9.32.0)
|
|
133
|
+
brew tap konjoai/squish
|
|
134
|
+
brew install squish
|
|
135
|
+
|
|
136
|
+
# From source
|
|
137
|
+
git clone https://github.com/konjoai/squish
|
|
138
|
+
cd squish
|
|
139
|
+
pip install -e .
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Requirements: macOS 13+, Apple Silicon (M1–M5), Python 3.10+.
|
|
143
|
+
|
|
144
|
+
---
|
|
145
|
+
|
|
146
|
+
## Quick Start
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
# Pull a pre-quantised model from the catalog
|
|
150
|
+
squish pull qwen2.5-7b-int4
|
|
151
|
+
|
|
152
|
+
# Start the daemon with both caches enabled (recommended config)
|
|
153
|
+
squish run qwen2.5-7b-int4 \
|
|
154
|
+
--block-kv-cache ~/.cache/squish/blocks \
|
|
155
|
+
--prompt-kv-cache ~/.cache/squish/pkv \
|
|
156
|
+
--port 8080
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Use it as an OpenAI-compatible client:
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
curl http://localhost:8080/v1/chat/completions \
|
|
163
|
+
-H "Content-Type: application/json" \
|
|
164
|
+
-d '{
|
|
165
|
+
"model": "qwen2.5-7b-int4",
|
|
166
|
+
"messages": [{"role": "user", "content": "Hello"}]
|
|
167
|
+
}'
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Or point any OpenAI / Ollama client at it:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
export OPENAI_BASE_URL=http://localhost:8080/v1
|
|
174
|
+
export OPENAI_API_KEY=squish
|
|
175
|
+
# Ollama-compatible /api/* endpoints also work
|
|
176
|
+
export OLLAMA_HOST=http://localhost:8080
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Install the macOS LaunchAgent so the daemon starts at login:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
squish daemon install
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
The **SquishBar** menu-bar app (`apps/macos/SquishBar/`) ships alongside the
|
|
186
|
+
daemon — model picker, load progress, and a global hotkey for the chat panel.
|
|
187
|
+
Build it from Xcode or grab the signed `.app` from the GitHub release page.
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## Configuration
|
|
192
|
+
|
|
193
|
+
| Flag | Purpose |
|
|
194
|
+
|---|---|
|
|
195
|
+
| `--block-kv-cache <DIR>` | Block-paged KV cache for shifting-prefix workloads (agents, multi-turn). Persists across daemon restarts via `.safetensors` blocks. |
|
|
196
|
+
| `--prompt-kv-cache <DIR>` | Exact-prompt KV cache. Single-digit-millisecond TTFT on verbatim repeats. |
|
|
197
|
+
| `--block-kv-size N` | Block size in tokens (default 64). |
|
|
198
|
+
| `--draft-model <MODEL>` | Speculative-decode draft model (opt-in; see [v5.2 diagnosis](results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md) for current status — net-negative on M3 INT4 with the draft models tested, kept off by default). |
|
|
199
|
+
| `--draft-depth N` | Speculative decode depth K. |
|
|
200
|
+
| `--no-spec`, `--no-cache` | Disable flags, intended for benchmark controls. |
|
|
201
|
+
| `squish daemon install` / `uninstall` | macOS LaunchAgent integration. |
|
|
202
|
+
|
|
203
|
+
Picking the right cache for your workload:
|
|
204
|
+
|
|
205
|
+
- **Exact-prompt repeats** (cached scripts, fixed templates, automated jobs):
|
|
206
|
+
`--prompt-kv-cache` alone. ~9 ms TTFT on a cache hit.
|
|
207
|
+
- **Shifting-prefix workloads** (agents, multi-turn conversations):
|
|
208
|
+
`--block-kv-cache` alone, or combined config.
|
|
209
|
+
- **General use without knowing the workload**: combined config (both caches
|
|
210
|
+
enabled). Best end-to-end completion time across prompt sizes.
|
|
211
|
+
|
|
212
|
+
The combined config currently doesn't inherit PKV's fast-hit TTFT due to a
|
|
213
|
+
lookup ordering issue documented in
|
|
214
|
+
[`results/benchmarks_v5_1_1/DIAGNOSIS.md`](results/benchmarks_v5_1_1/DIAGNOSIS.md);
|
|
215
|
+
reordering is tracked as a v5.2 follow-up.
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Benchmarks
|
|
220
|
+
|
|
221
|
+
Full table, methodology, ablation, jitter analysis, and raw per-run JSON:
|
|
222
|
+
|
|
223
|
+
- [`docs/RESULTS.md`](docs/RESULTS.md) — v5.1.1 section is the source of truth
|
|
224
|
+
- [`benchmarks/ollama_vs_squish/RESULTS.md`](benchmarks/ollama_vs_squish/RESULTS.md) — bench harness output
|
|
225
|
+
- [`results/benchmarks_v5_1_1/DIAGNOSIS.md`](results/benchmarks_v5_1_1/DIAGNOSIS.md) — combined-cache ordering write-up
|
|
226
|
+
- [`results/benchmarks_v5_1_1/JITTER_ANALYSIS.md`](results/benchmarks_v5_1_1/JITTER_ANALYSIS.md) — inter-token p95 explanation
|
|
227
|
+
- [`results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md`](results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md) — why speculative decoding is currently opt-in
|
|
228
|
+
|
|
229
|
+
Reproduce locally:
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
python benchmarks/ollama_vs_squish/bench_v5_1.py
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## What Squish Doesn't Do
|
|
238
|
+
|
|
239
|
+
In the spirit of honesty:
|
|
240
|
+
|
|
241
|
+
- **No GPU support outside Apple Silicon.** It's MLX-based. CUDA users should use vLLM or llama.cpp.
|
|
242
|
+
- **No multi-user serving.** Designed for one developer, one machine — not a production API.
|
|
243
|
+
- **No multimodal models.** Text only.
|
|
244
|
+
- **Higher inter-token p95 on long prompts** than Ollama. Conscious tradeoff (deferred KV-cache restore off the TTFT critical path); details in [`JITTER_ANALYSIS.md`](results/benchmarks_v5_1_1/JITTER_ANALYSIS.md).
|
|
245
|
+
- **Slower first-token on short prompts** than Ollama. Fundamental MLX prefill kernel cost.
|
|
246
|
+
- **Model conversion is slow and not user-friendly.** Squish needs models in its own format. Conversion takes time and isn't fully automated.
|
|
247
|
+
|
|
248
|
+
If any of those matter for your workflow, Ollama or LM Studio is the right choice.
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## Architecture
|
|
253
|
+
|
|
254
|
+
**Persistent daemon.** The model loads once when the daemon starts and stays
|
|
255
|
+
resident. Per-invocation model-load cost becomes a once-per-login cost.
|
|
256
|
+
|
|
257
|
+
**Two-cache architecture.** A block-paged KV cache stores KV state for
|
|
258
|
+
fixed-size token blocks on disk (`.safetensors`) and reconstructs partial-match
|
|
259
|
+
prefixes for shifting-prefix workloads. A prompt KV cache catches exact-prefix
|
|
260
|
+
repeats with single-digit-millisecond TTFT.
|
|
261
|
+
|
|
262
|
+
**INT3 quantization with a hard-block list.** INT3 behaviour is not uniform
|
|
263
|
+
across model families. Qwen3 holds within ~1pp of FP16; Gemma-3 collapses
|
|
264
|
+
(~15pp on common benchmarks). Squish enables INT3 only for families where it's
|
|
265
|
+
safe and hard-blocks the rest. Try to load Gemma-3 at INT3 and the accuracy
|
|
266
|
+
gate refuses — you can't accidentally ship a config that quietly degrades.
|
|
267
|
+
|
|
268
|
+
---
|
|
269
|
+
|
|
270
|
+
## Contributing
|
|
271
|
+
|
|
272
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). Issues, benchmarks, and PRs welcome.
|
|
273
|
+
The bench harness lives in `benchmarks/ollama_vs_squish/`; if you re-run on
|
|
274
|
+
different hardware, please share the raw JSON output.
|
|
275
|
+
|
|
276
|
+
---
|
|
277
|
+
|
|
278
|
+
## License
|
|
279
|
+
|
|
280
|
+
BUSL-1.1 — see [LICENSE](LICENSE).
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## Links
|
|
285
|
+
|
|
286
|
+
- Article: _Local LLM Server That Wins End-to-End on Long Contexts_ — in progress
|
|
287
|
+
- Org: [konjoai](https://github.com/konjoai) · [konjoai.org](https://konjoai.org)
|
|
288
|
+
- Related: [Kohaku](https://github.com/konjoai/kohaku), [Vectro](https://github.com/konjoai/vectro), [Squash](https://github.com/konjoai/squash) (EU AI Act compliance, extracted from squish in v9.15.0)
|
|
289
|
+
- HuggingFace models: [huggingface.co/squish-community](https://huggingface.co/squish-community)
|
|
290
|
+
- Module reference: [MODULES.md](MODULES.md)
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# Squish
|
|
2
|
+
|
|
3
|
+
**Local LLM inference for Apple Silicon. Faster end-to-end response on long contexts, less RAM, INT3 support.**
|
|
4
|
+
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://pypi.org/project/squish/)
|
|
7
|
+
[](https://github.com/konjoai/squish/actions/workflows/ci.yml)
|
|
8
|
+
[](https://github.com/konjoai/squish)
|
|
9
|
+
[](https://huggingface.co/squish-community)
|
|
10
|
+
|
|
11
|
+
<img src="assets/squish-logo-1.png" height="320" alt="Squish Logo"/>
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## The Numbers (v9.32.0 / bench v5.1.1)
|
|
16
|
+
|
|
17
|
+
Measured 2026-06-02 on Apple M3 MacBook Pro, 16 GB unified memory.
|
|
18
|
+
Model: Qwen2.5-7B-Instruct. Quant: INT4 (squish) / Q4_K_M (Ollama).
|
|
19
|
+
Five-run medians. Raw artifacts in [`results/benchmarks_v5_1_1/`](results/benchmarks_v5_1_1/).
|
|
20
|
+
|
|
21
|
+
| Metric | Ollama 0.18.2 | **Squish (recommended)** |
|
|
22
|
+
|---|---:|---:|
|
|
23
|
+
| **E2E response @ 4000-token prompt** | 69.63 s | **12.78 s** _(5.4× faster)_ |
|
|
24
|
+
| **E2E response @ 75-token prompt** | 8.09 s | **5.50 s** _(1.5× faster)_ |
|
|
25
|
+
| **Peak RAM during inference** | ~5 GB | **3.36 GB** |
|
|
26
|
+
| **Disk size — INT4** | 4.36 GB | **4.00 GB** |
|
|
27
|
+
| **Disk size — INT3 (Qwen3)** | not supported | **3.56 GB** |
|
|
28
|
+
| **TTFT @ 75-token prompt** | **131 ms** | 279 ms _(honest loss)_ |
|
|
29
|
+
|
|
30
|
+
**Squish wins end-to-end response time at every prompt size measured**, with
|
|
31
|
+
the largest win on long contexts (5.4× at 4000 tokens), uses ~33% less RAM,
|
|
32
|
+
and supports INT3 for compatible model families.
|
|
33
|
+
|
|
34
|
+
**Ollama wins time-to-first-token at every prompt size**, and inter-token
|
|
35
|
+
jitter on long contexts. If first-byte latency matters more than full-response
|
|
36
|
+
latency, Ollama is the right tool.
|
|
37
|
+
|
|
38
|
+
Full table, methodology, and ablation: [`docs/RESULTS.md`](docs/RESULTS.md)
|
|
39
|
+
(v5.1.1 section).
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Why Squish
|
|
44
|
+
|
|
45
|
+
Squish is for the workload most local-LLM tools aren't tuned for: **the same
|
|
46
|
+
model called many times an hour from the terminal with shifting context** —
|
|
47
|
+
git-commit-message generation, code-review prompts, agent loops, multi-turn
|
|
48
|
+
chat, document Q&A.
|
|
49
|
+
|
|
50
|
+
On a 16 GB Mac, that workload collides with the rest of your work. Ollama
|
|
51
|
+
keeps ~5 GB resident and pays a long prefill cost on each new long prompt.
|
|
52
|
+
Squish is a persistent daemon: the model loads once when the daemon starts,
|
|
53
|
+
and a two-cache architecture (block-paged KV cache for shifting prefixes,
|
|
54
|
+
prompt KV cache for exact repeats) avoids re-prefilling work the daemon has
|
|
55
|
+
already done.
|
|
56
|
+
|
|
57
|
+
Designed for one developer on one machine. Not a production multi-tenant API.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Install
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# PyPI
|
|
65
|
+
pip install squish
|
|
66
|
+
|
|
67
|
+
# Homebrew tap (coming with v9.32.0)
|
|
68
|
+
brew tap konjoai/squish
|
|
69
|
+
brew install squish
|
|
70
|
+
|
|
71
|
+
# From source
|
|
72
|
+
git clone https://github.com/konjoai/squish
|
|
73
|
+
cd squish
|
|
74
|
+
pip install -e .
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Requirements: macOS 13+, Apple Silicon (M1–M5), Python 3.10+.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Quick Start
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
# Pull a pre-quantised model from the catalog
|
|
85
|
+
squish pull qwen2.5-7b-int4
|
|
86
|
+
|
|
87
|
+
# Start the daemon with both caches enabled (recommended config)
|
|
88
|
+
squish run qwen2.5-7b-int4 \
|
|
89
|
+
--block-kv-cache ~/.cache/squish/blocks \
|
|
90
|
+
--prompt-kv-cache ~/.cache/squish/pkv \
|
|
91
|
+
--port 8080
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Use it as an OpenAI-compatible client:
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
curl http://localhost:8080/v1/chat/completions \
|
|
98
|
+
-H "Content-Type: application/json" \
|
|
99
|
+
-d '{
|
|
100
|
+
"model": "qwen2.5-7b-int4",
|
|
101
|
+
"messages": [{"role": "user", "content": "Hello"}]
|
|
102
|
+
}'
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Or point any OpenAI / Ollama client at it:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
export OPENAI_BASE_URL=http://localhost:8080/v1
|
|
109
|
+
export OPENAI_API_KEY=squish
|
|
110
|
+
# Ollama-compatible /api/* endpoints also work
|
|
111
|
+
export OLLAMA_HOST=http://localhost:8080
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Install the macOS LaunchAgent so the daemon starts at login:
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
squish daemon install
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
The **SquishBar** menu-bar app (`apps/macos/SquishBar/`) ships alongside the
|
|
121
|
+
daemon — model picker, load progress, and a global hotkey for the chat panel.
|
|
122
|
+
Build it from Xcode or grab the signed `.app` from the GitHub release page.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Configuration
|
|
127
|
+
|
|
128
|
+
| Flag | Purpose |
|
|
129
|
+
|---|---|
|
|
130
|
+
| `--block-kv-cache <DIR>` | Block-paged KV cache for shifting-prefix workloads (agents, multi-turn). Persists across daemon restarts via `.safetensors` blocks. |
|
|
131
|
+
| `--prompt-kv-cache <DIR>` | Exact-prompt KV cache. Single-digit-millisecond TTFT on verbatim repeats. |
|
|
132
|
+
| `--block-kv-size N` | Block size in tokens (default 64). |
|
|
133
|
+
| `--draft-model <MODEL>` | Speculative-decode draft model (opt-in; see [v5.2 diagnosis](results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md) for current status — net-negative on M3 INT4 with the draft models tested, kept off by default). |
|
|
134
|
+
| `--draft-depth N` | Speculative decode depth K. |
|
|
135
|
+
| `--no-spec`, `--no-cache` | Disable flags, intended for benchmark controls. |
|
|
136
|
+
| `squish daemon install` / `uninstall` | macOS LaunchAgent integration. |
|
|
137
|
+
|
|
138
|
+
Picking the right cache for your workload:
|
|
139
|
+
|
|
140
|
+
- **Exact-prompt repeats** (cached scripts, fixed templates, automated jobs):
|
|
141
|
+
`--prompt-kv-cache` alone. ~9 ms TTFT on a cache hit.
|
|
142
|
+
- **Shifting-prefix workloads** (agents, multi-turn conversations):
|
|
143
|
+
`--block-kv-cache` alone, or combined config.
|
|
144
|
+
- **General use without knowing the workload**: combined config (both caches
|
|
145
|
+
enabled). Best end-to-end completion time across prompt sizes.
|
|
146
|
+
|
|
147
|
+
The combined config currently doesn't inherit PKV's fast-hit TTFT due to a
|
|
148
|
+
lookup ordering issue documented in
|
|
149
|
+
[`results/benchmarks_v5_1_1/DIAGNOSIS.md`](results/benchmarks_v5_1_1/DIAGNOSIS.md);
|
|
150
|
+
reordering is tracked as a v5.2 follow-up.
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Benchmarks
|
|
155
|
+
|
|
156
|
+
Full table, methodology, ablation, jitter analysis, and raw per-run JSON:
|
|
157
|
+
|
|
158
|
+
- [`docs/RESULTS.md`](docs/RESULTS.md) — v5.1.1 section is the source of truth
|
|
159
|
+
- [`benchmarks/ollama_vs_squish/RESULTS.md`](benchmarks/ollama_vs_squish/RESULTS.md) — bench harness output
|
|
160
|
+
- [`results/benchmarks_v5_1_1/DIAGNOSIS.md`](results/benchmarks_v5_1_1/DIAGNOSIS.md) — combined-cache ordering write-up
|
|
161
|
+
- [`results/benchmarks_v5_1_1/JITTER_ANALYSIS.md`](results/benchmarks_v5_1_1/JITTER_ANALYSIS.md) — inter-token p95 explanation
|
|
162
|
+
- [`results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md`](results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md) — why speculative decoding is currently opt-in
|
|
163
|
+
|
|
164
|
+
Reproduce locally:
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
python benchmarks/ollama_vs_squish/bench_v5_1.py
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## What Squish Doesn't Do
|
|
173
|
+
|
|
174
|
+
In the spirit of honesty:
|
|
175
|
+
|
|
176
|
+
- **No GPU support outside Apple Silicon.** It's MLX-based. CUDA users should use vLLM or llama.cpp.
|
|
177
|
+
- **No multi-user serving.** Designed for one developer, one machine — not a production API.
|
|
178
|
+
- **No multimodal models.** Text only.
|
|
179
|
+
- **Higher inter-token p95 on long prompts** than Ollama. Conscious tradeoff (deferred KV-cache restore off the TTFT critical path); details in [`JITTER_ANALYSIS.md`](results/benchmarks_v5_1_1/JITTER_ANALYSIS.md).
|
|
180
|
+
- **Slower first-token on short prompts** than Ollama. Fundamental MLX prefill kernel cost.
|
|
181
|
+
- **Model conversion is slow and not user-friendly.** Squish needs models in its own format. Conversion takes time and isn't fully automated.
|
|
182
|
+
|
|
183
|
+
If any of those matter for your workflow, Ollama or LM Studio is the right choice.
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Architecture
|
|
188
|
+
|
|
189
|
+
**Persistent daemon.** The model loads once when the daemon starts and stays
|
|
190
|
+
resident. Per-invocation model-load cost becomes a once-per-login cost.
|
|
191
|
+
|
|
192
|
+
**Two-cache architecture.** A block-paged KV cache stores KV state for
|
|
193
|
+
fixed-size token blocks on disk (`.safetensors`) and reconstructs partial-match
|
|
194
|
+
prefixes for shifting-prefix workloads. A prompt KV cache catches exact-prefix
|
|
195
|
+
repeats with single-digit-millisecond TTFT.
|
|
196
|
+
|
|
197
|
+
**INT3 quantization with a hard-block list.** INT3 behaviour is not uniform
|
|
198
|
+
across model families. Qwen3 holds within ~1pp of FP16; Gemma-3 collapses
|
|
199
|
+
(~15pp on common benchmarks). Squish enables INT3 only for families where it's
|
|
200
|
+
safe and hard-blocks the rest. Try to load Gemma-3 at INT3 and the accuracy
|
|
201
|
+
gate refuses — you can't accidentally ship a config that quietly degrades.
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Contributing
|
|
206
|
+
|
|
207
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). Issues, benchmarks, and PRs welcome.
|
|
208
|
+
The bench harness lives in `benchmarks/ollama_vs_squish/`; if you re-run on
|
|
209
|
+
different hardware, please share the raw JSON output.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## License
|
|
214
|
+
|
|
215
|
+
BUSL-1.1 — see [LICENSE](LICENSE).
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Links
|
|
220
|
+
|
|
221
|
+
- Article: _Local LLM Server That Wins End-to-End on Long Contexts_ — in progress
|
|
222
|
+
- Org: [konjoai](https://github.com/konjoai) · [konjoai.org](https://konjoai.org)
|
|
223
|
+
- Related: [Kohaku](https://github.com/konjoai/kohaku), [Vectro](https://github.com/konjoai/vectro), [Squash](https://github.com/konjoai/squash) (EU AI Act compliance, extracted from squish in v9.15.0)
|
|
224
|
+
- HuggingFace models: [huggingface.co/squish-community](https://huggingface.co/squish-community)
|
|
225
|
+
- Module reference: [MODULES.md](MODULES.md)
|