squish-ai 9.32.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. squish_ai-9.32.0/LICENSE +104 -0
  2. squish_ai-9.32.0/PKG-INFO +290 -0
  3. squish_ai-9.32.0/README.md +225 -0
  4. squish_ai-9.32.0/pyproject.toml +167 -0
  5. squish_ai-9.32.0/setup.cfg +4 -0
  6. squish_ai-9.32.0/squish/__init__.py +437 -0
  7. squish_ai-9.32.0/squish/_fast_imports.py +149 -0
  8. squish_ai-9.32.0/squish/_term.py +257 -0
  9. squish_ai-9.32.0/squish/agent/__init__.py +5 -0
  10. squish_ai-9.32.0/squish/agent/builtin_tools.py +745 -0
  11. squish_ai-9.32.0/squish/agent/tool_name_map.py +90 -0
  12. squish_ai-9.32.0/squish/agent/tool_registry.py +339 -0
  13. squish_ai-9.32.0/squish/api/__init__.py +6 -0
  14. squish_ai-9.32.0/squish/api/v1_router.py +447 -0
  15. squish_ai-9.32.0/squish/backend.py +413 -0
  16. squish_ai-9.32.0/squish/catalog.py +1139 -0
  17. squish_ai-9.32.0/squish/cli.py +7361 -0
  18. squish_ai-9.32.0/squish/compressed_loader_torch.py +193 -0
  19. squish_ai-9.32.0/squish/config.py +167 -0
  20. squish_ai-9.32.0/squish/context/__init__.py +1 -0
  21. squish_ai-9.32.0/squish/context/lazy_llm.py +334 -0
  22. squish_ai-9.32.0/squish/context/prompt_compressor.py +189 -0
  23. squish_ai-9.32.0/squish/convert.py +1259 -0
  24. squish_ai-9.32.0/squish/daemon/__init__.py +32 -0
  25. squish_ai-9.32.0/squish/daemon/client.py +151 -0
  26. squish_ai-9.32.0/squish/daemon/launchagent.py +226 -0
  27. squish_ai-9.32.0/squish/daemon/squishd.py +654 -0
  28. squish_ai-9.32.0/squish/experimental/__init__.py +0 -0
  29. squish_ai-9.32.0/squish/experimental/_eval_torch.py +409 -0
  30. squish_ai-9.32.0/squish/experimental/astc_loader.py +420 -0
  31. squish_ai-9.32.0/squish/experimental/convert_coreml.py +450 -0
  32. squish_ai-9.32.0/squish/experimental/coreml_loader.py +347 -0
  33. squish_ai-9.32.0/squish/experimental/hqq_quant.py +23 -0
  34. squish_ai-9.32.0/squish/experimental/jacobi_decode.py +344 -0
  35. squish_ai-9.32.0/squish/experimental/layer_overlap_loader.py +351 -0
  36. squish_ai-9.32.0/squish/experimental/lm_studio_bridge.py +252 -0
  37. squish_ai-9.32.0/squish/experimental/localai_compat.py +77 -0
  38. squish_ai-9.32.0/squish/experimental/spin_quant.py +440 -0
  39. squish_ai-9.32.0/squish/experimental/structured_sparsity.py +245 -0
  40. squish_ai-9.32.0/squish/experimental/torch_ops.py +173 -0
  41. squish_ai-9.32.0/squish/grammar/__init__.py +1 -0
  42. squish_ai-9.32.0/squish/grammar/grammar_cache.py +439 -0
  43. squish_ai-9.32.0/squish/grammar/grammar_engine.py +578 -0
  44. squish_ai-9.32.0/squish/grammar/schema_gen.py +488 -0
  45. squish_ai-9.32.0/squish/hardware/__init__.py +1 -0
  46. squish_ai-9.32.0/squish/hardware/capability_probe.py +269 -0
  47. squish_ai-9.32.0/squish/hardware/chip_detector.py +383 -0
  48. squish_ai-9.32.0/squish/hardware/fused_kernels.py +438 -0
  49. squish_ai-9.32.0/squish/hardware/fused_sampler.py +315 -0
  50. squish_ai-9.32.0/squish/hardware/production_profiler.py +266 -0
  51. squish_ai-9.32.0/squish/integrations/__init__.py +6 -0
  52. squish_ai-9.32.0/squish/integrations/hf.py +366 -0
  53. squish_ai-9.32.0/squish/io/__init__.py +1 -0
  54. squish_ai-9.32.0/squish/io/entropy.py +362 -0
  55. squish_ai-9.32.0/squish/io/gguf_loader.py +484 -0
  56. squish_ai-9.32.0/squish/io/loader_utils.py +401 -0
  57. squish_ai-9.32.0/squish/io/model_shard_loader.py +386 -0
  58. squish_ai-9.32.0/squish/io/split_loader.py +480 -0
  59. squish_ai-9.32.0/squish/io/weight_decompress_stream.py +350 -0
  60. squish_ai-9.32.0/squish/kv/__init__.py +1 -0
  61. squish_ai-9.32.0/squish/kv/block_kv_cache.py +589 -0
  62. squish_ai-9.32.0/squish/kv/delta.py +314 -0
  63. squish_ai-9.32.0/squish/kv/head_importance.py +242 -0
  64. squish_ai-9.32.0/squish/kv/kv_cache.py +3740 -0
  65. squish_ai-9.32.0/squish/kv/mmap_cache.py +363 -0
  66. squish_ai-9.32.0/squish/kv/prompt_kv_cache.py +499 -0
  67. squish_ai-9.32.0/squish/kv/radix_cache.py +495 -0
  68. squish_ai-9.32.0/squish/loaders/__init__.py +12 -0
  69. squish_ai-9.32.0/squish/logging_config.py +118 -0
  70. squish_ai-9.32.0/squish/platform/__init__.py +30 -0
  71. squish_ai-9.32.0/squish/platform/ane_router.py +326 -0
  72. squish_ai-9.32.0/squish/platform/detector.py +347 -0
  73. squish_ai-9.32.0/squish/platform/feature_registry.py +249 -0
  74. squish_ai-9.32.0/squish/platform/platform_router.py +388 -0
  75. squish_ai-9.32.0/squish/quant/__init__.py +1 -0
  76. squish_ai-9.32.0/squish/quant/aqlm.py +579 -0
  77. squish_ai-9.32.0/squish/quant/awq.py +840 -0
  78. squish_ai-9.32.0/squish/quant/compressed_loader.py +2083 -0
  79. squish_ai-9.32.0/squish/quant/hqq.py +337 -0
  80. squish_ai-9.32.0/squish/quant/int3_linear.py +256 -0
  81. squish_ai-9.32.0/squish/quant/int3_runtime.py +245 -0
  82. squish_ai-9.32.0/squish/quant/milo_quant.py +623 -0
  83. squish_ai-9.32.0/squish/quant/quantizer.py +810 -0
  84. squish_ai-9.32.0/squish/quant/sqint2.py +1454 -0
  85. squish_ai-9.32.0/squish/quant/sqint2_linear.py +423 -0
  86. squish_ai-9.32.0/squish/reasoning/__init__.py +1 -0
  87. squish_ai-9.32.0/squish/reasoning/coconut.py +256 -0
  88. squish_ai-9.32.0/squish/reasoning/self_consistency.py +163 -0
  89. squish_ai-9.32.0/squish/runtime/__init__.py +8 -0
  90. squish_ai-9.32.0/squish/runtime/auto_profile.py +399 -0
  91. squish_ai-9.32.0/squish/runtime/format_validator.py +394 -0
  92. squish_ai-9.32.0/squish/runtime/squish_runtime.py +501 -0
  93. squish_ai-9.32.0/squish/semantic_cache.py +10 -0
  94. squish_ai-9.32.0/squish/server.py +5514 -0
  95. squish_ai-9.32.0/squish/serving/__init__.py +1 -0
  96. squish_ai-9.32.0/squish/serving/backend_router.py +156 -0
  97. squish_ai-9.32.0/squish/serving/blazing.py +145 -0
  98. squish_ai-9.32.0/squish/serving/feature_state.py +103 -0
  99. squish_ai-9.32.0/squish/serving/kernel_cache.py +148 -0
  100. squish_ai-9.32.0/squish/serving/local_model_scanner.py +701 -0
  101. squish_ai-9.32.0/squish/serving/mcp_client.py +367 -0
  102. squish_ai-9.32.0/squish/serving/memory_governor.py +345 -0
  103. squish_ai-9.32.0/squish/serving/obs_report.py +156 -0
  104. squish_ai-9.32.0/squish/serving/ollama_compat.py +523 -0
  105. squish_ai-9.32.0/squish/serving/quality_monitor.py +440 -0
  106. squish_ai-9.32.0/squish/serving/router.py +411 -0
  107. squish_ai-9.32.0/squish/serving/scheduler.py +1373 -0
  108. squish_ai-9.32.0/squish/serving/startup_profiler.py +195 -0
  109. squish_ai-9.32.0/squish/serving/tool_calling.py +410 -0
  110. squish_ai-9.32.0/squish/speculative/__init__.py +22 -0
  111. squish_ai-9.32.0/squish/speculative/eagle3.py +463 -0
  112. squish_ai-9.32.0/squish/speculative/prompt_lookup.py +314 -0
  113. squish_ai-9.32.0/squish/speculative/speculative.py +2111 -0
  114. squish_ai-9.32.0/squish/streaming/__init__.py +1 -0
  115. squish_ai-9.32.0/squish/streaming/chunked_prefill.py +110 -0
  116. squish_ai-9.32.0/squish/streaming/streaming_sink.py +269 -0
  117. squish_ai-9.32.0/squish/telemetry.py +568 -0
  118. squish_ai-9.32.0/squish/ui.py +684 -0
  119. squish_ai-9.32.0/squish_ai.egg-info/PKG-INFO +290 -0
  120. squish_ai-9.32.0/squish_ai.egg-info/SOURCES.txt +223 -0
  121. squish_ai-9.32.0/squish_ai.egg-info/dependency_links.txt +1 -0
  122. squish_ai-9.32.0/squish_ai.egg-info/entry_points.txt +5 -0
  123. squish_ai-9.32.0/squish_ai.egg-info/requires.txt +53 -0
  124. squish_ai-9.32.0/squish_ai.egg-info/top_level.txt +2 -0
  125. squish_ai-9.32.0/tests/test_auto_calibrate.py +402 -0
  126. squish_ai-9.32.0/tests/test_backend_unit.py +634 -0
  127. squish_ai-9.32.0/tests/test_bench.py +198 -0
  128. squish_ai-9.32.0/tests/test_block_kv_cache.py +237 -0
  129. squish_ai-9.32.0/tests/test_catalog_branches.py +220 -0
  130. squish_ai-9.32.0/tests/test_catalog_extended.py +234 -0
  131. squish_ai-9.32.0/tests/test_catalog_hash.py +171 -0
  132. squish_ai-9.32.0/tests/test_catalog_ssl.py +180 -0
  133. squish_ai-9.32.0/tests/test_catalog_unit.py +289 -0
  134. squish_ai-9.32.0/tests/test_cli_eval.py +346 -0
  135. squish_ai-9.32.0/tests/test_cli_extras.py +163 -0
  136. squish_ai-9.32.0/tests/test_cli_sbom.py +158 -0
  137. squish_ai-9.32.0/tests/test_cli_unit.py +438 -0
  138. squish_ai-9.32.0/tests/test_compressed_loader_torch_unit.py +219 -0
  139. squish_ai-9.32.0/tests/test_config_unit.py +309 -0
  140. squish_ai-9.32.0/tests/test_convert_unit.py +356 -0
  141. squish_ai-9.32.0/tests/test_demo_server.py +312 -0
  142. squish_ai-9.32.0/tests/test_docker_entrypoint_unit.py +397 -0
  143. squish_ai-9.32.0/tests/test_eval_binder.py +289 -0
  144. squish_ai-9.32.0/tests/test_fast_imports.py +213 -0
  145. squish_ai-9.32.0/tests/test_governor_middleware.py +290 -0
  146. squish_ai-9.32.0/tests/test_grammar_independent_mask.py +304 -0
  147. squish_ai-9.32.0/tests/test_helm_chart_unit.py +422 -0
  148. squish_ai-9.32.0/tests/test_kitty_channel_sensitivity.py +482 -0
  149. squish_ai-9.32.0/tests/test_kv_budget.py +359 -0
  150. squish_ai-9.32.0/tests/test_kv_int2.py +379 -0
  151. squish_ai-9.32.0/tests/test_kv_int4.py +441 -0
  152. squish_ai-9.32.0/tests/test_kv_p1.py +408 -0
  153. squish_ai-9.32.0/tests/test_lazy_load_modes.py +370 -0
  154. squish_ai-9.32.0/tests/test_load_mlx_model_parallel.py +205 -0
  155. squish_ai-9.32.0/tests/test_logging_config_unit.py +213 -0
  156. squish_ai-9.32.0/tests/test_model_pipeline_unit.py +522 -0
  157. squish_ai-9.32.0/tests/test_oms_signer.py +80 -0
  158. squish_ai-9.32.0/tests/test_openai_compat.py +259 -0
  159. squish_ai-9.32.0/tests/test_overnight_bench_unit.py +299 -0
  160. squish_ai-9.32.0/tests/test_predownload_scan.py +483 -0
  161. squish_ai-9.32.0/tests/test_prompt_kv_cache.py +404 -0
  162. squish_ai-9.32.0/tests/test_quality_monitor.py +374 -0
  163. squish_ai-9.32.0/tests/test_quant_aqlm.py +580 -0
  164. squish_ai-9.32.0/tests/test_radix_kv_reuse_integration.py +197 -0
  165. squish_ai-9.32.0/tests/test_router.py +394 -0
  166. squish_ai-9.32.0/tests/test_rust_matmul.py +207 -0
  167. squish_ai-9.32.0/tests/test_sbom_builder.py +298 -0
  168. squish_ai-9.32.0/tests/test_spaces_demo.py +349 -0
  169. squish_ai-9.32.0/tests/test_sparsity_trim.py +337 -0
  170. squish_ai-9.32.0/tests/test_sqint2.py +920 -0
  171. squish_ai-9.32.0/tests/test_sqint2_compress.py +455 -0
  172. squish_ai-9.32.0/tests/test_sqint2_linear.py +605 -0
  173. squish_ai-9.32.0/tests/test_sqint2_loader.py +329 -0
  174. squish_ai-9.32.0/tests/test_sqint2_residual_gemv.py +377 -0
  175. squish_ai-9.32.0/tests/test_sqint2_router.py +437 -0
  176. squish_ai-9.32.0/tests/test_squishd_unit.py +469 -0
  177. squish_ai-9.32.0/tests/test_stop_token_suppression.py +269 -0
  178. squish_ai-9.32.0/tests/test_synthetic_model_fixture.py +168 -0
  179. squish_ai-9.32.0/tests/test_telemetry_unit.py +689 -0
  180. squish_ai-9.32.0/tests/test_term_unit.py +279 -0
  181. squish_ai-9.32.0/tests/test_tool_choice_unit.py +158 -0
  182. squish_ai-9.32.0/tests/test_torch_ops_unit.py +201 -0
  183. squish_ai-9.32.0/tests/test_ui_unit.py +278 -0
  184. squish_ai-9.32.0/tests/test_version.py +128 -0
  185. squish_ai-9.32.0/tests/test_wave108_calculator.py +323 -0
  186. squish_ai-9.32.0/tests/test_wave114_rep_loop.py +187 -0
  187. squish_ai-9.32.0/tests/test_wave119_dead_stub_purge.py +158 -0
  188. squish_ai-9.32.0/tests/test_wave120_dead_global_purge.py +108 -0
  189. squish_ai-9.32.0/tests/test_wave121_dead_flag_purge.py +165 -0
  190. squish_ai-9.32.0/tests/test_wave122_dead_const_purge.py +149 -0
  191. squish_ai-9.32.0/tests/test_wave123_empty_section_purge.py +100 -0
  192. squish_ai-9.32.0/tests/test_wave124_orphan_global_purge.py +78 -0
  193. squish_ai-9.32.0/tests/test_wave125_stale_comment_purge.py +48 -0
  194. squish_ai-9.32.0/tests/test_wave126_empty_header_purge.py +42 -0
  195. squish_ai-9.32.0/tests/test_wave64a_trace_endpoint.py +293 -0
  196. squish_ai-9.32.0/tests/test_wave70_squish_runtime.py +627 -0
  197. squish_ai-9.32.0/tests/test_wave72_quantize_fix.py +919 -0
  198. squish_ai-9.32.0/tests/test_wave72_resquish.py +393 -0
  199. squish_ai-9.32.0/tests/test_wave74_run_polish.py +250 -0
  200. squish_ai-9.32.0/tests/test_wave74_web_ui.py +39 -0
  201. squish_ai-9.32.0/tests/test_wave75_perf_foundations.py +411 -0
  202. squish_ai-9.32.0/tests/test_wave76_agent_tools.py +526 -0
  203. squish_ai-9.32.0/tests/test_wave78_perf_quality.py +576 -0
  204. squish_ai-9.32.0/tests/test_wave79_auto_profile.py +461 -0
  205. squish_ai-9.32.0/tests/test_wave79_startup_inference.py +283 -0
  206. squish_ai-9.32.0/tests/test_wave80_chunk_fingerprint.py +336 -0
  207. squish_ai-9.32.0/tests/test_wave81_blazing_m3.py +429 -0
  208. squish_ai-9.32.0/tests/test_wave81_orjson_sse.py +366 -0
  209. squish_ai-9.32.0/tests/test_wave82_autoload_eagle3.py +473 -0
  210. squish_ai-9.32.0/tests/test_wave82_ux_polish.py +343 -0
  211. squish_ai-9.32.0/tests/test_wave85_color_dedup.py +381 -0
  212. squish_ai-9.32.0/tests/test_wave86_observability.py +312 -0
  213. squish_ai-9.32.0/tests/test_wave87_agent_tools.py +208 -0
  214. squish_ai-9.32.0/tests/test_wave88_api_compat.py +311 -0
  215. squish_ai-9.32.0/tests/test_wave89_local_model_scan.py +448 -0
  216. squish_ai-9.32.0/tests/test_wave90_startup_lean.py +296 -0
  217. squish_ai-9.32.0/tests/test_wave91_performance.py +266 -0
  218. squish_ai-9.32.0/tests/test_wave92_presquish.py +244 -0
  219. squish_ai-9.32.0/tests/test_wave93_squishbar.py +238 -0
  220. squish_ai-9.32.0/tests/test_wave95_ps_logs.py +418 -0
  221. squish_ai-9.32.0/tests/test_wave95_release.py +261 -0
  222. squish_ai-9.32.0/tests/test_wave96_lm_studio.py +404 -0
  223. squish_ai-9.32.0/tests/test_wave97_inference_fixes.py +258 -0
  224. squish_ai-9.32.0/tests/test_wave98_lean_server.py +251 -0
  225. squish_ai-9.32.0/tests/test_wave99_speed_restore.py +294 -0
@@ -0,0 +1,104 @@
1
+ Business Source License 1.1
2
+
3
+ Parameters
4
+
5
+ Licensor: squishai
6
+ Licensed Work: squish
7
+ The Licensed Work is (c) 2025 squishai
8
+ Additional Use Grant: You may make production use of the Licensed Work,
9
+ provided that such use is for any of the following
10
+ purposes: (i) personal or household use; (ii)
11
+ non-commercial research, academic study, or
12
+ educational purposes; (iii) evaluation or testing of
13
+ the Licensed Work for potential adoption, provided
14
+ that such evaluation does not itself constitute a
15
+ production commercial deployment. You may also make
16
+ any non-production use of the Licensed Work (including
17
+ development, staging, and internal testing).
18
+ Change Date: 2030-01-01
19
+ Change License: MIT
20
+
21
+ For information about alternative commercial licensing arrangements for the
22
+ Licensed Work, please contact: wes@squish.run
23
+
24
+ -----------------------------------------------------------------------------
25
+
26
+ Notice
27
+
28
+ The Business Source License (this document, or the "License") is not an Open
29
+ Source license. However, the Licensed Work will eventually be made available
30
+ under an Open Source License, as stated in this License.
31
+
32
+ License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
33
+ "Business Source License" is a trademark of MariaDB Corporation Ab.
34
+
35
+ -----------------------------------------------------------------------------
36
+
37
+ Business Source License 1.1
38
+
39
+ Terms
40
+
41
+ The Licensor hereby grants you the right to copy, modify, create derivative
42
+ works, redistribute, and make non-production use of the Licensed Work. The
43
+ Licensor may make an Additional Use Grant, above, permitting limited
44
+ production use.
45
+
46
+ Effective on the Change Date, or the fourth anniversary of the first publicly
47
+ available distribution of a specific version of the Licensed Work under this
48
+ License, whichever comes first, the Licensor hereby grants you rights under
49
+ the terms of the Change License, and the rights granted in the paragraph
50
+ above terminate.
51
+
52
+ If your use of the Licensed Work does not comply with the requirements
53
+ currently in effect as described in this License, you must purchase a
54
+ commercial license from the Licensor, its affiliated entities, or authorized
55
+ resellers, or you must refrain from using the Licensed Work.
56
+
57
+ All copies of the original and modified Licensed Work, and derivative works
58
+ of the Licensed Work, are subject to this License. This License applies
59
+ separately for each version of the Licensed Work and the Change Date may vary
60
+ for each version of the Licensed Work released by Licensor.
61
+
62
+ You must conspicuously display this License on each original or modified copy
63
+ of the Licensed Work. If you receive the Licensed Work in original or modified
64
+ form from a third party, the terms and conditions set forth in this License
65
+ apply to your use of that work.
66
+
67
+ Any use of the Licensed Work in violation of this License will automatically
68
+ terminate your rights under this License for the current and all other
69
+ versions of the Licensed Work.
70
+
71
+ This License does not grant you any right in any trademark or logo of
72
+ Licensor or its affiliates (provided that you may use a trademark or logo of
73
+ Licensor as expressly required by this License).
74
+
75
+ TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON
76
+ AN "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS,
77
+ EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF
78
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND
79
+ TITLE.
80
+
81
+ MariaDB hereby grants you permission to use this License's text to license
82
+ your works, and to refer to it using the trademark "Business Source License",
83
+ as long as you comply with the Covenants of Licensor below.
84
+
85
+ Covenants of Licensor
86
+
87
+ In consideration of the right to use this License's text and the "Business
88
+ Source License" name and trademark, Licensor covenants to MariaDB, and to
89
+ all recipients of the licensed work to be provided by Licensor:
90
+
91
+ 1. To specify as the Change License the GPL Version 2.0 or any later
92
+ version, or a license that is compatible with GPL Version 2.0 or a later
93
+ version, where "compatible" means that software provided under the Change
94
+ License can be included in a program with software provided under GPL
95
+ Version 2.0 or a later version. Licensor may specify additional Change
96
+ Licenses as an alternative to the Change License.
97
+
98
+ 2. To either: (a) specify an additional grant of rights to use that does not
99
+ impose any additional restriction on the right granted in this License,
100
+ as the Additional Use Grant; or (b) insert the text "None".
101
+
102
+ 3. To specify a Change Date.
103
+
104
+ 4. Not to modify this License in any other way.
@@ -0,0 +1,290 @@
1
+ Metadata-Version: 2.4
2
+ Name: squish-ai
3
+ Version: 9.32.0
4
+ Summary: Local LLM inference server for Apple Silicon. Block-level paged KV cache for long-context workloads. 5.4× faster end-to-end on 4K-token prompts vs Ollama, less RAM, INT3 support for Qwen3. OpenAI-compatible API.
5
+ License: BUSL-1.1
6
+ Project-URL: Homepage, https://github.com/wesleyscholl/squish
7
+ Project-URL: Bug Tracker, https://github.com/wesleyscholl/squish/issues
8
+ Project-URL: Documentation, https://wesleyscholl.github.io/squish
9
+ Keywords: llm,inference,quantization,apple-silicon,mlx,speculative-decoding
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: Other/Proprietary License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: mlx>=0.18; sys_platform == "darwin" and platform_machine == "arm64"
23
+ Requires-Dist: mlx-lm>=0.19; sys_platform == "darwin" and platform_machine == "arm64"
24
+ Requires-Dist: numpy>=1.26
25
+ Requires-Dist: safetensors>=0.4
26
+ Requires-Dist: fastapi>=0.111
27
+ Requires-Dist: uvicorn[standard]>=0.29
28
+ Requires-Dist: sse-starlette>=1.8
29
+ Requires-Dist: huggingface-hub>=0.23
30
+ Requires-Dist: transformers>=4.40
31
+ Requires-Dist: zstandard>=0.22
32
+ Requires-Dist: rich>=13.0
33
+ Requires-Dist: orjson>=3.9
34
+ Provides-Extra: quant
35
+ Provides-Extra: retrieval
36
+ Requires-Dist: hnswlib>=0.8; extra == "retrieval"
37
+ Provides-Extra: llmlingua
38
+ Requires-Dist: llmlingua>=0.2; extra == "llmlingua"
39
+ Provides-Extra: grammar
40
+ Requires-Dist: xgrammar>=0.1; extra == "grammar"
41
+ Provides-Extra: cache
42
+ Requires-Dist: sqlite-vec>=0.1; extra == "cache"
43
+ Provides-Extra: whatsapp
44
+ Provides-Extra: eval
45
+ Requires-Dist: lm-eval>=0.4; extra == "eval"
46
+ Requires-Dist: datasets>=2.18; extra == "eval"
47
+ Requires-Dist: accelerate>=0.29; extra == "eval"
48
+ Requires-Dist: sacrebleu; extra == "eval"
49
+ Requires-Dist: rouge_score; extra == "eval"
50
+ Requires-Dist: nltk; extra == "eval"
51
+ Provides-Extra: linux
52
+ Requires-Dist: torch>=2.0; extra == "linux"
53
+ Provides-Extra: dev
54
+ Requires-Dist: pytest>=8; extra == "dev"
55
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
56
+ Requires-Dist: pytest-timeout>=2.3; extra == "dev"
57
+ Requires-Dist: httpx>=0.27; extra == "dev"
58
+ Requires-Dist: maturin>=1.5; extra == "dev"
59
+ Requires-Dist: ruff>=0.4; extra == "dev"
60
+ Requires-Dist: mypy>=1.10; extra == "dev"
61
+ Requires-Dist: mkdocs-material>=9.5; extra == "dev"
62
+ Requires-Dist: mkdocs-exclude>=1.0; extra == "dev"
63
+ Requires-Dist: cryptography>=42.0; extra == "dev"
64
+ Dynamic: license-file
65
+
66
+ # Squish
67
+
68
+ **Local LLM inference for Apple Silicon. Faster end-to-end response on long contexts, less RAM, INT3 support.**
69
+
70
+ [![License: BUSL-1.1](https://img.shields.io/badge/License-BUSL--1.1-blue.svg)](LICENSE)
71
+ [![PyPI version](https://img.shields.io/pypi/v/squish.svg)](https://pypi.org/project/squish/)
72
+ [![CI](https://github.com/konjoai/squish/actions/workflows/ci.yml/badge.svg)](https://github.com/konjoai/squish/actions/workflows/ci.yml)
73
+ [![Platform](https://img.shields.io/badge/platform-Apple%20Silicon-lightgrey.svg)](https://github.com/konjoai/squish)
74
+ [![HuggingFace](https://img.shields.io/badge/🤗%20Models-squish--community-yellow)](https://huggingface.co/squish-community)
75
+
76
+ <img src="assets/squish-logo-1.png" height="320" alt="Squish Logo"/>
77
+
78
+ ---
79
+
80
+ ## The Numbers (v9.32.0 / bench v5.1.1)
81
+
82
+ Measured 2026-06-02 on Apple M3 MacBook Pro, 16 GB unified memory.
83
+ Model: Qwen2.5-7B-Instruct. Quant: INT4 (squish) / Q4_K_M (Ollama).
84
+ Five-run medians. Raw artifacts in [`results/benchmarks_v5_1_1/`](results/benchmarks_v5_1_1/).
85
+
86
+ | Metric | Ollama 0.18.2 | **Squish (recommended)** |
87
+ |---|---:|---:|
88
+ | **E2E response @ 4000-token prompt** | 69.63 s | **12.78 s** &nbsp;_(5.4× faster)_ |
89
+ | **E2E response @ 75-token prompt** | 8.09 s | **5.50 s** &nbsp;_(1.5× faster)_ |
90
+ | **Peak RAM during inference** | ~5 GB | **3.36 GB** |
91
+ | **Disk size — INT4** | 4.36 GB | **4.00 GB** |
92
+ | **Disk size — INT3 (Qwen3)** | not supported | **3.56 GB** |
93
+ | **TTFT @ 75-token prompt** | **131 ms** | 279 ms &nbsp;_(honest loss)_ |
94
+
95
+ **Squish wins end-to-end response time at every prompt size measured**, with
96
+ the largest win on long contexts (5.4× at 4000 tokens), uses ~33% less RAM,
97
+ and supports INT3 for compatible model families.
98
+
99
+ **Ollama wins time-to-first-token at every prompt size**, and inter-token
100
+ jitter on long contexts. If first-byte latency matters more than full-response
101
+ latency, Ollama is the right tool.
102
+
103
+ Full table, methodology, and ablation: [`docs/RESULTS.md`](docs/RESULTS.md)
104
+ (v5.1.1 section).
105
+
106
+ ---
107
+
108
+ ## Why Squish
109
+
110
+ Squish is for the workload most local-LLM tools aren't tuned for: **the same
111
+ model called many times an hour from the terminal with shifting context** —
112
+ git-commit-message generation, code-review prompts, agent loops, multi-turn
113
+ chat, document Q&A.
114
+
115
+ On a 16 GB Mac, that workload collides with the rest of your work. Ollama
116
+ keeps ~5 GB resident and pays a long prefill cost on each new long prompt.
117
+ Squish is a persistent daemon: the model loads once when the daemon starts,
118
+ and a two-cache architecture (block-paged KV cache for shifting prefixes,
119
+ prompt KV cache for exact repeats) avoids re-prefilling work the daemon has
120
+ already done.
121
+
122
+ Designed for one developer on one machine. Not a production multi-tenant API.
123
+
124
+ ---
125
+
126
+ ## Install
127
+
128
+ ```bash
129
+ # PyPI
130
+ pip install squish
131
+
132
+ # Homebrew tap (coming with v9.32.0)
133
+ brew tap konjoai/squish
134
+ brew install squish
135
+
136
+ # From source
137
+ git clone https://github.com/konjoai/squish
138
+ cd squish
139
+ pip install -e .
140
+ ```
141
+
142
+ Requirements: macOS 13+, Apple Silicon (M1–M5), Python 3.10+.
143
+
144
+ ---
145
+
146
+ ## Quick Start
147
+
148
+ ```bash
149
+ # Pull a pre-quantised model from the catalog
150
+ squish pull qwen2.5-7b-int4
151
+
152
+ # Start the daemon with both caches enabled (recommended config)
153
+ squish run qwen2.5-7b-int4 \
154
+ --block-kv-cache ~/.cache/squish/blocks \
155
+ --prompt-kv-cache ~/.cache/squish/pkv \
156
+ --port 8080
157
+ ```
158
+
159
+ Use it as an OpenAI-compatible client:
160
+
161
+ ```bash
162
+ curl http://localhost:8080/v1/chat/completions \
163
+ -H "Content-Type: application/json" \
164
+ -d '{
165
+ "model": "qwen2.5-7b-int4",
166
+ "messages": [{"role": "user", "content": "Hello"}]
167
+ }'
168
+ ```
169
+
170
+ Or point any OpenAI / Ollama client at it:
171
+
172
+ ```bash
173
+ export OPENAI_BASE_URL=http://localhost:8080/v1
174
+ export OPENAI_API_KEY=squish
175
+ # Ollama-compatible /api/* endpoints also work
176
+ export OLLAMA_HOST=http://localhost:8080
177
+ ```
178
+
179
+ Install the macOS LaunchAgent so the daemon starts at login:
180
+
181
+ ```bash
182
+ squish daemon install
183
+ ```
184
+
185
+ The **SquishBar** menu-bar app (`apps/macos/SquishBar/`) ships alongside the
186
+ daemon — model picker, load progress, and a global hotkey for the chat panel.
187
+ Build it from Xcode or grab the signed `.app` from the GitHub release page.
188
+
189
+ ---
190
+
191
+ ## Configuration
192
+
193
+ | Flag | Purpose |
194
+ |---|---|
195
+ | `--block-kv-cache <DIR>` | Block-paged KV cache for shifting-prefix workloads (agents, multi-turn). Persists across daemon restarts via `.safetensors` blocks. |
196
+ | `--prompt-kv-cache <DIR>` | Exact-prompt KV cache. Single-digit-millisecond TTFT on verbatim repeats. |
197
+ | `--block-kv-size N` | Block size in tokens (default 64). |
198
+ | `--draft-model <MODEL>` | Speculative-decode draft model (opt-in; see [v5.2 diagnosis](results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md) for current status — net-negative on M3 INT4 with the draft models tested, kept off by default). |
199
+ | `--draft-depth N` | Speculative decode depth K. |
200
+ | `--no-spec`, `--no-cache` | Disable flags, intended for benchmark controls. |
201
+ | `squish daemon install` / `uninstall` | macOS LaunchAgent integration. |
202
+
203
+ Picking the right cache for your workload:
204
+
205
+ - **Exact-prompt repeats** (cached scripts, fixed templates, automated jobs):
206
+ `--prompt-kv-cache` alone. ~9 ms TTFT on a cache hit.
207
+ - **Shifting-prefix workloads** (agents, multi-turn conversations):
208
+ `--block-kv-cache` alone, or combined config.
209
+ - **General use without knowing the workload**: combined config (both caches
210
+ enabled). Best end-to-end completion time across prompt sizes.
211
+
212
+ The combined config currently doesn't inherit PKV's fast-hit TTFT due to a
213
+ lookup ordering issue documented in
214
+ [`results/benchmarks_v5_1_1/DIAGNOSIS.md`](results/benchmarks_v5_1_1/DIAGNOSIS.md);
215
+ reordering is tracked as a v5.2 follow-up.
216
+
217
+ ---
218
+
219
+ ## Benchmarks
220
+
221
+ Full table, methodology, ablation, jitter analysis, and raw per-run JSON:
222
+
223
+ - [`docs/RESULTS.md`](docs/RESULTS.md) — v5.1.1 section is the source of truth
224
+ - [`benchmarks/ollama_vs_squish/RESULTS.md`](benchmarks/ollama_vs_squish/RESULTS.md) — bench harness output
225
+ - [`results/benchmarks_v5_1_1/DIAGNOSIS.md`](results/benchmarks_v5_1_1/DIAGNOSIS.md) — combined-cache ordering write-up
226
+ - [`results/benchmarks_v5_1_1/JITTER_ANALYSIS.md`](results/benchmarks_v5_1_1/JITTER_ANALYSIS.md) — inter-token p95 explanation
227
+ - [`results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md`](results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md) — why speculative decoding is currently opt-in
228
+
229
+ Reproduce locally:
230
+
231
+ ```bash
232
+ python benchmarks/ollama_vs_squish/bench_v5_1.py
233
+ ```
234
+
235
+ ---
236
+
237
+ ## What Squish Doesn't Do
238
+
239
+ In the spirit of honesty:
240
+
241
+ - **No GPU support outside Apple Silicon.** It's MLX-based. CUDA users should use vLLM or llama.cpp.
242
+ - **No multi-user serving.** Designed for one developer, one machine — not a production API.
243
+ - **No multimodal models.** Text only.
244
+ - **Higher inter-token p95 on long prompts** than Ollama. Conscious tradeoff (deferred KV-cache restore off the TTFT critical path); details in [`JITTER_ANALYSIS.md`](results/benchmarks_v5_1_1/JITTER_ANALYSIS.md).
245
+ - **Slower first-token on short prompts** than Ollama. Fundamental MLX prefill kernel cost.
246
+ - **Model conversion is slow and not user-friendly.** Squish needs models in its own format. Conversion takes time and isn't fully automated.
247
+
248
+ If any of those matter for your workflow, Ollama or LM Studio is the right choice.
249
+
250
+ ---
251
+
252
+ ## Architecture
253
+
254
+ **Persistent daemon.** The model loads once when the daemon starts and stays
255
+ resident. Per-invocation model-load cost becomes a once-per-login cost.
256
+
257
+ **Two-cache architecture.** A block-paged KV cache stores KV state for
258
+ fixed-size token blocks on disk (`.safetensors`) and reconstructs partial-match
259
+ prefixes for shifting-prefix workloads. A prompt KV cache catches exact-prefix
260
+ repeats with single-digit-millisecond TTFT.
261
+
262
+ **INT3 quantization with a hard-block list.** INT3 behaviour is not uniform
263
+ across model families. Qwen3 holds within ~1pp of FP16; Gemma-3 collapses
264
+ (~15pp on common benchmarks). Squish enables INT3 only for families where it's
265
+ safe and hard-blocks the rest. Try to load Gemma-3 at INT3 and the accuracy
266
+ gate refuses — you can't accidentally ship a config that quietly degrades.
267
+
268
+ ---
269
+
270
+ ## Contributing
271
+
272
+ See [CONTRIBUTING.md](CONTRIBUTING.md). Issues, benchmarks, and PRs welcome.
273
+ The bench harness lives in `benchmarks/ollama_vs_squish/`; if you re-run on
274
+ different hardware, please share the raw JSON output.
275
+
276
+ ---
277
+
278
+ ## License
279
+
280
+ BUSL-1.1 — see [LICENSE](LICENSE).
281
+
282
+ ---
283
+
284
+ ## Links
285
+
286
+ - Article: _Local LLM Server That Wins End-to-End on Long Contexts_ — in progress
287
+ - Org: [konjoai](https://github.com/konjoai) · [konjoai.org](https://konjoai.org)
288
+ - Related: [Kohaku](https://github.com/konjoai/kohaku), [Vectro](https://github.com/konjoai/vectro), [Squash](https://github.com/konjoai/squash) (EU AI Act compliance, extracted from squish in v9.15.0)
289
+ - HuggingFace models: [huggingface.co/squish-community](https://huggingface.co/squish-community)
290
+ - Module reference: [MODULES.md](MODULES.md)
@@ -0,0 +1,225 @@
1
+ # Squish
2
+
3
+ **Local LLM inference for Apple Silicon. Faster end-to-end response on long contexts, less RAM, INT3 support.**
4
+
5
+ [![License: BUSL-1.1](https://img.shields.io/badge/License-BUSL--1.1-blue.svg)](LICENSE)
6
+ [![PyPI version](https://img.shields.io/pypi/v/squish.svg)](https://pypi.org/project/squish/)
7
+ [![CI](https://github.com/konjoai/squish/actions/workflows/ci.yml/badge.svg)](https://github.com/konjoai/squish/actions/workflows/ci.yml)
8
+ [![Platform](https://img.shields.io/badge/platform-Apple%20Silicon-lightgrey.svg)](https://github.com/konjoai/squish)
9
+ [![HuggingFace](https://img.shields.io/badge/🤗%20Models-squish--community-yellow)](https://huggingface.co/squish-community)
10
+
11
+ <img src="assets/squish-logo-1.png" height="320" alt="Squish Logo"/>
12
+
13
+ ---
14
+
15
+ ## The Numbers (v9.32.0 / bench v5.1.1)
16
+
17
+ Measured 2026-06-02 on Apple M3 MacBook Pro, 16 GB unified memory.
18
+ Model: Qwen2.5-7B-Instruct. Quant: INT4 (squish) / Q4_K_M (Ollama).
19
+ Five-run medians. Raw artifacts in [`results/benchmarks_v5_1_1/`](results/benchmarks_v5_1_1/).
20
+
21
+ | Metric | Ollama 0.18.2 | **Squish (recommended)** |
22
+ |---|---:|---:|
23
+ | **E2E response @ 4000-token prompt** | 69.63 s | **12.78 s** &nbsp;_(5.4× faster)_ |
24
+ | **E2E response @ 75-token prompt** | 8.09 s | **5.50 s** &nbsp;_(1.5× faster)_ |
25
+ | **Peak RAM during inference** | ~5 GB | **3.36 GB** |
26
+ | **Disk size — INT4** | 4.36 GB | **4.00 GB** |
27
+ | **Disk size — INT3 (Qwen3)** | not supported | **3.56 GB** |
28
+ | **TTFT @ 75-token prompt** | **131 ms** | 279 ms &nbsp;_(honest loss)_ |
29
+
30
+ **Squish wins end-to-end response time at every prompt size measured**, with
31
+ the largest win on long contexts (5.4× at 4000 tokens), uses ~33% less RAM,
32
+ and supports INT3 for compatible model families.
33
+
34
+ **Ollama wins time-to-first-token at every prompt size**, and inter-token
35
+ jitter on long contexts. If first-byte latency matters more than full-response
36
+ latency, Ollama is the right tool.
37
+
38
+ Full table, methodology, and ablation: [`docs/RESULTS.md`](docs/RESULTS.md)
39
+ (v5.1.1 section).
40
+
41
+ ---
42
+
43
+ ## Why Squish
44
+
45
+ Squish is for the workload most local-LLM tools aren't tuned for: **the same
46
+ model called many times an hour from the terminal with shifting context** —
47
+ git-commit-message generation, code-review prompts, agent loops, multi-turn
48
+ chat, document Q&A.
49
+
50
+ On a 16 GB Mac, that workload collides with the rest of your work. Ollama
51
+ keeps ~5 GB resident and pays a long prefill cost on each new long prompt.
52
+ Squish is a persistent daemon: the model loads once when the daemon starts,
53
+ and a two-cache architecture (block-paged KV cache for shifting prefixes,
54
+ prompt KV cache for exact repeats) avoids re-prefilling work the daemon has
55
+ already done.
56
+
57
+ Designed for one developer on one machine. Not a production multi-tenant API.
58
+
59
+ ---
60
+
61
+ ## Install
62
+
63
+ ```bash
64
+ # PyPI
65
+ pip install squish
66
+
67
+ # Homebrew tap (coming with v9.32.0)
68
+ brew tap konjoai/squish
69
+ brew install squish
70
+
71
+ # From source
72
+ git clone https://github.com/konjoai/squish
73
+ cd squish
74
+ pip install -e .
75
+ ```
76
+
77
+ Requirements: macOS 13+, Apple Silicon (M1–M5), Python 3.10+.
78
+
79
+ ---
80
+
81
+ ## Quick Start
82
+
83
+ ```bash
84
+ # Pull a pre-quantised model from the catalog
85
+ squish pull qwen2.5-7b-int4
86
+
87
+ # Start the daemon with both caches enabled (recommended config)
88
+ squish run qwen2.5-7b-int4 \
89
+ --block-kv-cache ~/.cache/squish/blocks \
90
+ --prompt-kv-cache ~/.cache/squish/pkv \
91
+ --port 8080
92
+ ```
93
+
94
+ Use it as an OpenAI-compatible client:
95
+
96
+ ```bash
97
+ curl http://localhost:8080/v1/chat/completions \
98
+ -H "Content-Type: application/json" \
99
+ -d '{
100
+ "model": "qwen2.5-7b-int4",
101
+ "messages": [{"role": "user", "content": "Hello"}]
102
+ }'
103
+ ```
104
+
105
+ Or point any OpenAI / Ollama client at it:
106
+
107
+ ```bash
108
+ export OPENAI_BASE_URL=http://localhost:8080/v1
109
+ export OPENAI_API_KEY=squish
110
+ # Ollama-compatible /api/* endpoints also work
111
+ export OLLAMA_HOST=http://localhost:8080
112
+ ```
113
+
114
+ Install the macOS LaunchAgent so the daemon starts at login:
115
+
116
+ ```bash
117
+ squish daemon install
118
+ ```
119
+
120
+ The **SquishBar** menu-bar app (`apps/macos/SquishBar/`) ships alongside the
121
+ daemon — model picker, load progress, and a global hotkey for the chat panel.
122
+ Build it from Xcode or grab the signed `.app` from the GitHub release page.
123
+
124
+ ---
125
+
126
+ ## Configuration
127
+
128
+ | Flag | Purpose |
129
+ |---|---|
130
+ | `--block-kv-cache <DIR>` | Block-paged KV cache for shifting-prefix workloads (agents, multi-turn). Persists across daemon restarts via `.safetensors` blocks. |
131
+ | `--prompt-kv-cache <DIR>` | Exact-prompt KV cache. Single-digit-millisecond TTFT on verbatim repeats. |
132
+ | `--block-kv-size N` | Block size in tokens (default 64). |
133
+ | `--draft-model <MODEL>` | Speculative-decode draft model (opt-in; see [v5.2 diagnosis](results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md) for current status — net-negative on M3 INT4 with the draft models tested, kept off by default). |
134
+ | `--draft-depth N` | Speculative decode depth K. |
135
+ | `--no-spec`, `--no-cache` | Disable flags, intended for benchmark controls. |
136
+ | `squish daemon install` / `uninstall` | macOS LaunchAgent integration. |
137
+
138
+ Picking the right cache for your workload:
139
+
140
+ - **Exact-prompt repeats** (cached scripts, fixed templates, automated jobs):
141
+ `--prompt-kv-cache` alone. ~9 ms TTFT on a cache hit.
142
+ - **Shifting-prefix workloads** (agents, multi-turn conversations):
143
+ `--block-kv-cache` alone, or combined config.
144
+ - **General use without knowing the workload**: combined config (both caches
145
+ enabled). Best end-to-end completion time across prompt sizes.
146
+
147
+ The combined config currently doesn't inherit PKV's fast-hit TTFT due to a
148
+ lookup ordering issue documented in
149
+ [`results/benchmarks_v5_1_1/DIAGNOSIS.md`](results/benchmarks_v5_1_1/DIAGNOSIS.md);
150
+ reordering is tracked as a v5.2 follow-up.
151
+
152
+ ---
153
+
154
+ ## Benchmarks
155
+
156
+ Full table, methodology, ablation, jitter analysis, and raw per-run JSON:
157
+
158
+ - [`docs/RESULTS.md`](docs/RESULTS.md) — v5.1.1 section is the source of truth
159
+ - [`benchmarks/ollama_vs_squish/RESULTS.md`](benchmarks/ollama_vs_squish/RESULTS.md) — bench harness output
160
+ - [`results/benchmarks_v5_1_1/DIAGNOSIS.md`](results/benchmarks_v5_1_1/DIAGNOSIS.md) — combined-cache ordering write-up
161
+ - [`results/benchmarks_v5_1_1/JITTER_ANALYSIS.md`](results/benchmarks_v5_1_1/JITTER_ANALYSIS.md) — inter-token p95 explanation
162
+ - [`results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md`](results/benchmarks_v5_2/SPEC_DECODE_DIAGNOSIS.md) — why speculative decoding is currently opt-in
163
+
164
+ Reproduce locally:
165
+
166
+ ```bash
167
+ python benchmarks/ollama_vs_squish/bench_v5_1.py
168
+ ```
169
+
170
+ ---
171
+
172
+ ## What Squish Doesn't Do
173
+
174
+ In the spirit of honesty:
175
+
176
+ - **No GPU support outside Apple Silicon.** It's MLX-based. CUDA users should use vLLM or llama.cpp.
177
+ - **No multi-user serving.** Designed for one developer, one machine — not a production API.
178
+ - **No multimodal models.** Text only.
179
+ - **Higher inter-token p95 on long prompts** than Ollama. Conscious tradeoff (deferred KV-cache restore off the TTFT critical path); details in [`JITTER_ANALYSIS.md`](results/benchmarks_v5_1_1/JITTER_ANALYSIS.md).
180
+ - **Slower first-token on short prompts** than Ollama. Fundamental MLX prefill kernel cost.
181
+ - **Model conversion is slow and not user-friendly.** Squish needs models in its own format. Conversion takes time and isn't fully automated.
182
+
183
+ If any of those matter for your workflow, Ollama or LM Studio is the right choice.
184
+
185
+ ---
186
+
187
+ ## Architecture
188
+
189
+ **Persistent daemon.** The model loads once when the daemon starts and stays
190
+ resident. Per-invocation model-load cost becomes a once-per-login cost.
191
+
192
+ **Two-cache architecture.** A block-paged KV cache stores KV state for
193
+ fixed-size token blocks on disk (`.safetensors`) and reconstructs partial-match
194
+ prefixes for shifting-prefix workloads. A prompt KV cache catches exact-prefix
195
+ repeats with single-digit-millisecond TTFT.
196
+
197
+ **INT3 quantization with a hard-block list.** INT3 behaviour is not uniform
198
+ across model families. Qwen3 holds within ~1pp of FP16; Gemma-3 collapses
199
+ (~15pp on common benchmarks). Squish enables INT3 only for families where it's
200
+ safe and hard-blocks the rest. Try to load Gemma-3 at INT3 and the accuracy
201
+ gate refuses — you can't accidentally ship a config that quietly degrades.
202
+
203
+ ---
204
+
205
+ ## Contributing
206
+
207
+ See [CONTRIBUTING.md](CONTRIBUTING.md). Issues, benchmarks, and PRs welcome.
208
+ The bench harness lives in `benchmarks/ollama_vs_squish/`; if you re-run on
209
+ different hardware, please share the raw JSON output.
210
+
211
+ ---
212
+
213
+ ## License
214
+
215
+ BUSL-1.1 — see [LICENSE](LICENSE).
216
+
217
+ ---
218
+
219
+ ## Links
220
+
221
+ - Article: _Local LLM Server That Wins End-to-End on Long Contexts_ — in progress
222
+ - Org: [konjoai](https://github.com/konjoai) · [konjoai.org](https://konjoai.org)
223
+ - Related: [Kohaku](https://github.com/konjoai/kohaku), [Vectro](https://github.com/konjoai/vectro), [Squash](https://github.com/konjoai/squash) (EU AI Act compliance, extracted from squish in v9.15.0)
224
+ - HuggingFace models: [huggingface.co/squish-community](https://huggingface.co/squish-community)
225
+ - Module reference: [MODULES.md](MODULES.md)