PyPI - mlx-code - Versions diffs - 0.0.35__tar.gz → 0.0.36__tar.gz - Mend

mlx-code 0.0.35tar.gz → 0.0.36tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

{mlx_code-0.0.35 → mlx_code-0.0.36}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mlx-code
-Version: 0.0.35
+Version: 0.0.36
 Summary: Coding Agent for Mac
 Home-page: https://josefalbers.github.io/mlx-code/
 Author: J Joe
@@ -13,7 +13,7 @@ Requires-Python: >=3.12.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: mlx-lm>=0.31.3; platform_system == "Darwin"
-Requires-Dist: vllm; platform_system != "Darwin"
+Requires-Dist: transformers; platform_system != "Darwin"
 Requires-Dist: httpx
 Requires-Dist: pydantic
 Provides-Extra: all

{mlx_code-0.0.35 → mlx_code-0.0.36}/mlx_code/bats.py RENAMED Viewed

@@ -11,9 +11,9 @@ _M='inserted'
 _L='submit_ts'
 _K='cancel'
 _J='asyncio.Queue'
-_I=False
-_H='seed_none'
-_G='submitted'
+_I='seed_none'
+_H='submitted'
+_G=False
 _F='insert_ts'
 _E='rid'
 _D='out'
@@ -36,7 +36,7 @@ DEFAULT_SKIPS=['(?m)^\\[SUGGESTION MODE[\\s\\S]*','(?m)^<system-reminder>[\\s\\S
 PREFILL_STEP=2048
 MAX_ENGINE_ERRORS=3
 class Tracer:
-	_COUNTERS=_G,_M,_N,_O,_P,_Q,_R,_H,_S,_T
+	_COUNTERS=_H,_M,_N,_O,_P,_Q,_R,_I,_S,_T
 	def __init__(A,path:Optional[str]=_A):
 		B=path;A._f=open(B,'a',buffering=1)if B else _A;A._lock=threading.Lock();A._c={A:0 for A in A._COUNTERS}
 		if B:logger.info('[bats] tracing to %s',B)
@@ -54,10 +54,10 @@ class CheckpointStore:
 	def load(A,prefix_tokens):from mlx_lm.models.cache import load_prompt_cache as B;C,D=B(str(A.path(prefix_tokens)),return_metadata=_B);return C
 	def save(C,prefix_tokens,cache)->bool:
 		from mlx_lm.models.cache import save_prompt_cache as D;A=C.path(prefix_tokens)
-		if A.exists():return _I
+		if A.exists():return _G
 		B=A.with_name(A.stem+'.tmp.safetensors');D(str(B),cache);B.replace(A);return _B
 def _make_think_guard(te:int,max_tokens:int,prompt_len:int):
-	F=prompt_len;D='closed';C='gen_tokens';from.main import is_stuck as H;I=int(max_tokens*.9);A={D:_I,C:[]}
+	F=prompt_len;D='closed';C='gen_tokens';from.main import is_stuck as H;I=int(max_tokens*.9);A={D:_G,C:[]}
 	def B(tokens,logits):
 		G=tokens;B=logits
 		if A[D]:return B
@@ -176,67 +176,52 @@ class Engine:
 		try:A.bg.remove(B)
 		except Exception:pass
 def make_batch_app(model_name:str,cache_dir:str=_U,*,system:Optional[str]=_A,tool_names:Optional[list]=_A,skips:Optional[list]=_A,think_tags:Optional[list]=_A,ram_seed_slots:int=4,completion_batch_size:int=32,prefill_batch_size:int=8,trace_path:Optional[str]=_A)->Starlette:
-	P='GET';V='submit';U='empty prompt';T='max_tokens';S='text';R='model not loaded';Q='disk';O=ram_seed_slots;N='model';M='none';L='store';K='tok';J=trace_path;I='ram';H='engine';F=skips;E=model_name;D='error';B='POST';C=think_tags;F=DEFAULT_SKIPS if F is _A else F;C=['<think>','</think>']if C is _A else C;J=J or os.environ.get('BATS_TRACE');A:dict[str,Any]={H:_A,K:_A,L:_A,I:_A}
+	M='submit';L='disk';K=ram_seed_slots;J='none';I='store';H='tok';G=trace_path;F='ram';E=skips;B='engine';D=model_name;C=think_tags;from.import main as N;E=DEFAULT_SKIPS if E is _A else E;C=['<think>','</think>']if C is _A else C;G=G or os.environ.get('BATS_TRACE');A:dict[str,Any]={B:_A,H:_A,I:_A,F:_A}
 	@asynccontextmanager
-	async def W(_app):
-		from mlx_lm import load;from mlx_lm.tokenizer_utils import TokenizerWrapper as G;logger.info('[bats] loading model %r',E);L,B=load(E)
-		if not isinstance(B,G):B=G(B)
+	async def O(_app):
+		from mlx_lm import load;from mlx_lm.tokenizer_utils import TokenizerWrapper as H;logger.info('[bats] loading model %r',D);M,B=load(D)
+		if not isinstance(B,H):B=H(B)
 		if _new_detokenizer(B)is _new_detokenizer(B):logger.warning('[bats] detokenizer factory returned a shared instance!')
-		D=B.convert_tokens_to_ids(C[1]);H=getattr(B,'unk_token_id',_A)
-		if D is _A or H is not _A and D==H:D=_A;logger.info('[bats] no %s token; doom-loop guard disabled',C[1])
-		I=CheckpointStore(E,cache_dir);K=RamSeedCache(O);M=Tracer(J);F=Engine(L,B,I,K,D,asyncio.get_running_loop(),M,completion_batch_size=completion_batch_size,prefill_batch_size=prefill_batch_size);F.start();A.update(engine=F,tok=B,store=I,ram=K);logger.info('[bats] ready (ram_seed_slots=%d)',O)
+		E=B.convert_tokens_to_ids(C[1]);I=getattr(B,'unk_token_id',_A)
+		if E is _A or I is not _A and E==I:E=_A;logger.info('[bats] no %s token; doom-loop guard disabled',C[1])
+		J=CheckpointStore(D,cache_dir);L=RamSeedCache(K);N=Tracer(G);F=Engine(M,B,J,L,E,asyncio.get_running_loop(),N,completion_batch_size=completion_batch_size,prefill_batch_size=prefill_batch_size);F.start();A.update(engine=F,tok=B,store=J,ram=L);logger.info('[bats] ready (ram_seed_slots=%d)',K)
 		try:yield
 		finally:F.stop()
-	def d(prompt,ckpts):
-		C=prompt;D,E=A[L],A[I]
-		def H(c):A=C[:c];return E.get(hash_tokens(A))is not _A or D.path(A).exists()
-		B=choose_seed_ckpt(ckpts,H)
-		if B==0:return _A,0,M
-		F=C[:B];G=E.get(hash_tokens(F))
-		if G is not _A:return G,B,I
-		try:return D.load(F),B,Q
-		except Exception as J:logger.info('[bats] checkpoint load failed (%s); falling back to recompute',J);return _A,0,M
-	async def e(api,q,cancel,msg_id,in_tokens,think_tags,initial_state='thinking'):
-		from.import main as A;B=A.select_adapter(api,msg_id,in_tokens);E=initial_state;F=''
+	def P(prompt,ckpts):
+		C=prompt;D,E=A[I],A[F]
+		def K(c):A=C[:c];return E.get(hash_tokens(A))is not _A or D.path(A).exists()
+		B=choose_seed_ckpt(ckpts,K)
+		if B==0:return _A,0,J
+		G=C[:B];H=E.get(hash_tokens(G))
+		if H is not _A:return H,B,F
+		try:return D.load(G),B,L
+		except Exception as M:logger.info('[bats] checkpoint load failed (%s); falling back to recompute',M);return _A,0,J
+	async def Q(q,cancel):
 		try:
-			yield B.start()
 			while _B:
-				C=await q.get()
-				if C is _A:break
-				F+=C;G,E=A.apply_think_state(B,E,C,think_tags)
-				for D in G:yield D
-			for D in A.finish_sse(B,F):yield D
+				A=await q.get()
+				if A is _A:break
+				yield A
 		finally:cancel.set()
-	async def G(request:Request):
-		L=request;from.import main as W;B=A[H]
-		if B is _A:return JSONResponse({D:R},status_code=503)
-		X=L.url.path.split('?')[0].rstrip('/');G=W.detect_api_from_path(X)
-		if G=='gemini':
-			f=str(L.url.query)or''
-			if'alt=sse'not in f and'streamGenerateContent'not in X:return JSONResponse({'candidates':[{'content':{'role':N,'parts':[{S:'{"complexity_reasoning":"local","complexity_score":50}'}]},'finishReason':'STOP'}],'usageMetadata':{'promptTokenCount':0,'candidatesTokenCount':0}})
-		O=await L.json();Y=int(O.get(T,O.get('max_completion_tokens',8192)))
-		try:E,J,g=W.encode(O,G,A[K],system,tool_names,F,think_tags=C)
-		except Exception as h:logger.exception('[bats] encode failed');return JSONResponse({D:f"encode: {h}"},status_code=500)
-		if J is _A or not E:return JSONResponse({D:U},status_code=400)
-		P=uuid.uuid4().hex[:8];i=time.perf_counter();B.tracer.count(_G);B.tracer.event(V,rid=P,api=G,prompt_tokens=len(E),n_ckpts=len(J),max_tokens=Y);j=asyncio.get_running_loop();k=time.perf_counter();l,Z,a=await j.run_in_executor(_A,d,E,J);B.tracer.count({I:_Q,Q:_R,M:_H}[a]);B.tracer.event('seed',rid=P,seed_len=Z,source=a,lookup_ms=round((time.perf_counter()-k)*1000,1));b:_J=asyncio.Queue();c=threading.Event();B.submit(_Req(E,J,l,Z,Y,b,c,P,i));m=f"msg_{uuid.uuid4().hex}";return StreamingResponse(e(G,b,c,m,len(E),C,initial_state=g),media_type='text/event-stream')
-	async def X(request:Request):
-		P='messages';B=A[H]
-		if B is _A:return JSONResponse({D:R},status_code=503)
-		C=await request.json();I=A[K];J=int(C.get(T,256))
-		if P in C:L=I.apply_chat_template(C[P],tokenize=_I,add_generation_prompt=_B)
-		else:L=C.get('prompt','')
-		E=I.encode(L)
-		if not E:return JSONResponse({D:U},status_code=400)
-		M=uuid.uuid4().hex[:8];Q=time.perf_counter();B.tracer.count(_G);B.tracer.count(_H);B.tracer.event(V,rid=M,api='generate',prompt_tokens=len(E),n_ckpts=0,max_tokens=J);F:_J=asyncio.Queue();G=threading.Event();B.submit(_Req(list(E),[],_A,0,J,F,G,M,Q))
-		if C.get('stream',_B):
-			async def W():
+	async def R(api,prompt,ckpts,*,max_tokens,temperature,top_p,rid):H=max_tokens;G=rid;E=ckpts;D=prompt;C=A[B];R=time.perf_counter();C.tracer.count(_H);C.tracer.event(M,rid=G,api=api,prompt_tokens=len(D),n_ckpts=len(E),max_tokens=H);S=asyncio.get_running_loop();T=time.perf_counter();U,I,K=await S.run_in_executor(_A,P,D,E);C.tracer.count({F:_Q,L:_R,J:_I}[K]);C.tracer.event('seed',rid=G,seed_len=I,source=K,lookup_ms=round((time.perf_counter()-T)*1000,1));N:_J=asyncio.Queue();O=threading.Event();C.submit(_Req(D,E,U,I,H,N,O,G,R));return Q(N,O)
+	async def S(request:Request):
+		Q='messages';P='error';C=A[B]
+		if C is _A:return JSONResponse({P:'model not loaded'},status_code=503)
+		D=await request.json();I=A[H];J=int(D.get('max_tokens',256))
+		if Q in D:K=I.apply_chat_template(D[Q],tokenize=_G,add_generation_prompt=_B)
+		else:K=D.get('prompt','')
+		E=I.encode(K)
+		if not E:return JSONResponse({P:'empty prompt'},status_code=400)
+		L=uuid.uuid4().hex[:8];R=time.perf_counter();C.tracer.count(_H);C.tracer.count(_I);C.tracer.event(M,rid=L,api='generate',prompt_tokens=len(E),n_ckpts=0,max_tokens=J);F:_J=asyncio.Queue();G=threading.Event();C.submit(_Req(list(E),[],_A,0,J,F,G,L,R))
+		if D.get('stream',_B):
+			async def S():
 				try:
 					while _B:
 						A=await F.get()
 						if A is _A:break
 						yield A
 				finally:G.set()
-			return StreamingResponse(W(),media_type='text/plain')
+			return StreamingResponse(S(),media_type='text/plain')
 		N=[]
 		try:
 			while _B:
@@ -244,19 +229,11 @@ def make_batch_app(model_name:str,cache_dir:str=_U,*,system:Optional[str]=_A,too
 				if O is _A:break
 				N.append(O)
 		finally:G.set()
-		return JSONResponse({S:''.join(N)})
-	async def Y(_req):A='local';return JSONResponse({'data':[{'id':A,'object':N,'created':int(time.time()),'owned_by':A}]})
-	async def Z(_req):return JSONResponse({'input_tokens':0})
-	async def a(_req):
-		C,B=A[L],A[H];F=sum(1 for A in C.cache_dir.glob('*.safetensors'))if C else 0;D={'status':'ok',N:E,'active_sequences':len(B.active)if B else 0,'checkpoint_files':F}
-		if B:D['counters']=B.tracer.snapshot()
-		return JSONResponse(D)
-	return Starlette(routes=[Route('/v1/models',Y,methods=[P]),Route('/v1/messages/count_tokens',Z,methods=[B]),Route('/v1/chat/completions',G,methods=[B]),Route('/v1/messages',G,methods=[B]),Route('/v1/responses',G,methods=[B]),Route('/v1beta/models/{rest:path}',G,methods=[B]),Route('/generate',X,methods=[B]),Route('/health',a,methods=[P])],lifespan=W)
-class BatchServer:
-	def __init__(A,app,host:str,port:int):A._server=uvicorn.Server(uvicorn.Config(app,host=host,port=port,loop='asyncio',log_level='warning'));A.host=host;A.port=port
-	def serve_forever(A):A._server.run()
-	@property
-	def started(self)->bool:return self._server.started
-	def stop(A):A._server.should_exit=_B
-def make_batch_server(host:str,port:int,model,cache_dir:str=_U,*,system:Optional[str]=_A,tool_names:Optional[list]=_A,skips:Optional[list]=_A,think_tags:Optional[list]=_A,ram_seed_slots:int=4,trace_path:Optional[str]=_A)->BatchServer:A=make_batch_app(model,cache_dir=cache_dir,system=system,tool_names=tool_names,skips=skips,think_tags=think_tags,ram_seed_slots=ram_seed_slots,trace_path=trace_path);return BatchServer(A,host,port)
+		return JSONResponse({'text':''.join(N)})
+	async def T(_req):
+		E,C=A[I],A[B];G=sum(1 for A in E.cache_dir.glob('*.safetensors'))if E else 0;F={'status':'ok','model':D,'active_sequences':len(C.active)if C else 0,'checkpoint_files':G}
+		if C:F['counters']=C.tracer.snapshot()
+		return JSONResponse(F)
+	return N.make_async_app(model_name=D,ready=lambda:A[B]is not _A,encode_tok=lambda:A[H],submit=R,system=system,tool_names=tool_names,skips=E,think_tags=C,lifespan=O,health=T,extra_routes=[('/generate',['POST'],S)])
+def make_server(host,port,model,*,cache_dir=_U,think_tags=_A,ram_seed_slots=4,trace_path=_A,fixed_port=_G):B=host;A=port;from.import main as C;A=C._find_port(B,A,fixed_port);E=make_batch_app(model,cache_dir=cache_dir,think_tags=think_tags,ram_seed_slots=ram_seed_slots,trace_path=trace_path);F=C.UvicornServer(E,B,A);D=f"http://{B}:{A}";logger.debug('batch server bound to %s',D);return F,D
 if __name__=='__main__':uvicorn.run(make_batch_app('mlx-community/Qwen3.5-4B-OptiQ-4bit'),host='0.0.0.0',port=8000)

mlx-code 0.0.35__tar.gz → 0.0.36__tar.gz

mlx-code 0.0.35tar.gz → 0.0.36tar.gz