@modular-prompt/driver 0.12.0 → 0.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/anthropic/anthropic-driver.d.ts +38 -8
- package/dist/anthropic/anthropic-driver.d.ts.map +1 -1
- package/dist/anthropic/anthropic-driver.js +180 -164
- package/dist/anthropic/anthropic-driver.js.map +1 -1
- package/dist/cache-controller.d.ts +28 -0
- package/dist/cache-controller.d.ts.map +1 -0
- package/dist/cache-controller.js +2 -0
- package/dist/cache-controller.js.map +1 -0
- package/dist/cache-utils.d.ts +20 -0
- package/dist/cache-utils.d.ts.map +1 -0
- package/dist/cache-utils.js +71 -0
- package/dist/cache-utils.js.map +1 -0
- package/dist/content-utils.d.ts.map +1 -1
- package/dist/content-utils.js +20 -0
- package/dist/content-utils.js.map +1 -1
- package/dist/driver-registry/config-based-factory.d.ts.map +1 -1
- package/dist/driver-registry/config-based-factory.js +7 -0
- package/dist/driver-registry/config-based-factory.js.map +1 -1
- package/dist/driver-registry/factory-helper.d.ts.map +1 -1
- package/dist/driver-registry/factory-helper.js +7 -4
- package/dist/driver-registry/factory-helper.js.map +1 -1
- package/dist/driver-registry/types.d.ts +6 -0
- package/dist/driver-registry/types.d.ts.map +1 -1
- package/dist/formatter/converter.js +1 -1
- package/dist/formatter/converter.js.map +1 -1
- package/dist/google-genai/element-converter.d.ts +11 -0
- package/dist/google-genai/element-converter.d.ts.map +1 -0
- package/dist/google-genai/element-converter.js +126 -0
- package/dist/google-genai/element-converter.js.map +1 -0
- package/dist/google-genai/google-genai-cache-controller.d.ts +24 -0
- package/dist/google-genai/google-genai-cache-controller.d.ts.map +1 -0
- package/dist/google-genai/google-genai-cache-controller.js +127 -0
- package/dist/google-genai/google-genai-cache-controller.js.map +1 -0
- package/dist/google-genai/google-genai-driver.d.ts +5 -29
- package/dist/google-genai/google-genai-driver.d.ts.map +1 -1
- package/dist/google-genai/google-genai-driver.js +92 -255
- package/dist/google-genai/google-genai-driver.js.map +1 -1
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/mlx-ml/mlx-cache-controller.d.ts +66 -0
- package/dist/mlx-ml/mlx-cache-controller.d.ts.map +1 -0
- package/dist/mlx-ml/mlx-cache-controller.js +600 -0
- package/dist/mlx-ml/mlx-cache-controller.js.map +1 -0
- package/dist/mlx-ml/mlx-driver.d.ts +12 -7
- package/dist/mlx-ml/mlx-driver.d.ts.map +1 -1
- package/dist/mlx-ml/mlx-driver.js +192 -124
- package/dist/mlx-ml/mlx-driver.js.map +1 -1
- package/dist/mlx-ml/mlx-message-utils.d.ts +9 -0
- package/dist/mlx-ml/mlx-message-utils.d.ts.map +1 -0
- package/dist/mlx-ml/mlx-message-utils.js +71 -0
- package/dist/mlx-ml/mlx-message-utils.js.map +1 -0
- package/dist/mlx-ml/process/index.d.ts +7 -3
- package/dist/mlx-ml/process/index.d.ts.map +1 -1
- package/dist/mlx-ml/process/index.js +22 -7
- package/dist/mlx-ml/process/index.js.map +1 -1
- package/dist/mlx-ml/process/model-handlers.d.ts +4 -59
- package/dist/mlx-ml/process/model-handlers.d.ts.map +1 -1
- package/dist/mlx-ml/process/model-handlers.js +15 -14
- package/dist/mlx-ml/process/model-handlers.js.map +1 -1
- package/dist/mlx-ml/process/model-specific.d.ts +7 -0
- package/dist/mlx-ml/process/model-specific.d.ts.map +1 -1
- package/dist/mlx-ml/process/model-specific.js +3 -0
- package/dist/mlx-ml/process/model-specific.js.map +1 -1
- package/dist/mlx-ml/process/process-communication.d.ts +3 -0
- package/dist/mlx-ml/process/process-communication.d.ts.map +1 -1
- package/dist/mlx-ml/process/process-communication.js +13 -0
- package/dist/mlx-ml/process/process-communication.js.map +1 -1
- package/dist/mlx-ml/process/queue.d.ts +5 -2
- package/dist/mlx-ml/process/queue.d.ts.map +1 -1
- package/dist/mlx-ml/process/queue.js +101 -14
- package/dist/mlx-ml/process/queue.js.map +1 -1
- package/dist/mlx-ml/process/response-processor.d.ts +10 -0
- package/dist/mlx-ml/process/response-processor.d.ts.map +1 -1
- package/dist/mlx-ml/process/response-processor.js +23 -1
- package/dist/mlx-ml/process/response-processor.js.map +1 -1
- package/dist/mlx-ml/process/types.d.ts +50 -4
- package/dist/mlx-ml/process/types.d.ts.map +1 -1
- package/dist/mlx-ml/tool-call-parser.d.ts.map +1 -1
- package/dist/mlx-ml/tool-call-parser.js +44 -25
- package/dist/mlx-ml/tool-call-parser.js.map +1 -1
- package/dist/types.d.ts +2 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +7 -4
- package/src/mlx-ml/python/__main__.py +41 -449
- package/src/mlx-ml/python/backends/__init__.py +3 -0
- package/src/mlx-ml/python/backends/base.py +84 -0
- package/src/mlx-ml/python/backends/mlx_lm.py +202 -0
- package/src/mlx-ml/python/backends/mlx_vlm.py +99 -0
- package/src/mlx-ml/python/handlers/__init__.py +6 -0
- package/src/mlx-ml/python/handlers/cache.py +81 -0
- package/src/mlx-ml/python/handlers/capabilities.py +6 -0
- package/src/mlx-ml/python/handlers/chat.py +221 -0
- package/src/mlx-ml/python/handlers/completion.py +36 -0
- package/src/mlx-ml/python/handlers/format_test.py +70 -0
- package/src/mlx-ml/python/handlers/tokenize.py +63 -0
- package/src/mlx-ml/python/pyproject.toml +13 -3
- package/src/mlx-ml/python/server.py +126 -0
- package/src/mlx-ml/python/tests/__init__.py +0 -0
- package/src/mlx-ml/python/utils/__init__.py +0 -0
- package/src/mlx-ml/python/utils/prompt_builder.py +54 -0
- package/src/mlx-ml/python/{token_utils.py → utils/token_utils.py} +1 -2
- package/src/mlx-ml/python/uv.lock +266 -41
- /package/src/mlx-ml/python/{example_basic.py → examples/example_basic.py} +0 -0
- /package/src/mlx-ml/python/{example_tool_call.py → examples/example_tool_call.py} +0 -0
- /package/src/mlx-ml/python/{chat_template_constraints.py → utils/chat_template_constraints.py} +0 -0
- /package/src/mlx-ml/python/{vlm_utils.py → utils/vlm_utils.py} +0 -0
package/dist/types.d.ts
CHANGED
|
@@ -133,6 +133,8 @@ export interface QueryOptions {
|
|
|
133
133
|
toolChoice?: ToolChoice;
|
|
134
134
|
/** Reasoning effort level for thinking models (e.g., o-series, llm-jp-4-thinking) */
|
|
135
135
|
reasoningEffort?: 'low' | 'medium' | 'high';
|
|
136
|
+
/** Enable prompt caching (driver-specific optimization) */
|
|
137
|
+
cache?: boolean;
|
|
136
138
|
}
|
|
137
139
|
/**
|
|
138
140
|
* Stream result with both stream and final result
|
package/dist/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,uBAAuB,CAAC;AAGtD,YAAY,EAAE,UAAU,EAAE,cAAc,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AAEjG;;GAEG;AACH,MAAM,MAAM,IAAI,GAAG,QAAQ,GAAG,WAAW,GAAG,MAAM,GAAG,MAAM,CAAC;AAE5D;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,QAAQ,GAAG,MAAM,GAAG,WAAW,CAAC;IACtC,OAAO,EAAE,MAAM,GAAG,UAAU,EAAE,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,IAAI,EAAE,WAAW,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,QAAQ,EAAE,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,6CAA6C;IAC7C,UAAU,EAAE,MAAM,CAAC;IACnB,oBAAoB;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,+BAA+B;IAC/B,IAAI,EAAE,cAAc,CAAC;IACrB,8BAA8B;IAC9B,KAAK,EAAE,OAAO,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,MAAM,WAAW,GAAG,mBAAmB,GAAG,wBAAwB,GAAG,iBAAiB,CAAC;AAE7F;;GAEG;AACH,wBAAgB,YAAY,CAAC,OAAO,EAAE,WAAW,GAAG,OAAO,IAAI,wBAAwB,CAEtF;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,OAAO,EAAE,WAAW,GAAG,OAAO,IAAI,iBAAiB,CAE/E;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,yDAAyD;IACzD,IAAI,EAAE,MAAM,CAAC;IACb,oEAAoE;IACpE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,wCAAwC;IACxC,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACrC,0DAA0D;IAC1D,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,MAAM,UAAU,GAClB,MAAM,GACN,MAAM,GACN,UAAU,GACV;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC;AAGrB;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B;;OAEG;IACH,OAAO,EAAE,MAAM,CAAC;IAEhB;;;OAGG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IAEzB;;;;OAIG;IACH,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAE3B,KAAK,CAAC,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,gBAAgB,EAAE,MAAM,CAAC;QACzB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;IAEF,uCAAuC;IACvC,SAAS,CAAC,EAAE,QAAQ,EAAE,CAAC;IAEvB,YAAY,CAAC,EAAE,YAAY,CAAC;IAE5B,kDAAkD;IAClD,UAAU,CAAC,EAAE,QAAQ,EAAE,CAAC;IAExB,8DAA8D;IAC9D,MAAM,CAAC,EAAE,QAAQ,EAAE,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,GAAG,YAAY,CAAC;AAEtE;;;;;;GAMG;AACH,MAAM,MAAM,SAAS,GAAG,SAAS,GAAG,UAAU,GAAG,UAAU,GAAG,MAAM,CAAC;AAErE;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,8EAA8E;IAC9E,IAAI,CAAC,EAAE,SAAS,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,GAAG,YAAY,GAAG,kBAAkB,CAAC;IACzD,iCAAiC;IACjC,KAAK,CAAC,EAAE,cAAc,EAAE,CAAC;IACzB,0BAA0B;IAC1B,UAAU,CAAC,EAAE,UAAU,CAAC;IACxB,qFAAqF;IACrF,eAAe,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AACjF,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,uBAAuB,CAAC;AAGtD,YAAY,EAAE,UAAU,EAAE,cAAc,EAAE,QAAQ,EAAE,cAAc,EAAE,MAAM,sBAAsB,CAAC;AAEjG;;GAEG;AACH,MAAM,MAAM,IAAI,GAAG,QAAQ,GAAG,WAAW,GAAG,MAAM,GAAG,MAAM,CAAC;AAE5D;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,QAAQ,GAAG,MAAM,GAAG,WAAW,CAAC;IACtC,OAAO,EAAE,MAAM,GAAG,UAAU,EAAE,CAAC;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,wBAAwB;IACvC,IAAI,EAAE,WAAW,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,QAAQ,EAAE,CAAC;CACvB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,6CAA6C;IAC7C,UAAU,EAAE,MAAM,CAAC;IACnB,oBAAoB;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,+BAA+B;IAC/B,IAAI,EAAE,cAAc,CAAC;IACrB,8BAA8B;IAC9B,KAAK,EAAE,OAAO,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,MAAM,WAAW,GAAG,mBAAmB,GAAG,wBAAwB,GAAG,iBAAiB,CAAC;AAE7F;;GAEG;AACH,wBAAgB,YAAY,CAAC,OAAO,EAAE,WAAW,GAAG,OAAO,IAAI,wBAAwB,CAEtF;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,OAAO,EAAE,WAAW,GAAG,OAAO,IAAI,iBAAiB,CAE/E;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,yDAAyD;IACzD,IAAI,EAAE,MAAM,CAAC;IACb,oEAAoE;IACpE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,wCAAwC;IACxC,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACrC,0DAA0D;IAC1D,MAAM,CAAC,EAAE,OAAO,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,MAAM,UAAU,GAClB,MAAM,GACN,MAAM,GACN,UAAU,GACV;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC;AAGrB;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B;;OAEG;IACH,OAAO,EAAE,MAAM,CAAC;IAEhB;;;OAGG;IACH,eAAe,CAAC,EAAE,MAAM,CAAC;IAEzB;;;;OAIG;IACH,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAE3B,KAAK,CAAC,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,gBAAgB,EAAE,MAAM,CAAC;QACzB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;IAEF,uCAAuC;IACvC,SAAS,CAAC,EAAE,QAAQ,EAAE,CAAC;IAEvB,YAAY,CAAC,EAAE,YAAY,CAAC;IAE5B,kDAAkD;IAClD,UAAU,CAAC,EAAE,QAAQ,EAAE,CAAC;IAExB,8DAA8D;IAC9D,MAAM,CAAC,EAAE,QAAQ,EAAE,CAAC;CACrB;AAED;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,GAAG,YAAY,CAAC;AAEtE;;;;;;GAMG;AACH,MAAM,MAAM,SAAS,GAAG,SAAS,GAAG,UAAU,GAAG,UAAU,GAAG,MAAM,CAAC;AAErE;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,8EAA8E;IAC9E,IAAI,CAAC,EAAE,SAAS,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,GAAG,YAAY,GAAG,kBAAkB,CAAC;IACzD,iCAAiC;IACjC,KAAK,CAAC,EAAE,cAAc,EAAE,CAAC;IACzB,0BAA0B;IAC1B,UAAU,CAAC,EAAE,UAAU,CAAC;IACxB,qFAAqF;IACrF,eAAe,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;IAC5C,2DAA2D;IAC3D,KAAK,CAAC,EAAE,OAAO,CAAC;CACjB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B;;OAEG;IACH,MAAM,EAAE,aAAa,CAAC,MAAM,CAAC,CAAC;IAE9B;;OAEG;IACH,MAAM,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB;;OAEG;IACH,KAAK,CAAC,MAAM,EAAE,OAAO,sBAAsB,EAAE,cAAc,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;IAE3G;;OAEG;IACH,WAAW,CAAC,MAAM,EAAE,OAAO,sBAAsB,EAAE,cAAc,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC,CAAC;IAElH;;OAEG;IACH,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,QAAQ,EAAE,QAAQ,GAAG,WAAW,GAAG,QAAQ,GAAG,MAAM,CAAC;IACrD,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,YAAY,CAAC;CAC/B"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@modular-prompt/driver",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.13.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"types": "./dist/index.d.ts",
|
|
@@ -20,13 +20,13 @@
|
|
|
20
20
|
"@anthropic-ai/sdk": "0.61.0",
|
|
21
21
|
"@anthropic-ai/vertex-sdk": "0.14.4",
|
|
22
22
|
"@google-cloud/vertexai": "1.10.0",
|
|
23
|
-
"@google/genai": "
|
|
23
|
+
"@google/genai": "2.0.1",
|
|
24
24
|
"@types/js-yaml": "4.0.9",
|
|
25
25
|
"google-auth-library": "9.15.1",
|
|
26
26
|
"js-yaml": "4.1.1",
|
|
27
27
|
"openai": "5.23.2",
|
|
28
|
-
"@modular-prompt/core": "0.
|
|
29
|
-
"@modular-prompt/utils": "0.3.
|
|
28
|
+
"@modular-prompt/core": "0.3.0",
|
|
29
|
+
"@modular-prompt/utils": "0.3.5"
|
|
30
30
|
},
|
|
31
31
|
"devDependencies": {
|
|
32
32
|
"@eslint/js": "9.39.2",
|
|
@@ -38,6 +38,9 @@
|
|
|
38
38
|
"typescript": "5.9.3",
|
|
39
39
|
"vitest": "3.2.4"
|
|
40
40
|
},
|
|
41
|
+
"engines": {
|
|
42
|
+
"node": ">=20.0.0"
|
|
43
|
+
},
|
|
41
44
|
"publishConfig": {
|
|
42
45
|
"access": "public",
|
|
43
46
|
"registry": "https://registry.npmjs.org/"
|
|
@@ -1,466 +1,58 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
|
|
3
|
-
from
|
|
4
|
-
from token_utils import get_capabilities
|
|
2
|
+
|
|
3
|
+
from backends import MlxLmBackend, MlxVlmBackend
|
|
4
|
+
from utils.token_utils import get_capabilities
|
|
5
|
+
from utils.vlm_utils import detect_model_kind
|
|
6
|
+
from server import Server
|
|
5
7
|
|
|
6
8
|
model_name = sys.argv[1] if len(sys.argv) > 1 else "mlx-community/gemma-3-270m-it-qat-4bit"
|
|
7
9
|
text_only = "--text-only" in sys.argv
|
|
8
10
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
if
|
|
13
|
-
|
|
14
|
-
try:
|
|
15
|
-
model, processor = vlm_load(model_name)
|
|
16
|
-
tokenizer = processor # capabilities取得用(VLMのprocessorもtokenizer互換)
|
|
17
|
-
except (ValueError, Exception) as e:
|
|
18
|
-
# mlx_vlm.models にモジュールが存在しても、実際のモデルに vision コンポーネントが
|
|
19
|
-
# ない場合(例: Qwen3.5 テキストモデルが qwen2_vl として認識される)にフォールバック
|
|
20
|
-
sys.stderr.write(f"VLM load failed, falling back to LM: {e}\n")
|
|
21
|
-
model_kind = "lm"
|
|
22
|
-
from mlx_lm import load, stream_generate
|
|
23
|
-
from mlx_lm.sample_utils import make_sampler
|
|
24
|
-
model, tokenizer = load(model_name)
|
|
25
|
-
else:
|
|
26
|
-
from mlx_lm import load, stream_generate
|
|
27
|
-
from mlx_lm.sample_utils import make_sampler
|
|
28
|
-
model, tokenizer = load(model_name)
|
|
29
|
-
|
|
30
|
-
# Capabilities情報の取得
|
|
31
|
-
capabilities = get_capabilities(tokenizer)
|
|
32
|
-
capabilities["model_kind"] = model_kind
|
|
33
|
-
|
|
34
|
-
def read():
|
|
35
|
-
lines = []
|
|
36
|
-
data = None
|
|
37
|
-
eof = False
|
|
38
|
-
while not eof:
|
|
39
|
-
line = sys.stdin.readline()
|
|
40
|
-
# sys.stderr.write('line:' + line + '\n')
|
|
41
|
-
if not line:
|
|
42
|
-
eof = True
|
|
43
|
-
else:
|
|
44
|
-
lines.append(line)
|
|
45
|
-
try:
|
|
46
|
-
data = json.loads(''.join(lines))
|
|
47
|
-
except json.JSONDecodeError as e:
|
|
48
|
-
data = None
|
|
49
|
-
continue
|
|
50
|
-
break
|
|
51
|
-
return data
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
def supports_chat_template():
|
|
55
|
-
"""
|
|
56
|
-
チャットテンプレートがサポートされているかを判定
|
|
57
|
-
|
|
58
|
-
apply_chat_templateメソッドの存在と、tokenizer.chat_templateの両方を確認する。
|
|
59
|
-
tokenizer.chat_templateが設定されていない場合、apply_chat_templateを呼んでも
|
|
60
|
-
エラーになるため、両方の条件をチェックする必要がある。
|
|
61
|
-
|
|
62
|
-
Returns:
|
|
63
|
-
bool: チャットテンプレートがサポートされている場合True
|
|
64
|
-
"""
|
|
65
|
-
return (hasattr(tokenizer, 'apply_chat_template') and
|
|
66
|
-
hasattr(tokenizer, 'chat_template') and
|
|
67
|
-
tokenizer.chat_template is not None)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def handle_capabilities():
|
|
71
|
-
"""capabilities API の処理"""
|
|
72
|
-
print(json.dumps(capabilities), end='\0', flush=True)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
def handle_format_test(messages, options=None, tools=None):
|
|
76
|
-
"""フォーマットテスト API の処理(実際に生成せずフォーマットのみ)"""
|
|
77
|
-
if options is None:
|
|
78
|
-
options = {}
|
|
79
|
-
|
|
80
|
-
result = {
|
|
81
|
-
"formatted_prompt": None,
|
|
82
|
-
"template_applied": False,
|
|
83
|
-
"model_specific_processing": None,
|
|
84
|
-
"error": None
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
try:
|
|
88
|
-
# チャットテンプレートが利用可能かチェック
|
|
89
|
-
if supports_chat_template():
|
|
90
|
-
# messagesはTypeScript側で既にモデル固有処理済み
|
|
91
|
-
result["model_specific_processing"] = messages
|
|
92
|
-
|
|
93
|
-
# プロンプト生成(フォーマットのみ)
|
|
94
|
-
primer = options.get('primer')
|
|
95
|
-
add_generation_prompt = True
|
|
96
|
-
tokenize = False # 常にテキストで返す
|
|
97
|
-
|
|
98
|
-
if primer is not None:
|
|
99
|
-
messages.append({'role': 'assistant', 'content': primer})
|
|
100
|
-
add_generation_prompt = False
|
|
11
|
+
drafter_model = None
|
|
12
|
+
if "--drafter" in sys.argv:
|
|
13
|
+
idx = sys.argv.index("--drafter")
|
|
14
|
+
if idx + 1 < len(sys.argv):
|
|
15
|
+
drafter_model = sys.argv[idx + 1]
|
|
101
16
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
tools=tools,
|
|
107
|
-
add_generation_prompt=add_generation_prompt,
|
|
108
|
-
tokenize=tokenize,
|
|
109
|
-
)
|
|
110
|
-
except TypeError:
|
|
111
|
-
formatted_prompt = tokenizer.apply_chat_template(
|
|
112
|
-
messages,
|
|
113
|
-
add_generation_prompt=add_generation_prompt,
|
|
114
|
-
tokenize=tokenize,
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
if primer is not None:
|
|
118
|
-
formatted_prompt = primer.join(formatted_prompt.split(primer)[0:-1]) + primer
|
|
119
|
-
|
|
120
|
-
result["formatted_prompt"] = formatted_prompt
|
|
121
|
-
result["template_applied"] = True
|
|
122
|
-
else:
|
|
123
|
-
# チャットテンプレートがない場合はcompletionフォーマット
|
|
124
|
-
formatted_prompt = generate_merged_prompt(messages)
|
|
125
|
-
primer = options.get('primer')
|
|
126
|
-
if primer is not None:
|
|
127
|
-
formatted_prompt += primer
|
|
128
|
-
|
|
129
|
-
result["formatted_prompt"] = formatted_prompt
|
|
130
|
-
result["template_applied"] = False
|
|
131
|
-
|
|
132
|
-
except Exception as e:
|
|
133
|
-
result["error"] = str(e)
|
|
134
|
-
|
|
135
|
-
print(json.dumps(result), end='\0', flush=True)
|
|
136
|
-
|
|
137
|
-
def handle_chat(messages, primer=None, options=None, tools=None, reasoning_effort=None):
|
|
138
|
-
"""chat API の処理"""
|
|
139
|
-
if options is None:
|
|
140
|
-
options = {}
|
|
141
|
-
|
|
142
|
-
trust_remote_code = options.pop('trust_remote_code', None)
|
|
143
|
-
|
|
144
|
-
# チャットテンプレートが利用可能かチェック
|
|
145
|
-
if not supports_chat_template():
|
|
146
|
-
# チャットテンプレートがない場合はcompletionフォーマットに変換
|
|
147
|
-
prompt = generate_merged_prompt(messages)
|
|
148
|
-
if primer is not None:
|
|
149
|
-
print(primer, end='', flush=True)
|
|
150
|
-
generate_text(prompt, options)
|
|
151
|
-
return
|
|
152
|
-
|
|
153
|
-
# プロンプト生成
|
|
154
|
-
add_generation_prompt = True
|
|
155
|
-
tokenize = False
|
|
156
|
-
|
|
157
|
-
if primer is not None:
|
|
158
|
-
messages.append({'role': 'assistant', 'content': primer})
|
|
159
|
-
add_generation_prompt = False
|
|
160
|
-
tokenize = False
|
|
161
|
-
|
|
162
|
-
# apply_chat_templateの追加引数(reasoning_effort等)
|
|
163
|
-
extra_kwargs = {}
|
|
164
|
-
if tools is not None:
|
|
165
|
-
extra_kwargs['tools'] = tools
|
|
166
|
-
if reasoning_effort is not None:
|
|
167
|
-
extra_kwargs['reasoning_effort'] = reasoning_effort
|
|
168
|
-
if trust_remote_code is not None:
|
|
169
|
-
extra_kwargs['trust_remote_code'] = trust_remote_code
|
|
170
|
-
|
|
171
|
-
# テンプレート適用(対応していないkwargsはTypeErrorになるので段階的にフォールバック)
|
|
172
|
-
try:
|
|
173
|
-
prompt = tokenizer.apply_chat_template(
|
|
174
|
-
messages,
|
|
175
|
-
add_generation_prompt=add_generation_prompt,
|
|
176
|
-
tokenize=tokenize,
|
|
177
|
-
**extra_kwargs,
|
|
178
|
-
)
|
|
179
|
-
except TypeError:
|
|
180
|
-
# reasoning_effort非対応の場合、toolsのみで再試行
|
|
17
|
+
draft_block_size = None
|
|
18
|
+
if "--draft-block-size" in sys.argv:
|
|
19
|
+
idx = sys.argv.index("--draft-block-size")
|
|
20
|
+
if idx + 1 < len(sys.argv):
|
|
181
21
|
try:
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
messages,
|
|
187
|
-
add_generation_prompt=add_generation_prompt,
|
|
188
|
-
tokenize=tokenize,
|
|
189
|
-
**fallback_kwargs,
|
|
190
|
-
)
|
|
191
|
-
except TypeError:
|
|
192
|
-
prompt = tokenizer.apply_chat_template(
|
|
193
|
-
messages,
|
|
194
|
-
add_generation_prompt=add_generation_prompt,
|
|
195
|
-
tokenize=tokenize,
|
|
196
|
-
)
|
|
197
|
-
|
|
198
|
-
if primer is not None:
|
|
199
|
-
prompt = primer.join(prompt.split(primer)[0:-1]) + primer
|
|
200
|
-
print(primer, end='', flush=True)
|
|
201
|
-
|
|
202
|
-
generate_text(prompt, options)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def generate_merged_prompt(messages):
|
|
206
|
-
"""apply_chat_templateがない場合のプロンプト生成"""
|
|
207
|
-
# messagesはTypeScript側で既にmergeSystemMessages処理済み
|
|
208
|
-
# TypeScript側のformatterと同じフォーマットを維持
|
|
209
|
-
|
|
210
|
-
prompt_parts = []
|
|
211
|
-
special_tokens = capabilities.get('special_tokens', {})
|
|
22
|
+
draft_block_size = int(sys.argv[idx + 1])
|
|
23
|
+
except ValueError:
|
|
24
|
+
sys.stderr.write(f"Invalid --draft-block-size value: {sys.argv[idx + 1]}\n")
|
|
25
|
+
sys.exit(1)
|
|
212
26
|
|
|
213
|
-
for msg in messages:
|
|
214
|
-
role = msg['role'] # 小文字のまま
|
|
215
|
-
role_upper = role.upper()
|
|
216
27
|
|
|
217
|
-
|
|
218
|
-
|
|
28
|
+
def create_backend(model_name: str, text_only: bool = False):
|
|
29
|
+
model_kind = "lm" if text_only else detect_model_kind(model_name)
|
|
219
30
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
start_token = role_token['start']['text']
|
|
223
|
-
end_token = role_token['end']['text']
|
|
224
|
-
prompt_parts.extend([
|
|
225
|
-
start_token,
|
|
226
|
-
msg['content'].strip(),
|
|
227
|
-
end_token,
|
|
228
|
-
'' # 空行で区切る
|
|
229
|
-
])
|
|
230
|
-
else:
|
|
231
|
-
# 2. 専用トークンがない場合、汎用blockトークンを探す
|
|
232
|
-
# blockやcontextなどの汎用的なペアトークンを探す
|
|
233
|
-
block_token = None
|
|
234
|
-
for candidate in ['block', 'context', 'quote', 'section']:
|
|
235
|
-
token = special_tokens.get(candidate)
|
|
236
|
-
if token and isinstance(token, dict) and 'start' in token:
|
|
237
|
-
block_token = token
|
|
238
|
-
break
|
|
239
|
-
|
|
240
|
-
if block_token:
|
|
241
|
-
# 汎用blockトークンがある場合: {block_begin}{role}:\n...{block_end}
|
|
242
|
-
start_token = block_token['start']['text']
|
|
243
|
-
end_token = block_token['end']['text']
|
|
244
|
-
prompt_parts.extend([
|
|
245
|
-
f'{start_token}{role_upper}:\n{msg["content"].strip()}',
|
|
246
|
-
end_token,
|
|
247
|
-
'' # 空行で区切る
|
|
248
|
-
])
|
|
249
|
-
else:
|
|
250
|
-
# 3. どちらもない場合は、HTMLコメント形式(フォールバック)
|
|
251
|
-
prompt_parts.extend([
|
|
252
|
-
f'<!-- begin of {role_upper} -->',
|
|
253
|
-
msg['content'].strip(),
|
|
254
|
-
f'<!-- end of {role_upper} -->',
|
|
255
|
-
'' # 空行で区切る
|
|
256
|
-
])
|
|
257
|
-
|
|
258
|
-
# 最後の空行を削除して、ダブル改行で結合
|
|
259
|
-
return '\n'.join(prompt_parts[:-1])
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
def handle_completion(prompt, options=None, images=None, max_image_size=768):
|
|
263
|
-
"""completion API の処理
|
|
264
|
-
|
|
265
|
-
VLMモデルの場合、TypeScript側でプロンプトにimageトークンが挿入済み。
|
|
266
|
-
images が渡された場合は VLM 生成を使用する。
|
|
267
|
-
"""
|
|
268
|
-
if options is None:
|
|
269
|
-
options = {}
|
|
270
|
-
|
|
271
|
-
# promptはTypeScript側で既にモデル固有処理済み
|
|
272
|
-
|
|
273
|
-
if images:
|
|
274
|
-
pil_images = load_and_resize_images(images, max_image_size)
|
|
275
|
-
|
|
276
|
-
import re
|
|
277
|
-
display_prompt = re.sub(r'(<\|image_pad\|>)+', '<|image_pad|>...', prompt)
|
|
278
|
-
sys.stderr.write(f"--- vlm completion (images: {len(pil_images)}, max_size: {max_image_size})\n{display_prompt}\n")
|
|
279
|
-
|
|
280
|
-
generate_text_vlm(prompt, pil_images, options)
|
|
281
|
-
else:
|
|
282
|
-
generate_text(prompt, options)
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
def handle_chat_vlm(messages, images, options=None, max_image_size=768, tools=None, primer=None):
|
|
286
|
-
"""VLMモデル用のチャット処理
|
|
287
|
-
|
|
288
|
-
messages: TypeScript側で画像プレースホルダー({type: "image"})が挿入済み
|
|
289
|
-
images: 画像ファイルパスの配列(プレースホルダーと位置が対応)
|
|
290
|
-
tools: ツール定義(テンプレートが対応している場合のみ使用)
|
|
291
|
-
primer: アシスタント応答のプリフィックス
|
|
292
|
-
"""
|
|
293
|
-
if options is None:
|
|
294
|
-
options = {}
|
|
295
|
-
|
|
296
|
-
# primer処理
|
|
297
|
-
add_generation_prompt = True
|
|
298
|
-
if primer is not None:
|
|
299
|
-
messages.append({'role': 'assistant', 'content': primer})
|
|
300
|
-
add_generation_prompt = False
|
|
301
|
-
|
|
302
|
-
# processorのapply_chat_templateを直接使用
|
|
303
|
-
# systemメッセージのマージはTypeScript側でchat_restrictionsに基づき処理済み
|
|
304
|
-
# tools対応を試みる(テンプレートが対応していなければtools無しで実行)
|
|
305
|
-
try:
|
|
306
|
-
formatted_prompt = processor.apply_chat_template(
|
|
307
|
-
messages,
|
|
308
|
-
tools=tools,
|
|
309
|
-
add_generation_prompt=add_generation_prompt,
|
|
310
|
-
tokenize=False,
|
|
311
|
-
)
|
|
312
|
-
except TypeError:
|
|
313
|
-
formatted_prompt = processor.apply_chat_template(
|
|
314
|
-
messages,
|
|
315
|
-
add_generation_prompt=add_generation_prompt,
|
|
316
|
-
tokenize=False,
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
if primer is not None:
|
|
320
|
-
formatted_prompt = primer.join(formatted_prompt.split(primer)[0:-1]) + primer
|
|
321
|
-
print(primer, end='', flush=True)
|
|
322
|
-
|
|
323
|
-
# 画像ファイルを読み込み・リサイズ
|
|
324
|
-
pil_images = load_and_resize_images(images, max_image_size)
|
|
325
|
-
|
|
326
|
-
# image_padトークンを省略して表示(大量のパディングで読みづらいため)
|
|
327
|
-
import re
|
|
328
|
-
display_prompt = re.sub(r'(<\|image_pad\|>)+', '<|image_pad|>...', formatted_prompt)
|
|
329
|
-
sys.stderr.write(f"--- vlm prompt (images: {len(pil_images)}, max_size: {max_image_size})\n{display_prompt}\n")
|
|
330
|
-
|
|
331
|
-
generate_text_vlm(formatted_prompt, pil_images, options)
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
def generate_text_vlm(prompt, images, options, stop_token_ids=None):
|
|
335
|
-
"""VLMストリーミング生成"""
|
|
336
|
-
temperature = options.pop('temperature', 1.0) if 'temperature' in options else 1.0
|
|
337
|
-
max_tokens = options.pop('max_tokens', 1000) if 'max_tokens' in options else 1000
|
|
338
|
-
top_p = options.pop('top_p', 0.0) if 'top_p' in options else 0.0
|
|
339
|
-
top_k = options.pop('top_k', 0) if 'top_k' in options else 0
|
|
340
|
-
|
|
341
|
-
for response in vlm_stream_generate(
|
|
342
|
-
model, processor, prompt,
|
|
343
|
-
image=images if images else None,
|
|
344
|
-
max_tokens=max_tokens,
|
|
345
|
-
temperature=temperature,
|
|
346
|
-
top_p=top_p,
|
|
347
|
-
top_k=top_k,
|
|
348
|
-
):
|
|
349
|
-
# 追加 stop token チェック(tool call end 等)
|
|
350
|
-
if stop_token_ids and hasattr(response, 'token') and int(response.token) in stop_token_ids:
|
|
351
|
-
sys.stderr.write(f"--- stop token detected (vlm): {int(response.token)}\n")
|
|
352
|
-
print('\n', end='\0', flush=True)
|
|
353
|
-
return
|
|
354
|
-
print(response.text.replace('\0', ''), end='', flush=True)
|
|
355
|
-
|
|
356
|
-
print('\n', end='\0', flush=True)
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
def generate_text(prompt, options):
|
|
360
|
-
"""テキスト生成の共通処理
|
|
361
|
-
|
|
362
|
-
注意: optionsはTypeScript側で事前にバリデーション済み
|
|
363
|
-
- temperatureパラメータはsamplerオブジェクトに変換
|
|
364
|
-
- サポートされていないパラメータはTS側でフィルタリング
|
|
365
|
-
"""
|
|
366
|
-
# デフォルトオプションの設定
|
|
367
|
-
default_options = {'max_tokens': 1000}
|
|
368
|
-
|
|
369
|
-
# temperatureパラメータを抽出してsamplerを作成
|
|
370
|
-
temperature = options.pop('temperature', 1.0) if 'temperature' in options else 1.0
|
|
371
|
-
top_p = options.pop('top_p', 0.0) if 'top_p' in options else 0.0
|
|
372
|
-
top_k = options.pop('top_k', 0) if 'top_k' in options else 0
|
|
373
|
-
|
|
374
|
-
# samplerオブジェクトを作成
|
|
375
|
-
sampler = make_sampler(temp=temperature, top_p=top_p, top_k=top_k)
|
|
376
|
-
|
|
377
|
-
# 残りのオプションとマージ
|
|
378
|
-
final_options = {**default_options, **options, 'sampler': sampler}
|
|
379
|
-
|
|
380
|
-
if isinstance(prompt, list): # tokenized
|
|
381
|
-
sys.stderr.write(f"--- prompt: len={len(prompt)}\n")
|
|
382
|
-
else:
|
|
383
|
-
sys.stderr.write(f"--- prompt\n{prompt}\n")
|
|
384
|
-
|
|
385
|
-
eos_detected = False
|
|
386
|
-
for response in stream_generate(model, tokenizer, prompt, **final_options):
|
|
387
|
-
# トークンIDによるEOS判定(より確実)
|
|
388
|
-
if is_eod_token(response, tokenizer):
|
|
389
|
-
eos_detected = True
|
|
390
|
-
print('\n', end='\0', flush=True)
|
|
391
|
-
break
|
|
392
|
-
if not eos_detected:
|
|
393
|
-
print(response.text.replace('\0', ''), end='', flush=True)
|
|
394
|
-
|
|
395
|
-
if not eos_detected:
|
|
396
|
-
print('\n', end='\0', flush=True)
|
|
397
|
-
|
|
398
|
-
def main():
|
|
399
|
-
while True:
|
|
400
|
-
req = read()
|
|
401
|
-
if req is None:
|
|
402
|
-
break
|
|
403
|
-
|
|
404
|
-
method = req.get('method')
|
|
405
|
-
if not method:
|
|
406
|
-
sys.stderr.write("Error: 'method' field is required\n")
|
|
407
|
-
print('\n', end='\0', flush=True)
|
|
408
|
-
continue
|
|
409
|
-
|
|
31
|
+
if model_kind == "vlm":
|
|
32
|
+
backend = MlxVlmBackend()
|
|
410
33
|
try:
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
messages = req.get('messages')
|
|
416
|
-
if not messages:
|
|
417
|
-
sys.stderr.write("Error: 'messages' field is required for format_test method\n")
|
|
418
|
-
print('\n', end='\0', flush=True)
|
|
419
|
-
continue
|
|
34
|
+
backend.load(model_name)
|
|
35
|
+
return backend, "vlm"
|
|
36
|
+
except (ValueError, Exception) as e:
|
|
37
|
+
sys.stderr.write(f"VLM load failed, falling back to LM: {e}\n")
|
|
420
38
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
39
|
+
backend = MlxLmBackend()
|
|
40
|
+
backend.load(model_name)
|
|
41
|
+
return backend, "lm"
|
|
424
42
|
|
|
425
|
-
elif method == 'chat':
|
|
426
|
-
messages = req.get('messages')
|
|
427
|
-
if not messages:
|
|
428
|
-
sys.stderr.write("Error: 'messages' field is required for chat method\n")
|
|
429
|
-
print('\n', end='\0', flush=True)
|
|
430
|
-
continue
|
|
431
43
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
tools = req.get('tools')
|
|
435
|
-
images = req.get('images', [])
|
|
436
|
-
reasoning_effort = req.get('reasoning_effort')
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
backend, model_kind = create_backend(model_name, text_only)
|
|
437
46
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
handle_chat(messages, primer, options, tools, reasoning_effort=reasoning_effort)
|
|
443
|
-
|
|
444
|
-
elif method == 'completion':
|
|
445
|
-
prompt = req.get('prompt')
|
|
446
|
-
if not prompt:
|
|
447
|
-
sys.stderr.write("Error: 'prompt' field is required for completion method\n")
|
|
448
|
-
print('\n', end='\0', flush=True)
|
|
449
|
-
continue
|
|
450
|
-
|
|
451
|
-
options = req.get('options', {})
|
|
452
|
-
images = req.get('images', [])
|
|
453
|
-
max_image_size = req.get('maxImageSize', 768)
|
|
454
|
-
handle_completion(prompt, options, images if images else None, max_image_size)
|
|
455
|
-
|
|
456
|
-
else:
|
|
457
|
-
sys.stderr.write(f"Error: Unknown method '{method}'\n")
|
|
458
|
-
print('\n', end='\0', flush=True)
|
|
459
|
-
|
|
460
|
-
except Exception as e:
|
|
461
|
-
sys.stderr.write(f"Error processing request: {e}\n")
|
|
462
|
-
print('\n', end='\0', flush=True)
|
|
47
|
+
if drafter_model:
|
|
48
|
+
backend.load_drafter(drafter_model)
|
|
49
|
+
if draft_block_size is not None and hasattr(backend, 'draft_block_size'):
|
|
50
|
+
backend.draft_block_size = draft_block_size
|
|
463
51
|
|
|
52
|
+
capabilities = get_capabilities(backend.get_tokenizer())
|
|
53
|
+
capabilities["model_kind"] = model_kind
|
|
54
|
+
if model_kind == "lm":
|
|
55
|
+
capabilities["methods"].append("cache_prefill")
|
|
464
56
|
|
|
465
|
-
|
|
466
|
-
|
|
57
|
+
server = Server(backend, capabilities)
|
|
58
|
+
server.run()
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any, Iterator
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class ModelBackend(ABC):
|
|
6
|
+
"""Abstract base class for model backends."""
|
|
7
|
+
|
|
8
|
+
@abstractmethod
|
|
9
|
+
def load(self, model_name: str) -> None:
|
|
10
|
+
"""Load the target model."""
|
|
11
|
+
raise NotImplementedError
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def get_tokenizer(self) -> Any:
|
|
15
|
+
"""Return the tokenizer or processor."""
|
|
16
|
+
raise NotImplementedError
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def stream_generate(
|
|
20
|
+
self, prompt: str | list[int], options: dict, images: list | None = None,
|
|
21
|
+
prompt_cache: list | None = None,
|
|
22
|
+
) -> Iterator[Any]:
|
|
23
|
+
"""Stream generation results."""
|
|
24
|
+
raise NotImplementedError
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def supports_vision(self) -> bool:
|
|
28
|
+
"""Return whether image input is supported."""
|
|
29
|
+
raise NotImplementedError
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def model_kind(self) -> str:
|
|
34
|
+
"""Return "lm" or "vlm"."""
|
|
35
|
+
raise NotImplementedError
|
|
36
|
+
|
|
37
|
+
def load_drafter(self, drafter_model: str) -> None:
|
|
38
|
+
"""Load a drafter model for speculative decoding."""
|
|
39
|
+
raise NotImplementedError(
|
|
40
|
+
f"{type(self).__name__} does not support drafter models"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def has_drafter(self) -> bool:
|
|
44
|
+
"""Return whether a drafter model is loaded."""
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
def cache_prefill(
|
|
48
|
+
self,
|
|
49
|
+
cache_path: str,
|
|
50
|
+
prompt: str,
|
|
51
|
+
base_cache_path: str | None = None,
|
|
52
|
+
trim_to_tokens: int | None = None,
|
|
53
|
+
prefix_offsets: list[int] | None = None,
|
|
54
|
+
prefix_hashes: list[str] | None = None,
|
|
55
|
+
) -> dict:
|
|
56
|
+
"""Build a KV cache from a prompt prefix."""
|
|
57
|
+
raise NotImplementedError(
|
|
58
|
+
f"{type(self).__name__} does not support prompt caching"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def load_cache_from_file(self, cache_path: str) -> list | None:
|
|
62
|
+
"""Load a prompt cache from file, or None."""
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
def get_cache_offset(self, prompt_cache: list) -> int:
|
|
66
|
+
"""Get the number of tokens stored in a loaded prompt cache."""
|
|
67
|
+
if not prompt_cache:
|
|
68
|
+
return 0
|
|
69
|
+
layer0 = prompt_cache[0]
|
|
70
|
+
if hasattr(layer0, 'offset'):
|
|
71
|
+
off = layer0.offset
|
|
72
|
+
return int(off.item() if hasattr(off, 'item') else off)
|
|
73
|
+
if hasattr(layer0, 'caches'):
|
|
74
|
+
for c in layer0.caches:
|
|
75
|
+
if hasattr(c, 'offset'):
|
|
76
|
+
off = c.offset
|
|
77
|
+
return int(off.item() if hasattr(off, 'item') else off)
|
|
78
|
+
try:
|
|
79
|
+
return int(layer0[0].shape[2])
|
|
80
|
+
except Exception:
|
|
81
|
+
pass
|
|
82
|
+
if hasattr(layer0, 'keys') and layer0.keys is not None:
|
|
83
|
+
return int(layer0.keys.shape[2])
|
|
84
|
+
return 0
|