cembedding 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ # Downloaded ONNX/MLX models + runtime index (fetched via download_model.py)
2
+ data/
3
+
4
+ # Python
5
+ __pycache__/
6
+ *.py[cod]
7
+ .venv/
8
+ venv/
9
+ *.egg-info/
10
+ dist/
11
+ build/
12
+ .uv/
13
+ uv.lock
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025-2026 ClotoCore Project <ClotoCore@proton.me>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,138 @@
1
+ Metadata-Version: 2.4
2
+ Name: cembedding
3
+ Version: 0.5.0
4
+ Summary: Local-first embedding server: vector generation + index/search over HTTP (ONNX on-device or API providers). The reference /embed server for CPersona.
5
+ Project-URL: Homepage, https://github.com/Cloto-dev/CEmbedding
6
+ Project-URL: Repository, https://github.com/Cloto-dev/CEmbedding
7
+ Author-email: ClotoCore Project <ClotoCore@proton.me>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: bge-m3,embedding,jina,mcp,onnx,vector-search
11
+ Requires-Python: >=3.10
12
+ Requires-Dist: aiohttp>=3.9.0
13
+ Requires-Dist: aiosqlite>=0.20.0
14
+ Requires-Dist: httpx>=0.27.0
15
+ Requires-Dist: mcp<1.27.0,>=1.0.0
16
+ Requires-Dist: numpy>=1.24.0
17
+ Provides-Extra: mlx
18
+ Requires-Dist: mlx-embeddings>=0.1.0; extra == 'mlx'
19
+ Requires-Dist: mlx>=0.18.0; extra == 'mlx'
20
+ Provides-Extra: onnx
21
+ Requires-Dist: onnxruntime>=1.17.0; extra == 'onnx'
22
+ Requires-Dist: tokenizers>=0.15.0; extra == 'onnx'
23
+ Provides-Extra: onnx-gpu
24
+ Requires-Dist: onnxruntime-gpu>=1.17.0; extra == 'onnx-gpu'
25
+ Requires-Dist: tokenizers>=0.15.0; extra == 'onnx-gpu'
26
+ Description-Content-Type: text/markdown
27
+
28
+ <div align="center">
29
+
30
+ # CEmbedding
31
+
32
+ ### Local-first embedding server
33
+
34
+ Vector embeddings over a tiny HTTP contract.
35
+ On-device ONNX or any OpenAI-compatible API. The reference `/embed` server for [CPersona](https://github.com/Cloto-dev/CPersona).
36
+
37
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
38
+ [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)]()
39
+
40
+ </div>
41
+
42
+ ---
43
+
44
+ > **Standalone repository** — extracted from the (now private) `clotohub-servers` monorepo so it can be used on its own. [ClotoCore](https://github.com/Cloto-dev/ClotoCore) users get this through the in-app marketplace ([ClotoHub](https://hub.cloto.dev)); everyone else can run it directly as described below.
45
+
46
+ ## What it is
47
+
48
+ A small server that turns text into vectors. It speaks a minimal HTTP contract so anything can call it — its primary consumer is [CPersona](https://github.com/Cloto-dev/CPersona), whose hybrid search uses it for the vector-similarity layer. It can run a model **on-device** via ONNX (no API key, no network) or proxy an **OpenAI-compatible API**.
49
+
50
+ It also exposes an MCP (stdio) surface and an optional persistent vector index (`/index`, `/search`), but the HTTP `/embed` endpoint is all CPersona needs.
51
+
52
+ ## The `/embed` contract
53
+
54
+ ```
55
+ POST /embed
56
+ Request: { "texts": ["string", ...] } # non-empty array, max 100 per batch
57
+ Response: { "embeddings": [[float, ...], ...], "dimensions": <int> }
58
+ ```
59
+
60
+ Point any client (e.g. CPersona's `CPERSONA_EMBEDDING_URL` / generic `EMBEDDING_HTTP_URL`) at `http://127.0.0.1:8401/embed`.
61
+
62
+ ## Quick Start (on-device ONNX)
63
+
64
+ **Prerequisites:** Python 3.10+
65
+
66
+ ```bash
67
+ # Download a model into ./data/models (jina-v5-nano is what CPersona is tuned for)
68
+ uvx --from "cembedding[onnx]" cembedding-download-model --model jina-v5-nano
69
+
70
+ # Run the server (reads ./data/models from the current directory)
71
+ EMBEDDING_PROVIDER=onnx_jina_v5_nano uvx --from "cembedding[onnx]" cembedding
72
+ ```
73
+
74
+ Or install it onto your PATH with `pip install "cembedding[onnx]"`, then run
75
+ `cembedding-download-model --model jina-v5-nano` and `cembedding`.
76
+
77
+ From source (development):
78
+
79
+ ```bash
80
+ git clone https://github.com/Cloto-dev/CEmbedding.git
81
+ cd CEmbedding
82
+ python -m venv .venv
83
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
84
+ pip install ".[onnx]"
85
+ python -m cembedding.download_model --model jina-v5-nano
86
+ EMBEDDING_PROVIDER=onnx_jina_v5_nano python -m cembedding # or: python server.py
87
+ ```
88
+
89
+ You should see `HTTP embedding endpoint started on http://127.0.0.1:8401/embed`. Verify it:
90
+
91
+ ```bash
92
+ curl -s http://127.0.0.1:8401/embed \
93
+ -H 'content-type: application/json' \
94
+ -d '{"texts":["hello world"]}' | head -c 200
95
+ ```
96
+
97
+ ## Providers
98
+
99
+ Set `EMBEDDING_PROVIDER`:
100
+
101
+ | Value | Model | Notes |
102
+ |-------|-------|-------|
103
+ | `onnx_jina_v5_nano` | jina-v5-nano (33M, 768d) | Local CPU, what CPersona is benchmarked against |
104
+ | `onnx_bge_m3` | bge-m3 | Local CPU, larger / multilingual |
105
+ | `onnx_miniml` | all-MiniLM-L6-v2 (22M, 384d) | Local CPU, smallest |
106
+ | `mlx_bge_m3` | bge-m3 (MLX) | Apple Silicon only — `pip install ".[mlx]"` |
107
+ | `api_openai` | provider's model | OpenAI-compatible API; needs `EMBEDDING_API_KEY` (+ optional `EMBEDDING_API_URL`, `EMBEDDING_MODEL`) |
108
+
109
+ Download a local model with `cembedding-download-model --model {miniml,jina-v5-nano,bge-m3}` (or `python -m cembedding.download_model ...` from a source checkout; fetched from HuggingFace into `./data/models`, not committed to this repo).
110
+
111
+ ## Configuration
112
+
113
+ | Env var | Default | Description |
114
+ |---------|---------|-------------|
115
+ | `EMBEDDING_PROVIDER` | `api_openai` | Provider (see table above) |
116
+ | `EMBEDDING_HTTP_PORT` | `8401` | HTTP port for `/embed` |
117
+ | `EMBEDDING_INDEX_ENABLED` | `true` | Enable the persistent vector index endpoints (`/index`, `/search`, `/remove`, `/purge`) |
118
+ | `ONNX_MODEL_DIR` | (auto) | Override the model directory for ONNX providers |
119
+ | `ONNX_EP_PREFERENCE` | (auto) | ONNX execution providers, comma-separated. Empty = auto (CoreML on macOS, DirectML on Windows, else CPU; CPU always ensured) |
120
+ | `ONNX_MAX_SEQ_LEN` | `2048` | Max tokenization length (1–8192; MiniLM clamped to 512 internally) |
121
+ | `EMBEDDING_API_KEY` | — | Required for `api_openai` |
122
+ | `EMBEDDING_API_URL` | `https://api.openai.com/v1/embeddings` | API endpoint for `api_openai` |
123
+
124
+ ## Use with CPersona
125
+
126
+ Run this server, then tell CPersona to use it:
127
+
128
+ ```bash
129
+ # CPersona MCP config env
130
+ CPERSONA_EMBEDDING_MODE=http
131
+ CPERSONA_EMBEDDING_URL=http://127.0.0.1:8401/embed
132
+ ```
133
+
134
+ Without an embedding server CPersona still works (FTS5 + keyword search); adding one enables the vector-similarity layer.
135
+
136
+ ## License
137
+
138
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,111 @@
1
+ <div align="center">
2
+
3
+ # CEmbedding
4
+
5
+ ### Local-first embedding server
6
+
7
+ Vector embeddings over a tiny HTTP contract.
8
+ On-device ONNX or any OpenAI-compatible API. The reference `/embed` server for [CPersona](https://github.com/Cloto-dev/CPersona).
9
+
10
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
11
+ [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)]()
12
+
13
+ </div>
14
+
15
+ ---
16
+
17
+ > **Standalone repository** — extracted from the (now private) `clotohub-servers` monorepo so it can be used on its own. [ClotoCore](https://github.com/Cloto-dev/ClotoCore) users get this through the in-app marketplace ([ClotoHub](https://hub.cloto.dev)); everyone else can run it directly as described below.
18
+
19
+ ## What it is
20
+
21
+ A small server that turns text into vectors. It speaks a minimal HTTP contract so anything can call it — its primary consumer is [CPersona](https://github.com/Cloto-dev/CPersona), whose hybrid search uses it for the vector-similarity layer. It can run a model **on-device** via ONNX (no API key, no network) or proxy an **OpenAI-compatible API**.
22
+
23
+ It also exposes an MCP (stdio) surface and an optional persistent vector index (`/index`, `/search`), but the HTTP `/embed` endpoint is all CPersona needs.
24
+
25
+ ## The `/embed` contract
26
+
27
+ ```
28
+ POST /embed
29
+ Request: { "texts": ["string", ...] } # non-empty array, max 100 per batch
30
+ Response: { "embeddings": [[float, ...], ...], "dimensions": <int> }
31
+ ```
32
+
33
+ Point any client (e.g. CPersona's `CPERSONA_EMBEDDING_URL` / generic `EMBEDDING_HTTP_URL`) at `http://127.0.0.1:8401/embed`.
34
+
35
+ ## Quick Start (on-device ONNX)
36
+
37
+ **Prerequisites:** Python 3.10+
38
+
39
+ ```bash
40
+ # Download a model into ./data/models (jina-v5-nano is what CPersona is tuned for)
41
+ uvx --from "cembedding[onnx]" cembedding-download-model --model jina-v5-nano
42
+
43
+ # Run the server (reads ./data/models from the current directory)
44
+ EMBEDDING_PROVIDER=onnx_jina_v5_nano uvx --from "cembedding[onnx]" cembedding
45
+ ```
46
+
47
+ Or install it onto your PATH with `pip install "cembedding[onnx]"`, then run
48
+ `cembedding-download-model --model jina-v5-nano` and `cembedding`.
49
+
50
+ From source (development):
51
+
52
+ ```bash
53
+ git clone https://github.com/Cloto-dev/CEmbedding.git
54
+ cd CEmbedding
55
+ python -m venv .venv
56
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
57
+ pip install ".[onnx]"
58
+ python -m cembedding.download_model --model jina-v5-nano
59
+ EMBEDDING_PROVIDER=onnx_jina_v5_nano python -m cembedding # or: python server.py
60
+ ```
61
+
62
+ You should see `HTTP embedding endpoint started on http://127.0.0.1:8401/embed`. Verify it:
63
+
64
+ ```bash
65
+ curl -s http://127.0.0.1:8401/embed \
66
+ -H 'content-type: application/json' \
67
+ -d '{"texts":["hello world"]}' | head -c 200
68
+ ```
69
+
70
+ ## Providers
71
+
72
+ Set `EMBEDDING_PROVIDER`:
73
+
74
+ | Value | Model | Notes |
75
+ |-------|-------|-------|
76
+ | `onnx_jina_v5_nano` | jina-v5-nano (33M, 768d) | Local CPU, what CPersona is benchmarked against |
77
+ | `onnx_bge_m3` | bge-m3 | Local CPU, larger / multilingual |
78
+ | `onnx_miniml` | all-MiniLM-L6-v2 (22M, 384d) | Local CPU, smallest |
79
+ | `mlx_bge_m3` | bge-m3 (MLX) | Apple Silicon only — `pip install ".[mlx]"` |
80
+ | `api_openai` | provider's model | OpenAI-compatible API; needs `EMBEDDING_API_KEY` (+ optional `EMBEDDING_API_URL`, `EMBEDDING_MODEL`) |
81
+
82
+ Download a local model with `cembedding-download-model --model {miniml,jina-v5-nano,bge-m3}` (or `python -m cembedding.download_model ...` from a source checkout; fetched from HuggingFace into `./data/models`, not committed to this repo).
83
+
84
+ ## Configuration
85
+
86
+ | Env var | Default | Description |
87
+ |---------|---------|-------------|
88
+ | `EMBEDDING_PROVIDER` | `api_openai` | Provider (see table above) |
89
+ | `EMBEDDING_HTTP_PORT` | `8401` | HTTP port for `/embed` |
90
+ | `EMBEDDING_INDEX_ENABLED` | `true` | Enable the persistent vector index endpoints (`/index`, `/search`, `/remove`, `/purge`) |
91
+ | `ONNX_MODEL_DIR` | (auto) | Override the model directory for ONNX providers |
92
+ | `ONNX_EP_PREFERENCE` | (auto) | ONNX execution providers, comma-separated. Empty = auto (CoreML on macOS, DirectML on Windows, else CPU; CPU always ensured) |
93
+ | `ONNX_MAX_SEQ_LEN` | `2048` | Max tokenization length (1–8192; MiniLM clamped to 512 internally) |
94
+ | `EMBEDDING_API_KEY` | — | Required for `api_openai` |
95
+ | `EMBEDDING_API_URL` | `https://api.openai.com/v1/embeddings` | API endpoint for `api_openai` |
96
+
97
+ ## Use with CPersona
98
+
99
+ Run this server, then tell CPersona to use it:
100
+
101
+ ```bash
102
+ # CPersona MCP config env
103
+ CPERSONA_EMBEDDING_MODE=http
104
+ CPERSONA_EMBEDDING_URL=http://127.0.0.1:8401/embed
105
+ ```
106
+
107
+ Without an embedding server CPersona still works (FTS5 + keyword search); adding one enables the vector-similarity layer.
108
+
109
+ ## License
110
+
111
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,3 @@
1
+ """CEmbedding — local-first embedding server (the reference /embed server for CPersona)."""
2
+
3
+ __version__ = "0.5.0"
@@ -0,0 +1,6 @@
1
+ """Enable ``python -m cembedding``."""
2
+
3
+ from cembedding.server import run
4
+
5
+ if __name__ == "__main__":
6
+ run()
@@ -0,0 +1,14 @@
1
+ """Vendored subset of the MGP Python common layer.
2
+
3
+ Ported from ``clotohub-servers/servers/common/`` so this server runs
4
+ standalone without depending on that (now private) monorepo:
5
+
6
+ - :mod:`_vendored_mcp_common.validation` — graceful-degradation argument
7
+ validators (``validate_bool`` / ``validate_str`` / ``validate_int`` /
8
+ ``validate_dict`` / ``validate_float`` / ``validate_list``).
9
+ - :mod:`_vendored_mcp_common.mcp_utils` — ``ToolRegistry`` MCP tool
10
+ registration helper.
11
+
12
+ Only the symbols this server actually imports are vendored; keep this
13
+ copy in sync with the upstream common layer when it changes.
14
+ """
@@ -0,0 +1,152 @@
1
+ """
2
+ Decorator-based MCP tool registration utility.
3
+ Eliminates boilerplate list_tools/call_tool patterns across all servers.
4
+ """
5
+
6
+ import json
7
+ import logging
8
+ from collections.abc import Callable
9
+
10
+ from mcp.server import Server
11
+ from mcp.server.stdio import stdio_server
12
+ from mcp.types import TextContent, Tool, ToolAnnotations
13
+
14
+ from cembedding._vendored_mcp_common.validation import validate_bool, validate_dict, validate_float, validate_int, validate_list, validate_str
15
+
16
+
17
+ class _MgpValidationFilter(logging.Filter):
18
+ """Drop mcp.shared.session's bulk pydantic validation warnings.
19
+
20
+ The Python MCP SDK's ``ClientRequest`` union doesn't include MGP
21
+ extensions (``mgp/callback/respond``, ``notifications/mgp.*``). Every
22
+ time the kernel sends one the SDK logs a 30+ line ``Failed to validate
23
+ request`` warning against every known method, even though the SDK's
24
+ own error-response path handles it cleanly. These warnings are pure
25
+ noise and drown out genuine errors.
26
+ """
27
+
28
+ def filter(self, record: logging.LogRecord) -> bool:
29
+ msg = record.getMessage()
30
+ return not msg.startswith("Failed to validate request:") and not msg.startswith(
31
+ "Failed to validate notification:"
32
+ )
33
+
34
+
35
+ _MGP_FILTER_INSTALLED = False
36
+
37
+
38
+ def install_mgp_validation_filter() -> None:
39
+ """Install the MGP validation log filter on the root logger.
40
+
41
+ Called automatically by ``run_mcp_server``. Servers with a custom
42
+ main loop (e.g. ones that also serve HTTP) should call this
43
+ explicitly before entering ``stdio_server``.
44
+ """
45
+ global _MGP_FILTER_INSTALLED
46
+ if _MGP_FILTER_INSTALLED:
47
+ return
48
+ logging.getLogger().addFilter(_MgpValidationFilter())
49
+ _MGP_FILTER_INSTALLED = True
50
+
51
+
52
+ _VALIDATORS: dict[type, Callable] = {
53
+ bool: validate_bool,
54
+ str: validate_str,
55
+ int: validate_int,
56
+ float: validate_float,
57
+ dict: validate_dict,
58
+ list: validate_list,
59
+ }
60
+
61
+
62
+ class ToolRegistry:
63
+ """Decorator-based MCP tool registration."""
64
+
65
+ def __init__(self, server_name: str):
66
+ self.server = Server(server_name)
67
+ self._tools: list[Tool] = []
68
+ self._handlers: dict[str, Callable] = {}
69
+ self._bind()
70
+
71
+ def tool(
72
+ self,
73
+ name: str,
74
+ description: str,
75
+ schema: dict,
76
+ annotations: ToolAnnotations | None = None,
77
+ ):
78
+ """Decorator: register a tool handler.
79
+
80
+ The decorated function receives (arguments: dict) and returns a dict.
81
+ JSON serialization and TextContent wrapping are handled automatically.
82
+
83
+ *annotations* is forwarded to the MCP Tool schema. The kernel reads
84
+ ``destructiveHint`` from annotations to trigger the HITL approval
85
+ gate for destructive tools.
86
+ """
87
+
88
+ def decorator(fn):
89
+ tool_kwargs = {"name": name, "description": description, "inputSchema": schema}
90
+ if annotations is not None:
91
+ tool_kwargs["annotations"] = annotations
92
+ self._tools.append(Tool(**tool_kwargs))
93
+ self._handlers[name] = fn
94
+ return fn
95
+
96
+ return decorator
97
+
98
+ def auto_tool(
99
+ self,
100
+ name: str,
101
+ description: str,
102
+ schema: dict,
103
+ handler: Callable,
104
+ params: list[tuple],
105
+ annotations: ToolAnnotations | None = None,
106
+ ):
107
+ """Register a tool with auto-validated parameter extraction.
108
+
109
+ Each entry in *params* is ``(key, type)`` or ``(key, type, default)``.
110
+ Supported types: ``str``, ``int``, ``dict``, ``list``.
111
+ The extracted values are passed positionally to *handler*.
112
+ """
113
+
114
+ async def _handler(arguments: dict) -> dict:
115
+ args = []
116
+ for spec in params:
117
+ key, typ = spec[0], spec[1]
118
+ default = spec[2] if len(spec) > 2 else None
119
+ validator = _VALIDATORS[typ]
120
+ if default is not None:
121
+ args.append(validator(arguments, key, default))
122
+ else:
123
+ args.append(validator(arguments, key))
124
+ return await handler(*args)
125
+
126
+ self._tools.append(Tool(name=name, description=description, inputSchema=schema, annotations=annotations))
127
+ self._handlers[name] = _handler
128
+
129
+ def _bind(self):
130
+ registry = self
131
+
132
+ @self.server.list_tools()
133
+ async def list_tools() -> list[Tool]:
134
+ return registry._tools
135
+
136
+ @self.server.call_tool()
137
+ async def call_tool(name: str, arguments: dict) -> list[TextContent]:
138
+ handler = registry._handlers.get(name)
139
+ if handler is None:
140
+ return [TextContent(type="text", text=json.dumps({"error": f"Unknown tool: {name}"}))]
141
+ try:
142
+ result = await handler(arguments)
143
+ return [TextContent(type="text", text=json.dumps(result, ensure_ascii=False))]
144
+ except Exception as e:
145
+ return [TextContent(type="text", text=json.dumps({"error": str(e)}))]
146
+
147
+
148
+ async def run_mcp_server(registry: ToolRegistry):
149
+ """Standard MCP server main loop."""
150
+ install_mgp_validation_filter()
151
+ async with stdio_server() as (read_stream, write_stream):
152
+ await registry.server.run(read_stream, write_stream, registry.server.create_initialization_options())
@@ -0,0 +1,65 @@
1
+ """Common argument validation helpers for MCP tool handlers.
2
+
3
+ All validators return a safe default on type mismatch (graceful degradation).
4
+ """
5
+
6
+
7
+ def validate_bool(arguments: dict, key: str, default: bool = False) -> bool:
8
+ """Extract a boolean value, returning *default* if missing or wrong type."""
9
+ val = arguments.get(key, default)
10
+ if not isinstance(val, bool):
11
+ return default
12
+ return val
13
+
14
+
15
+ def validate_str(arguments: dict, key: str, default: str = "") -> str:
16
+ """Extract a string value, returning *default* if missing or wrong type."""
17
+ val = arguments.get(key, default)
18
+ if not isinstance(val, str):
19
+ return default
20
+ return val
21
+
22
+
23
+ def validate_int(arguments: dict, key: str, default: int = 0) -> int:
24
+ """Extract an integer value, returning *default* if missing or wrong type.
25
+
26
+ ``bool`` is explicitly excluded (``isinstance(True, int)`` is ``True``).
27
+ """
28
+ val = arguments.get(key, default)
29
+ if isinstance(val, bool) or not isinstance(val, int):
30
+ return default
31
+ return val
32
+
33
+
34
+ def validate_dict(arguments: dict, key: str, default: dict | None = None) -> dict:
35
+ """Extract a dict value, returning *default* (or ``{}``) if missing or wrong type."""
36
+ if default is None:
37
+ default = {}
38
+ val = arguments.get(key, default)
39
+ if not isinstance(val, dict):
40
+ return default
41
+ return val
42
+
43
+
44
+ def validate_float(arguments: dict, key: str, default: float = 0.0) -> float:
45
+ """Extract a float value, returning *default* if missing or wrong type.
46
+
47
+ Accepts both ``float`` and ``int`` (JSON integers are valid float inputs).
48
+ ``bool`` is explicitly excluded.
49
+ """
50
+ val = arguments.get(key, default)
51
+ if isinstance(val, bool):
52
+ return default
53
+ if isinstance(val, (int, float)):
54
+ return float(val)
55
+ return default
56
+
57
+
58
+ def validate_list(arguments: dict, key: str, default: list | None = None) -> list:
59
+ """Extract a list value, returning *default* (or ``[]``) if missing or wrong type."""
60
+ if default is None:
61
+ default = []
62
+ val = arguments.get(key, default)
63
+ if not isinstance(val, list):
64
+ return default
65
+ return val
@@ -0,0 +1,132 @@
1
+ """Download ONNX embedding models for local inference."""
2
+
3
+ import argparse
4
+ import os
5
+ import sys
6
+ import urllib.request
7
+
8
+
9
+ def _hf_download(repo_id: str, repo_filename: str, dest_path: str) -> bool:
10
+ """Download a single file from HuggingFace Hub.
11
+
12
+ Uses huggingface_hub if available (handles LFS, caching, auth).
13
+ Falls back to direct urllib download for minimal-dependency environments.
14
+ """
15
+ if os.path.exists(dest_path):
16
+ print(f" Already exists: {dest_path}")
17
+ return True
18
+
19
+ os.makedirs(os.path.dirname(dest_path), exist_ok=True)
20
+ print(f" Downloading: {repo_filename} ...")
21
+
22
+ try:
23
+ from huggingface_hub import hf_hub_download
24
+
25
+ cached = hf_hub_download(repo_id=repo_id, filename=repo_filename)
26
+ import shutil
27
+
28
+ shutil.copy2(cached, dest_path)
29
+ size_mb = os.path.getsize(dest_path) / (1024 * 1024)
30
+ print(f" Saved: {dest_path} ({size_mb:.1f} MB)")
31
+ return True
32
+ except ImportError:
33
+ pass
34
+
35
+ # Fallback: direct URL
36
+ url = f"https://huggingface.co/{repo_id}/resolve/main/{repo_filename}"
37
+ try:
38
+ urllib.request.urlretrieve(url, dest_path)
39
+ size_mb = os.path.getsize(dest_path) / (1024 * 1024)
40
+ print(f" Saved: {dest_path} ({size_mb:.1f} MB)")
41
+ return True
42
+ except Exception as e:
43
+ print(f" Failed: {e}", file=sys.stderr)
44
+ if os.path.exists(dest_path):
45
+ os.remove(dest_path)
46
+ return False
47
+
48
+
49
+ # MiniLM
50
+ MINIML_DIR = os.environ.get("ONNX_MODEL_DIR", "data/models/all-MiniLM-L6-v2")
51
+ MINIML_REPO = "sentence-transformers/all-MiniLM-L6-v2"
52
+ MINIML_FILES = {
53
+ "model.onnx": "onnx/model.onnx",
54
+ "tokenizer.json": "tokenizer.json",
55
+ }
56
+
57
+ # jina-v5-nano (retrieval variant with merged LoRA, external data format)
58
+ JINA_REPO = "jinaai/jina-embeddings-v5-text-nano-retrieval"
59
+ JINA_FILES = {
60
+ "model.onnx": "onnx/model.onnx",
61
+ "model.onnx_data": "onnx/model.onnx_data",
62
+ "tokenizer.json": "tokenizer.json",
63
+ }
64
+
65
+ # bge-m3 (Xenova int8 single-file, ~542MB)
66
+ # Xenova/bge-m3 is the canonical Transformers.js ONNX conversion maintained by HuggingFace
67
+ BGE_M3_REPO = "Xenova/bge-m3"
68
+ BGE_M3_FILES = {
69
+ "model.onnx": "onnx/model_int8.onnx",
70
+ "tokenizer.json": "tokenizer.json",
71
+ "sentencepiece.bpe.model": "sentencepiece.bpe.model",
72
+ }
73
+
74
+
75
+ def _download_repo_files(repo_id: str, files: dict[str, str], model_dir: str) -> bool:
76
+ """Download a set of repo_filename→local_filename mappings into model_dir."""
77
+ os.makedirs(model_dir, exist_ok=True)
78
+ for local_name, repo_filename in files.items():
79
+ dest = os.path.join(model_dir, local_name)
80
+ if not _hf_download(repo_id, repo_filename, dest):
81
+ return False
82
+ return True
83
+
84
+
85
+ def download():
86
+ """Download MiniLM (legacy entrypoint)."""
87
+ print("=== Downloading all-MiniLM-L6-v2 ONNX model ===")
88
+ ok = _download_repo_files(MINIML_REPO, MINIML_FILES, MINIML_DIR)
89
+ if ok:
90
+ print(f"Model ready at {MINIML_DIR}")
91
+ return ok
92
+
93
+
94
+ def download_jina_v5_nano(model_dir: str = "") -> bool:
95
+ """Download jina-embeddings-v5-text-nano-retrieval ONNX model."""
96
+ if not model_dir:
97
+ model_dir = os.environ.get("ONNX_MODEL_DIR", "data/models/jina-embeddings-v5-text-nano")
98
+ print("=== Downloading jina-embeddings-v5-text-nano-retrieval ===")
99
+ ok = _download_repo_files(JINA_REPO, JINA_FILES, model_dir)
100
+ if ok:
101
+ print(f"Model ready at {model_dir}")
102
+ return ok
103
+
104
+
105
+ def download_bge_m3(model_dir: str = "") -> bool:
106
+ """Download BAAI/bge-m3 int8 quantized ONNX model (~542MB) via Xenova conversion."""
107
+ if not model_dir:
108
+ model_dir = os.environ.get("ONNX_MODEL_DIR", "data/models/bge-m3")
109
+ print("=== Downloading BAAI/bge-m3 ONNX int8 (~542 MB) from Xenova/bge-m3 ===")
110
+ ok = _download_repo_files(BGE_M3_REPO, BGE_M3_FILES, model_dir)
111
+ if ok:
112
+ print(f"Model ready at {model_dir}")
113
+ return ok
114
+
115
+
116
+ def main():
117
+ """Console-script / ``python -m cembedding.download_model`` entry point."""
118
+ parser = argparse.ArgumentParser(description="Download ONNX embedding models")
119
+ parser.add_argument("--model", default="miniml", choices=["miniml", "jina-v5-nano", "bge-m3"])
120
+ args = parser.parse_args()
121
+
122
+ if args.model == "miniml":
123
+ success = download()
124
+ elif args.model == "jina-v5-nano":
125
+ success = download_jina_v5_nano()
126
+ else:
127
+ success = download_bge_m3()
128
+ sys.exit(0 if success else 1)
129
+
130
+
131
+ if __name__ == "__main__":
132
+ main()