dpone-native-accel 0.29.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ Metadata-Version: 2.4
2
+ Name: dpone-native-accel
3
+ Version: 0.29.0
4
+ Summary: Optional native acceleration provider for dpone native transfer
5
+ Author: PaulKov
6
+ License-Expression: Apache-2.0
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: License :: OSI Approved :: Apache Software License
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Typing :: Typed
13
+ Maintainer: PaulKov
14
+ Requires-Python: >=3.11, <3.13
15
+ Project-URL: Homepage, https://github.com/PaulKov/dpone
16
+ Project-URL: Repository, https://github.com/PaulKov/dpone
17
+ Description-Content-Type: text/markdown
18
+
19
+ # dpone-native-accel
20
+
21
+ Optional provider package for dpone native-transfer acceleration.
22
+
23
+ The package exposes the stable provider boundary used by `dpone[accel]`.
24
+ Backends declare certified capabilities before dpone can select them for
25
+ `native_transfer.wire.acceleration.mode: auto|required`.
26
+
27
+ The v0.29 provider includes a fused MSSQL BCP native -> ClickHouse Native
28
+ encoder for certified primitive, decimal, temporal, UUID, binary, and text
29
+ layouts. It avoids the core row-dict reference pipeline while preserving
30
+ byte-for-byte equivalence with the Python reference path. Unsupported layouts
31
+ fall back to the reference path unless acceleration is explicitly required.
@@ -0,0 +1,13 @@
1
+ # dpone-native-accel
2
+
3
+ Optional provider package for dpone native-transfer acceleration.
4
+
5
+ The package exposes the stable provider boundary used by `dpone[accel]`.
6
+ Backends declare certified capabilities before dpone can select them for
7
+ `native_transfer.wire.acceleration.mode: auto|required`.
8
+
9
+ The v0.29 provider includes a fused MSSQL BCP native -> ClickHouse Native
10
+ encoder for certified primitive, decimal, temporal, UUID, binary, and text
11
+ layouts. It avoids the core row-dict reference pipeline while preserving
12
+ byte-for-byte equivalence with the Python reference path. Unsupported layouts
13
+ fall back to the reference path unless acceleration is explicitly required.
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["uv_build>=0.9.18,<0.12.0"]
3
+ build-backend = "uv_build"
4
+
5
+ [project]
6
+ name = "dpone-native-accel"
7
+ version = "0.29.0"
8
+ description = "Optional native acceleration provider for dpone native transfer"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11,<3.13"
11
+ license = "Apache-2.0"
12
+ authors = [{ name = "PaulKov" }]
13
+ maintainers = [{ name = "PaulKov" }]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "License :: OSI Approved :: Apache Software License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Typing :: Typed",
21
+ ]
22
+
23
+ [project.urls]
24
+ Homepage = "https://github.com/PaulKov/dpone"
25
+ Repository = "https://github.com/PaulKov/dpone"
@@ -0,0 +1,7 @@
1
+ """Optional dpone native acceleration provider."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dpone_native_accel._provider import SCHEMA_VERSION, __version__, capabilities, transcode
6
+
7
+ __all__ = ["SCHEMA_VERSION", "__version__", "capabilities", "transcode"]
@@ -0,0 +1,336 @@
1
+ """Fused MSSQL BCP native to ClickHouse Native block encoder."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import struct
7
+ import uuid
8
+ from collections.abc import Iterator, Mapping, Sequence
9
+ from dataclasses import dataclass
10
+ from datetime import date
11
+ from pathlib import Path
12
+ from typing import Any, BinaryIO
13
+
14
+ _DAYS_TO_UNIX_EPOCH = (date(1970, 1, 1) - date(1, 1, 1)).days
15
+ _DAYS_TO_SQL_SERVER_EPOCH = (date(1970, 1, 1) - date(1900, 1, 1)).days
16
+
17
+
18
+ @dataclass(frozen=True, slots=True)
19
+ class ColumnLayout:
20
+ name: str
21
+ storage_type: str
22
+ clickhouse_type: str
23
+ nullable: bool
24
+ prefix_width: int
25
+ fixed_length: int | None
26
+ precision: int | None
27
+ scale: int | None
28
+ encoding: str
29
+
30
+ @classmethod
31
+ def from_contract(cls, raw: Mapping[str, Any], target_type: str | None) -> ColumnLayout:
32
+ return cls(
33
+ name=str(raw["name"]),
34
+ storage_type=str(raw["storage_type"]).lower(),
35
+ clickhouse_type=str(target_type or raw["target_type"]),
36
+ nullable=bool(raw.get("nullable")),
37
+ prefix_width=int(raw.get("prefix_width") or 0),
38
+ fixed_length=int(raw["fixed_length"]) if raw.get("fixed_length") is not None else None,
39
+ precision=int(raw["precision"]) if raw.get("precision") is not None else None,
40
+ scale=int(raw["scale"]) if raw.get("scale") is not None else None,
41
+ encoding=str(raw.get("encoding") or "utf-8"),
42
+ )
43
+
44
+
45
+ class MssqlBcpClickHouseNativeBackend:
46
+ """Encode supported MSSQL native rows into ClickHouse Native blocks."""
47
+
48
+ def __init__(
49
+ self,
50
+ *,
51
+ artifact_path: Path,
52
+ columns: Sequence[ColumnLayout],
53
+ block_rows: int,
54
+ block_bytes: int | None,
55
+ ) -> None:
56
+ self._artifact_path = artifact_path
57
+ self._columns = tuple(columns)
58
+ self._block_rows = max(1, block_rows)
59
+ self._block_bytes = block_bytes
60
+
61
+ @classmethod
62
+ def from_request(cls, request: Mapping[str, Any]) -> MssqlBcpClickHouseNativeBackend:
63
+ contract = _mapping(request.get("native_wire_contract"))
64
+ bulk = _mapping(request.get("bulk_wire_contract"))
65
+ target_schema = tuple((str(name), str(dtype)) for name, dtype in request.get("clickhouse_schema") or ())
66
+ target_by_position = tuple(dtype for _, dtype in target_schema)
67
+ raw_columns = tuple(_mapping(item) for item in contract.get("columns") or ())
68
+ columns = tuple(
69
+ ColumnLayout.from_contract(raw, target_by_position[index] if index < len(target_by_position) else None)
70
+ for index, raw in enumerate(raw_columns)
71
+ )
72
+ return cls(
73
+ artifact_path=Path(str(request["artifact_path"])),
74
+ columns=columns,
75
+ block_rows=int(bulk.get("block_rows") or 65_536),
76
+ block_bytes=_parse_bytes(bulk.get("block_bytes")),
77
+ )
78
+
79
+ def iter_blocks(self) -> Iterator[bytes]:
80
+ with self._artifact_path.open("rb") as handle:
81
+ while True:
82
+ block = _BlockBuilder(self._columns)
83
+ estimated_bytes = 0
84
+ while len(block) < self._block_rows and not _eof(handle):
85
+ values = [_read_cell(handle, column) for column in self._columns]
86
+ estimated_bytes += sum(len(value or b"") + 1 for value in values)
87
+ block.append(values)
88
+ if self._block_bytes is not None and estimated_bytes >= self._block_bytes:
89
+ break
90
+ if len(block) == 0:
91
+ return
92
+ yield block.to_bytes()
93
+
94
+
95
+ class _BlockBuilder:
96
+ def __init__(self, columns: Sequence[ColumnLayout]) -> None:
97
+ self._columns = tuple(columns)
98
+ self._null_maps = [bytearray() for _ in columns]
99
+ self._data = [bytearray() for _ in columns]
100
+ self._rows = 0
101
+
102
+ def __len__(self) -> int:
103
+ return self._rows
104
+
105
+ def append(self, values: Sequence[bytes | None]) -> None:
106
+ for index, value in enumerate(values):
107
+ column = self._columns[index]
108
+ nullable, inner_type = _unwrap_nullable(column.clickhouse_type)
109
+ if nullable:
110
+ self._null_maps[index].append(1 if value is None else 0)
111
+ self._data[index].extend(
112
+ _default_value(inner_type) if value is None else _encode_value(value, column, inner_type)
113
+ )
114
+ elif value is None:
115
+ raise ValueError(f"native_acceleration_null_for_non_nullable:{column.name}")
116
+ else:
117
+ self._data[index].extend(_encode_value(value, column, column.clickhouse_type))
118
+ self._rows += 1
119
+
120
+ def to_bytes(self) -> bytes:
121
+ payload = bytearray()
122
+ payload.extend(_var_uint(len(self._columns)))
123
+ payload.extend(_var_uint(self._rows))
124
+ for index, column in enumerate(self._columns):
125
+ payload.extend(_ch_string(column.name))
126
+ payload.extend(_ch_string(column.clickhouse_type))
127
+ payload.extend(self._null_maps[index])
128
+ payload.extend(self._data[index])
129
+ return bytes(payload)
130
+
131
+
132
+ def _read_cell(handle: BinaryIO, column: ColumnLayout) -> bytes | None:
133
+ length = column.fixed_length
134
+ if column.prefix_width:
135
+ indicator = int.from_bytes(_read_exact(handle, column.prefix_width), byteorder="little", signed=True)
136
+ if indicator == -1:
137
+ return None
138
+ length = indicator
139
+ if length is None:
140
+ raise ValueError(f"native_acceleration_missing_length:{column.name}")
141
+ return _read_exact(handle, length)
142
+
143
+
144
+ def _encode_value(payload: bytes, column: ColumnLayout, clickhouse_type: str) -> bytes:
145
+ root = _root_type(clickhouse_type)
146
+ storage = column.storage_type
147
+ if root == "bool":
148
+ return struct.pack("<B", 1 if payload[0] else 0)
149
+ if root in {"int8", "uint8", "int16", "uint16", "int32", "uint32", "int64", "uint64", "float32", "float64"}:
150
+ return payload
151
+ if root.startswith("decimal"):
152
+ return _encode_decimal(payload, column, clickhouse_type)
153
+ if root == "date":
154
+ return _encode_date(payload)
155
+ if root == "datetime64":
156
+ return _encode_datetime64(payload, column, _scale(clickhouse_type))
157
+ if root == "uuid":
158
+ return _encode_uuid(payload)
159
+ if root == "string":
160
+ if storage in {"nvarchar", "nchar"}:
161
+ value = payload.decode(column.encoding).encode("utf-8")
162
+ else:
163
+ value = payload
164
+ return _var_uint(len(value)) + value
165
+ raise ValueError(f"native_acceleration_unsupported_clickhouse_type:{clickhouse_type}")
166
+
167
+
168
+ def _default_value(clickhouse_type: str) -> bytes:
169
+ root = _root_type(clickhouse_type)
170
+ widths = {
171
+ "bool": 1,
172
+ "int8": 1,
173
+ "uint8": 1,
174
+ "int16": 2,
175
+ "uint16": 2,
176
+ "int32": 4,
177
+ "uint32": 4,
178
+ "int64": 8,
179
+ "uint64": 8,
180
+ "float32": 4,
181
+ "float64": 8,
182
+ }
183
+ if root == "string":
184
+ return b"\x00"
185
+ if root == "uuid":
186
+ return b"\x00" * 16
187
+ if root == "date":
188
+ return b"\x00" * 2
189
+ if root == "datetime64":
190
+ return b"\x00" * 8
191
+ if root.startswith("decimal"):
192
+ precision, _ = _decimal_precision_scale(clickhouse_type)
193
+ return b"\x00" * _decimal_width(precision)
194
+ if root in widths:
195
+ return b"\x00" * widths[root]
196
+ raise ValueError(f"native_acceleration_unsupported_clickhouse_type:{clickhouse_type}")
197
+
198
+
199
+ def _encode_decimal(payload: bytes, column: ColumnLayout, clickhouse_type: str) -> bytes:
200
+ precision, target_scale = _decimal_precision_scale(clickhouse_type)
201
+ storage = column.storage_type
202
+ if storage in {"money", "smallmoney"}:
203
+ scaled = _money_integer(payload) if storage == "money" else struct.unpack("<i", payload)[0]
204
+ elif storage in {"decimal", "numeric"}:
205
+ source_scale = int(column.scale if column.scale is not None else payload[1])
206
+ sign = 1 if payload[2] == 1 else -1
207
+ magnitude = int.from_bytes(payload[3:], byteorder="little", signed=False)
208
+ scaled = sign * magnitude
209
+ if target_scale > source_scale:
210
+ scaled *= 10 ** (target_scale - source_scale)
211
+ elif target_scale < source_scale:
212
+ scaled //= 10 ** (source_scale - target_scale)
213
+ else:
214
+ raise ValueError(f"native_acceleration_unsupported_decimal_source:{column.storage_type}")
215
+ return int(scaled).to_bytes(_decimal_width(precision), byteorder="little", signed=True)
216
+
217
+
218
+ def _encode_date(payload: bytes) -> bytes:
219
+ days = int.from_bytes(payload, byteorder="little", signed=False) - _DAYS_TO_UNIX_EPOCH
220
+ return struct.pack("<H", days)
221
+
222
+
223
+ def _encode_datetime64(payload: bytes, column: ColumnLayout, target_scale: int) -> bytes:
224
+ storage = column.storage_type
225
+ if storage == "datetime2":
226
+ time_length = (column.fixed_length or len(payload)) - 3
227
+ source_scale = int(column.scale or target_scale)
228
+ time_ticks = int.from_bytes(payload[:time_length], byteorder="little", signed=False)
229
+ days = int.from_bytes(payload[time_length:], byteorder="little", signed=False) - _DAYS_TO_UNIX_EPOCH
230
+ ticks = days * 86400 * (10**source_scale) + time_ticks
231
+ if target_scale > source_scale:
232
+ ticks *= 10 ** (target_scale - source_scale)
233
+ elif target_scale < source_scale:
234
+ ticks //= 10 ** (source_scale - target_scale)
235
+ return struct.pack("<q", ticks)
236
+ if storage == "datetime":
237
+ days, sql_ticks = struct.unpack("<ii", payload)
238
+ milliseconds = (sql_ticks * 1000 + 150) // 300
239
+ ticks = (days - _DAYS_TO_SQL_SERVER_EPOCH) * 86400 * 1000 + milliseconds
240
+ return struct.pack("<q", _rescale_ticks(ticks, 3, target_scale))
241
+ if storage == "smalldatetime":
242
+ days, minutes = struct.unpack("<HH", payload)
243
+ ticks = ((days - _DAYS_TO_SQL_SERVER_EPOCH) * 86400 + minutes * 60) * (10**target_scale)
244
+ return struct.pack("<q", ticks)
245
+ raise ValueError(f"native_acceleration_unsupported_datetime_source:{column.storage_type}")
246
+
247
+
248
+ def _encode_uuid(payload: bytes) -> bytes:
249
+ raw = uuid.UUID(bytes_le=payload).bytes
250
+ return raw[:8][::-1] + raw[8:][::-1]
251
+
252
+
253
+ def _rescale_ticks(value: int, source_scale: int, target_scale: int) -> int:
254
+ if target_scale > source_scale:
255
+ return value * (10 ** (target_scale - source_scale))
256
+ if target_scale < source_scale:
257
+ return value // (10 ** (source_scale - target_scale))
258
+ return value
259
+
260
+
261
+ def _money_integer(payload: bytes) -> int:
262
+ high = int.from_bytes(payload[:4], byteorder="little", signed=True)
263
+ low = int.from_bytes(payload[4:], byteorder="little", signed=False)
264
+ return (high << 32) + low
265
+
266
+
267
+ def _decimal_precision_scale(clickhouse_type: str) -> tuple[int, int]:
268
+ match = re.search(r"\((\d+)\s*,\s*(\d+)\)", clickhouse_type)
269
+ if not match:
270
+ return 38, 9
271
+ return int(match.group(1)), int(match.group(2))
272
+
273
+
274
+ def _decimal_width(precision: int) -> int:
275
+ return 4 if precision <= 9 else 8 if precision <= 18 else 16 if precision <= 38 else 32
276
+
277
+
278
+ def _scale(clickhouse_type: str) -> int:
279
+ match = re.search(r"\((\d+)", clickhouse_type)
280
+ return min(max(int(match.group(1)), 0), 9) if match else 0
281
+
282
+
283
+ def _unwrap_nullable(clickhouse_type: str) -> tuple[bool, str]:
284
+ value = clickhouse_type.strip()
285
+ if value.lower().startswith("nullable(") and value.endswith(")"):
286
+ return True, value[len("Nullable(") : -1].strip()
287
+ return False, value
288
+
289
+
290
+ def _root_type(clickhouse_type: str) -> str:
291
+ return clickhouse_type.strip().split("(", 1)[0].strip().lower()
292
+
293
+
294
+ def _ch_string(value: str) -> bytes:
295
+ payload = value.encode("utf-8")
296
+ return _var_uint(len(payload)) + payload
297
+
298
+
299
+ def _var_uint(value: int) -> bytes:
300
+ output = bytearray()
301
+ current = int(value)
302
+ while current >= 0x80:
303
+ output.append((current & 0x7F) | 0x80)
304
+ current >>= 7
305
+ output.append(current)
306
+ return bytes(output)
307
+
308
+
309
+ def _read_exact(handle: BinaryIO, length: int) -> bytes:
310
+ payload = handle.read(length)
311
+ if len(payload) != length:
312
+ raise EOFError("native_acceleration_unexpected_eof")
313
+ return payload
314
+
315
+
316
+ def _eof(handle: BinaryIO) -> bool:
317
+ position = handle.tell()
318
+ payload = handle.read(1)
319
+ handle.seek(position)
320
+ return not payload
321
+
322
+
323
+ def _mapping(value: Any) -> Mapping[str, Any]:
324
+ if not isinstance(value, Mapping):
325
+ raise ValueError("native_acceleration_invalid_request")
326
+ return value
327
+
328
+
329
+ def _parse_bytes(value: Any) -> int | None:
330
+ if value is None:
331
+ return None
332
+ text = str(value).strip()
333
+ units = {"kib": 1024, "mib": 1024**2, "gib": 1024**3, "kb": 1000, "mb": 1000**2, "gb": 1000**3}
334
+ suffix = "".join(re.findall(r"[A-Za-z]+", text)).lower()
335
+ number = text[: len(text) - len(suffix)] if suffix else text
336
+ return int(float(number.strip()) * units.get(suffix, 1))
@@ -0,0 +1,61 @@
1
+ """Public provider facade for dpone native acceleration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Mapping
6
+ from typing import Any
7
+
8
+ from dpone_native_accel._mssql_clickhouse_native import MssqlBcpClickHouseNativeBackend
9
+
10
+ __version__ = "0.29.0"
11
+ SCHEMA_VERSION = "dpone.native_transfer.acceleration.v1"
12
+ BACKEND_ID = "mssql_bcp_native_to_clickhouse_native"
13
+ SUPPORTED_TYPES = (
14
+ "bit",
15
+ "tinyint",
16
+ "smallint",
17
+ "int",
18
+ "bigint",
19
+ "real",
20
+ "float",
21
+ "money",
22
+ "smallmoney",
23
+ "decimal",
24
+ "numeric",
25
+ "date",
26
+ "datetime",
27
+ "datetime2",
28
+ "smalldatetime",
29
+ "uniqueidentifier",
30
+ "binary",
31
+ "varbinary",
32
+ "char",
33
+ "varchar",
34
+ "nchar",
35
+ "nvarchar",
36
+ )
37
+
38
+
39
+ def capabilities() -> dict[str, Any]:
40
+ """Return certified native acceleration backends."""
41
+
42
+ return {
43
+ "schema_version": SCHEMA_VERSION,
44
+ "package_version": __version__,
45
+ "backends": [
46
+ {
47
+ "backend_id": BACKEND_ID,
48
+ "source_format": "mssql-bcp-native",
49
+ "target_format": "Native",
50
+ "certified": True,
51
+ "supported_platforms": ["linux_x86_64", "macos_arm64", "macos_x86_64"],
52
+ "supported_types": list(SUPPORTED_TYPES),
53
+ }
54
+ ],
55
+ }
56
+
57
+
58
+ def transcode(request: Mapping[str, Any]) -> Iterable[bytes]:
59
+ """Transcode one MSSQL BCP native artifact to ClickHouse Native blocks."""
60
+
61
+ return MssqlBcpClickHouseNativeBackend.from_request(request).iter_blocks()