ygo 1.0.2__py3-none-any.whl → 1.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ygo/__init__.py +30 -9
- ygo/delay.py +89 -0
- ygo/exceptions.py +17 -1
- ygo/lazy.py +50 -0
- ygo/pool.py +286 -0
- ygo/utils.py +230 -0
- ygo-1.2.12.dist-info/METADATA +119 -0
- ygo-1.2.12.dist-info/RECORD +11 -0
- {ygo-1.0.2.dist-info → ygo-1.2.12.dist-info}/WHEEL +1 -1
- ygo-1.2.12.dist-info/top_level.txt +1 -0
- ycat/__init__.py +0 -34
- ycat/client.py +0 -142
- ycat/dtype.py +0 -389
- ycat/parse.py +0 -66
- ycat/yck.py +0 -87
- ygo/ygo.py +0 -372
- ygo-1.0.2.dist-info/METADATA +0 -94
- ygo-1.0.2.dist-info/RECORD +0 -15
- ygo-1.0.2.dist-info/top_level.txt +0 -3
- ylog/__init__.py +0 -20
- ylog/core.py +0 -226
- {ygo-1.0.2.dist-info → ygo-1.2.12.dist-info}/licenses/LICENSE +0 -0
ycat/dtype.py
DELETED
|
@@ -1,389 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
"""
|
|
3
|
-
---------------------------------------------
|
|
4
|
-
Created on 2024/11/4 下午1:20
|
|
5
|
-
@author: ZhangYundi
|
|
6
|
-
@email: yundi.xxii@outlook.com
|
|
7
|
-
---------------------------------------------
|
|
8
|
-
"""
|
|
9
|
-
import functools
|
|
10
|
-
import re
|
|
11
|
-
from typing import Any
|
|
12
|
-
import pyarrow as pa
|
|
13
|
-
import re # 正则解析 Decimal 类型
|
|
14
|
-
|
|
15
|
-
from polars._typing import PolarsDataType
|
|
16
|
-
from polars.datatypes import (
|
|
17
|
-
Binary,
|
|
18
|
-
Boolean,
|
|
19
|
-
Date,
|
|
20
|
-
Datetime,
|
|
21
|
-
Decimal,
|
|
22
|
-
Duration,
|
|
23
|
-
Float32,
|
|
24
|
-
Float64,
|
|
25
|
-
Int8,
|
|
26
|
-
Int16,
|
|
27
|
-
Int32,
|
|
28
|
-
Int64,
|
|
29
|
-
List,
|
|
30
|
-
Null,
|
|
31
|
-
String,
|
|
32
|
-
Time,
|
|
33
|
-
UInt8,
|
|
34
|
-
UInt16,
|
|
35
|
-
UInt32,
|
|
36
|
-
UInt64,
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@functools.lru_cache(8)
|
|
41
|
-
def integer_dtype_from_nbits(
|
|
42
|
-
bits: int,
|
|
43
|
-
*,
|
|
44
|
-
unsigned: bool,
|
|
45
|
-
default: PolarsDataType | None = None,
|
|
46
|
-
) -> PolarsDataType | None:
|
|
47
|
-
"""
|
|
48
|
-
Return matching Polars integer dtype from num bits and signed/unsigned flag.
|
|
49
|
-
|
|
50
|
-
Examples
|
|
51
|
-
--------
|
|
52
|
-
>>> integer_dtype_from_nbits(8, unsigned=False)
|
|
53
|
-
Int8
|
|
54
|
-
>>> integer_dtype_from_nbits(32, unsigned=True)
|
|
55
|
-
UInt32
|
|
56
|
-
"""
|
|
57
|
-
dtype = {
|
|
58
|
-
(8, False): Int8,
|
|
59
|
-
(8, True): UInt8,
|
|
60
|
-
(16, False): Int16,
|
|
61
|
-
(16, True): UInt16,
|
|
62
|
-
(32, False): Int32,
|
|
63
|
-
(32, True): UInt32,
|
|
64
|
-
(64, False): Int64,
|
|
65
|
-
(64, True): UInt64,
|
|
66
|
-
}.get((bits, unsigned), None)
|
|
67
|
-
|
|
68
|
-
if dtype is None and default is not None:
|
|
69
|
-
return default
|
|
70
|
-
return dtype
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def timeunit_from_precision(precision: int | str | None) -> str | None:
|
|
74
|
-
"""
|
|
75
|
-
Return `time_unit` from integer precision value.
|
|
76
|
-
|
|
77
|
-
Examples
|
|
78
|
-
--------
|
|
79
|
-
>>> timeunit_from_precision(3)
|
|
80
|
-
'ms'
|
|
81
|
-
>>> timeunit_from_precision(5)
|
|
82
|
-
'us'
|
|
83
|
-
>>> timeunit_from_precision(7)
|
|
84
|
-
'ns'
|
|
85
|
-
"""
|
|
86
|
-
from math import ceil
|
|
87
|
-
|
|
88
|
-
if not precision:
|
|
89
|
-
return None
|
|
90
|
-
elif isinstance(precision, str):
|
|
91
|
-
if precision.isdigit():
|
|
92
|
-
precision = int(precision)
|
|
93
|
-
elif (precision := precision.lower()) in ("s", "ms", "us", "ns"):
|
|
94
|
-
return "ms" if precision == "s" else precision
|
|
95
|
-
try:
|
|
96
|
-
n = min(max(3, int(ceil(precision / 3)) * 3), 9) # type: ignore[operator]
|
|
97
|
-
return {3: "ms", 6: "us", 9: "ns"}.get(n)
|
|
98
|
-
except TypeError:
|
|
99
|
-
return None
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def infer_dtype_from_database_typename(
|
|
103
|
-
value: str,
|
|
104
|
-
*,
|
|
105
|
-
raise_unmatched: bool = True,
|
|
106
|
-
) -> PolarsDataType | None:
|
|
107
|
-
"""
|
|
108
|
-
Attempt to infer Polars dtype from database cursor `type_code` string value.
|
|
109
|
-
|
|
110
|
-
Examples
|
|
111
|
-
--------
|
|
112
|
-
>>> infer_dtype_from_database_typename("INT2")
|
|
113
|
-
Int16
|
|
114
|
-
>>> infer_dtype_from_database_typename("NVARCHAR")
|
|
115
|
-
String
|
|
116
|
-
>>> infer_dtype_from_database_typename("NUMERIC(10,2)")
|
|
117
|
-
Decimal(precision=10, scale=2)
|
|
118
|
-
>>> infer_dtype_from_database_typename("TIMESTAMP WITHOUT TZ")
|
|
119
|
-
Datetime(time_unit='us', time_zone=None)
|
|
120
|
-
"""
|
|
121
|
-
dtype: PolarsDataType | None = None
|
|
122
|
-
|
|
123
|
-
# normalise string name/case (eg: 'IntegerType' -> 'INTEGER')
|
|
124
|
-
original_value = value
|
|
125
|
-
value = value.upper().replace("TYPE", "")
|
|
126
|
-
|
|
127
|
-
# extract optional type modifier (eg: 'VARCHAR(64)' -> '64')
|
|
128
|
-
if re.search(r"\([\w,: ]+\)$", value):
|
|
129
|
-
modifier = value[value.find("(") + 1: -1]
|
|
130
|
-
value = value.split("(")[0]
|
|
131
|
-
# Nullable type
|
|
132
|
-
if value.upper() == "NULLABLE":
|
|
133
|
-
return infer_dtype_from_database_typename(modifier)
|
|
134
|
-
elif (
|
|
135
|
-
not value.startswith(("<", ">")) and re.search(r"\[[\w,\]\[: ]+]$", value)
|
|
136
|
-
) or value.endswith(("[S]", "[MS]", "[US]", "[NS]")):
|
|
137
|
-
modifier = value[value.find("[") + 1: -1]
|
|
138
|
-
value = value.split("[")[0]
|
|
139
|
-
else:
|
|
140
|
-
modifier = ""
|
|
141
|
-
|
|
142
|
-
# array dtypes
|
|
143
|
-
array_aliases = ("ARRAY", "LIST", "[]")
|
|
144
|
-
if value.endswith(array_aliases) or value.startswith(array_aliases):
|
|
145
|
-
for a in array_aliases:
|
|
146
|
-
value = value.replace(a, "", 1) if value else ""
|
|
147
|
-
|
|
148
|
-
nested: PolarsDataType | None = None
|
|
149
|
-
if not value and modifier:
|
|
150
|
-
nested = infer_dtype_from_database_typename(
|
|
151
|
-
value=modifier,
|
|
152
|
-
raise_unmatched=False,
|
|
153
|
-
)
|
|
154
|
-
else:
|
|
155
|
-
if inner_value := infer_dtype_from_database_typename(
|
|
156
|
-
value[1:-1]
|
|
157
|
-
if (value[0], value[-1]) == ("<", ">")
|
|
158
|
-
else re.sub(r"\W", "", re.sub(r"\WOF\W", "", value)),
|
|
159
|
-
raise_unmatched=False,
|
|
160
|
-
):
|
|
161
|
-
nested = inner_value
|
|
162
|
-
elif modifier:
|
|
163
|
-
nested = infer_dtype_from_database_typename(
|
|
164
|
-
value=modifier,
|
|
165
|
-
raise_unmatched=False,
|
|
166
|
-
)
|
|
167
|
-
if nested:
|
|
168
|
-
dtype = List(nested)
|
|
169
|
-
|
|
170
|
-
# float dtypes
|
|
171
|
-
elif value.startswith("FLOAT") or ("DOUBLE" in value) or (value == "REAL"):
|
|
172
|
-
dtype = (
|
|
173
|
-
Float32
|
|
174
|
-
if value == "FLOAT4"
|
|
175
|
-
or (value.endswith(("16", "32")) or (modifier in ("16", "32")))
|
|
176
|
-
else Float64
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
# integer dtypes
|
|
180
|
-
elif ("INTERVAL" not in value) and (
|
|
181
|
-
value.startswith(("INT", "UINT", "UNSIGNED"))
|
|
182
|
-
or value.endswith(("INT", "SERIAL"))
|
|
183
|
-
or ("INTEGER" in value)
|
|
184
|
-
or value == "ROWID"
|
|
185
|
-
):
|
|
186
|
-
sz: Any
|
|
187
|
-
if "LARGE" in value or value.startswith("BIG") or value == "INT8":
|
|
188
|
-
sz = 64
|
|
189
|
-
elif "MEDIUM" in value or value in ("INT4", "SERIAL"):
|
|
190
|
-
sz = 32
|
|
191
|
-
elif "SMALL" in value or value == "INT2":
|
|
192
|
-
sz = 16
|
|
193
|
-
elif "TINY" in value:
|
|
194
|
-
sz = 8
|
|
195
|
-
else:
|
|
196
|
-
sz = None
|
|
197
|
-
|
|
198
|
-
sz = modifier if (not sz and modifier) else sz
|
|
199
|
-
if not isinstance(sz, int):
|
|
200
|
-
sz = int(sz) if isinstance(sz, str) and sz.isdigit() else None
|
|
201
|
-
if (
|
|
202
|
-
("U" in value and "MEDIUM" not in value)
|
|
203
|
-
or ("UNSIGNED" in value)
|
|
204
|
-
or value == "ROWID"
|
|
205
|
-
):
|
|
206
|
-
dtype = integer_dtype_from_nbits(sz, unsigned=True, default=UInt64)
|
|
207
|
-
else:
|
|
208
|
-
dtype = integer_dtype_from_nbits(sz, unsigned=False, default=Int64)
|
|
209
|
-
|
|
210
|
-
# number types (note: 'number' alone is not that helpful and requires refinement)
|
|
211
|
-
elif "NUMBER" in value and "CARDINAL" in value:
|
|
212
|
-
dtype = UInt64
|
|
213
|
-
|
|
214
|
-
# decimal dtypes
|
|
215
|
-
elif (is_dec := ("DECIMAL" in value)) or ("NUMERIC" in value):
|
|
216
|
-
if "," in modifier:
|
|
217
|
-
prec, scale = modifier.split(",")
|
|
218
|
-
dtype = Decimal(int(prec), int(scale))
|
|
219
|
-
else:
|
|
220
|
-
dtype = Decimal if is_dec else Float64
|
|
221
|
-
|
|
222
|
-
# string dtypes
|
|
223
|
-
elif (
|
|
224
|
-
any(tp in value for tp in ("VARCHAR", "STRING", "TEXT", "UNICODE"))
|
|
225
|
-
or value.startswith(("STR", "CHAR", "BPCHAR", "NCHAR", "UTF"))
|
|
226
|
-
or value.endswith(("_UTF8", "_UTF16", "_UTF32"))
|
|
227
|
-
):
|
|
228
|
-
dtype = String
|
|
229
|
-
|
|
230
|
-
# binary dtypes
|
|
231
|
-
elif value in ("BYTEA", "BYTES", "BLOB", "CLOB", "BINARY"):
|
|
232
|
-
dtype = Binary
|
|
233
|
-
|
|
234
|
-
# boolean dtypes
|
|
235
|
-
elif value.startswith("BOOL"):
|
|
236
|
-
dtype = Boolean
|
|
237
|
-
|
|
238
|
-
# null dtype; odd, but valid
|
|
239
|
-
elif value == "NULL":
|
|
240
|
-
dtype = Null
|
|
241
|
-
|
|
242
|
-
# temporal dtypes
|
|
243
|
-
elif value.startswith(("DATETIME", "TIMESTAMP")) and not (value.endswith("[D]")):
|
|
244
|
-
if any((tz in value.replace(" ", "")) for tz in ("TZ", "TIMEZONE")):
|
|
245
|
-
if "WITHOUT" not in value:
|
|
246
|
-
return None # there's a timezone, but we don't know what it is
|
|
247
|
-
unit = timeunit_from_precision(modifier) if modifier else "us"
|
|
248
|
-
dtype = Datetime(time_unit=(unit or "us")) # type: ignore[arg-type]
|
|
249
|
-
else:
|
|
250
|
-
value = re.sub(r"\d", "", value)
|
|
251
|
-
if value in ("INTERVAL", "TIMEDELTA", "DURATION"):
|
|
252
|
-
dtype = Duration
|
|
253
|
-
elif value == "DATE":
|
|
254
|
-
dtype = Date
|
|
255
|
-
elif value == "TIME":
|
|
256
|
-
dtype = Time
|
|
257
|
-
|
|
258
|
-
if not dtype and raise_unmatched:
|
|
259
|
-
msg = f"cannot infer dtype from {original_value!r} string value"
|
|
260
|
-
raise ValueError(msg)
|
|
261
|
-
|
|
262
|
-
return dtype
|
|
263
|
-
|
|
264
|
-
CLICKHOUSE_TO_ARROW_TYPE = {
|
|
265
|
-
# 整数类型
|
|
266
|
-
'Int8': pa.int8(),
|
|
267
|
-
'Int16': pa.int16(),
|
|
268
|
-
'Int32': pa.int32(),
|
|
269
|
-
'Int64': pa.int64(),
|
|
270
|
-
'UInt8': pa.uint8(),
|
|
271
|
-
'UInt16': pa.uint16(),
|
|
272
|
-
'UInt32': pa.uint32(),
|
|
273
|
-
'UInt64': pa.uint64(),
|
|
274
|
-
|
|
275
|
-
# 浮点类型
|
|
276
|
-
'Float32': pa.float32(),
|
|
277
|
-
'Float64': pa.float64(),
|
|
278
|
-
|
|
279
|
-
# 字符串类型
|
|
280
|
-
'String': pa.string(),
|
|
281
|
-
'FixedString': pa.string(), # Arrow 不区分固定长度和动态长度字符串
|
|
282
|
-
|
|
283
|
-
# 日期和时间类型
|
|
284
|
-
'Date': pa.date32(), # ClickHouse 的 Date 是 32 位(天)
|
|
285
|
-
'Date32': pa.date32(),
|
|
286
|
-
'DateTime': pa.timestamp('s'), # ClickHouse DateTime 精度为秒
|
|
287
|
-
'DateTime64': pa.timestamp('ms'), # 默认映射为毫秒精度(可根据需求调整)
|
|
288
|
-
'UUID': pa.binary(16), # UUID 是 16 字节的二进制
|
|
289
|
-
|
|
290
|
-
# 布尔类型
|
|
291
|
-
'Boolean': pa.bool_(),
|
|
292
|
-
|
|
293
|
-
# 数组类型(嵌套类型)
|
|
294
|
-
'Array(Int8)': pa.list_(pa.int8()),
|
|
295
|
-
'Array(Int16)': pa.list_(pa.int16()),
|
|
296
|
-
'Array(Int32)': pa.list_(pa.int32()),
|
|
297
|
-
'Array(Int64)': pa.list_(pa.int64()),
|
|
298
|
-
'Array(UInt8)': pa.list_(pa.uint8()),
|
|
299
|
-
'Array(UInt16)': pa.list_(pa.uint16()),
|
|
300
|
-
'Array(UInt32)': pa.list_(pa.uint32()),
|
|
301
|
-
'Array(UInt64)': pa.list_(pa.uint64()),
|
|
302
|
-
'Array(Float32)': pa.list_(pa.float32()),
|
|
303
|
-
'Array(Float64)': pa.list_(pa.float64()),
|
|
304
|
-
'Array(String)': pa.list_(pa.string()),
|
|
305
|
-
'Array(Date)': pa.list_(pa.date32()),
|
|
306
|
-
'Array(DateTime)': pa.list_(pa.timestamp('s')),
|
|
307
|
-
|
|
308
|
-
# 嵌套类型(元组、枚举等)
|
|
309
|
-
# 注意:Arrow 不直接支持 Tuple,通常需要转换为 Struct
|
|
310
|
-
'Tuple': pa.struct([]), # 需要动态定义每个字段的类型
|
|
311
|
-
# 枚举类型
|
|
312
|
-
'Enum8': pa.string(), # 通常映射为字符串
|
|
313
|
-
'Enum16': pa.string(),
|
|
314
|
-
|
|
315
|
-
# Map 类型
|
|
316
|
-
'Map': pa.map_(pa.string(), pa.string()), # 默认键值对是字符串(可根据需求调整)
|
|
317
|
-
|
|
318
|
-
# Nullable 类型(ClickHouse 的 Nullable 包装类型)
|
|
319
|
-
'Nullable(Int8)': pa.int8(),
|
|
320
|
-
'Nullable(Int16)': pa.int16(),
|
|
321
|
-
'Nullable(Int32)': pa.int32(),
|
|
322
|
-
'Nullable(Int64)': pa.int64(),
|
|
323
|
-
'Nullable(UInt8)': pa.uint8(),
|
|
324
|
-
'Nullable(UInt16)': pa.uint16(),
|
|
325
|
-
'Nullable(UInt32)': pa.uint32(),
|
|
326
|
-
'Nullable(UInt64)': pa.uint64(),
|
|
327
|
-
'Nullable(Float32)': pa.float32(),
|
|
328
|
-
'Nullable(Float64)': pa.float64(),
|
|
329
|
-
'Nullable(String)': pa.string(),
|
|
330
|
-
'Nullable(Date)': pa.date32(),
|
|
331
|
-
'Nullable(DateTime)': pa.timestamp('s'),
|
|
332
|
-
'Nullable(UUID)': pa.binary(16),
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
def map_clickhouse_decimal(ch_type: str) -> pa.DataType:
|
|
336
|
-
"""
|
|
337
|
-
映射 ClickHouse 的 Decimal 类型到 Arrow 的 Decimal 类型
|
|
338
|
-
:param ch_type: ClickHouse 的 Decimal 类型描述,例如 'Decimal(10, 2)' 或 'Decimal128(38)'
|
|
339
|
-
:return: 对应的 Arrow Decimal 类型
|
|
340
|
-
"""
|
|
341
|
-
# 匹配 ClickHouse 的 Decimal(p, s) 格式
|
|
342
|
-
decimal_match = re.match(r"Decimal(?:32|64|128)?\((\d+),\s*(\d+)\)", ch_type)
|
|
343
|
-
if decimal_match:
|
|
344
|
-
precision, scale = map(int, decimal_match.groups())
|
|
345
|
-
return pa.decimal128(precision, scale)
|
|
346
|
-
|
|
347
|
-
# 匹配 ClickHouse 的 Decimal(p) 格式,默认 scale 为 0
|
|
348
|
-
decimal_match_no_scale = re.match(r"Decimal(?:32|64|128)?\((\d+)\)", ch_type)
|
|
349
|
-
if decimal_match_no_scale:
|
|
350
|
-
precision = int(decimal_match_no_scale.group(1))
|
|
351
|
-
return pa.decimal128(precision, 0)
|
|
352
|
-
|
|
353
|
-
# 如果不匹配,抛出异常
|
|
354
|
-
raise ValueError(f"Unsupported ClickHouse Decimal type: {ch_type}")
|
|
355
|
-
|
|
356
|
-
def map_clickhouse_to_arrow(ch_type: str) -> pa.DataType:
|
|
357
|
-
"""
|
|
358
|
-
动态映射 ClickHouse 类型到 Arrow 类型
|
|
359
|
-
"""
|
|
360
|
-
# 基础类型直接映射
|
|
361
|
-
if ch_type in CLICKHOUSE_TO_ARROW_TYPE:
|
|
362
|
-
return CLICKHOUSE_TO_ARROW_TYPE[ch_type]
|
|
363
|
-
|
|
364
|
-
# Decimal 类型处理
|
|
365
|
-
if ch_type.startswith("Decimal"):
|
|
366
|
-
return map_clickhouse_decimal(ch_type)
|
|
367
|
-
|
|
368
|
-
# 动态处理 Array 类型
|
|
369
|
-
if ch_type.startswith('Array('):
|
|
370
|
-
inner_type = ch_type[6:-1] # 提取 Array 内的类型
|
|
371
|
-
return pa.list_(map_clickhouse_to_arrow(inner_type))
|
|
372
|
-
|
|
373
|
-
# 动态处理 Nullable 类型
|
|
374
|
-
if ch_type.startswith('Nullable('):
|
|
375
|
-
inner_type = ch_type[9:-1] # 提取 Nullable 内的类型
|
|
376
|
-
return map_clickhouse_to_arrow(inner_type)
|
|
377
|
-
|
|
378
|
-
# 动态处理 Tuple 类型
|
|
379
|
-
if ch_type.startswith('Tuple('):
|
|
380
|
-
inner_types = ch_type[6:-1].split(',') # 提取 Tuple 内的字段类型
|
|
381
|
-
return pa.struct([('field' + str(i), map_clickhouse_to_arrow(t.strip())) for i, t in enumerate(inner_types)])
|
|
382
|
-
|
|
383
|
-
# 动态处理 Map 类型
|
|
384
|
-
if ch_type.startswith('Map('):
|
|
385
|
-
key_type, value_type = ch_type[4:-1].split(',')
|
|
386
|
-
return pa.map_(map_clickhouse_to_arrow(key_type.strip()), map_clickhouse_to_arrow(value_type.strip()))
|
|
387
|
-
|
|
388
|
-
raise ValueError(f"Unsupported ClickHouse type: {ch_type}")
|
|
389
|
-
|
ycat/parse.py
DELETED
|
@@ -1,66 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
"""
|
|
3
|
-
---------------------------------------------
|
|
4
|
-
Created on 2024/11/6 下午7:25
|
|
5
|
-
@author: ZhangYundi
|
|
6
|
-
@email: yundi.xxii@outlook.com
|
|
7
|
-
---------------------------------------------
|
|
8
|
-
"""
|
|
9
|
-
import sqlparse
|
|
10
|
-
import re
|
|
11
|
-
|
|
12
|
-
def format_sql(sql_content):
|
|
13
|
-
"""将sql语句进行规范化,并去除sql中的注释,输入和输出均为字符串"""
|
|
14
|
-
parse_str = sqlparse.format(sql_content, reindent=True, strip_comments=True)
|
|
15
|
-
return parse_str
|
|
16
|
-
|
|
17
|
-
def extract_temp_tables(with_clause):
|
|
18
|
-
"""从WITH子句中提取临时表名,输出为列表"""
|
|
19
|
-
temp_tables = re.findall(r'\b(\w+)\s*as\s*\(', with_clause, re.IGNORECASE)
|
|
20
|
-
return temp_tables
|
|
21
|
-
|
|
22
|
-
def extract_table_names_from_sql(sql_query):
|
|
23
|
-
"""从sql中提取对应的表名称,输出为列表"""
|
|
24
|
-
table_names = set()
|
|
25
|
-
# 解析SQL语句
|
|
26
|
-
parsed = sqlparse.parse(sql_query)
|
|
27
|
-
# 正则表达式模式,用于匹配表名
|
|
28
|
-
table_name_pattern = r'\bFROM\s+([^\s\(\)\,]+)|\bJOIN\s+([^\s\(\)\,]+)'
|
|
29
|
-
|
|
30
|
-
# 用于存储WITH子句中的临时表名
|
|
31
|
-
remove_with_name = []
|
|
32
|
-
|
|
33
|
-
# 遍历解析后的语句块
|
|
34
|
-
for statement in parsed:
|
|
35
|
-
# 转换为字符串
|
|
36
|
-
statement_str = str(statement)# .lower()
|
|
37
|
-
|
|
38
|
-
# 将字符串中的特殊语法置空
|
|
39
|
-
statement_str = re.sub(r'(substring|extract)\s*\(((.|\s)*?)\)', '', statement_str)
|
|
40
|
-
|
|
41
|
-
# 查找匹配的表名
|
|
42
|
-
matches = re.findall(table_name_pattern, statement_str, re.IGNORECASE)
|
|
43
|
-
|
|
44
|
-
for match in matches:
|
|
45
|
-
# 提取非空的表名部分
|
|
46
|
-
for name in match:
|
|
47
|
-
if name:
|
|
48
|
-
# 对于可能包含命名空间的情况,只保留最后一部分作为表名
|
|
49
|
-
table_name = name.split('.')[-1]
|
|
50
|
-
# 去除表名中的特殊符号
|
|
51
|
-
table_name = re.sub(r'("|`|\'|;)', '', table_name)
|
|
52
|
-
table_names.add(table_name)
|
|
53
|
-
|
|
54
|
-
# 处理特殊的WITH语句
|
|
55
|
-
if 'with' in statement_str:
|
|
56
|
-
remove_with_name = extract_temp_tables(statement_str)
|
|
57
|
-
# 移除多余的表名
|
|
58
|
-
if remove_with_name:
|
|
59
|
-
table_names = list(set(table_names) - set(remove_with_name))
|
|
60
|
-
|
|
61
|
-
return table_names
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
if __name__ == '__main__':
|
|
65
|
-
print(extract_table_names_from_sql("select * from c.a/b/c/d"))
|
|
66
|
-
|
ycat/yck.py
DELETED
|
@@ -1,87 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
"""
|
|
3
|
-
---------------------------------------------
|
|
4
|
-
Created on 2024/11/4 上午9:01
|
|
5
|
-
@author: ZhangYundi
|
|
6
|
-
@email: yundi.xxii@outlook.com
|
|
7
|
-
---------------------------------------------
|
|
8
|
-
"""
|
|
9
|
-
from random import randint
|
|
10
|
-
|
|
11
|
-
import pandas as pd
|
|
12
|
-
import polars
|
|
13
|
-
import polars as pl
|
|
14
|
-
import pyarrow as pa
|
|
15
|
-
from clickhouse_driver import Client
|
|
16
|
-
|
|
17
|
-
from . import dtype
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def connect(urls: list[str], user: str, password: str) -> Client:
|
|
21
|
-
"""
|
|
22
|
-
连接clickhouse服务器, 支持集群
|
|
23
|
-
Parameters
|
|
24
|
-
----------
|
|
25
|
-
urls: List[str]
|
|
26
|
-
["host1:port1", "host2:port2", "host3:port3"...]
|
|
27
|
-
user: str
|
|
28
|
-
用户名
|
|
29
|
-
password: str
|
|
30
|
-
密码
|
|
31
|
-
Returns
|
|
32
|
-
-------
|
|
33
|
-
client: Client
|
|
34
|
-
ClickHouse 数据库连接客户端,必须是一个有效的 `clickhouse_driver.Client` 实例
|
|
35
|
-
"""
|
|
36
|
-
i = randint(0, len(urls) - 1)
|
|
37
|
-
url_ini = urls[i]
|
|
38
|
-
[host, port] = url_ini.split(":")
|
|
39
|
-
return Client(host, port=port, round_robin=True, alt_hosts=",".join(urls), user=user, password=password)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def query_pandas(sql, conn) -> pd.DataFrame:
|
|
43
|
-
"""
|
|
44
|
-
请求ck,返回 pandas.DataFrame
|
|
45
|
-
Parameters
|
|
46
|
-
----------
|
|
47
|
-
sql: str
|
|
48
|
-
查询语句
|
|
49
|
-
conn: Client
|
|
50
|
-
ClickHouse 数据库连接客户端,必须是一个有效的 `clickhouse_driver.Client` 实例
|
|
51
|
-
Returns
|
|
52
|
-
-------
|
|
53
|
-
pandas.DataFrame
|
|
54
|
-
包含查询结果的 Pandas DataFrame。如果查询没有返回任何数据,则
|
|
55
|
-
返回一个空的 DataFrame 或者 None
|
|
56
|
-
"""
|
|
57
|
-
return conn.query_dataframe(sql)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def query_polars(sql, conn) -> pl.DataFrame:
|
|
61
|
-
"""
|
|
62
|
-
请求ck,返回 polars.DataFrame
|
|
63
|
-
Parameters
|
|
64
|
-
----------
|
|
65
|
-
sql: str
|
|
66
|
-
查询语句
|
|
67
|
-
conn: Client
|
|
68
|
-
ClickHouse 数据库连接客户端,必须是一个有效的 `clickhouse_driver.Client` 实例。
|
|
69
|
-
Returns
|
|
70
|
-
-------
|
|
71
|
-
polars.DataFrame
|
|
72
|
-
包含查询结果的 Polars DataFrame。如果查询没有返回任何数据,则
|
|
73
|
-
返回一个空的 DataFrame 或者 None
|
|
74
|
-
"""
|
|
75
|
-
data, columns = conn.execute(sql, columnar=True, with_column_types=True)
|
|
76
|
-
# columns = {name: dtype.infer_dtype_from_database_typename(type_) for name, type_ in columns}
|
|
77
|
-
if len(data) < 1:
|
|
78
|
-
columns = {name: dtype.infer_dtype_from_database_typename(type_) for name, type_ in columns}
|
|
79
|
-
return pl.DataFrame(schema=columns)
|
|
80
|
-
columns = {name: dtype.map_clickhouse_to_arrow(type_) for name, type_ in columns}
|
|
81
|
-
# 构造 Arrow 表(逐列传递数据和类型)
|
|
82
|
-
arrow_table = pa.Table.from_arrays(
|
|
83
|
-
[pa.array(col, type=col_type) for col, col_type in zip(data, columns.values())],
|
|
84
|
-
schema=pa.schema(columns))
|
|
85
|
-
|
|
86
|
-
# 从 Arrow 表构造 Polars DataFrame
|
|
87
|
-
return pl.from_arrow(arrow_table)
|