databricks-ddbxutils 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {databricks_ddbxutils-0.2.0.dist-info → databricks_ddbxutils-0.4.0.dist-info}/METADATA +6 -2
- {databricks_ddbxutils-0.2.0.dist-info → databricks_ddbxutils-0.4.0.dist-info}/RECORD +6 -4
- ddbxutils/datasources/__init__.py +0 -0
- ddbxutils/datasources/pyfunc.py +122 -0
- {databricks_ddbxutils-0.2.0.dist-info → databricks_ddbxutils-0.4.0.dist-info}/LICENSE +0 -0
- {databricks_ddbxutils-0.2.0.dist-info → databricks_ddbxutils-0.4.0.dist-info}/WHEEL +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: databricks-ddbxutils
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: extends databricks dbutils
|
|
5
5
|
Author: Haneul Kim
|
|
6
6
|
Author-email: haneul.kim@data-dynamics.io
|
|
@@ -8,10 +8,14 @@ Requires-Python: >=3.11,<4.0
|
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: Programming Language :: Python :: 3.11
|
|
10
10
|
Classifier: Programming Language :: Python :: 3.12
|
|
11
|
-
Requires-Dist:
|
|
11
|
+
Requires-Dist: cloudpickle (>=3.1.1,<4.0.0)
|
|
12
|
+
Requires-Dist: databricks-sdk (>=0.64.0,<0.65.0)
|
|
12
13
|
Requires-Dist: dotenv (>=0.9.9,<0.10.0)
|
|
13
14
|
Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
|
|
15
|
+
Requires-Dist: pyarrow (==20.0.0)
|
|
16
|
+
Requires-Dist: pyspark (>=4.0.0,<5.0.0)
|
|
14
17
|
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
|
18
|
+
Requires-Dist: pytz (>=2025.2,<2026.0)
|
|
15
19
|
Description-Content-Type: text/markdown
|
|
16
20
|
|
|
17
21
|
# databricks-ddbxutils
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
ddbxutils/__init__.py,sha256=ElK7lAZwcQh50qAVQ16t9Cfk0caZAYLJtKkDJDkLljk,158
|
|
2
|
+
ddbxutils/datasources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
ddbxutils/datasources/pyfunc.py,sha256=AAPCdR9Pw7NDIbAAZi8yOB2T9usLTD2u9lpwqJMv2pI,4171
|
|
2
4
|
ddbxutils/functions.py,sha256=eb1cM5tpJKB1MuzJ8ncTpGxTjZA70kDj_gJFQC6y7zg,1567
|
|
3
5
|
ddbxutils/main.py,sha256=9pjd5VXAQOj-_lZsTojv-A4LUiZ-SLuK5oMgNwO4Fzc,488
|
|
4
6
|
ddbxutils/widgets/__init__.py,sha256=aeFsmeixYUPaFFw4DqgvuQreBlkNwmBM4v2bCZUB2zU,1364
|
|
5
7
|
ddbxutils/widgets/core.py,sha256=uBhbq5KKSyFT5CLRPsTw4zzeWYxGuJvImEQgh_iiEqk,989
|
|
6
|
-
databricks_ddbxutils-0.
|
|
7
|
-
databricks_ddbxutils-0.
|
|
8
|
-
databricks_ddbxutils-0.
|
|
9
|
-
databricks_ddbxutils-0.
|
|
8
|
+
databricks_ddbxutils-0.4.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
9
|
+
databricks_ddbxutils-0.4.0.dist-info/METADATA,sha256=7en42X_IwuMIDocSmWyIQfB_McsNIxFqk2oS6UpMzk4,2881
|
|
10
|
+
databricks_ddbxutils-0.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
11
|
+
databricks_ddbxutils-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import cloudpickle
|
|
5
|
+
from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
|
|
6
|
+
from pyspark.sql.types import StructType, StructField, IntegerType
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class PythonFunctionPartition(InputPartition):
|
|
11
|
+
"""
|
|
12
|
+
Partition 정의: 각 파티션의 시작과 끝 범위
|
|
13
|
+
"""
|
|
14
|
+
start: int
|
|
15
|
+
end: int
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PythonFunctionReader(DataSourceReader):
|
|
19
|
+
"""
|
|
20
|
+
DataSourceReader 구현
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, schema: StructType, options: dict, serialized_func_b64):
|
|
24
|
+
self.schema = schema
|
|
25
|
+
self.options = options
|
|
26
|
+
self.func = cloudpickle.loads(base64.b64decode(serialized_func_b64))
|
|
27
|
+
|
|
28
|
+
def partitions(self):
|
|
29
|
+
lower = int(self.options.get('lowerLimit', '0'))
|
|
30
|
+
upper = int(self.options.get('upperLimit', '0'))
|
|
31
|
+
num_parts = int(self.options.get('numPartitions', '1'))
|
|
32
|
+
step = (upper - lower) // num_parts if num_parts > 0 else (upper - lower)
|
|
33
|
+
# print(f'step={step}')
|
|
34
|
+
parts = []
|
|
35
|
+
start = lower
|
|
36
|
+
for i in range(num_parts):
|
|
37
|
+
end = upper if i == num_parts - 1 else start + step
|
|
38
|
+
parts.append(PythonFunctionPartition(start, end))
|
|
39
|
+
start = end
|
|
40
|
+
return parts
|
|
41
|
+
|
|
42
|
+
def read(self, partition: PythonFunctionPartition):
|
|
43
|
+
for x in range(partition.start, partition.end):
|
|
44
|
+
# yield (self.func(x),)
|
|
45
|
+
yield self.func(x)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class PythonFunctionDataSource(DataSource):
|
|
49
|
+
"""
|
|
50
|
+
DataSource 구현
|
|
51
|
+
|
|
52
|
+
.. versionadded: 0.3.0
|
|
53
|
+
|
|
54
|
+
Notes
|
|
55
|
+
-----
|
|
56
|
+
user defined function 은 tuple, list, `pyspark.sql.types.Row`, `pyarrow.RecordBatch` 중에 하나의 type 을 가져야 합니다.
|
|
57
|
+
|
|
58
|
+
Examples
|
|
59
|
+
--------
|
|
60
|
+
>>> spark = ...
|
|
61
|
+
|
|
62
|
+
Use the default input partition implementation:
|
|
63
|
+
|
|
64
|
+
>>> def partitions(self):
|
|
65
|
+
... return [PythonFunctionPartition(1, 3)]
|
|
66
|
+
|
|
67
|
+
Subclass the input partition class:
|
|
68
|
+
|
|
69
|
+
>>> def partitions(self):
|
|
70
|
+
... return [PythonFunctionPartition(1, 3), PythonFunctionPartition(4, 6)]
|
|
71
|
+
|
|
72
|
+
Example in PySpark Shell 에서 `pyspark.sql.Row` 를 return 하는 함수를 사용하는 방법입니다.
|
|
73
|
+
|
|
74
|
+
>>> import base64
|
|
75
|
+
>>> import cloudpickle
|
|
76
|
+
>>> from ddbxutils.datasources.pyfunc import PythonFunctionDataSource
|
|
77
|
+
>>> spark.dataSource.register(PythonFunctionDataSource)
|
|
78
|
+
>>> # ...
|
|
79
|
+
>>> from pyspark.sql import Row
|
|
80
|
+
>>> def user_function_row(x) -> Row:
|
|
81
|
+
... from datetime import datetime
|
|
82
|
+
... from pytz import timezone
|
|
83
|
+
... return Row(str(x), str(x * x), datetime.now(timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S"))
|
|
84
|
+
...
|
|
85
|
+
>>> df = (spark.read.format("pyfunc").
|
|
86
|
+
... schema('value1 string, value2 string, ts string').
|
|
87
|
+
... option("lowerLimit", "0").
|
|
88
|
+
... option("upperLimit", "10").
|
|
89
|
+
... option("numPartitions", "100").
|
|
90
|
+
... option("func", base64.b64encode(cloudpickle.dumps(user_function_row)).decode('utf-8')).
|
|
91
|
+
... load())
|
|
92
|
+
>>> df.show()
|
|
93
|
+
|
|
94
|
+
Example in PySpark Shell 에서 array 를 return 하는 함수를 사용하는 방법입니다.
|
|
95
|
+
|
|
96
|
+
>>> def user_function_row(x):
|
|
97
|
+
... from datetime import datetime
|
|
98
|
+
... from pytz import timezone
|
|
99
|
+
... return [str(x), str(x * x), datetime.now(timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S")]
|
|
100
|
+
...
|
|
101
|
+
>>> df = (spark.read.format("pyfunc").
|
|
102
|
+
... schema('value1 string, value2 string, ts string').
|
|
103
|
+
... option("lowerLimit", "0").
|
|
104
|
+
... option("upperLimit", "10").
|
|
105
|
+
... option("numPartitions", "100").
|
|
106
|
+
... option("func", base64.b64encode(cloudpickle.dumps(user_function_row)).decode('utf-8')).
|
|
107
|
+
... load())
|
|
108
|
+
>>> df.show()
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
@classmethod
|
|
112
|
+
def name(cls):
|
|
113
|
+
return 'pyfunc'
|
|
114
|
+
|
|
115
|
+
def schema(self):
|
|
116
|
+
return StructType([StructField('value', IntegerType(), nullable=False)])
|
|
117
|
+
|
|
118
|
+
def reader(self, schema: StructType):
|
|
119
|
+
# options는 문자열이므로 필요시 변환하세요
|
|
120
|
+
# func = self.options.get('func', None)
|
|
121
|
+
func = self.options['func']
|
|
122
|
+
return PythonFunctionReader(self.schema(), self.options, func)
|
|
File without changes
|
|
File without changes
|