databricks-ddbxutils 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: databricks-ddbxutils
3
- Version: 0.2.0
3
+ Version: 0.4.0
4
4
  Summary: extends databricks dbutils
5
5
  Author: Haneul Kim
6
6
  Author-email: haneul.kim@data-dynamics.io
@@ -8,10 +8,14 @@ Requires-Python: >=3.11,<4.0
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.11
10
10
  Classifier: Programming Language :: Python :: 3.12
11
- Requires-Dist: databricks-sdk (>=0.57.0,<0.58.0)
11
+ Requires-Dist: cloudpickle (>=3.1.1,<4.0.0)
12
+ Requires-Dist: databricks-sdk (>=0.64.0,<0.65.0)
12
13
  Requires-Dist: dotenv (>=0.9.9,<0.10.0)
13
14
  Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
15
+ Requires-Dist: pyarrow (==20.0.0)
16
+ Requires-Dist: pyspark (>=4.0.0,<5.0.0)
14
17
  Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
18
+ Requires-Dist: pytz (>=2025.2,<2026.0)
15
19
  Description-Content-Type: text/markdown
16
20
 
17
21
  # databricks-ddbxutils
@@ -1,9 +1,11 @@
1
1
  ddbxutils/__init__.py,sha256=ElK7lAZwcQh50qAVQ16t9Cfk0caZAYLJtKkDJDkLljk,158
2
+ ddbxutils/datasources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ ddbxutils/datasources/pyfunc.py,sha256=AAPCdR9Pw7NDIbAAZi8yOB2T9usLTD2u9lpwqJMv2pI,4171
2
4
  ddbxutils/functions.py,sha256=eb1cM5tpJKB1MuzJ8ncTpGxTjZA70kDj_gJFQC6y7zg,1567
3
5
  ddbxutils/main.py,sha256=9pjd5VXAQOj-_lZsTojv-A4LUiZ-SLuK5oMgNwO4Fzc,488
4
6
  ddbxutils/widgets/__init__.py,sha256=aeFsmeixYUPaFFw4DqgvuQreBlkNwmBM4v2bCZUB2zU,1364
5
7
  ddbxutils/widgets/core.py,sha256=uBhbq5KKSyFT5CLRPsTw4zzeWYxGuJvImEQgh_iiEqk,989
6
- databricks_ddbxutils-0.2.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- databricks_ddbxutils-0.2.0.dist-info/METADATA,sha256=QQsJYSPRCg-fUKeCF05fCFLuozonYeTvX0KYyc0-XwM,2724
8
- databricks_ddbxutils-0.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- databricks_ddbxutils-0.2.0.dist-info/RECORD,,
8
+ databricks_ddbxutils-0.4.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
+ databricks_ddbxutils-0.4.0.dist-info/METADATA,sha256=7en42X_IwuMIDocSmWyIQfB_McsNIxFqk2oS6UpMzk4,2881
10
+ databricks_ddbxutils-0.4.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
+ databricks_ddbxutils-0.4.0.dist-info/RECORD,,
File without changes
@@ -0,0 +1,122 @@
1
+ import base64
2
+ from dataclasses import dataclass
3
+
4
+ import cloudpickle
5
+ from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
6
+ from pyspark.sql.types import StructType, StructField, IntegerType
7
+
8
+
9
+ @dataclass
10
+ class PythonFunctionPartition(InputPartition):
11
+ """
12
+ Partition 정의: 각 파티션의 시작과 끝 범위
13
+ """
14
+ start: int
15
+ end: int
16
+
17
+
18
+ class PythonFunctionReader(DataSourceReader):
19
+ """
20
+ DataSourceReader 구현
21
+ """
22
+
23
+ def __init__(self, schema: StructType, options: dict, serialized_func_b64):
24
+ self.schema = schema
25
+ self.options = options
26
+ self.func = cloudpickle.loads(base64.b64decode(serialized_func_b64))
27
+
28
+ def partitions(self):
29
+ lower = int(self.options.get('lowerLimit', '0'))
30
+ upper = int(self.options.get('upperLimit', '0'))
31
+ num_parts = int(self.options.get('numPartitions', '1'))
32
+ step = (upper - lower) // num_parts if num_parts > 0 else (upper - lower)
33
+ # print(f'step={step}')
34
+ parts = []
35
+ start = lower
36
+ for i in range(num_parts):
37
+ end = upper if i == num_parts - 1 else start + step
38
+ parts.append(PythonFunctionPartition(start, end))
39
+ start = end
40
+ return parts
41
+
42
+ def read(self, partition: PythonFunctionPartition):
43
+ for x in range(partition.start, partition.end):
44
+ # yield (self.func(x),)
45
+ yield self.func(x)
46
+
47
+
48
+ class PythonFunctionDataSource(DataSource):
49
+ """
50
+ DataSource 구현
51
+
52
+ .. versionadded: 0.3.0
53
+
54
+ Notes
55
+ -----
56
+ user defined function 은 tuple, list, `pyspark.sql.types.Row`, `pyarrow.RecordBatch` 중에 하나의 type 을 가져야 합니다.
57
+
58
+ Examples
59
+ --------
60
+ >>> spark = ...
61
+
62
+ Use the default input partition implementation:
63
+
64
+ >>> def partitions(self):
65
+ ... return [PythonFunctionPartition(1, 3)]
66
+
67
+ Subclass the input partition class:
68
+
69
+ >>> def partitions(self):
70
+ ... return [PythonFunctionPartition(1, 3), PythonFunctionPartition(4, 6)]
71
+
72
+ Example in PySpark Shell 에서 `pyspark.sql.Row` 를 return 하는 함수를 사용하는 방법입니다.
73
+
74
+ >>> import base64
75
+ >>> import cloudpickle
76
+ >>> from ddbxutils.datasources.pyfunc import PythonFunctionDataSource
77
+ >>> spark.dataSource.register(PythonFunctionDataSource)
78
+ >>> # ...
79
+ >>> from pyspark.sql import Row
80
+ >>> def user_function_row(x) -> Row:
81
+ ... from datetime import datetime
82
+ ... from pytz import timezone
83
+ ... return Row(str(x), str(x * x), datetime.now(timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S"))
84
+ ...
85
+ >>> df = (spark.read.format("pyfunc").
86
+ ... schema('value1 string, value2 string, ts string').
87
+ ... option("lowerLimit", "0").
88
+ ... option("upperLimit", "10").
89
+ ... option("numPartitions", "100").
90
+ ... option("func", base64.b64encode(cloudpickle.dumps(user_function_row)).decode('utf-8')).
91
+ ... load())
92
+ >>> df.show()
93
+
94
+ Example in PySpark Shell 에서 array 를 return 하는 함수를 사용하는 방법입니다.
95
+
96
+ >>> def user_function_row(x):
97
+ ... from datetime import datetime
98
+ ... from pytz import timezone
99
+ ... return [str(x), str(x * x), datetime.now(timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S")]
100
+ ...
101
+ >>> df = (spark.read.format("pyfunc").
102
+ ... schema('value1 string, value2 string, ts string').
103
+ ... option("lowerLimit", "0").
104
+ ... option("upperLimit", "10").
105
+ ... option("numPartitions", "100").
106
+ ... option("func", base64.b64encode(cloudpickle.dumps(user_function_row)).decode('utf-8')).
107
+ ... load())
108
+ >>> df.show()
109
+ """
110
+
111
+ @classmethod
112
+ def name(cls):
113
+ return 'pyfunc'
114
+
115
+ def schema(self):
116
+ return StructType([StructField('value', IntegerType(), nullable=False)])
117
+
118
+ def reader(self, schema: StructType):
119
+ # options는 문자열이므로 필요시 변환하세요
120
+ # func = self.options.get('func', None)
121
+ func = self.options['func']
122
+ return PythonFunctionReader(self.schema(), self.options, func)