databricks-ddbxutils 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.1
2
+ Name: databricks-ddbxutils
3
+ Version: 0.3.0
4
+ Summary: extends databricks dbutils
5
+ Author: Haneul Kim
6
+ Author-email: haneul.kim@data-dynamics.io
7
+ Requires-Python: >=3.11,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Dist: cloudpickle (>=3.1.1,<4.0.0)
12
+ Requires-Dist: databricks-sdk (>=0.64.0,<0.65.0)
13
+ Requires-Dist: dotenv (>=0.9.9,<0.10.0)
14
+ Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
15
+ Requires-Dist: pyarrow (==20.0.0)
16
+ Requires-Dist: pyspark (>=4.0.0,<5.0.0)
17
+ Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
18
+ Requires-Dist: pytz (>=2025.2,<2026.0)
19
+ Description-Content-Type: text/markdown
20
+
21
+ # databricks-ddbxutils
22
+
23
+ dbutils 로 부족한 부분을 확장한 ddbxutils
24
+
25
+ ## Feature
26
+
27
+ * [x] `dbutils.widgets` 에 jinja2 template 적용
28
+
29
+ ## setup
30
+
31
+ ```shell
32
+ cd <PROJECT_ROOT>
33
+ pip install poetry
34
+ ```
35
+
36
+ ## venv
37
+
38
+ ```shell
39
+ poetry shell
40
+ ```
41
+
42
+ ## Build
43
+
44
+ ```shell
45
+ poetry build
46
+ ```
47
+
48
+ ## Run
49
+
50
+ ### in databricks w/o init_script(= Serverless)
51
+
52
+ * Add Wheel
53
+ * wheel upload 용 Volume 생성 후 upload
54
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
55
+ * notebook 의 우측 Environment 에서 Environment version 2로 지정 후 volume 에 upload 한 wheel file 추가 후 Apply
56
+ * Usage
57
+ ```python
58
+ # dbutils.widgets.text('rawdate', '2025-05-24', 'Raw Date')
59
+ # dbutils.widgets.text('next_day', '{{add_days(rawdate, "%Y-%m-%d", "", 1)}}', 'Next Day')
60
+ import ddbxutils
61
+ next_day = ddbxutils.widgets.get('next_day')
62
+ # next_day: 2025-05-25
63
+ ```
64
+
65
+ ### in databricks w/ init_script
66
+
67
+ * Add Wheel
68
+ * wheel upload 용 Volume 생성 후 upload
69
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
70
+ * Libraries
71
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
72
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/init_script_ddbxutils.sh`
73
+ ```shell
74
+ #! /bin/bash
75
+
76
+ STARTUP_SCRIPT=/tmp/pyspark_startup.py
77
+
78
+ cat >> ${STARTUP_SCRIPT} << EOF
79
+
80
+ prefix = 'PYTHONSTARTUP_ddbxutils'
81
+ print(f'{prefix} custom startup script loading...')
82
+ try:
83
+ import ddbxutils
84
+ print(f'{prefix} Custom modules [ddbxutils] are loaded.')
85
+ except Exception as e:
86
+ print(f'{prefix} e={e}')
87
+ print(f'{prefix} import ddbxutils failed')
88
+ EOF
89
+ ```
90
+ * Spark config
91
+ ```text
92
+ spark.executorEnv.PYTHONSTARTUP /tmp/pyspark_startup.py
93
+ ```
94
+ * Environment variables
95
+ ```shell
96
+ PYTHONSTARTUP=/tmp/pyspark_startup.py
97
+ ```
98
+ * Init scripts
99
+ ```text
100
+ /Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/init_script_ddbxutils.sh
101
+ ```
102
+ * Usage
103
+ ```python
104
+ # dbutils.widgets.text('rawdate', '2025-05-24', 'Raw Date')
105
+ # dbutils.widgets.text('next_day', '{{add_days(rawdate, "%Y-%m-%d", "", 1)}}', 'Next Day')
106
+ next_day = ddbxutils.widgets.get('next_day')
107
+ # next_day: 2025-05-25
108
+ ```
@@ -0,0 +1,11 @@
1
+ ddbxutils/__init__.py,sha256=ElK7lAZwcQh50qAVQ16t9Cfk0caZAYLJtKkDJDkLljk,158
2
+ ddbxutils/datasources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ ddbxutils/datasources/pyfunc.py,sha256=AAPCdR9Pw7NDIbAAZi8yOB2T9usLTD2u9lpwqJMv2pI,4171
4
+ ddbxutils/functions.py,sha256=eb1cM5tpJKB1MuzJ8ncTpGxTjZA70kDj_gJFQC6y7zg,1567
5
+ ddbxutils/main.py,sha256=9pjd5VXAQOj-_lZsTojv-A4LUiZ-SLuK5oMgNwO4Fzc,488
6
+ ddbxutils/widgets/__init__.py,sha256=aeFsmeixYUPaFFw4DqgvuQreBlkNwmBM4v2bCZUB2zU,1364
7
+ ddbxutils/widgets/core.py,sha256=uBhbq5KKSyFT5CLRPsTw4zzeWYxGuJvImEQgh_iiEqk,989
8
+ databricks_ddbxutils-0.3.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
+ databricks_ddbxutils-0.3.0.dist-info/METADATA,sha256=wWrm7QWS-CDGgC4lQdjujWO0lOucmjbo4ud4foQqvRM,2881
10
+ databricks_ddbxutils-0.3.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
11
+ databricks_ddbxutils-0.3.0.dist-info/RECORD,,
ddbxutils/__init__.py CHANGED
@@ -1,6 +1,4 @@
1
- from databricks.sdk import WorkspaceClient
2
-
3
1
  from . import widgets
4
2
 
5
- w = WorkspaceClient()
6
- widgets.init(w.dbutils)
3
+ # lazy evaluation 위해 () 로 감싸서 generator 를 return 하도록 변경
4
+ generator = (widgets.get_instance() for x in range(1))
File without changes
@@ -0,0 +1,122 @@
1
+ import base64
2
+ from dataclasses import dataclass
3
+
4
+ import cloudpickle
5
+ from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
6
+ from pyspark.sql.types import StructType, StructField, IntegerType
7
+
8
+
9
+ @dataclass
10
+ class PythonFunctionPartition(InputPartition):
11
+ """
12
+ Partition 정의: 각 파티션의 시작과 끝 범위
13
+ """
14
+ start: int
15
+ end: int
16
+
17
+
18
+ class PythonFunctionReader(DataSourceReader):
19
+ """
20
+ DataSourceReader 구현
21
+ """
22
+
23
+ def __init__(self, schema: StructType, options: dict, serialized_func_b64):
24
+ self.schema = schema
25
+ self.options = options
26
+ self.func = cloudpickle.loads(base64.b64decode(serialized_func_b64))
27
+
28
+ def partitions(self):
29
+ lower = int(self.options.get('lowerLimit', '0'))
30
+ upper = int(self.options.get('upperLimit', '0'))
31
+ num_parts = int(self.options.get('numPartitions', '1'))
32
+ step = (upper - lower) // num_parts if num_parts > 0 else (upper - lower)
33
+ # print(f'step={step}')
34
+ parts = []
35
+ start = lower
36
+ for i in range(num_parts):
37
+ end = upper if i == num_parts - 1 else start + step
38
+ parts.append(PythonFunctionPartition(start, end))
39
+ start = end
40
+ return parts
41
+
42
+ def read(self, partition: PythonFunctionPartition):
43
+ for x in range(partition.start, partition.end):
44
+ # yield (self.func(x),)
45
+ yield self.func(x)
46
+
47
+
48
+ class PythonFunctionDataSource(DataSource):
49
+ """
50
+ DataSource 구현
51
+
52
+ .. versionadded: 0.3.0
53
+
54
+ Notes
55
+ -----
56
+ user defined function 은 tuple, list, `pyspark.sql.types.Row`, `pyarrow.RecordBatch` 중에 하나의 type 을 가져야 합니다.
57
+
58
+ Examples
59
+ --------
60
+ >>> spark = ...
61
+
62
+ Use the default input partition implementation:
63
+
64
+ >>> def partitions(self):
65
+ ... return [PythonFunctionPartition(1, 3)]
66
+
67
+ Subclass the input partition class:
68
+
69
+ >>> def partitions(self):
70
+ ... return [PythonFunctionPartition(1, 3), PythonFunctionPartition(4, 6)]
71
+
72
+ Example in PySpark Shell 에서 `pyspark.sql.Row` 를 return 하는 함수를 사용하는 방법입니다.
73
+
74
+ >>> import base64
75
+ >>> import cloudpickle
76
+ >>> from ddbxutils.datasources.pyfunc import PythonFunctionDataSource
77
+ >>> spark.dataSource.register(PythonFunctionDataSource)
78
+ >>> # ...
79
+ >>> from pyspark.sql import Row
80
+ >>> def user_function_row(x) -> Row:
81
+ ... from datetime import datetime
82
+ ... from pytz import timezone
83
+ ... return Row(str(x), str(x * x), datetime.now(timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S"))
84
+ ...
85
+ >>> df = (spark.read.format("pyfunc").
86
+ ... schema('value1 string, value2 string, ts string').
87
+ ... option("lowerLimit", "0").
88
+ ... option("upperLimit", "10").
89
+ ... option("numPartitions", "100").
90
+ ... option("func", base64.b64encode(cloudpickle.dumps(user_function_row)).decode('utf-8')).
91
+ ... load())
92
+ >>> df.show()
93
+
94
+ Example in PySpark Shell 에서 array 를 return 하는 함수를 사용하는 방법입니다.
95
+
96
+ >>> def user_function_row(x):
97
+ ... from datetime import datetime
98
+ ... from pytz import timezone
99
+ ... return [str(x), str(x * x), datetime.now(timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S")]
100
+ ...
101
+ >>> df = (spark.read.format("pyfunc").
102
+ ... schema('value1 string, value2 string, ts string').
103
+ ... option("lowerLimit", "0").
104
+ ... option("upperLimit", "10").
105
+ ... option("numPartitions", "100").
106
+ ... option("func", base64.b64encode(cloudpickle.dumps(user_function_row)).decode('utf-8')).
107
+ ... load())
108
+ >>> df.show()
109
+ """
110
+
111
+ @classmethod
112
+ def name(cls):
113
+ return 'pyfunc'
114
+
115
+ def schema(self):
116
+ return StructType([StructField('value', IntegerType(), nullable=False)])
117
+
118
+ def reader(self, schema: StructType):
119
+ # options는 문자열이므로 필요시 변환하세요
120
+ # func = self.options.get('func', None)
121
+ func = self.options['func']
122
+ return PythonFunctionReader(self.schema(), self.options, func)
ddbxutils/main.py CHANGED
@@ -2,13 +2,13 @@
2
2
  import ddbxutils
3
3
 
4
4
  # 'next_day' 위젯의 기본값 가져오기
5
- initial_value = ddbxutils.widgets.get("next_day")
6
- print(f"초기 'next_day' 값: {initial_value}")
5
+ initial_value = ddbxutils.widgets.get('next_day')
6
+ print(f'초기 next_day 값: {initial_value}')
7
7
 
8
8
  # 변경된 값 다시 가져오기
9
- updated_value = ddbxutils.widgets.get("next_day")
10
- print(f"업데이트된 'next_day' 값: {updated_value}")
9
+ updated_value = ddbxutils.widgets.get('next_day')
10
+ print(f'업데이트된 next_day 값: {updated_value}')
11
11
 
12
12
  # 존재하지 않는 위젯 가져오기
13
- other_value = ddbxutils.widgets.get("another_widget")
14
- print(f"'another_widget' 값: {other_value}")
13
+ other_value = ddbxutils.widgets.get('another_widget')
14
+ print(f'another_widget 값: {other_value}')
@@ -3,16 +3,17 @@ from .core import WidgetImpl
3
3
  _widget_impl_instance: WidgetImpl = None
4
4
 
5
5
 
6
- def init(dbutils):
6
+ def get_instance():
7
7
  """
8
8
  widgets 모듈을 초기화합니다.
9
9
  이 함수는 반드시 dbutils 객체와 함께 한 번 호출되어야 합니다.
10
10
 
11
- :param dbutils: dbutils
12
11
  :return: None
13
12
  """
14
13
  global _widget_impl_instance
15
- _widget_impl_instance = WidgetImpl(dbutils)
14
+ if _widget_impl_instance is None:
15
+ _widget_impl_instance = WidgetImpl()
16
+ return _widget_impl_instance
16
17
 
17
18
 
18
19
  def get(widget_name: str):
@@ -23,20 +24,22 @@ def get(widget_name: str):
23
24
  :param widget_name: widget key
24
25
  :return: resolved widget value
25
26
  """
26
- if _widget_impl_instance is None:
27
- raise RuntimeError('ddbxutils.widgets가 초기화되지 않았습니다. `ddbxutils.widgets.init(dbutils)`를 먼저 호출하세요.')
28
- return _widget_impl_instance.get(widget_name)
27
+ widget_impl = get_instance()
28
+ # if _widget_impl_instance is None:
29
+ # raise RuntimeError('ddbxutils.widgets가 초기화되지 않았습니다. `ddbxutils.widgets.init(dbutils)`를 먼저 호출하세요.')
30
+ return widget_impl.get(widget_name)
29
31
 
30
32
 
31
- def refresh(dbutils):
33
+ def refresh():
32
34
  """
33
35
  위젯 값을 새로 고칩니다.
34
36
 
35
37
  :param dbutils: dbutils
36
38
  :return: None
37
39
  """
38
- if _widget_impl_instance is None:
39
- raise RuntimeError('ddbxutils.widgets가 초기화되지 않았습니다. `ddbxutils.widgets.init(dbutils)`를 먼저 호출하세요.')
40
- if dbutils is None:
41
- raise RuntimeError('dbutils is required.')
42
- _widget_impl_instance.refresh(dbutils)
40
+ widget_impl = get_instance()
41
+ # if _widget_impl_instance is None:
42
+ # raise RuntimeError('ddbxutils.widgets가 초기화되지 않았습니다. `ddbxutils.widgets.init(dbutils)`를 먼저 호출하세요.')
43
+ # if dbutils is None:
44
+ # raise RuntimeError('dbutils is required.')
45
+ widget_impl.refresh()
ddbxutils/widgets/core.py CHANGED
@@ -1,6 +1,8 @@
1
- from ddbxutils.functions import add_days, add_datetime
1
+ from databricks.sdk import WorkspaceClient
2
2
  from jinja2 import Environment
3
3
 
4
+ from ddbxutils.functions import add_days, add_datetime
5
+
4
6
  environment = Environment()
5
7
  environment.globals['add_days'] = add_days
6
8
  environment.globals['add_datetime'] = add_datetime
@@ -10,14 +12,15 @@ class WidgetImpl:
10
12
  dbutils = None
11
13
  rendered_widget_values = None
12
14
 
13
- def __init__(self, dbutils):
14
- self.refresh(dbutils)
15
+ def __init__(self):
16
+ self.refresh()
15
17
 
16
- def refresh(self, dbutils):
18
+ def refresh(self):
17
19
  """
18
20
  위젯의 값을 설정하거나 추가합니다.
19
21
  """
20
- self.dbutils = dbutils
22
+ if self.dbutils is None:
23
+ self.dbutils = WorkspaceClient().dbutils
21
24
  widget_values = self.dbutils.widgets.getAll()
22
25
  self.rendered_widget_values = {key: environment.from_string(value).render(widget_values) for key, value in widget_values.items()}
23
26
 
@@ -1,56 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: databricks-ddbxutils
3
- Version: 0.1.0
4
- Summary: extends databricks dbutils
5
- Author: Haneul Kim
6
- Author-email: haneul.kim@data-dynamics.io
7
- Requires-Python: >=3.11,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.11
10
- Classifier: Programming Language :: Python :: 3.12
11
- Requires-Dist: databricks-sdk (>=0.57.0,<0.58.0)
12
- Requires-Dist: dotenv (>=0.9.9,<0.10.0)
13
- Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
14
- Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
15
- Description-Content-Type: text/markdown
16
-
17
- # databricks-ddbxutils
18
-
19
- dbutils 로 부족한 부분을 확장한 ddbxutils
20
-
21
- ## Feature
22
-
23
- * [x] `dbutils.widgets` 에 jinja2 template 적용
24
-
25
- ## setup
26
-
27
- ```shell
28
- cd <PROJECT_ROOT>
29
- pip install poetry
30
- ```
31
-
32
- ## venv
33
-
34
- ```shell
35
- poetry shell
36
- ```
37
-
38
- ## Build
39
-
40
- ```shell
41
- poetry build
42
- ```
43
-
44
- ## Run
45
-
46
- ### in databricks w/o init_script
47
-
48
- * Add Wheel
49
- * wheel upload 용 Volume 생성 후 upload
50
- * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
51
- * notebook 의 우측 Environment 에서 Environment version 2로 지정 후 volume 에 upload 한 wheel file 추가 후 Apply
52
-
53
- ### in databricks w/ init_script
54
-
55
- [//]: # (TODO)
56
-
@@ -1,9 +0,0 @@
1
- ddbxutils/__init__.py,sha256=PKtRu9A6hvfLvcB8DLqtt1IN6ozJqb9rUA3mueiaSpc,113
2
- ddbxutils/functions.py,sha256=eb1cM5tpJKB1MuzJ8ncTpGxTjZA70kDj_gJFQC6y7zg,1567
3
- ddbxutils/main.py,sha256=Hfo0I4lyOZKSUHJplqQnNFNrElb54XHGAZFTJdxQ9LE,494
4
- ddbxutils/widgets/__init__.py,sha256=yZQLYkRtXTBISZlbIJlBx5e9xpgikCzSvr8B6KWj5hw,1279
5
- ddbxutils/widgets/core.py,sha256=b2SciKS2TEUQS7HjOK8FAvjR228hv8-k0P5CxnnZpHA,915
6
- databricks_ddbxutils-0.1.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- databricks_ddbxutils-0.1.0.dist-info/METADATA,sha256=FxItGiTlCDR9QXPHZisMdl-P7L0V_5stGom3XggUoeU,1208
8
- databricks_ddbxutils-0.1.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
9
- databricks_ddbxutils-0.1.0.dist-info/RECORD,,