databricks-ddbxutils 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.1
2
+ Name: databricks-ddbxutils
3
+ Version: 0.3.0
4
+ Summary: extends databricks dbutils
5
+ Author: Haneul Kim
6
+ Author-email: haneul.kim@data-dynamics.io
7
+ Requires-Python: >=3.11,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Requires-Dist: cloudpickle (>=3.1.1,<4.0.0)
12
+ Requires-Dist: databricks-sdk (>=0.64.0,<0.65.0)
13
+ Requires-Dist: dotenv (>=0.9.9,<0.10.0)
14
+ Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
15
+ Requires-Dist: pyarrow (==20.0.0)
16
+ Requires-Dist: pyspark (>=4.0.0,<5.0.0)
17
+ Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
18
+ Requires-Dist: pytz (>=2025.2,<2026.0)
19
+ Description-Content-Type: text/markdown
20
+
21
+ # databricks-ddbxutils
22
+
23
+ dbutils 로 부족한 부분을 확장한 ddbxutils
24
+
25
+ ## Feature
26
+
27
+ * [x] `dbutils.widgets` 에 jinja2 template 적용
28
+
29
+ ## setup
30
+
31
+ ```shell
32
+ cd <PROJECT_ROOT>
33
+ pip install poetry
34
+ ```
35
+
36
+ ## venv
37
+
38
+ ```shell
39
+ poetry shell
40
+ ```
41
+
42
+ ## Build
43
+
44
+ ```shell
45
+ poetry build
46
+ ```
47
+
48
+ ## Run
49
+
50
+ ### in databricks w/o init_script(= Serverless)
51
+
52
+ * Add Wheel
53
+ * wheel upload 용 Volume 생성 후 upload
54
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
55
+ * notebook 의 우측 Environment 에서 Environment version 2로 지정 후 volume 에 upload 한 wheel file 추가 후 Apply
56
+ * Usage
57
+ ```python
58
+ # dbutils.widgets.text('rawdate', '2025-05-24', 'Raw Date')
59
+ # dbutils.widgets.text('next_day', '{{add_days(rawdate, "%Y-%m-%d", "", 1)}}', 'Next Day')
60
+ import ddbxutils
61
+ next_day = ddbxutils.widgets.get('next_day')
62
+ # next_day: 2025-05-25
63
+ ```
64
+
65
+ ### in databricks w/ init_script
66
+
67
+ * Add Wheel
68
+ * wheel upload 용 Volume 생성 후 upload
69
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
70
+ * Libraries
71
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
72
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/init_script_ddbxutils.sh`
73
+ ```shell
74
+ #! /bin/bash
75
+
76
+ STARTUP_SCRIPT=/tmp/pyspark_startup.py
77
+
78
+ cat >> ${STARTUP_SCRIPT} << EOF
79
+
80
+ prefix = 'PYTHONSTARTUP_ddbxutils'
81
+ print(f'{prefix} custom startup script loading...')
82
+ try:
83
+ import ddbxutils
84
+ print(f'{prefix} Custom modules [ddbxutils] are loaded.')
85
+ except Exception as e:
86
+ print(f'{prefix} e={e}')
87
+ print(f'{prefix} import ddbxutils failed')
88
+ EOF
89
+ ```
90
+ * Spark config
91
+ ```text
92
+ spark.executorEnv.PYTHONSTARTUP /tmp/pyspark_startup.py
93
+ ```
94
+ * Environment variables
95
+ ```shell
96
+ PYTHONSTARTUP=/tmp/pyspark_startup.py
97
+ ```
98
+ * Init scripts
99
+ ```text
100
+ /Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/init_script_ddbxutils.sh
101
+ ```
102
+ * Usage
103
+ ```python
104
+ # dbutils.widgets.text('rawdate', '2025-05-24', 'Raw Date')
105
+ # dbutils.widgets.text('next_day', '{{add_days(rawdate, "%Y-%m-%d", "", 1)}}', 'Next Day')
106
+ next_day = ddbxutils.widgets.get('next_day')
107
+ # next_day: 2025-05-25
108
+ ```
@@ -0,0 +1,88 @@
1
+ # databricks-ddbxutils
2
+
3
+ dbutils 로 부족한 부분을 확장한 ddbxutils
4
+
5
+ ## Feature
6
+
7
+ * [x] `dbutils.widgets` 에 jinja2 template 적용
8
+
9
+ ## setup
10
+
11
+ ```shell
12
+ cd <PROJECT_ROOT>
13
+ pip install poetry
14
+ ```
15
+
16
+ ## venv
17
+
18
+ ```shell
19
+ poetry shell
20
+ ```
21
+
22
+ ## Build
23
+
24
+ ```shell
25
+ poetry build
26
+ ```
27
+
28
+ ## Run
29
+
30
+ ### in databricks w/o init_script(= Serverless)
31
+
32
+ * Add Wheel
33
+ * wheel upload 용 Volume 생성 후 upload
34
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
35
+ * notebook 의 우측 Environment 에서 Environment version 2로 지정 후 volume 에 upload 한 wheel file 추가 후 Apply
36
+ * Usage
37
+ ```python
38
+ # dbutils.widgets.text('rawdate', '2025-05-24', 'Raw Date')
39
+ # dbutils.widgets.text('next_day', '{{add_days(rawdate, "%Y-%m-%d", "", 1)}}', 'Next Day')
40
+ import ddbxutils
41
+ next_day = ddbxutils.widgets.get('next_day')
42
+ # next_day: 2025-05-25
43
+ ```
44
+
45
+ ### in databricks w/ init_script
46
+
47
+ * Add Wheel
48
+ * wheel upload 용 Volume 생성 후 upload
49
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
50
+ * Libraries
51
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
52
+ * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/init_script_ddbxutils.sh`
53
+ ```shell
54
+ #! /bin/bash
55
+
56
+ STARTUP_SCRIPT=/tmp/pyspark_startup.py
57
+
58
+ cat >> ${STARTUP_SCRIPT} << EOF
59
+
60
+ prefix = 'PYTHONSTARTUP_ddbxutils'
61
+ print(f'{prefix} custom startup script loading...')
62
+ try:
63
+ import ddbxutils
64
+ print(f'{prefix} Custom modules [ddbxutils] are loaded.')
65
+ except Exception as e:
66
+ print(f'{prefix} e={e}')
67
+ print(f'{prefix} import ddbxutils failed')
68
+ EOF
69
+ ```
70
+ * Spark config
71
+ ```text
72
+ spark.executorEnv.PYTHONSTARTUP /tmp/pyspark_startup.py
73
+ ```
74
+ * Environment variables
75
+ ```shell
76
+ PYTHONSTARTUP=/tmp/pyspark_startup.py
77
+ ```
78
+ * Init scripts
79
+ ```text
80
+ /Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/init_script_ddbxutils.sh
81
+ ```
82
+ * Usage
83
+ ```python
84
+ # dbutils.widgets.text('rawdate', '2025-05-24', 'Raw Date')
85
+ # dbutils.widgets.text('next_day', '{{add_days(rawdate, "%Y-%m-%d", "", 1)}}', 'Next Day')
86
+ next_day = ddbxutils.widgets.get('next_day')
87
+ # next_day: 2025-05-25
88
+ ```
@@ -0,0 +1,4 @@
1
+ from . import widgets
2
+
3
+ # lazy evaluation 위해 () 로 감싸서 generator 를 return 하도록 변경
4
+ generator = (widgets.get_instance() for x in range(1))
@@ -0,0 +1,122 @@
1
+ import base64
2
+ from dataclasses import dataclass
3
+
4
+ import cloudpickle
5
+ from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
6
+ from pyspark.sql.types import StructType, StructField, IntegerType
7
+
8
+
9
+ @dataclass
10
+ class PythonFunctionPartition(InputPartition):
11
+ """
12
+ Partition 정의: 각 파티션의 시작과 끝 범위
13
+ """
14
+ start: int
15
+ end: int
16
+
17
+
18
+ class PythonFunctionReader(DataSourceReader):
19
+ """
20
+ DataSourceReader 구현
21
+ """
22
+
23
+ def __init__(self, schema: StructType, options: dict, serialized_func_b64):
24
+ self.schema = schema
25
+ self.options = options
26
+ self.func = cloudpickle.loads(base64.b64decode(serialized_func_b64))
27
+
28
+ def partitions(self):
29
+ lower = int(self.options.get('lowerLimit', '0'))
30
+ upper = int(self.options.get('upperLimit', '0'))
31
+ num_parts = int(self.options.get('numPartitions', '1'))
32
+ step = (upper - lower) // num_parts if num_parts > 0 else (upper - lower)
33
+ # print(f'step={step}')
34
+ parts = []
35
+ start = lower
36
+ for i in range(num_parts):
37
+ end = upper if i == num_parts - 1 else start + step
38
+ parts.append(PythonFunctionPartition(start, end))
39
+ start = end
40
+ return parts
41
+
42
+ def read(self, partition: PythonFunctionPartition):
43
+ for x in range(partition.start, partition.end):
44
+ # yield (self.func(x),)
45
+ yield self.func(x)
46
+
47
+
48
+ class PythonFunctionDataSource(DataSource):
49
+ """
50
+ DataSource 구현
51
+
52
+ .. versionadded: 0.3.0
53
+
54
+ Notes
55
+ -----
56
+ user defined function 은 tuple, list, `pyspark.sql.types.Row`, `pyarrow.RecordBatch` 중에 하나의 type 을 가져야 합니다.
57
+
58
+ Examples
59
+ --------
60
+ >>> spark = ...
61
+
62
+ Use the default input partition implementation:
63
+
64
+ >>> def partitions(self):
65
+ ... return [PythonFunctionPartition(1, 3)]
66
+
67
+ Subclass the input partition class:
68
+
69
+ >>> def partitions(self):
70
+ ... return [PythonFunctionPartition(1, 3), PythonFunctionPartition(4, 6)]
71
+
72
+ Example in PySpark Shell 에서 `pyspark.sql.Row` 를 return 하는 함수를 사용하는 방법입니다.
73
+
74
+ >>> import base64
75
+ >>> import cloudpickle
76
+ >>> from ddbxutils.datasources.pyfunc import PythonFunctionDataSource
77
+ >>> spark.dataSource.register(PythonFunctionDataSource)
78
+ >>> # ...
79
+ >>> from pyspark.sql import Row
80
+ >>> def user_function_row(x) -> Row:
81
+ ... from datetime import datetime
82
+ ... from pytz import timezone
83
+ ... return Row(str(x), str(x * x), datetime.now(timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S"))
84
+ ...
85
+ >>> df = (spark.read.format("pyfunc").
86
+ ... schema('value1 string, value2 string, ts string').
87
+ ... option("lowerLimit", "0").
88
+ ... option("upperLimit", "10").
89
+ ... option("numPartitions", "100").
90
+ ... option("func", base64.b64encode(cloudpickle.dumps(user_function_row)).decode('utf-8')).
91
+ ... load())
92
+ >>> df.show()
93
+
94
+ Example in PySpark Shell 에서 array 를 return 하는 함수를 사용하는 방법입니다.
95
+
96
+ >>> def user_function_row(x):
97
+ ... from datetime import datetime
98
+ ... from pytz import timezone
99
+ ... return [str(x), str(x * x), datetime.now(timezone('Asia/Seoul')).strftime("%Y-%m-%d %H:%M:%S")]
100
+ ...
101
+ >>> df = (spark.read.format("pyfunc").
102
+ ... schema('value1 string, value2 string, ts string').
103
+ ... option("lowerLimit", "0").
104
+ ... option("upperLimit", "10").
105
+ ... option("numPartitions", "100").
106
+ ... option("func", base64.b64encode(cloudpickle.dumps(user_function_row)).decode('utf-8')).
107
+ ... load())
108
+ >>> df.show()
109
+ """
110
+
111
+ @classmethod
112
+ def name(cls):
113
+ return 'pyfunc'
114
+
115
+ def schema(self):
116
+ return StructType([StructField('value', IntegerType(), nullable=False)])
117
+
118
+ def reader(self, schema: StructType):
119
+ # options는 문자열이므로 필요시 변환하세요
120
+ # func = self.options.get('func', None)
121
+ func = self.options['func']
122
+ return PythonFunctionReader(self.schema(), self.options, func)
@@ -0,0 +1,14 @@
1
+ # 직접 만든 ddbxutils 모듈 임포트
2
+ import ddbxutils
3
+
4
+ # 'next_day' 위젯의 기본값 가져오기
5
+ initial_value = ddbxutils.widgets.get('next_day')
6
+ print(f'초기 next_day 값: {initial_value}')
7
+
8
+ # 변경된 값 다시 가져오기
9
+ updated_value = ddbxutils.widgets.get('next_day')
10
+ print(f'업데이트된 next_day 값: {updated_value}')
11
+
12
+ # 존재하지 않는 위젯 가져오기
13
+ other_value = ddbxutils.widgets.get('another_widget')
14
+ print(f'another_widget 값: {other_value}')
@@ -0,0 +1,45 @@
1
+ from .core import WidgetImpl
2
+
3
+ _widget_impl_instance: WidgetImpl = None
4
+
5
+
6
+ def get_instance():
7
+ """
8
+ widgets 모듈을 초기화합니다.
9
+ 이 함수는 반드시 dbutils 객체와 함께 한 번 호출되어야 합니다.
10
+
11
+ :return: None
12
+ """
13
+ global _widget_impl_instance
14
+ if _widget_impl_instance is None:
15
+ _widget_impl_instance = WidgetImpl()
16
+ return _widget_impl_instance
17
+
18
+
19
+ def get(widget_name: str):
20
+ """
21
+ 초기화된 인스턴스에서 위젯 값을 가져옵니다.
22
+ init()이 호출되지 않았다면 예외를 발생시킵니다.
23
+
24
+ :param widget_name: widget key
25
+ :return: resolved widget value
26
+ """
27
+ widget_impl = get_instance()
28
+ # if _widget_impl_instance is None:
29
+ # raise RuntimeError('ddbxutils.widgets가 초기화되지 않았습니다. `ddbxutils.widgets.init(dbutils)`를 먼저 호출하세요.')
30
+ return widget_impl.get(widget_name)
31
+
32
+
33
+ def refresh():
34
+ """
35
+ 위젯 값을 새로 고칩니다.
36
+
37
+ :param dbutils: dbutils
38
+ :return: None
39
+ """
40
+ widget_impl = get_instance()
41
+ # if _widget_impl_instance is None:
42
+ # raise RuntimeError('ddbxutils.widgets가 초기화되지 않았습니다. `ddbxutils.widgets.init(dbutils)`를 먼저 호출하세요.')
43
+ # if dbutils is None:
44
+ # raise RuntimeError('dbutils is required.')
45
+ widget_impl.refresh()
@@ -1,6 +1,8 @@
1
- from ddbxutils.functions import add_days, add_datetime
1
+ from databricks.sdk import WorkspaceClient
2
2
  from jinja2 import Environment
3
3
 
4
+ from ddbxutils.functions import add_days, add_datetime
5
+
4
6
  environment = Environment()
5
7
  environment.globals['add_days'] = add_days
6
8
  environment.globals['add_datetime'] = add_datetime
@@ -10,14 +12,15 @@ class WidgetImpl:
10
12
  dbutils = None
11
13
  rendered_widget_values = None
12
14
 
13
- def __init__(self, dbutils):
14
- self.refresh(dbutils)
15
+ def __init__(self):
16
+ self.refresh()
15
17
 
16
- def refresh(self, dbutils):
18
+ def refresh(self):
17
19
  """
18
20
  위젯의 값을 설정하거나 추가합니다.
19
21
  """
20
- self.dbutils = dbutils
22
+ if self.dbutils is None:
23
+ self.dbutils = WorkspaceClient().dbutils
21
24
  widget_values = self.dbutils.widgets.getAll()
22
25
  self.rendered_widget_values = {key: environment.from_string(value).render(widget_values) for key, value in widget_values.items()}
23
26
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "databricks-ddbxutils"
3
- version = "0.1.0"
3
+ version = "0.3.0"
4
4
  description = "extends databricks dbutils"
5
5
  authors = ["Haneul Kim <haneul.kim@data-dynamics.io>"]
6
6
  readme = "README.md"
@@ -8,10 +8,14 @@ packages = [{ include = "ddbxutils" }]
8
8
 
9
9
  [tool.poetry.dependencies]
10
10
  python = "^3.11"
11
- databricks-sdk = "^0.57.0"
11
+ databricks-sdk = "^0.64.0"
12
12
  dotenv = "^0.9.9"
13
13
  jinja2 = "^3.1.6"
14
14
  python-dateutil = "^2.9.0.post0"
15
+ pyspark = "^4.0.0"
16
+ pyarrow = "20.0.0"
17
+ cloudpickle = "^3.1.1"
18
+ pytz = "^2025.2"
15
19
 
16
20
 
17
21
  [build-system]
@@ -1,56 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: databricks-ddbxutils
3
- Version: 0.1.0
4
- Summary: extends databricks dbutils
5
- Author: Haneul Kim
6
- Author-email: haneul.kim@data-dynamics.io
7
- Requires-Python: >=3.11,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.11
10
- Classifier: Programming Language :: Python :: 3.12
11
- Requires-Dist: databricks-sdk (>=0.57.0,<0.58.0)
12
- Requires-Dist: dotenv (>=0.9.9,<0.10.0)
13
- Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
14
- Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
15
- Description-Content-Type: text/markdown
16
-
17
- # databricks-ddbxutils
18
-
19
- dbutils 로 부족한 부분을 확장한 ddbxutils
20
-
21
- ## Feature
22
-
23
- * [x] `dbutils.widgets` 에 jinja2 template 적용
24
-
25
- ## setup
26
-
27
- ```shell
28
- cd <PROJECT_ROOT>
29
- pip install poetry
30
- ```
31
-
32
- ## venv
33
-
34
- ```shell
35
- poetry shell
36
- ```
37
-
38
- ## Build
39
-
40
- ```shell
41
- poetry build
42
- ```
43
-
44
- ## Run
45
-
46
- ### in databricks w/o init_script
47
-
48
- * Add Wheel
49
- * wheel upload 용 Volume 생성 후 upload
50
- * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
51
- * notebook 의 우측 Environment 에서 Environment version 2로 지정 후 volume 에 upload 한 wheel file 추가 후 Apply
52
-
53
- ### in databricks w/ init_script
54
-
55
- [//]: # (TODO)
56
-
@@ -1,39 +0,0 @@
1
- # databricks-ddbxutils
2
-
3
- dbutils 로 부족한 부분을 확장한 ddbxutils
4
-
5
- ## Feature
6
-
7
- * [x] `dbutils.widgets` 에 jinja2 template 적용
8
-
9
- ## setup
10
-
11
- ```shell
12
- cd <PROJECT_ROOT>
13
- pip install poetry
14
- ```
15
-
16
- ## venv
17
-
18
- ```shell
19
- poetry shell
20
- ```
21
-
22
- ## Build
23
-
24
- ```shell
25
- poetry build
26
- ```
27
-
28
- ## Run
29
-
30
- ### in databricks w/o init_script
31
-
32
- * Add Wheel
33
- * wheel upload 용 Volume 생성 후 upload
34
- * `/Volumes/<CATALOG>/<DATABASE>/<VOLUME_NAME>/ddbxutils-<VERSION>-py3-none-any.whl`
35
- * notebook 의 우측 Environment 에서 Environment version 2로 지정 후 volume 에 upload 한 wheel file 추가 후 Apply
36
-
37
- ### in databricks w/ init_script
38
-
39
- [//]: # (TODO)
@@ -1,6 +0,0 @@
1
- from databricks.sdk import WorkspaceClient
2
-
3
- from . import widgets
4
-
5
- w = WorkspaceClient()
6
- widgets.init(w.dbutils)
@@ -1,14 +0,0 @@
1
- # 직접 만든 ddbxutils 모듈 임포트
2
- import ddbxutils
3
-
4
- # 'next_day' 위젯의 기본값 가져오기
5
- initial_value = ddbxutils.widgets.get("next_day")
6
- print(f"초기 'next_day' 값: {initial_value}")
7
-
8
- # 변경된 값 다시 가져오기
9
- updated_value = ddbxutils.widgets.get("next_day")
10
- print(f"업데이트된 'next_day' 값: {updated_value}")
11
-
12
- # 존재하지 않는 위젯 가져오기
13
- other_value = ddbxutils.widgets.get("another_widget")
14
- print(f"'another_widget' 값: {other_value}")
@@ -1,42 +0,0 @@
1
- from .core import WidgetImpl
2
-
3
- _widget_impl_instance: WidgetImpl = None
4
-
5
-
6
- def init(dbutils):
7
- """
8
- widgets 모듈을 초기화합니다.
9
- 이 함수는 반드시 dbutils 객체와 함께 한 번 호출되어야 합니다.
10
-
11
- :param dbutils: dbutils
12
- :return: None
13
- """
14
- global _widget_impl_instance
15
- _widget_impl_instance = WidgetImpl(dbutils)
16
-
17
-
18
- def get(widget_name: str):
19
- """
20
- 초기화된 인스턴스에서 위젯 값을 가져옵니다.
21
- init()이 호출되지 않았다면 예외를 발생시킵니다.
22
-
23
- :param widget_name: widget key
24
- :return: resolved widget value
25
- """
26
- if _widget_impl_instance is None:
27
- raise RuntimeError('ddbxutils.widgets가 초기화되지 않았습니다. `ddbxutils.widgets.init(dbutils)`를 먼저 호출하세요.')
28
- return _widget_impl_instance.get(widget_name)
29
-
30
-
31
- def refresh(dbutils):
32
- """
33
- 위젯 값을 새로 고칩니다.
34
-
35
- :param dbutils: dbutils
36
- :return: None
37
- """
38
- if _widget_impl_instance is None:
39
- raise RuntimeError('ddbxutils.widgets가 초기화되지 않았습니다. `ddbxutils.widgets.init(dbutils)`를 먼저 호출하세요.')
40
- if dbutils is None:
41
- raise RuntimeError('dbutils is required.')
42
- _widget_impl_instance.refresh(dbutils)