duckdb 1.5.0.dev86__cp314-cp314-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (52) hide show
  1. _duckdb-stubs/__init__.pyi +1443 -0
  2. _duckdb-stubs/_func.pyi +46 -0
  3. _duckdb-stubs/_sqltypes.pyi +75 -0
  4. _duckdb.cpython-314-darwin.so +0 -0
  5. adbc_driver_duckdb/__init__.py +50 -0
  6. adbc_driver_duckdb/dbapi.py +115 -0
  7. duckdb/__init__.py +381 -0
  8. duckdb/_dbapi_type_object.py +231 -0
  9. duckdb/_version.py +22 -0
  10. duckdb/bytes_io_wrapper.py +69 -0
  11. duckdb/experimental/__init__.py +3 -0
  12. duckdb/experimental/spark/LICENSE +260 -0
  13. duckdb/experimental/spark/__init__.py +6 -0
  14. duckdb/experimental/spark/_globals.py +77 -0
  15. duckdb/experimental/spark/_typing.py +46 -0
  16. duckdb/experimental/spark/conf.py +46 -0
  17. duckdb/experimental/spark/context.py +180 -0
  18. duckdb/experimental/spark/errors/__init__.py +70 -0
  19. duckdb/experimental/spark/errors/error_classes.py +918 -0
  20. duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
  21. duckdb/experimental/spark/errors/exceptions/base.py +168 -0
  22. duckdb/experimental/spark/errors/utils.py +111 -0
  23. duckdb/experimental/spark/exception.py +18 -0
  24. duckdb/experimental/spark/sql/__init__.py +7 -0
  25. duckdb/experimental/spark/sql/_typing.py +86 -0
  26. duckdb/experimental/spark/sql/catalog.py +79 -0
  27. duckdb/experimental/spark/sql/column.py +361 -0
  28. duckdb/experimental/spark/sql/conf.py +24 -0
  29. duckdb/experimental/spark/sql/dataframe.py +1389 -0
  30. duckdb/experimental/spark/sql/functions.py +6195 -0
  31. duckdb/experimental/spark/sql/group.py +424 -0
  32. duckdb/experimental/spark/sql/readwriter.py +435 -0
  33. duckdb/experimental/spark/sql/session.py +297 -0
  34. duckdb/experimental/spark/sql/streaming.py +36 -0
  35. duckdb/experimental/spark/sql/type_utils.py +107 -0
  36. duckdb/experimental/spark/sql/types.py +1239 -0
  37. duckdb/experimental/spark/sql/udf.py +37 -0
  38. duckdb/filesystem.py +33 -0
  39. duckdb/func/__init__.py +3 -0
  40. duckdb/functional/__init__.py +13 -0
  41. duckdb/polars_io.py +284 -0
  42. duckdb/py.typed +0 -0
  43. duckdb/query_graph/__main__.py +358 -0
  44. duckdb/sqltypes/__init__.py +63 -0
  45. duckdb/typing/__init__.py +71 -0
  46. duckdb/udf.py +24 -0
  47. duckdb/value/__init__.py +1 -0
  48. duckdb/value/constant/__init__.py +270 -0
  49. duckdb-1.5.0.dev86.dist-info/METADATA +88 -0
  50. duckdb-1.5.0.dev86.dist-info/RECORD +52 -0
  51. duckdb-1.5.0.dev86.dist-info/WHEEL +6 -0
  52. duckdb-1.5.0.dev86.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,358 @@
1
+ import argparse # noqa: D100
2
+ import json
3
+ import re
4
+ import webbrowser
5
+ from functools import reduce
6
+ from pathlib import Path
7
+
8
+ qgraph_css = """
9
+ .styled-table {
10
+ border-collapse: collapse;
11
+ margin: 25px 0;
12
+ font-size: 0.9em;
13
+ font-family: sans-serif;
14
+ min-width: 400px;
15
+ box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
16
+ }
17
+ .styled-table thead tr {
18
+ background-color: #009879;
19
+ color: #ffffff;
20
+ text-align: left;
21
+ }
22
+ .styled-table th,
23
+ .styled-table td {
24
+ padding: 12px 15px;
25
+ }
26
+ .styled-table tbody tr {
27
+ border-bottom: 1px solid #dddddd;
28
+ }
29
+
30
+ .styled-table tbody tr:nth-of-type(even) {
31
+ background-color: #f3f3f3;
32
+ }
33
+
34
+ .styled-table tbody tr:last-of-type {
35
+ border-bottom: 2px solid #009879;
36
+ }
37
+
38
+ .node-body {
39
+ font-size:15px;
40
+ }
41
+ .tf-nc {
42
+ position: relative;
43
+ width: 180px;
44
+ text-align: center;
45
+ background-color: #fff100;
46
+ }
47
+ .custom-tooltip {
48
+ position: relative;
49
+ display: inline-block;
50
+ }
51
+
52
+ .tooltip-text {
53
+ visibility: hidden;
54
+ background-color: #333;
55
+ color: #fff;
56
+ text-align: center;
57
+ padding: 0px;
58
+ border-radius: 1px;
59
+
60
+ /* Positioning */
61
+ position: absolute;
62
+ z-index: 1;
63
+ bottom: 100%;
64
+ left: 50%;
65
+ transform: translateX(-50%);
66
+ margin-bottom: 8px;
67
+
68
+ /* Tooltip Arrow */
69
+ width: 400px;
70
+ }
71
+
72
+ .custom-tooltip:hover .tooltip-text {
73
+ visibility: visible;
74
+ }
75
+ """
76
+
77
+
78
+ class NodeTiming: # noqa: D101
79
+ def __init__(self, phase: str, time: float) -> None: # noqa: D107
80
+ self.phase = phase
81
+ self.time = time
82
+ # percentage is determined later.
83
+ self.percentage = 0
84
+
85
+ def calculate_percentage(self, total_time: float) -> None: # noqa: D102
86
+ self.percentage = self.time / total_time
87
+
88
+ def combine_timing(self, r: "NodeTiming") -> "NodeTiming": # noqa: D102
89
+ # TODO: can only add timings for same-phase nodes # noqa: TD002, TD003
90
+ total_time = self.time + r.time
91
+ return NodeTiming(self.phase, total_time)
92
+
93
+
94
+ class AllTimings: # noqa: D101
95
+ def __init__(self) -> None: # noqa: D107
96
+ self.phase_to_timings = {}
97
+
98
+ def add_node_timing(self, node_timing: NodeTiming) -> None: # noqa: D102
99
+ if node_timing.phase in self.phase_to_timings:
100
+ self.phase_to_timings[node_timing.phase].append(node_timing)
101
+ else:
102
+ self.phase_to_timings[node_timing.phase] = [node_timing]
103
+
104
+ def get_phase_timings(self, phase: str) -> list[NodeTiming]: # noqa: D102
105
+ return self.phase_to_timings[phase]
106
+
107
+ def get_summary_phase_timings(self, phase: str) -> NodeTiming: # noqa: D102
108
+ return reduce(NodeTiming.combine_timing, self.phase_to_timings[phase])
109
+
110
+ def get_phases(self) -> list[NodeTiming]: # noqa: D102
111
+ phases = list(self.phase_to_timings.keys())
112
+ phases.sort(key=lambda x: (self.get_summary_phase_timings(x)).time)
113
+ phases.reverse()
114
+ return phases
115
+
116
+ def get_sum_of_all_timings(self) -> float: # noqa: D102
117
+ total_timing_sum = 0
118
+ for phase in self.phase_to_timings:
119
+ total_timing_sum += self.get_summary_phase_timings(phase).time
120
+ return total_timing_sum
121
+
122
+
123
+ def open_utf8(fpath: str, flags: str) -> object: # noqa: D103
124
+ return Path(fpath).open(mode=flags, encoding="utf8")
125
+
126
+
127
+ def get_child_timings(top_node: object, query_timings: object) -> str: # noqa: D103
128
+ node_timing = NodeTiming(top_node["operator_type"], float(top_node["operator_timing"]))
129
+ query_timings.add_node_timing(node_timing)
130
+ for child in top_node["children"]:
131
+ get_child_timings(child, query_timings)
132
+
133
+
134
+ def get_pink_shade_hex(fraction: float) -> str: # noqa: D103
135
+ fraction = max(0, min(1, fraction))
136
+
137
+ # Define the RGB values for very light pink (almost white) and dark pink
138
+ light_pink = (255, 250, 250) # Very light pink
139
+ dark_pink = (255, 20, 147) # Dark pink
140
+
141
+ # Calculate the RGB values for the given fraction
142
+ r = int(light_pink[0] + (dark_pink[0] - light_pink[0]) * fraction)
143
+ g = int(light_pink[1] + (dark_pink[1] - light_pink[1]) * fraction)
144
+ b = int(light_pink[2] + (dark_pink[2] - light_pink[2]) * fraction)
145
+
146
+ # Return as hexadecimal color code
147
+ return f"#{r:02x}{g:02x}{b:02x}"
148
+
149
+
150
+ def get_node_body(name: str, result: str, cpu_time: float, card: int, est: int, width: int, extra_info: str) -> str: # noqa: D103
151
+ node_style = f"background-color: {get_pink_shade_hex(float(result) / cpu_time)};"
152
+
153
+ body = f'<span class="tf-nc custom-tooltip" style="{node_style}">'
154
+ body += '<div class="node-body">'
155
+ new_name = "BRIDGE" if (name == "INVALID") else name.replace("_", " ")
156
+ formatted_num = f"{float(result):.4f}"
157
+ body += f"<p><b>{new_name}</b> </p><p>time: {formatted_num} seconds</p>"
158
+ body += f'<span class="tooltip-text"> {extra_info} </span>'
159
+ if width > 0:
160
+ body += f"<p>cardinality: {card}</p>"
161
+ body += f"<p>estimate: {est}</p>"
162
+ body += f"<p>width: {width} bytes</p>"
163
+ # TODO: Expand on timing. Usually available from a detailed profiling # noqa: TD002, TD003
164
+ body += "</div>"
165
+ body += "</span>"
166
+ return body
167
+
168
+
169
+ def generate_tree_recursive(json_graph: object, cpu_time: float) -> str: # noqa: D103
170
+ node_prefix_html = "<li>"
171
+ node_suffix_html = "</li>"
172
+
173
+ extra_info = ""
174
+ estimate = 0
175
+ for key in json_graph["extra_info"]:
176
+ value = json_graph["extra_info"][key]
177
+ if key == "Estimated Cardinality":
178
+ estimate = int(value)
179
+ else:
180
+ extra_info += f"{key}: {value} <br>"
181
+ cardinality = json_graph["operator_cardinality"]
182
+ width = int(json_graph["result_set_size"] / max(1, cardinality))
183
+
184
+ # get rid of some typically long names
185
+ extra_info = re.sub(r"__internal_\s*", "__", extra_info)
186
+ extra_info = re.sub(r"compress_integral\s*", "compress", extra_info)
187
+
188
+ node_body = get_node_body(
189
+ json_graph["operator_type"],
190
+ json_graph["operator_timing"],
191
+ cpu_time,
192
+ cardinality,
193
+ estimate,
194
+ width,
195
+ re.sub(r",\s*", ", ", extra_info),
196
+ )
197
+
198
+ children_html = ""
199
+ if len(json_graph["children"]) >= 1:
200
+ children_html += "<ul>"
201
+ for child in json_graph["children"]:
202
+ children_html += generate_tree_recursive(child, cpu_time)
203
+ children_html += "</ul>"
204
+ return node_prefix_html + node_body + children_html + node_suffix_html
205
+
206
+
207
+ # For generating the table in the top left.
208
+ def generate_timing_html(graph_json: object, query_timings: object) -> object: # noqa: D103
209
+ json_graph = json.loads(graph_json)
210
+ gather_timing_information(json_graph, query_timings)
211
+ total_time = float(json_graph.get("operator_timing") or json_graph.get("latency"))
212
+ table_head = """
213
+ <table class=\"styled-table\">
214
+ <thead>
215
+ <tr>
216
+ <th>Phase</th>
217
+ <th>Time</th>
218
+ <th>Percentage</th>
219
+ </tr>
220
+ </thead>"""
221
+
222
+ table_body = "<tbody>"
223
+ table_end = "</tbody></table>"
224
+
225
+ execution_time = query_timings.get_sum_of_all_timings()
226
+
227
+ all_phases = query_timings.get_phases()
228
+ query_timings.add_node_timing(NodeTiming("TOTAL TIME", total_time))
229
+ query_timings.add_node_timing(NodeTiming("Execution Time", execution_time))
230
+ all_phases = ["TOTAL TIME", "Execution Time", *all_phases]
231
+ for phase in all_phases:
232
+ summarized_phase = query_timings.get_summary_phase_timings(phase)
233
+ summarized_phase.calculate_percentage(total_time)
234
+ phase_column = f"<b>{phase}</b>" if phase == "TOTAL TIME" or phase == "Execution Time" else phase
235
+ table_body += f"""
236
+ <tr>
237
+ <td>{phase_column}</td>
238
+ <td>{summarized_phase.time}</td>
239
+ <td>{str(summarized_phase.percentage * 100)[:6]}%</td>
240
+ </tr>
241
+ """
242
+ table_body += table_end
243
+ return table_head + table_body
244
+
245
+
246
+ def generate_tree_html(graph_json: object) -> str: # noqa: D103
247
+ json_graph = json.loads(graph_json)
248
+ cpu_time = float(json_graph["cpu_time"])
249
+ tree_prefix = '<div class="tf-tree tf-gap-sm"> \n <ul>'
250
+ tree_suffix = "</ul> </div>"
251
+ # first level of json is general overview
252
+ # TODO: make sure json output first level always has only 1 level # noqa: TD002, TD003
253
+ tree_body = generate_tree_recursive(json_graph["children"][0], cpu_time)
254
+ return tree_prefix + tree_body + tree_suffix
255
+
256
+
257
+ def generate_ipython(json_input: str) -> str: # noqa: D103
258
+ from IPython.core.display import HTML
259
+
260
+ html_output = generate_html(json_input, False) # noqa: F821
261
+
262
+ return HTML(
263
+ ('\n ${CSS}\n ${LIBRARIES}\n <div class="chart" id="query-profile"></div>\n ${CHART_SCRIPT}\n ')
264
+ .replace("${CSS}", html_output["css"])
265
+ .replace("${CHART_SCRIPT}", html_output["chart_script"])
266
+ .replace("${LIBRARIES}", html_output["libraries"])
267
+ )
268
+
269
+
270
+ def generate_style_html(graph_json: str, include_meta_info: bool) -> None: # noqa: D103, FBT001
271
+ treeflex_css = '<link rel="stylesheet" href="https://unpkg.com/treeflex/dist/css/treeflex.css">\n'
272
+ css = "<style>\n"
273
+ css += qgraph_css + "\n"
274
+ css += "</style>\n"
275
+ return {"treeflex_css": treeflex_css, "duckdb_css": css, "libraries": "", "chart_script": ""}
276
+
277
+
278
+ def gather_timing_information(json: str, query_timings: object) -> None: # noqa: D103
279
+ # add up all of the times
280
+ # measure each time as a percentage of the total time.
281
+ # then you can return a list of [phase, time, percentage]
282
+ get_child_timings(json["children"][0], query_timings)
283
+
284
+
285
+ def translate_json_to_html(input_file: str, output_file: str) -> None: # noqa: D103
286
+ query_timings = AllTimings()
287
+ with open_utf8(input_file, "r") as f:
288
+ text = f.read()
289
+
290
+ html_output = generate_style_html(text, True)
291
+ timing_table = generate_timing_html(text, query_timings)
292
+ tree_output = generate_tree_html(text)
293
+
294
+ # finally create and write the html
295
+ with open_utf8(output_file, "w+") as f:
296
+ html = """<!DOCTYPE html>
297
+ <html>
298
+ <head>
299
+ <meta charset="utf-8">
300
+ <meta name="viewport" content="width=device-width">
301
+ <title>Query Profile Graph for Query</title>
302
+ ${TREEFLEX_CSS}
303
+ <style>
304
+ ${DUCKDB_CSS}
305
+ </style>
306
+ </head>
307
+ <body>
308
+ <div id="meta-info"></div>
309
+ <div class="chart" id="query-profile">
310
+ ${TIMING_TABLE}
311
+ </div>
312
+ ${TREE}
313
+ </body>
314
+ </html>
315
+ """
316
+ html = html.replace("${TREEFLEX_CSS}", html_output["treeflex_css"])
317
+ html = html.replace("${DUCKDB_CSS}", html_output["duckdb_css"])
318
+ html = html.replace("${TIMING_TABLE}", timing_table)
319
+ html = html.replace("${TREE}", tree_output)
320
+ f.write(html)
321
+
322
+
323
+ def main() -> None: # noqa: D103
324
+ parser = argparse.ArgumentParser(
325
+ prog="Query Graph Generator",
326
+ description="""Given a json profile output, generate a html file showing the query graph and
327
+ timings of operators""",
328
+ )
329
+ parser.add_argument("profile_input", help="profile input in json")
330
+ parser.add_argument("--out", required=False, default=False)
331
+ parser.add_argument("--open", required=False, action="store_true", default=True)
332
+ args = parser.parse_args()
333
+
334
+ input = args.profile_input
335
+ output = args.out
336
+ if not args.out:
337
+ if ".json" in input:
338
+ output = input.replace(".json", ".html")
339
+ else:
340
+ print("please provide profile output in json")
341
+ exit(1)
342
+ else:
343
+ if ".html" in args.out:
344
+ output = args.out
345
+ else:
346
+ print("please provide valid .html file for output name")
347
+ exit(1)
348
+
349
+ open_output = args.open
350
+
351
+ translate_json_to_html(input, output)
352
+
353
+ if open_output:
354
+ webbrowser.open(f"file://{Path(output).resolve()}", new=2)
355
+
356
+
357
+ if __name__ == "__main__":
358
+ main()
@@ -0,0 +1,63 @@
1
+ """DuckDB's SQL types."""
2
+
3
+ from _duckdb._sqltypes import (
4
+ BIGINT,
5
+ BIT,
6
+ BLOB,
7
+ BOOLEAN,
8
+ DATE,
9
+ DOUBLE,
10
+ FLOAT,
11
+ HUGEINT,
12
+ INTEGER,
13
+ INTERVAL,
14
+ SMALLINT,
15
+ SQLNULL,
16
+ TIME,
17
+ TIME_TZ,
18
+ TIMESTAMP,
19
+ TIMESTAMP_MS,
20
+ TIMESTAMP_NS,
21
+ TIMESTAMP_S,
22
+ TIMESTAMP_TZ,
23
+ TINYINT,
24
+ UBIGINT,
25
+ UHUGEINT,
26
+ UINTEGER,
27
+ USMALLINT,
28
+ UTINYINT,
29
+ UUID,
30
+ VARCHAR,
31
+ DuckDBPyType,
32
+ )
33
+
34
+ __all__ = [
35
+ "BIGINT",
36
+ "BIT",
37
+ "BLOB",
38
+ "BOOLEAN",
39
+ "DATE",
40
+ "DOUBLE",
41
+ "FLOAT",
42
+ "HUGEINT",
43
+ "INTEGER",
44
+ "INTERVAL",
45
+ "SMALLINT",
46
+ "SQLNULL",
47
+ "TIME",
48
+ "TIMESTAMP",
49
+ "TIMESTAMP_MS",
50
+ "TIMESTAMP_NS",
51
+ "TIMESTAMP_S",
52
+ "TIMESTAMP_TZ",
53
+ "TIME_TZ",
54
+ "TINYINT",
55
+ "UBIGINT",
56
+ "UHUGEINT",
57
+ "UINTEGER",
58
+ "USMALLINT",
59
+ "UTINYINT",
60
+ "UUID",
61
+ "VARCHAR",
62
+ "DuckDBPyType",
63
+ ]
@@ -0,0 +1,71 @@
1
+ """DuckDB's SQL types. DEPRECATED. Please use `duckdb.sqltypes` instead."""
2
+
3
+ import warnings
4
+
5
+ from duckdb.sqltypes import (
6
+ BIGINT,
7
+ BIT,
8
+ BLOB,
9
+ BOOLEAN,
10
+ DATE,
11
+ DOUBLE,
12
+ FLOAT,
13
+ HUGEINT,
14
+ INTEGER,
15
+ INTERVAL,
16
+ SMALLINT,
17
+ SQLNULL,
18
+ TIME,
19
+ TIME_TZ,
20
+ TIMESTAMP,
21
+ TIMESTAMP_MS,
22
+ TIMESTAMP_NS,
23
+ TIMESTAMP_S,
24
+ TIMESTAMP_TZ,
25
+ TINYINT,
26
+ UBIGINT,
27
+ UHUGEINT,
28
+ UINTEGER,
29
+ USMALLINT,
30
+ UTINYINT,
31
+ UUID,
32
+ VARCHAR,
33
+ DuckDBPyType,
34
+ )
35
+
36
+ __all__ = [
37
+ "BIGINT",
38
+ "BIT",
39
+ "BLOB",
40
+ "BOOLEAN",
41
+ "DATE",
42
+ "DOUBLE",
43
+ "FLOAT",
44
+ "HUGEINT",
45
+ "INTEGER",
46
+ "INTERVAL",
47
+ "SMALLINT",
48
+ "SQLNULL",
49
+ "TIME",
50
+ "TIMESTAMP",
51
+ "TIMESTAMP_MS",
52
+ "TIMESTAMP_NS",
53
+ "TIMESTAMP_S",
54
+ "TIMESTAMP_TZ",
55
+ "TIME_TZ",
56
+ "TINYINT",
57
+ "UBIGINT",
58
+ "UHUGEINT",
59
+ "UINTEGER",
60
+ "USMALLINT",
61
+ "UTINYINT",
62
+ "UUID",
63
+ "VARCHAR",
64
+ "DuckDBPyType",
65
+ ]
66
+
67
+ warnings.warn(
68
+ "`duckdb.typing` is deprecated and will be removed in a future version. Please use `duckdb.sqltypes` instead.",
69
+ DeprecationWarning,
70
+ stacklevel=2,
71
+ )
duckdb/udf.py ADDED
@@ -0,0 +1,24 @@
1
+ # ruff: noqa: D100
2
+ import typing
3
+
4
+
5
+ def vectorized(func: typing.Callable[..., typing.Any]) -> typing.Callable[..., typing.Any]:
6
+ """Decorate a function with annotated function parameters.
7
+
8
+ This allows DuckDB to infer that the function should be provided with pyarrow arrays and should expect
9
+ pyarrow array(s) as output.
10
+ """
11
+ import types
12
+ from inspect import signature
13
+
14
+ new_func = types.FunctionType(func.__code__, func.__globals__, func.__name__, func.__defaults__, func.__closure__)
15
+ # Construct the annotations:
16
+ import pyarrow as pa
17
+
18
+ new_annotations = {}
19
+ sig = signature(func)
20
+ for param in sig.parameters:
21
+ new_annotations[param] = pa.lib.ChunkedArray
22
+
23
+ new_func.__annotations__ = new_annotations
24
+ return new_func
@@ -0,0 +1 @@
1
+ # noqa: D104