duckdb 1.5.0.dev37__cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (47) hide show
  1. _duckdb.cpython-314-aarch64-linux-gnu.so +0 -0
  2. duckdb/__init__.py +475 -0
  3. duckdb/__init__.pyi +713 -0
  4. duckdb/bytes_io_wrapper.py +66 -0
  5. duckdb/experimental/__init__.py +2 -0
  6. duckdb/experimental/spark/LICENSE +260 -0
  7. duckdb/experimental/spark/__init__.py +7 -0
  8. duckdb/experimental/spark/_globals.py +77 -0
  9. duckdb/experimental/spark/_typing.py +48 -0
  10. duckdb/experimental/spark/conf.py +45 -0
  11. duckdb/experimental/spark/context.py +164 -0
  12. duckdb/experimental/spark/errors/__init__.py +72 -0
  13. duckdb/experimental/spark/errors/error_classes.py +918 -0
  14. duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
  15. duckdb/experimental/spark/errors/exceptions/base.py +217 -0
  16. duckdb/experimental/spark/errors/utils.py +116 -0
  17. duckdb/experimental/spark/exception.py +15 -0
  18. duckdb/experimental/spark/sql/__init__.py +7 -0
  19. duckdb/experimental/spark/sql/_typing.py +93 -0
  20. duckdb/experimental/spark/sql/catalog.py +78 -0
  21. duckdb/experimental/spark/sql/column.py +368 -0
  22. duckdb/experimental/spark/sql/conf.py +23 -0
  23. duckdb/experimental/spark/sql/dataframe.py +1437 -0
  24. duckdb/experimental/spark/sql/functions.py +6221 -0
  25. duckdb/experimental/spark/sql/group.py +420 -0
  26. duckdb/experimental/spark/sql/readwriter.py +449 -0
  27. duckdb/experimental/spark/sql/session.py +292 -0
  28. duckdb/experimental/spark/sql/streaming.py +37 -0
  29. duckdb/experimental/spark/sql/type_utils.py +105 -0
  30. duckdb/experimental/spark/sql/types.py +1275 -0
  31. duckdb/experimental/spark/sql/udf.py +37 -0
  32. duckdb/filesystem.py +23 -0
  33. duckdb/functional/__init__.py +17 -0
  34. duckdb/functional/__init__.pyi +31 -0
  35. duckdb/polars_io.py +237 -0
  36. duckdb/query_graph/__main__.py +363 -0
  37. duckdb/typing/__init__.py +61 -0
  38. duckdb/typing/__init__.pyi +36 -0
  39. duckdb/udf.py +19 -0
  40. duckdb/value/__init__.py +0 -0
  41. duckdb/value/__init__.pyi +0 -0
  42. duckdb/value/constant/__init__.py +268 -0
  43. duckdb/value/constant/__init__.pyi +115 -0
  44. duckdb-1.5.0.dev37.dist-info/METADATA +80 -0
  45. duckdb-1.5.0.dev37.dist-info/RECORD +47 -0
  46. duckdb-1.5.0.dev37.dist-info/WHEEL +6 -0
  47. duckdb-1.5.0.dev37.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,363 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ import re
5
+ import webbrowser
6
+ from functools import reduce
7
+ import argparse
8
+
9
+ qgraph_css = """
10
+ .styled-table {
11
+ border-collapse: collapse;
12
+ margin: 25px 0;
13
+ font-size: 0.9em;
14
+ font-family: sans-serif;
15
+ min-width: 400px;
16
+ box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
17
+ }
18
+ .styled-table thead tr {
19
+ background-color: #009879;
20
+ color: #ffffff;
21
+ text-align: left;
22
+ }
23
+ .styled-table th,
24
+ .styled-table td {
25
+ padding: 12px 15px;
26
+ }
27
+ .styled-table tbody tr {
28
+ border-bottom: 1px solid #dddddd;
29
+ }
30
+
31
+ .styled-table tbody tr:nth-of-type(even) {
32
+ background-color: #f3f3f3;
33
+ }
34
+
35
+ .styled-table tbody tr:last-of-type {
36
+ border-bottom: 2px solid #009879;
37
+ }
38
+
39
+ .node-body {
40
+ font-size:15px;
41
+ }
42
+ .tf-nc {
43
+ position: relative;
44
+ width: 180px;
45
+ text-align: center;
46
+ background-color: #fff100;
47
+ }
48
+ .custom-tooltip {
49
+ position: relative;
50
+ display: inline-block;
51
+ }
52
+
53
+ .tooltip-text {
54
+ visibility: hidden;
55
+ background-color: #333;
56
+ color: #fff;
57
+ text-align: center;
58
+ padding: 0px;
59
+ border-radius: 1px;
60
+
61
+ /* Positioning */
62
+ position: absolute;
63
+ z-index: 1;
64
+ bottom: 100%;
65
+ left: 50%;
66
+ transform: translateX(-50%);
67
+ margin-bottom: 8px;
68
+
69
+ /* Tooltip Arrow */
70
+ width: 400px;
71
+ }
72
+
73
+ .custom-tooltip:hover .tooltip-text {
74
+ visibility: visible;
75
+ }
76
+ """
77
+
78
+
79
+ class NodeTiming:
80
+
81
+ def __init__(self, phase: str, time: float) -> object:
82
+ self.phase = phase
83
+ self.time = time
84
+ # percentage is determined later.
85
+ self.percentage = 0
86
+
87
+ def calculate_percentage(self, total_time: float) -> None:
88
+ self.percentage = self.time / total_time
89
+
90
+ def combine_timing(l: object, r: object) -> object:
91
+ # TODO: can only add timings for same-phase nodes
92
+ total_time = l.time + r.time
93
+ return NodeTiming(l.phase, total_time)
94
+
95
+
96
+ class AllTimings:
97
+
98
+ def __init__(self):
99
+ self.phase_to_timings = {}
100
+
101
+ def add_node_timing(self, node_timing: NodeTiming):
102
+ if node_timing.phase in self.phase_to_timings:
103
+ self.phase_to_timings[node_timing.phase].append(node_timing)
104
+ return
105
+ self.phase_to_timings[node_timing.phase] = [node_timing]
106
+
107
+ def get_phase_timings(self, phase: str):
108
+ return self.phase_to_timings[phase]
109
+
110
+ def get_summary_phase_timings(self, phase: str):
111
+ return reduce(NodeTiming.combine_timing, self.phase_to_timings[phase])
112
+
113
+ def get_phases(self):
114
+ phases = list(self.phase_to_timings.keys())
115
+ phases.sort(key=lambda x: (self.get_summary_phase_timings(x)).time)
116
+ phases.reverse()
117
+ return phases
118
+
119
+ def get_sum_of_all_timings(self):
120
+ total_timing_sum = 0
121
+ for phase in self.phase_to_timings.keys():
122
+ total_timing_sum += self.get_summary_phase_timings(phase).time
123
+ return total_timing_sum
124
+
125
+
126
+ def open_utf8(fpath: str, flags: str) -> object:
127
+ return open(fpath, flags, encoding="utf8")
128
+
129
+
130
+ def get_child_timings(top_node: object, query_timings: object) -> str:
131
+ node_timing = NodeTiming(top_node['operator_type'], float(top_node['operator_timing']))
132
+ query_timings.add_node_timing(node_timing)
133
+ for child in top_node['children']:
134
+ get_child_timings(child, query_timings)
135
+
136
+
137
+ def get_pink_shade_hex(fraction: float):
138
+ fraction = max(0, min(1, fraction))
139
+
140
+ # Define the RGB values for very light pink (almost white) and dark pink
141
+ light_pink = (255, 250, 250) # Very light pink
142
+ dark_pink = (255, 20, 147) # Dark pink
143
+
144
+ # Calculate the RGB values for the given fraction
145
+ r = int(light_pink[0] + (dark_pink[0] - light_pink[0]) * fraction)
146
+ g = int(light_pink[1] + (dark_pink[1] - light_pink[1]) * fraction)
147
+ b = int(light_pink[2] + (dark_pink[2] - light_pink[2]) * fraction)
148
+
149
+ # Return as hexadecimal color code
150
+ return f"#{r:02x}{g:02x}{b:02x}"
151
+
152
+ def get_node_body(name: str, result: str, cpu_time: float, card: int, est: int, width: int, extra_info: str) -> str:
153
+ node_style = f"background-color: {get_pink_shade_hex(float(result)/cpu_time)};"
154
+
155
+ body = f"<span class=\"tf-nc custom-tooltip\" style=\"{node_style}\">"
156
+ body += "<div class=\"node-body\">"
157
+ new_name = "BRIDGE" if (name == "INVALID") else name.replace("_", " ")
158
+ formatted_num = f"{float(result):.4f}"
159
+ body += f"<p><b>{new_name}</b> </p><p>time: {formatted_num} seconds</p>"
160
+ body += f"<span class=\"tooltip-text\"> {extra_info} </span>"
161
+ if (width > 0):
162
+ body += f"<p>cardinality: {card}</p>"
163
+ body += f"<p>estimate: {est}</p>"
164
+ body += f"<p>width: {width} bytes</p>"
165
+ # TODO: Expand on timing. Usually available from a detailed profiling
166
+ body += "</div>"
167
+ body += "</span>"
168
+ return body
169
+
170
+
171
+ def generate_tree_recursive(json_graph: object, cpu_time: float) -> str:
172
+ node_prefix_html = "<li>"
173
+ node_suffix_html = "</li>"
174
+
175
+ extra_info = ""
176
+ estimate = 0
177
+ for key in json_graph['extra_info']:
178
+ value = json_graph['extra_info'][key]
179
+ if (key == "Estimated Cardinality"):
180
+ estimate = int(value)
181
+ else:
182
+ extra_info += f"{key}: {value} <br>"
183
+ cardinality = json_graph["operator_cardinality"]
184
+ width = int(json_graph["result_set_size"]/max(1,cardinality))
185
+
186
+ # get rid of some typically long names
187
+ extra_info = re.sub(r"__internal_\s*", "__", extra_info)
188
+ extra_info = re.sub(r"compress_integral\s*", "compress", extra_info)
189
+
190
+ node_body = get_node_body(json_graph["operator_type"],
191
+ json_graph["operator_timing"],
192
+ cpu_time, cardinality, estimate, width,
193
+ re.sub(r",\s*", ", ", extra_info))
194
+
195
+ children_html = ""
196
+ if len(json_graph['children']) >= 1:
197
+ children_html += "<ul>"
198
+ for child in json_graph["children"]:
199
+ children_html += generate_tree_recursive(child, cpu_time)
200
+ children_html += "</ul>"
201
+ return node_prefix_html + node_body + children_html + node_suffix_html
202
+
203
+
204
+ # For generating the table in the top left.
205
+ def generate_timing_html(graph_json: object, query_timings: object) -> object:
206
+ json_graph = json.loads(graph_json)
207
+ gather_timing_information(json_graph, query_timings)
208
+ total_time = float(json_graph.get('operator_timing') or json_graph.get('latency'))
209
+ table_head = """
210
+ <table class=\"styled-table\">
211
+ <thead>
212
+ <tr>
213
+ <th>Phase</th>
214
+ <th>Time</th>
215
+ <th>Percentage</th>
216
+ </tr>
217
+ </thead>"""
218
+
219
+ table_body = "<tbody>"
220
+ table_end = "</tbody></table>"
221
+
222
+ execution_time = query_timings.get_sum_of_all_timings()
223
+
224
+ all_phases = query_timings.get_phases()
225
+ query_timings.add_node_timing(NodeTiming("TOTAL TIME", total_time))
226
+ query_timings.add_node_timing(NodeTiming("Execution Time", execution_time))
227
+ all_phases = ["TOTAL TIME", "Execution Time"] + all_phases
228
+ for phase in all_phases:
229
+ summarized_phase = query_timings.get_summary_phase_timings(phase)
230
+ summarized_phase.calculate_percentage(total_time)
231
+ phase_column = f"<b>{phase}</b>" if phase == "TOTAL TIME" or phase == "Execution Time" else phase
232
+ table_body += f"""
233
+ <tr>
234
+ <td>{phase_column}</td>
235
+ <td>{summarized_phase.time}</td>
236
+ <td>{str(summarized_phase.percentage * 100)[:6]}%</td>
237
+ </tr>
238
+ """
239
+ table_body += table_end
240
+ return table_head + table_body
241
+
242
+
243
+ def generate_tree_html(graph_json: object) -> str:
244
+ json_graph = json.loads(graph_json)
245
+ cpu_time = float(json_graph['cpu_time'])
246
+ tree_prefix = "<div class=\"tf-tree tf-gap-sm\"> \n <ul>"
247
+ tree_suffix = "</ul> </div>"
248
+ # first level of json is general overview
249
+ # FIXME: make sure json output first level always has only 1 level
250
+ tree_body = generate_tree_recursive(json_graph['children'][0], cpu_time)
251
+ return tree_prefix + tree_body + tree_suffix
252
+
253
+
254
+ def generate_ipython(json_input: str) -> str:
255
+ from IPython.core.display import HTML
256
+
257
+ html_output = generate_html(json_input, False)
258
+
259
+ return HTML(("\n"
260
+ " ${CSS}\n"
261
+ " ${LIBRARIES}\n"
262
+ " <div class=\"chart\" id=\"query-profile\"></div>\n"
263
+ " ${CHART_SCRIPT}\n"
264
+ " ").replace("${CSS}", html_output['css']).replace('${CHART_SCRIPT}',
265
+ html_output['chart_script']).replace(
266
+ '${LIBRARIES}', html_output['libraries']))
267
+
268
+
269
+ def generate_style_html(graph_json: str, include_meta_info: bool) -> None:
270
+ treeflex_css = "<link rel=\"stylesheet\" href=\"https://unpkg.com/treeflex/dist/css/treeflex.css\">\n"
271
+ css = "<style>\n"
272
+ css += qgraph_css + "\n"
273
+ css += "</style>\n"
274
+ return {
275
+ 'treeflex_css': treeflex_css,
276
+ 'duckdb_css': css,
277
+ 'libraries': '',
278
+ 'chart_script': ''
279
+ }
280
+
281
+
282
+ def gather_timing_information(json: str, query_timings: object) -> None:
283
+ # add up all of the times
284
+ # measure each time as a percentage of the total time.
285
+ # then you can return a list of [phase, time, percentage]
286
+ get_child_timings(json['children'][0], query_timings)
287
+
288
+
289
+ def translate_json_to_html(input_file: str, output_file: str) -> None:
290
+ query_timings = AllTimings()
291
+ with open_utf8(input_file, 'r') as f:
292
+ text = f.read()
293
+
294
+ html_output = generate_style_html(text, True)
295
+ timing_table = generate_timing_html(text, query_timings)
296
+ tree_output = generate_tree_html(text)
297
+
298
+ # finally create and write the html
299
+ with open_utf8(output_file, "w+") as f:
300
+ html = """<!DOCTYPE html>
301
+ <html>
302
+ <head>
303
+ <meta charset="utf-8">
304
+ <meta name="viewport" content="width=device-width">
305
+ <title>Query Profile Graph for Query</title>
306
+ ${TREEFLEX_CSS}
307
+ <style>
308
+ ${DUCKDB_CSS}
309
+ </style>
310
+ </head>
311
+ <body>
312
+ <div id="meta-info"></div>
313
+ <div class="chart" id="query-profile">
314
+ ${TIMING_TABLE}
315
+ </div>
316
+ ${TREE}
317
+ </body>
318
+ </html>
319
+ """
320
+ html = html.replace("${TREEFLEX_CSS}", html_output['treeflex_css'])
321
+ html = html.replace("${DUCKDB_CSS}", html_output['duckdb_css'])
322
+ html = html.replace("${TIMING_TABLE}", timing_table)
323
+ html = html.replace('${TREE}', tree_output)
324
+ f.write(html)
325
+
326
+
327
+ def main() -> None:
328
+ if sys.version_info[0] < 3:
329
+ print("Please use python3")
330
+ exit(1)
331
+ parser = argparse.ArgumentParser(
332
+ prog='Query Graph Generator',
333
+ description='Given a json profile output, generate a html file showing the query graph and timings of operators')
334
+ parser.add_argument('profile_input', help='profile input in json')
335
+ parser.add_argument('--out', required=False, default=False)
336
+ parser.add_argument('--open', required=False, action='store_true', default=True)
337
+ args = parser.parse_args()
338
+
339
+ input = args.profile_input
340
+ output = args.out
341
+ if not args.out:
342
+ if ".json" in input:
343
+ output = input.replace(".json", ".html")
344
+ else:
345
+ print("please provide profile output in json")
346
+ exit(1)
347
+ else:
348
+ if ".html" in args.out:
349
+ output = args.out
350
+ else:
351
+ print("please provide valid .html file for output name")
352
+ exit(1)
353
+
354
+ open_output = args.open
355
+
356
+ translate_json_to_html(input, output)
357
+
358
+ if open_output:
359
+ webbrowser.open('file://' + os.path.abspath(output), new=2)
360
+
361
+
362
+ if __name__ == '__main__':
363
+ main()
@@ -0,0 +1,61 @@
1
+ from _duckdb.typing import (
2
+ DuckDBPyType,
3
+ BIGINT,
4
+ BIT,
5
+ BLOB,
6
+ BOOLEAN,
7
+ DATE,
8
+ DOUBLE,
9
+ FLOAT,
10
+ HUGEINT,
11
+ UHUGEINT,
12
+ INTEGER,
13
+ INTERVAL,
14
+ SMALLINT,
15
+ SQLNULL,
16
+ TIME,
17
+ TIMESTAMP,
18
+ TIMESTAMP_MS,
19
+ TIMESTAMP_NS,
20
+ TIMESTAMP_S,
21
+ TIMESTAMP_TZ,
22
+ TIME_TZ,
23
+ TINYINT,
24
+ UBIGINT,
25
+ UINTEGER,
26
+ USMALLINT,
27
+ UTINYINT,
28
+ UUID,
29
+ VARCHAR
30
+ )
31
+
32
+ __all__ = [
33
+ "DuckDBPyType",
34
+ "BIGINT",
35
+ "BIT",
36
+ "BLOB",
37
+ "BOOLEAN",
38
+ "DATE",
39
+ "DOUBLE",
40
+ "FLOAT",
41
+ "HUGEINT",
42
+ "UHUGEINT",
43
+ "INTEGER",
44
+ "INTERVAL",
45
+ "SMALLINT",
46
+ "SQLNULL",
47
+ "TIME",
48
+ "TIMESTAMP",
49
+ "TIMESTAMP_MS",
50
+ "TIMESTAMP_NS",
51
+ "TIMESTAMP_S",
52
+ "TIMESTAMP_TZ",
53
+ "TIME_TZ",
54
+ "TINYINT",
55
+ "UBIGINT",
56
+ "UINTEGER",
57
+ "USMALLINT",
58
+ "UTINYINT",
59
+ "UUID",
60
+ "VARCHAR"
61
+ ]
@@ -0,0 +1,36 @@
1
+ from duckdb import DuckDBPyConnection
2
+
3
+ SQLNULL: DuckDBPyType
4
+ BOOLEAN: DuckDBPyType
5
+ TINYINT: DuckDBPyType
6
+ UTINYINT: DuckDBPyType
7
+ SMALLINT: DuckDBPyType
8
+ USMALLINT: DuckDBPyType
9
+ INTEGER: DuckDBPyType
10
+ UINTEGER: DuckDBPyType
11
+ BIGINT: DuckDBPyType
12
+ UBIGINT: DuckDBPyType
13
+ HUGEINT: DuckDBPyType
14
+ UHUGEINT: DuckDBPyType
15
+ UUID: DuckDBPyType
16
+ FLOAT: DuckDBPyType
17
+ DOUBLE: DuckDBPyType
18
+ DATE: DuckDBPyType
19
+ TIMESTAMP: DuckDBPyType
20
+ TIMESTAMP_MS: DuckDBPyType
21
+ TIMESTAMP_NS: DuckDBPyType
22
+ TIMESTAMP_S: DuckDBPyType
23
+ TIME: DuckDBPyType
24
+ TIME_TZ: DuckDBPyType
25
+ TIMESTAMP_TZ: DuckDBPyType
26
+ VARCHAR: DuckDBPyType
27
+ BLOB: DuckDBPyType
28
+ BIT: DuckDBPyType
29
+ INTERVAL: DuckDBPyType
30
+
31
+ class DuckDBPyType:
32
+ def __init__(self, type_str: str, connection: DuckDBPyConnection = ...) -> None: ...
33
+ def __repr__(self) -> str: ...
34
+ def __eq__(self, other) -> bool: ...
35
+ def __getattr__(self, name: str): DuckDBPyType
36
+ def __getitem__(self, name: str): DuckDBPyType
duckdb/udf.py ADDED
@@ -0,0 +1,19 @@
1
+ def vectorized(func):
2
+ """
3
+ Decorate a function with annotated function parameters, so DuckDB can infer that the function should be provided with pyarrow arrays and should expect pyarrow array(s) as output
4
+ """
5
+ from inspect import signature
6
+ import types
7
+
8
+ new_func = types.FunctionType(func.__code__, func.__globals__, func.__name__, func.__defaults__, func.__closure__)
9
+ # Construct the annotations:
10
+ import pyarrow as pa
11
+
12
+ new_annotations = {}
13
+ sig = signature(func)
14
+ sig.parameters
15
+ for param in sig.parameters:
16
+ new_annotations[param] = pa.lib.ChunkedArray
17
+
18
+ new_func.__annotations__ = new_annotations
19
+ return new_func
File without changes
File without changes