duckdb 1.4.1.dev113__cp311-cp311-macosx_10_9_universal2.whl → 1.5.0.dev37__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (46) hide show
  1. _duckdb.cpython-311-darwin.so +0 -0
  2. duckdb/__init__.py +374 -373
  3. duckdb/__init__.pyi +180 -604
  4. duckdb/bytes_io_wrapper.py +7 -6
  5. duckdb/experimental/__init__.py +1 -2
  6. duckdb/experimental/spark/__init__.py +4 -3
  7. duckdb/experimental/spark/_globals.py +8 -8
  8. duckdb/experimental/spark/_typing.py +9 -7
  9. duckdb/experimental/spark/conf.py +15 -16
  10. duckdb/experimental/spark/context.py +44 -60
  11. duckdb/experimental/spark/errors/__init__.py +35 -33
  12. duckdb/experimental/spark/errors/error_classes.py +1 -1
  13. duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
  14. duckdb/experimental/spark/errors/exceptions/base.py +88 -39
  15. duckdb/experimental/spark/errors/utils.py +16 -11
  16. duckdb/experimental/spark/exception.py +6 -9
  17. duckdb/experimental/spark/sql/__init__.py +5 -5
  18. duckdb/experimental/spark/sql/_typing.py +15 -8
  19. duckdb/experimental/spark/sql/catalog.py +20 -21
  20. duckdb/experimental/spark/sql/column.py +54 -47
  21. duckdb/experimental/spark/sql/conf.py +8 -9
  22. duckdb/experimental/spark/sql/dataframe.py +233 -185
  23. duckdb/experimental/spark/sql/functions.py +1248 -1222
  24. duckdb/experimental/spark/sql/group.py +52 -56
  25. duckdb/experimental/spark/sql/readwriter.py +94 -80
  26. duckdb/experimental/spark/sql/session.py +59 -64
  27. duckdb/experimental/spark/sql/streaming.py +10 -9
  28. duckdb/experimental/spark/sql/type_utils.py +64 -66
  29. duckdb/experimental/spark/sql/types.py +344 -308
  30. duckdb/experimental/spark/sql/udf.py +6 -6
  31. duckdb/filesystem.py +8 -13
  32. duckdb/functional/__init__.py +16 -2
  33. duckdb/polars_io.py +57 -66
  34. duckdb/query_graph/__main__.py +96 -91
  35. duckdb/typing/__init__.py +8 -8
  36. duckdb/typing/__init__.pyi +2 -4
  37. duckdb/udf.py +5 -10
  38. duckdb/value/__init__.py +0 -1
  39. duckdb/value/constant/__init__.py +59 -61
  40. duckdb/value/constant/__init__.pyi +4 -3
  41. duckdb-1.5.0.dev37.dist-info/METADATA +80 -0
  42. duckdb-1.5.0.dev37.dist-info/RECORD +47 -0
  43. duckdb-1.4.1.dev113.dist-info/METADATA +0 -326
  44. duckdb-1.4.1.dev113.dist-info/RECORD +0 -47
  45. {duckdb-1.4.1.dev113.dist-info → duckdb-1.5.0.dev37.dist-info}/WHEEL +0 -0
  46. {duckdb-1.4.1.dev113.dist-info → duckdb-1.5.0.dev37.dist-info}/licenses/LICENSE +0 -0
@@ -1,31 +1,32 @@
1
- import uuid # noqa: D100
2
- from collections.abc import Iterable, Sized
3
- from typing import TYPE_CHECKING, Any, NoReturn, Optional, Union
4
-
5
- import duckdb
1
+ from typing import Optional, List, Any, Union, Iterable, TYPE_CHECKING
2
+ import uuid
6
3
 
7
4
  if TYPE_CHECKING:
8
- from pandas.core.frame import DataFrame as PandasDataFrame
9
-
10
5
  from .catalog import Catalog
6
+ from pandas.core.frame import DataFrame as PandasDataFrame
11
7
 
12
-
13
- from ..conf import SparkConf
14
- from ..context import SparkContext
15
- from ..errors import PySparkTypeError
16
8
  from ..exception import ContributionsAcceptedError
17
- from .conf import RuntimeConfig
9
+ from .types import StructType, AtomicType, DataType
10
+ from ..conf import SparkConf
18
11
  from .dataframe import DataFrame
12
+ from .conf import RuntimeConfig
19
13
  from .readwriter import DataFrameReader
20
- from .streaming import DataStreamReader
21
- from .types import StructType
14
+ from ..context import SparkContext
22
15
  from .udf import UDFRegistration
16
+ from .streaming import DataStreamReader
17
+ import duckdb
18
+
19
+ from ..errors import (
20
+ PySparkTypeError,
21
+ PySparkValueError
22
+ )
23
+
24
+ from ..errors.error_classes import *
23
25
 
24
26
  # In spark:
25
27
  # SparkSession holds a SparkContext
26
28
  # SparkContext gets created from SparkConf
27
- # At this level the check is made to determine whether the instance already exists and just needs
28
- # to be retrieved or it needs to be created.
29
+ # At this level the check is made to determine whether the instance already exists and just needs to be retrieved or it needs to be created
29
30
 
30
31
  # For us this is done inside of `duckdb.connect`, based on the passed in path + configuration
31
32
  # SparkContext can be compared to our Connection class, and SparkConf to our ClientContext class
@@ -33,7 +34,7 @@ from .udf import UDFRegistration
33
34
 
34
35
  # data is a List of rows
35
36
  # every value in each row needs to be turned into a Value
36
- def _combine_data_and_schema(data: Iterable[Any], schema: StructType) -> list[duckdb.Value]:
37
+ def _combine_data_and_schema(data: Iterable[Any], schema: StructType):
37
38
  from duckdb import Value
38
39
 
39
40
  new_data = []
@@ -43,8 +44,8 @@ def _combine_data_and_schema(data: Iterable[Any], schema: StructType) -> list[du
43
44
  return new_data
44
45
 
45
46
 
46
- class SparkSession: # noqa: D101
47
- def __init__(self, context: SparkContext) -> None: # noqa: D107
47
+ class SparkSession:
48
+ def __init__(self, context: SparkContext):
48
49
  self.conn = context.connection
49
50
  self._context = context
50
51
  self._conf = RuntimeConfig(self.conn)
@@ -52,16 +53,15 @@ class SparkSession: # noqa: D101
52
53
  def _create_dataframe(self, data: Union[Iterable[Any], "PandasDataFrame"]) -> DataFrame:
53
54
  try:
54
55
  import pandas
55
-
56
56
  has_pandas = True
57
57
  except ImportError:
58
58
  has_pandas = False
59
59
  if has_pandas and isinstance(data, pandas.DataFrame):
60
- unique_name = f"pyspark_pandas_df_{uuid.uuid1()}"
60
+ unique_name = f'pyspark_pandas_df_{uuid.uuid1()}'
61
61
  self.conn.register(unique_name, data)
62
62
  return DataFrame(self.conn.sql(f'select * from "{unique_name}"'), self)
63
63
 
64
- def verify_tuple_integrity(tuples: list[tuple]) -> None:
64
+ def verify_tuple_integrity(tuples):
65
65
  if len(tuples) <= 1:
66
66
  return
67
67
  expected_length = len(tuples[0])
@@ -73,9 +73,9 @@ class SparkSession: # noqa: D101
73
73
  error_class="LENGTH_SHOULD_BE_THE_SAME",
74
74
  message_parameters={
75
75
  "arg1": f"data{i}",
76
- "arg2": f"data{i + 1}",
76
+ "arg2": f"data{i+1}",
77
77
  "arg1_length": str(expected_length),
78
- "arg2_length": str(actual_length),
78
+ "arg2_length": str(actual_length)
79
79
  },
80
80
  )
81
81
 
@@ -83,16 +83,16 @@ class SparkSession: # noqa: D101
83
83
  data = list(data)
84
84
  verify_tuple_integrity(data)
85
85
 
86
- def construct_query(tuples: Iterable) -> str:
87
- def construct_values_list(row: Sized, start_param_idx: int) -> str:
86
+ def construct_query(tuples) -> str:
87
+ def construct_values_list(row, start_param_idx):
88
88
  parameter_count = len(row)
89
- parameters = [f"${x + start_param_idx}" for x in range(parameter_count)]
90
- parameters = "(" + ", ".join(parameters) + ")"
89
+ parameters = [f'${x+start_param_idx}' for x in range(parameter_count)]
90
+ parameters = '(' + ', '.join(parameters) + ')'
91
91
  return parameters
92
92
 
93
93
  row_size = len(tuples[0])
94
94
  values_list = [construct_values_list(x, 1 + (i * row_size)) for i, x in enumerate(tuples)]
95
- values_list = ", ".join(values_list)
95
+ values_list = ', '.join(values_list)
96
96
 
97
97
  query = f"""
98
98
  select * from (values {values_list})
@@ -101,7 +101,7 @@ class SparkSession: # noqa: D101
101
101
 
102
102
  query = construct_query(data)
103
103
 
104
- def construct_parameters(tuples: Iterable) -> list[list]:
104
+ def construct_parameters(tuples):
105
105
  parameters = []
106
106
  for row in tuples:
107
107
  parameters.extend(list(row))
@@ -112,9 +112,7 @@ class SparkSession: # noqa: D101
112
112
  rel = self.conn.sql(query, params=parameters)
113
113
  return DataFrame(rel, self)
114
114
 
115
- def _createDataFrameFromPandas(
116
- self, data: "PandasDataFrame", types: Union[list[str], None], names: Union[list[str], None]
117
- ) -> DataFrame:
115
+ def _createDataFrameFromPandas(self, data: "PandasDataFrame", types, names) -> DataFrame:
118
116
  df = self._create_dataframe(data)
119
117
 
120
118
  # Cast to types
@@ -125,10 +123,10 @@ class SparkSession: # noqa: D101
125
123
  df = df.toDF(*names)
126
124
  return df
127
125
 
128
- def createDataFrame( # noqa: D102
126
+ def createDataFrame(
129
127
  self,
130
128
  data: Union["PandasDataFrame", Iterable[Any]],
131
- schema: Optional[Union[StructType, list[str]]] = None,
129
+ schema: Optional[Union[StructType, List[str]]] = None,
132
130
  samplingRatio: Optional[float] = None,
133
131
  verifySchema: bool = True,
134
132
  ) -> DataFrame:
@@ -177,7 +175,7 @@ class SparkSession: # noqa: D101
177
175
  if is_empty:
178
176
  rel = df.relation
179
177
  # Add impossible where clause
180
- rel = rel.filter("1=0")
178
+ rel = rel.filter('1=0')
181
179
  df = DataFrame(rel, self)
182
180
 
183
181
  # Cast to types
@@ -188,10 +186,10 @@ class SparkSession: # noqa: D101
188
186
  df = df.toDF(*names)
189
187
  return df
190
188
 
191
- def newSession(self) -> "SparkSession": # noqa: D102
189
+ def newSession(self) -> "SparkSession":
192
190
  return SparkSession(self._context)
193
191
 
194
- def range( # noqa: D102
192
+ def range(
195
193
  self,
196
194
  start: int,
197
195
  end: Optional[int] = None,
@@ -205,26 +203,26 @@ class SparkSession: # noqa: D101
205
203
  end = start
206
204
  start = 0
207
205
 
208
- return DataFrame(self.conn.table_function("range", parameters=[start, end, step]), self)
206
+ return DataFrame(self.conn.table_function("range", parameters=[start, end, step]),self)
209
207
 
210
- def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame: # noqa: D102, ANN401
208
+ def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame:
211
209
  if kwargs:
212
210
  raise NotImplementedError
213
211
  relation = self.conn.sql(sqlQuery)
214
212
  return DataFrame(relation, self)
215
213
 
216
- def stop(self) -> None: # noqa: D102
214
+ def stop(self) -> None:
217
215
  self._context.stop()
218
216
 
219
- def table(self, tableName: str) -> DataFrame: # noqa: D102
217
+ def table(self, tableName: str) -> DataFrame:
220
218
  relation = self.conn.table(tableName)
221
219
  return DataFrame(relation, self)
222
220
 
223
- def getActiveSession(self) -> "SparkSession": # noqa: D102
221
+ def getActiveSession(self) -> "SparkSession":
224
222
  return self
225
223
 
226
224
  @property
227
- def catalog(self) -> "Catalog": # noqa: D102
225
+ def catalog(self) -> "Catalog":
228
226
  if not hasattr(self, "_catalog"):
229
227
  from duckdb.experimental.spark.sql.catalog import Catalog
230
228
 
@@ -232,62 +230,59 @@ class SparkSession: # noqa: D101
232
230
  return self._catalog
233
231
 
234
232
  @property
235
- def conf(self) -> RuntimeConfig: # noqa: D102
233
+ def conf(self) -> RuntimeConfig:
236
234
  return self._conf
237
235
 
238
236
  @property
239
- def read(self) -> DataFrameReader: # noqa: D102
237
+ def read(self) -> DataFrameReader:
240
238
  return DataFrameReader(self)
241
239
 
242
240
  @property
243
- def readStream(self) -> DataStreamReader: # noqa: D102
241
+ def readStream(self) -> DataStreamReader:
244
242
  return DataStreamReader(self)
245
243
 
246
244
  @property
247
- def sparkContext(self) -> SparkContext: # noqa: D102
245
+ def sparkContext(self) -> SparkContext:
248
246
  return self._context
249
247
 
250
248
  @property
251
- def streams(self) -> NoReturn: # noqa: D102
249
+ def streams(self) -> Any:
252
250
  raise ContributionsAcceptedError
253
251
 
254
252
  @property
255
- def udf(self) -> UDFRegistration: # noqa: D102
253
+ def udf(self) -> UDFRegistration:
256
254
  return UDFRegistration(self)
257
255
 
258
256
  @property
259
- def version(self) -> str: # noqa: D102
260
- return "1.0.0"
257
+ def version(self) -> str:
258
+ return '1.0.0'
261
259
 
262
- class Builder: # noqa: D106
263
- def __init__(self) -> None: # noqa: D107
260
+ class Builder:
261
+ def __init__(self):
264
262
  pass
265
263
 
266
- def master(self, name: str) -> "SparkSession.Builder": # noqa: D102
264
+ def master(self, name: str) -> "SparkSession.Builder":
267
265
  # no-op
268
266
  return self
269
267
 
270
- def appName(self, name: str) -> "SparkSession.Builder": # noqa: D102
268
+ def appName(self, name: str) -> "SparkSession.Builder":
271
269
  # no-op
272
270
  return self
273
271
 
274
- def remote(self, url: str) -> "SparkSession.Builder": # noqa: D102
272
+ def remote(self, url: str) -> "SparkSession.Builder":
275
273
  # no-op
276
274
  return self
277
275
 
278
- def getOrCreate(self) -> "SparkSession": # noqa: D102
276
+ def getOrCreate(self) -> "SparkSession":
279
277
  context = SparkContext("__ignored__")
280
278
  return SparkSession(context)
281
279
 
282
- def config( # noqa: D102
283
- self,
284
- key: Optional[str] = None,
285
- value: Optional[Any] = None, # noqa: ANN401
286
- conf: Optional[SparkConf] = None,
280
+ def config(
281
+ self, key: Optional[str] = None, value: Optional[Any] = None, conf: Optional[SparkConf] = None
287
282
  ) -> "SparkSession.Builder":
288
283
  return self
289
284
 
290
- def enableHiveSupport(self) -> "SparkSession.Builder": # noqa: D102
285
+ def enableHiveSupport(self) -> "SparkSession.Builder":
291
286
  # no-op
292
287
  return self
293
288
 
@@ -1,5 +1,4 @@
1
- from typing import TYPE_CHECKING, Optional, Union # noqa: D100
2
-
1
+ from typing import TYPE_CHECKING, Optional, Union
3
2
  from .types import StructType
4
3
 
5
4
  if TYPE_CHECKING:
@@ -10,26 +9,28 @@ PrimitiveType = Union[bool, float, int, str]
10
9
  OptionalPrimitiveType = Optional[PrimitiveType]
11
10
 
12
11
 
13
- class DataStreamWriter: # noqa: D101
14
- def __init__(self, dataframe: "DataFrame") -> None: # noqa: D107
12
+ class DataStreamWriter:
13
+ def __init__(self, dataframe: "DataFrame"):
15
14
  self.dataframe = dataframe
16
15
 
17
- def toTable(self, table_name: str) -> None: # noqa: D102
16
+ def toTable(self, table_name: str) -> None:
18
17
  # Should we register the dataframe or create a table from the contents?
19
18
  raise NotImplementedError
20
19
 
21
20
 
22
- class DataStreamReader: # noqa: D101
23
- def __init__(self, session: "SparkSession") -> None: # noqa: D107
21
+ class DataStreamReader:
22
+ def __init__(self, session: "SparkSession"):
24
23
  self.session = session
25
24
 
26
- def load( # noqa: D102
25
+ def load(
27
26
  self,
28
27
  path: Optional[str] = None,
29
28
  format: Optional[str] = None,
30
29
  schema: Union[StructType, str, None] = None,
31
- **options: OptionalPrimitiveType,
30
+ **options: OptionalPrimitiveType
32
31
  ) -> "DataFrame":
32
+ from duckdb.experimental.spark.sql.dataframe import DataFrame
33
+
33
34
  raise NotImplementedError
34
35
 
35
36
 
@@ -1,107 +1,105 @@
1
- from typing import cast # noqa: D100
2
-
3
1
  from duckdb.typing import DuckDBPyType
4
-
2
+ from typing import List, Tuple, cast
5
3
  from .types import (
6
- ArrayType,
4
+ DataType,
5
+ StringType,
7
6
  BinaryType,
8
7
  BitstringType,
8
+ UUIDType,
9
9
  BooleanType,
10
- ByteType,
11
- DataType,
12
10
  DateType,
13
- DayTimeIntervalType,
11
+ TimestampType,
12
+ TimestampNTZType,
13
+ TimeType,
14
+ TimeNTZType,
15
+ TimestampNanosecondNTZType,
16
+ TimestampMilisecondNTZType,
17
+ TimestampSecondNTZType,
14
18
  DecimalType,
15
19
  DoubleType,
16
20
  FloatType,
17
- HugeIntegerType,
21
+ ByteType,
22
+ UnsignedByteType,
23
+ ShortType,
24
+ UnsignedShortType,
18
25
  IntegerType,
26
+ UnsignedIntegerType,
19
27
  LongType,
28
+ UnsignedLongType,
29
+ HugeIntegerType,
30
+ UnsignedHugeIntegerType,
31
+ DayTimeIntervalType,
32
+ ArrayType,
20
33
  MapType,
21
- ShortType,
22
- StringType,
23
34
  StructField,
24
35
  StructType,
25
- TimeNTZType,
26
- TimestampMilisecondNTZType,
27
- TimestampNanosecondNTZType,
28
- TimestampNTZType,
29
- TimestampSecondNTZType,
30
- TimestampType,
31
- TimeType,
32
- UnsignedByteType,
33
- UnsignedHugeIntegerType,
34
- UnsignedIntegerType,
35
- UnsignedLongType,
36
- UnsignedShortType,
37
- UUIDType,
38
36
  )
39
37
 
40
38
  _sqltype_to_spark_class = {
41
- "boolean": BooleanType,
42
- "utinyint": UnsignedByteType,
43
- "tinyint": ByteType,
44
- "usmallint": UnsignedShortType,
45
- "smallint": ShortType,
46
- "uinteger": UnsignedIntegerType,
47
- "integer": IntegerType,
48
- "ubigint": UnsignedLongType,
49
- "bigint": LongType,
50
- "hugeint": HugeIntegerType,
51
- "uhugeint": UnsignedHugeIntegerType,
52
- "varchar": StringType,
53
- "blob": BinaryType,
54
- "bit": BitstringType,
55
- "uuid": UUIDType,
56
- "date": DateType,
57
- "time": TimeNTZType,
58
- "time with time zone": TimeType,
59
- "timestamp": TimestampNTZType,
60
- "timestamp with time zone": TimestampType,
61
- "timestamp_ms": TimestampNanosecondNTZType,
62
- "timestamp_ns": TimestampMilisecondNTZType,
63
- "timestamp_s": TimestampSecondNTZType,
64
- "interval": DayTimeIntervalType,
65
- "list": ArrayType,
66
- "struct": StructType,
67
- "map": MapType,
39
+ 'boolean': BooleanType,
40
+ 'utinyint': UnsignedByteType,
41
+ 'tinyint': ByteType,
42
+ 'usmallint': UnsignedShortType,
43
+ 'smallint': ShortType,
44
+ 'uinteger': UnsignedIntegerType,
45
+ 'integer': IntegerType,
46
+ 'ubigint': UnsignedLongType,
47
+ 'bigint': LongType,
48
+ 'hugeint': HugeIntegerType,
49
+ 'uhugeint': UnsignedHugeIntegerType,
50
+ 'varchar': StringType,
51
+ 'blob': BinaryType,
52
+ 'bit': BitstringType,
53
+ 'uuid': UUIDType,
54
+ 'date': DateType,
55
+ 'time': TimeNTZType,
56
+ 'time with time zone': TimeType,
57
+ 'timestamp': TimestampNTZType,
58
+ 'timestamp with time zone': TimestampType,
59
+ 'timestamp_ms': TimestampNanosecondNTZType,
60
+ 'timestamp_ns': TimestampMilisecondNTZType,
61
+ 'timestamp_s': TimestampSecondNTZType,
62
+ 'interval': DayTimeIntervalType,
63
+ 'list': ArrayType,
64
+ 'struct': StructType,
65
+ 'map': MapType,
68
66
  # union
69
67
  # enum
70
68
  # null (???)
71
- "float": FloatType,
72
- "double": DoubleType,
73
- "decimal": DecimalType,
69
+ 'float': FloatType,
70
+ 'double': DoubleType,
71
+ 'decimal': DecimalType,
74
72
  }
75
73
 
76
74
 
77
- def convert_nested_type(dtype: DuckDBPyType) -> DataType: # noqa: D103
75
+ def convert_nested_type(dtype: DuckDBPyType) -> DataType:
78
76
  id = dtype.id
79
- if id == "list" or id == "array":
77
+ if id == 'list' or id == 'array':
80
78
  children = dtype.children
81
79
  return ArrayType(convert_type(children[0][1]))
82
- # TODO: add support for 'union' # noqa: TD002, TD003
83
- if id == "struct":
84
- children: list[tuple[str, DuckDBPyType]] = dtype.children
80
+ # TODO: add support for 'union'
81
+ if id == 'struct':
82
+ children: List[Tuple[str, DuckDBPyType]] = dtype.children
85
83
  fields = [StructField(x[0], convert_type(x[1])) for x in children]
86
84
  return StructType(fields)
87
- if id == "map":
85
+ if id == 'map':
88
86
  return MapType(convert_type(dtype.key), convert_type(dtype.value))
89
87
  raise NotImplementedError
90
88
 
91
89
 
92
- def convert_type(dtype: DuckDBPyType) -> DataType: # noqa: D103
90
+ def convert_type(dtype: DuckDBPyType) -> DataType:
93
91
  id = dtype.id
94
- if id in ["list", "struct", "map", "array"]:
92
+ if id in ['list', 'struct', 'map', 'array']:
95
93
  return convert_nested_type(dtype)
96
- if id == "decimal":
97
- children: list[tuple[str, DuckDBPyType]] = dtype.children
98
- precision = cast("int", children[0][1])
99
- scale = cast("int", children[1][1])
94
+ if id == 'decimal':
95
+ children: List[Tuple[str, DuckDBPyType]] = dtype.children
96
+ precision = cast(int, children[0][1])
97
+ scale = cast(int, children[1][1])
100
98
  return DecimalType(precision, scale)
101
99
  spark_type = _sqltype_to_spark_class[id]
102
100
  return spark_type()
103
101
 
104
102
 
105
- def duckdb_to_spark_schema(names: list[str], types: list[DuckDBPyType]) -> StructType: # noqa: D103
103
+ def duckdb_to_spark_schema(names: List[str], types: List[DuckDBPyType]) -> StructType:
106
104
  fields = [StructField(name, dtype) for name, dtype in zip(names, [convert_type(x) for x in types])]
107
105
  return StructType(fields)