polars-runtime-compat 1.34.0b3__cp39-abi3-macosx_11_0_arm64.whl → 1.34.0b4__cp39-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of polars-runtime-compat might be problematic. Click here for more details.

Files changed (203) hide show
  1. _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
  2. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/METADATA +1 -1
  3. polars_runtime_compat-1.34.0b4.dist-info/RECORD +6 -0
  4. polars/__init__.py +0 -528
  5. polars/_cpu_check.py +0 -265
  6. polars/_dependencies.py +0 -355
  7. polars/_plr.py +0 -99
  8. polars/_plr.pyi +0 -2496
  9. polars/_reexport.py +0 -23
  10. polars/_typing.py +0 -478
  11. polars/_utils/__init__.py +0 -37
  12. polars/_utils/async_.py +0 -102
  13. polars/_utils/cache.py +0 -176
  14. polars/_utils/cloud.py +0 -40
  15. polars/_utils/constants.py +0 -29
  16. polars/_utils/construction/__init__.py +0 -46
  17. polars/_utils/construction/dataframe.py +0 -1397
  18. polars/_utils/construction/other.py +0 -72
  19. polars/_utils/construction/series.py +0 -560
  20. polars/_utils/construction/utils.py +0 -118
  21. polars/_utils/convert.py +0 -224
  22. polars/_utils/deprecation.py +0 -406
  23. polars/_utils/getitem.py +0 -457
  24. polars/_utils/logging.py +0 -11
  25. polars/_utils/nest_asyncio.py +0 -264
  26. polars/_utils/parquet.py +0 -15
  27. polars/_utils/parse/__init__.py +0 -12
  28. polars/_utils/parse/expr.py +0 -242
  29. polars/_utils/polars_version.py +0 -19
  30. polars/_utils/pycapsule.py +0 -53
  31. polars/_utils/scan.py +0 -27
  32. polars/_utils/serde.py +0 -63
  33. polars/_utils/slice.py +0 -215
  34. polars/_utils/udfs.py +0 -1251
  35. polars/_utils/unstable.py +0 -63
  36. polars/_utils/various.py +0 -782
  37. polars/_utils/wrap.py +0 -25
  38. polars/api.py +0 -370
  39. polars/catalog/__init__.py +0 -0
  40. polars/catalog/unity/__init__.py +0 -19
  41. polars/catalog/unity/client.py +0 -733
  42. polars/catalog/unity/models.py +0 -152
  43. polars/config.py +0 -1571
  44. polars/convert/__init__.py +0 -25
  45. polars/convert/general.py +0 -1046
  46. polars/convert/normalize.py +0 -261
  47. polars/dataframe/__init__.py +0 -5
  48. polars/dataframe/_html.py +0 -186
  49. polars/dataframe/frame.py +0 -12582
  50. polars/dataframe/group_by.py +0 -1067
  51. polars/dataframe/plotting.py +0 -257
  52. polars/datatype_expr/__init__.py +0 -5
  53. polars/datatype_expr/array.py +0 -56
  54. polars/datatype_expr/datatype_expr.py +0 -304
  55. polars/datatype_expr/list.py +0 -18
  56. polars/datatype_expr/struct.py +0 -69
  57. polars/datatypes/__init__.py +0 -122
  58. polars/datatypes/_parse.py +0 -195
  59. polars/datatypes/_utils.py +0 -48
  60. polars/datatypes/classes.py +0 -1213
  61. polars/datatypes/constants.py +0 -11
  62. polars/datatypes/constructor.py +0 -172
  63. polars/datatypes/convert.py +0 -366
  64. polars/datatypes/group.py +0 -130
  65. polars/exceptions.py +0 -230
  66. polars/expr/__init__.py +0 -7
  67. polars/expr/array.py +0 -964
  68. polars/expr/binary.py +0 -346
  69. polars/expr/categorical.py +0 -306
  70. polars/expr/datetime.py +0 -2620
  71. polars/expr/expr.py +0 -11272
  72. polars/expr/list.py +0 -1408
  73. polars/expr/meta.py +0 -444
  74. polars/expr/name.py +0 -321
  75. polars/expr/string.py +0 -3045
  76. polars/expr/struct.py +0 -357
  77. polars/expr/whenthen.py +0 -185
  78. polars/functions/__init__.py +0 -193
  79. polars/functions/aggregation/__init__.py +0 -33
  80. polars/functions/aggregation/horizontal.py +0 -298
  81. polars/functions/aggregation/vertical.py +0 -341
  82. polars/functions/as_datatype.py +0 -848
  83. polars/functions/business.py +0 -138
  84. polars/functions/col.py +0 -384
  85. polars/functions/datatype.py +0 -121
  86. polars/functions/eager.py +0 -524
  87. polars/functions/escape_regex.py +0 -29
  88. polars/functions/lazy.py +0 -2751
  89. polars/functions/len.py +0 -68
  90. polars/functions/lit.py +0 -210
  91. polars/functions/random.py +0 -22
  92. polars/functions/range/__init__.py +0 -19
  93. polars/functions/range/_utils.py +0 -15
  94. polars/functions/range/date_range.py +0 -303
  95. polars/functions/range/datetime_range.py +0 -370
  96. polars/functions/range/int_range.py +0 -348
  97. polars/functions/range/linear_space.py +0 -311
  98. polars/functions/range/time_range.py +0 -287
  99. polars/functions/repeat.py +0 -301
  100. polars/functions/whenthen.py +0 -353
  101. polars/interchange/__init__.py +0 -10
  102. polars/interchange/buffer.py +0 -77
  103. polars/interchange/column.py +0 -190
  104. polars/interchange/dataframe.py +0 -230
  105. polars/interchange/from_dataframe.py +0 -328
  106. polars/interchange/protocol.py +0 -303
  107. polars/interchange/utils.py +0 -170
  108. polars/io/__init__.py +0 -64
  109. polars/io/_utils.py +0 -317
  110. polars/io/avro.py +0 -49
  111. polars/io/clipboard.py +0 -36
  112. polars/io/cloud/__init__.py +0 -17
  113. polars/io/cloud/_utils.py +0 -80
  114. polars/io/cloud/credential_provider/__init__.py +0 -17
  115. polars/io/cloud/credential_provider/_builder.py +0 -520
  116. polars/io/cloud/credential_provider/_providers.py +0 -618
  117. polars/io/csv/__init__.py +0 -9
  118. polars/io/csv/_utils.py +0 -38
  119. polars/io/csv/batched_reader.py +0 -142
  120. polars/io/csv/functions.py +0 -1495
  121. polars/io/database/__init__.py +0 -6
  122. polars/io/database/_arrow_registry.py +0 -70
  123. polars/io/database/_cursor_proxies.py +0 -147
  124. polars/io/database/_executor.py +0 -578
  125. polars/io/database/_inference.py +0 -314
  126. polars/io/database/_utils.py +0 -144
  127. polars/io/database/functions.py +0 -516
  128. polars/io/delta.py +0 -499
  129. polars/io/iceberg/__init__.py +0 -3
  130. polars/io/iceberg/_utils.py +0 -697
  131. polars/io/iceberg/dataset.py +0 -556
  132. polars/io/iceberg/functions.py +0 -151
  133. polars/io/ipc/__init__.py +0 -8
  134. polars/io/ipc/functions.py +0 -514
  135. polars/io/json/__init__.py +0 -3
  136. polars/io/json/read.py +0 -101
  137. polars/io/ndjson.py +0 -332
  138. polars/io/parquet/__init__.py +0 -17
  139. polars/io/parquet/field_overwrites.py +0 -140
  140. polars/io/parquet/functions.py +0 -722
  141. polars/io/partition.py +0 -491
  142. polars/io/plugins.py +0 -187
  143. polars/io/pyarrow_dataset/__init__.py +0 -5
  144. polars/io/pyarrow_dataset/anonymous_scan.py +0 -109
  145. polars/io/pyarrow_dataset/functions.py +0 -79
  146. polars/io/scan_options/__init__.py +0 -5
  147. polars/io/scan_options/_options.py +0 -59
  148. polars/io/scan_options/cast_options.py +0 -126
  149. polars/io/spreadsheet/__init__.py +0 -6
  150. polars/io/spreadsheet/_utils.py +0 -52
  151. polars/io/spreadsheet/_write_utils.py +0 -647
  152. polars/io/spreadsheet/functions.py +0 -1323
  153. polars/lazyframe/__init__.py +0 -9
  154. polars/lazyframe/engine_config.py +0 -61
  155. polars/lazyframe/frame.py +0 -8564
  156. polars/lazyframe/group_by.py +0 -669
  157. polars/lazyframe/in_process.py +0 -42
  158. polars/lazyframe/opt_flags.py +0 -333
  159. polars/meta/__init__.py +0 -14
  160. polars/meta/build.py +0 -33
  161. polars/meta/index_type.py +0 -27
  162. polars/meta/thread_pool.py +0 -50
  163. polars/meta/versions.py +0 -120
  164. polars/ml/__init__.py +0 -0
  165. polars/ml/torch.py +0 -213
  166. polars/ml/utilities.py +0 -30
  167. polars/plugins.py +0 -155
  168. polars/py.typed +0 -0
  169. polars/pyproject.toml +0 -103
  170. polars/schema.py +0 -265
  171. polars/selectors.py +0 -3117
  172. polars/series/__init__.py +0 -5
  173. polars/series/array.py +0 -776
  174. polars/series/binary.py +0 -254
  175. polars/series/categorical.py +0 -246
  176. polars/series/datetime.py +0 -2275
  177. polars/series/list.py +0 -1087
  178. polars/series/plotting.py +0 -191
  179. polars/series/series.py +0 -9197
  180. polars/series/string.py +0 -2367
  181. polars/series/struct.py +0 -154
  182. polars/series/utils.py +0 -191
  183. polars/sql/__init__.py +0 -7
  184. polars/sql/context.py +0 -677
  185. polars/sql/functions.py +0 -139
  186. polars/string_cache.py +0 -185
  187. polars/testing/__init__.py +0 -13
  188. polars/testing/asserts/__init__.py +0 -9
  189. polars/testing/asserts/frame.py +0 -231
  190. polars/testing/asserts/series.py +0 -219
  191. polars/testing/asserts/utils.py +0 -12
  192. polars/testing/parametric/__init__.py +0 -33
  193. polars/testing/parametric/profiles.py +0 -107
  194. polars/testing/parametric/strategies/__init__.py +0 -22
  195. polars/testing/parametric/strategies/_utils.py +0 -14
  196. polars/testing/parametric/strategies/core.py +0 -615
  197. polars/testing/parametric/strategies/data.py +0 -452
  198. polars/testing/parametric/strategies/dtype.py +0 -436
  199. polars/testing/parametric/strategies/legacy.py +0 -169
  200. polars/type_aliases.py +0 -24
  201. polars_runtime_compat-1.34.0b3.dist-info/RECORD +0 -203
  202. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/WHEEL +0 -0
  203. {polars_runtime_compat-1.34.0b3.dist-info → polars_runtime_compat-1.34.0b4.dist-info}/licenses/LICENSE +0 -0
polars/io/ipc/__init__.py DELETED
@@ -1,8 +0,0 @@
1
- from polars.io.ipc.functions import read_ipc, read_ipc_schema, read_ipc_stream, scan_ipc
2
-
3
- __all__ = [
4
- "read_ipc",
5
- "read_ipc_schema",
6
- "read_ipc_stream",
7
- "scan_ipc",
8
- ]
@@ -1,514 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import contextlib
4
- import os
5
- from pathlib import Path
6
- from typing import IO, TYPE_CHECKING, Any, Literal
7
-
8
- import polars._reexport as pl
9
- import polars.functions as F
10
- from polars._dependencies import import_optional
11
- from polars._utils.deprecation import deprecate_renamed_parameter
12
- from polars._utils.various import (
13
- is_path_or_str_sequence,
14
- is_str_sequence,
15
- normalize_filepath,
16
- )
17
- from polars._utils.wrap import wrap_df, wrap_ldf
18
- from polars.io._utils import (
19
- is_glob_pattern,
20
- is_local_file,
21
- parse_columns_arg,
22
- parse_row_index_args,
23
- prepare_file_arg,
24
- )
25
- from polars.io.cloud.credential_provider._builder import (
26
- _init_credential_provider_builder,
27
- )
28
-
29
- with contextlib.suppress(ImportError): # Module not available when building docs
30
- from polars._plr import PyDataFrame, PyLazyFrame
31
- from polars._plr import read_ipc_schema as _read_ipc_schema
32
-
33
- if TYPE_CHECKING:
34
- from collections.abc import Sequence
35
-
36
- from polars import DataFrame, DataType, LazyFrame
37
- from polars._typing import SchemaDict
38
- from polars.io.cloud import CredentialProviderFunction
39
-
40
-
41
- @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
42
- @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
43
- def read_ipc(
44
- source: str | Path | IO[bytes] | bytes,
45
- *,
46
- columns: list[int] | list[str] | None = None,
47
- n_rows: int | None = None,
48
- use_pyarrow: bool = False,
49
- memory_map: bool = True,
50
- storage_options: dict[str, Any] | None = None,
51
- row_index_name: str | None = None,
52
- row_index_offset: int = 0,
53
- rechunk: bool = True,
54
- ) -> DataFrame:
55
- """
56
- Read into a DataFrame from Arrow IPC (Feather v2) file.
57
-
58
- See "File or Random Access format" on https://arrow.apache.org/docs/python/ipc.html.
59
- Arrow IPC files are also known as Feather (v2) files.
60
-
61
- .. versionchanged:: 0.20.4
62
- * The `row_count_name` parameter was renamed `row_index_name`.
63
- * The `row_count_offset` parameter was renamed `row_index_offset`.
64
-
65
- Parameters
66
- ----------
67
- source
68
- Path to a file or a file-like object (by "file-like object" we refer to objects
69
- that have a `read()` method, such as a file handler like the builtin `open`
70
- function, or a `BytesIO` instance). If `fsspec` is installed, it will be used
71
- to open remote files. For file-like objects, the stream position may not be
72
- updated accordingly after reading.
73
- columns
74
- Columns to select. Accepts a list of column indices (starting at zero) or a list
75
- of column names.
76
- n_rows
77
- Stop reading from IPC file after reading `n_rows`.
78
- Only valid when `use_pyarrow=False`.
79
- use_pyarrow
80
- Use pyarrow or the native Rust reader.
81
- memory_map
82
- Try to memory map the file. This can greatly improve performance on repeated
83
- queries as the OS may cache pages.
84
- Only uncompressed IPC files can be memory mapped.
85
- storage_options
86
- Extra options that make sense for `fsspec.open()` or a particular storage
87
- connection, e.g. host, port, username, password, etc.
88
- row_index_name
89
- Insert a row index column with the given name into the DataFrame as the first
90
- column. If set to `None` (default), no row index column is created.
91
- row_index_offset
92
- Start the row index at this offset. Cannot be negative.
93
- Only used if `row_index_name` is set.
94
- rechunk
95
- Make sure that all data is contiguous.
96
-
97
- Returns
98
- -------
99
- DataFrame
100
-
101
- See Also
102
- --------
103
- scan_ipc : Lazily read from an IPC file or multiple files via glob patterns.
104
-
105
- Warnings
106
- --------
107
- Calling `read_ipc().lazy()` is an antipattern as this forces Polars to materialize
108
- a full csv file and therefore cannot push any optimizations into the reader.
109
- Therefore always prefer `scan_ipc` if you want to work with `LazyFrame` s.
110
-
111
- If `memory_map` is set, the bytes on disk are mapped 1:1 to memory.
112
- That means that you cannot write to the same filename.
113
- E.g. `pl.read_ipc("my_file.arrow").write_ipc("my_file.arrow")` will fail.
114
- """
115
- if (
116
- # Check that it is not a BytesIO object
117
- isinstance(v := source, (str, Path))
118
- ) and (
119
- # HuggingFace only for now ⊂( ◜◒◝ )⊃
120
- (is_hf := str(v).startswith("hf://"))
121
- # Also dispatch on FORCE_ASYNC, so that this codepath gets run
122
- # through by our test suite during CI.
123
- or os.getenv("POLARS_FORCE_ASYNC") == "1"
124
- # TODO: Dispatch all paths to `scan_ipc` - this will need a breaking
125
- # change to the `storage_options` parameter.
126
- ):
127
- if is_hf and use_pyarrow:
128
- msg = "`use_pyarrow=True` is not supported for Hugging Face"
129
- raise ValueError(msg)
130
-
131
- lf = scan_ipc(
132
- source,
133
- n_rows=n_rows,
134
- storage_options=storage_options,
135
- row_index_name=row_index_name,
136
- row_index_offset=row_index_offset,
137
- rechunk=rechunk,
138
- )
139
-
140
- if columns:
141
- if isinstance(columns[0], int):
142
- lf = lf.select(F.nth(columns)) # type: ignore[arg-type]
143
- else:
144
- lf = lf.select(columns)
145
-
146
- df = lf.collect()
147
-
148
- return df
149
-
150
- if use_pyarrow and n_rows and not memory_map:
151
- msg = "`n_rows` cannot be used with `use_pyarrow=True` and `memory_map=False`"
152
- raise ValueError(msg)
153
-
154
- with prepare_file_arg(
155
- source, use_pyarrow=use_pyarrow, storage_options=storage_options
156
- ) as data:
157
- if use_pyarrow:
158
- pyarrow_feather = import_optional(
159
- "pyarrow.feather",
160
- err_prefix="",
161
- err_suffix="is required when using 'read_ipc(..., use_pyarrow=True)'",
162
- )
163
- tbl = pyarrow_feather.read_table(
164
- data,
165
- memory_map=memory_map,
166
- columns=columns,
167
- )
168
- df = pl.DataFrame._from_arrow(tbl, rechunk=rechunk)
169
- if row_index_name is not None:
170
- df = df.with_row_index(row_index_name, row_index_offset)
171
- if n_rows is not None:
172
- df = df.slice(0, n_rows)
173
- return df
174
-
175
- return _read_ipc_impl(
176
- data,
177
- columns=columns,
178
- n_rows=n_rows,
179
- row_index_name=row_index_name,
180
- row_index_offset=row_index_offset,
181
- rechunk=rechunk,
182
- memory_map=memory_map,
183
- )
184
-
185
-
186
- def _read_ipc_impl(
187
- source: str | Path | IO[bytes] | bytes,
188
- *,
189
- columns: Sequence[int] | Sequence[str] | None = None,
190
- n_rows: int | None = None,
191
- row_index_name: str | None = None,
192
- row_index_offset: int = 0,
193
- rechunk: bool = True,
194
- memory_map: bool = True,
195
- ) -> DataFrame:
196
- if isinstance(source, (str, Path)):
197
- source = normalize_filepath(source, check_not_directory=False)
198
- if isinstance(columns, str):
199
- columns = [columns]
200
-
201
- if isinstance(source, str) and is_glob_pattern(source) and is_local_file(source):
202
- scan = scan_ipc(
203
- source,
204
- n_rows=n_rows,
205
- rechunk=rechunk,
206
- row_index_name=row_index_name,
207
- row_index_offset=row_index_offset,
208
- )
209
- if columns is None:
210
- df = scan.collect()
211
- elif is_str_sequence(columns, allow_str=False):
212
- df = scan.select(columns).collect()
213
- else:
214
- msg = (
215
- "cannot use glob patterns and integer based projection as `columns` argument"
216
- "\n\nUse columns: List[str]"
217
- )
218
- raise TypeError(msg)
219
- return df
220
-
221
- projection, columns = parse_columns_arg(columns)
222
- pydf = PyDataFrame.read_ipc(
223
- source,
224
- columns,
225
- projection,
226
- n_rows,
227
- parse_row_index_args(row_index_name, row_index_offset),
228
- memory_map=memory_map,
229
- )
230
- return wrap_df(pydf)
231
-
232
-
233
- @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
234
- @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
235
- def read_ipc_stream(
236
- source: str | Path | IO[bytes] | bytes,
237
- *,
238
- columns: list[int] | list[str] | None = None,
239
- n_rows: int | None = None,
240
- use_pyarrow: bool = False,
241
- storage_options: dict[str, Any] | None = None,
242
- row_index_name: str | None = None,
243
- row_index_offset: int = 0,
244
- rechunk: bool = True,
245
- ) -> DataFrame:
246
- """
247
- Read into a DataFrame from Arrow IPC record batch stream.
248
-
249
- See "Streaming format" on https://arrow.apache.org/docs/python/ipc.html.
250
-
251
- .. versionchanged:: 0.20.4
252
- * The `row_count_name` parameter was renamed `row_index_name`.
253
- * The `row_count_offset` parameter was renamed `row_index_offset`.
254
-
255
- Parameters
256
- ----------
257
- source
258
- Path to a file or a file-like object (by "file-like object" we refer to objects
259
- that have a `read()` method, such as a file handler like the builtin `open`
260
- function, or a `BytesIO` instance). If `fsspec` is installed, it will be used
261
- to open remote files. For file-like objects, the stream position may not be
262
- updated accordingly after reading.
263
- columns
264
- Columns to select. Accepts a list of column indices (starting at zero) or a list
265
- of column names.
266
- n_rows
267
- Stop reading from IPC stream after reading `n_rows`.
268
- Only valid when `use_pyarrow=False`.
269
- use_pyarrow
270
- Use pyarrow or the native Rust reader.
271
- storage_options
272
- Extra options that make sense for `fsspec.open()` or a particular storage
273
- connection, e.g. host, port, username, password, etc.
274
- row_index_name
275
- Insert a row index column with the given name into the DataFrame as the first
276
- column. If set to `None` (default), no row index column is created.
277
- row_index_offset
278
- Start the row index at this offset. Cannot be negative.
279
- Only used if `row_index_name` is set.
280
- rechunk
281
- Make sure that all data is contiguous.
282
-
283
- Returns
284
- -------
285
- DataFrame
286
- """
287
- with prepare_file_arg(
288
- source, use_pyarrow=use_pyarrow, storage_options=storage_options
289
- ) as data:
290
- if use_pyarrow:
291
- pyarrow_ipc = import_optional(
292
- "pyarrow.ipc",
293
- err_prefix="",
294
- err_suffix="is required when using 'read_ipc_stream(..., use_pyarrow=True)'",
295
- )
296
- with pyarrow_ipc.RecordBatchStreamReader(data) as reader:
297
- tbl = reader.read_all()
298
- df = pl.DataFrame._from_arrow(tbl, rechunk=rechunk)
299
- if row_index_name is not None:
300
- df = df.with_row_index(row_index_name, row_index_offset)
301
- if n_rows is not None:
302
- df = df.slice(0, n_rows)
303
- return df
304
-
305
- return _read_ipc_stream_impl(
306
- data,
307
- columns=columns,
308
- n_rows=n_rows,
309
- row_index_name=row_index_name,
310
- row_index_offset=row_index_offset,
311
- rechunk=rechunk,
312
- )
313
-
314
-
315
- def _read_ipc_stream_impl(
316
- source: str | Path | IO[bytes] | bytes,
317
- *,
318
- columns: Sequence[int] | Sequence[str] | None = None,
319
- n_rows: int | None = None,
320
- row_index_name: str | None = None,
321
- row_index_offset: int = 0,
322
- rechunk: bool = True,
323
- ) -> DataFrame:
324
- if isinstance(source, (str, Path)):
325
- source = normalize_filepath(source, check_not_directory=False)
326
- if isinstance(columns, str):
327
- columns = [columns]
328
-
329
- projection, columns = parse_columns_arg(columns)
330
- pydf = PyDataFrame.read_ipc_stream(
331
- source,
332
- columns,
333
- projection,
334
- n_rows,
335
- parse_row_index_args(row_index_name, row_index_offset),
336
- rechunk,
337
- )
338
- return wrap_df(pydf)
339
-
340
-
341
- def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> dict[str, DataType]:
342
- """
343
- Get the schema of an IPC file without reading data.
344
-
345
- Parameters
346
- ----------
347
- source
348
- Path to a file or a file-like object (by "file-like object" we refer to objects
349
- that have a `read()` method, such as a file handler like the builtin `open`
350
- function, or a `BytesIO` instance). For file-like objects, the stream position
351
- may not be updated accordingly after reading.
352
-
353
- Returns
354
- -------
355
- dict
356
- Dictionary mapping column names to datatypes
357
- """
358
- if isinstance(source, (str, Path)):
359
- source = normalize_filepath(source, check_not_directory=False)
360
-
361
- return _read_ipc_schema(source)
362
-
363
-
364
- @deprecate_renamed_parameter("row_count_name", "row_index_name", version="0.20.4")
365
- @deprecate_renamed_parameter("row_count_offset", "row_index_offset", version="0.20.4")
366
- def scan_ipc(
367
- source: (
368
- str
369
- | Path
370
- | IO[bytes]
371
- | bytes
372
- | list[str]
373
- | list[Path]
374
- | list[IO[bytes]]
375
- | list[bytes]
376
- ),
377
- *,
378
- n_rows: int | None = None,
379
- cache: bool = True,
380
- rechunk: bool = False,
381
- row_index_name: str | None = None,
382
- row_index_offset: int = 0,
383
- storage_options: dict[str, Any] | None = None,
384
- credential_provider: CredentialProviderFunction | Literal["auto"] | None = "auto",
385
- memory_map: bool = True,
386
- retries: int = 2,
387
- file_cache_ttl: int | None = None,
388
- hive_partitioning: bool | None = None,
389
- hive_schema: SchemaDict | None = None,
390
- try_parse_hive_dates: bool = True,
391
- include_file_paths: str | None = None,
392
- ) -> LazyFrame:
393
- """
394
- Lazily read from an Arrow IPC (Feather v2) file or multiple files via glob patterns.
395
-
396
- This allows the query optimizer to push down predicates and projections to the scan
397
- level, thereby potentially reducing memory overhead.
398
-
399
- .. versionchanged:: 0.20.4
400
- * The `row_count_name` parameter was renamed `row_index_name`.
401
- * The `row_count_offset` parameter was renamed `row_index_offset`.
402
-
403
- Parameters
404
- ----------
405
- source
406
- Path(s) to a file or directory
407
- When needing to authenticate for scanning cloud locations, see the
408
- `storage_options` parameter.
409
- n_rows
410
- Stop reading from IPC file after reading `n_rows`.
411
- cache
412
- Cache the result after reading.
413
- rechunk
414
- Reallocate to contiguous memory when all chunks/ files are parsed.
415
- row_index_name
416
- If not None, this will insert a row index column with give name into the
417
- DataFrame
418
- row_index_offset
419
- Offset to start the row index column (only use if the name is set)
420
- storage_options
421
- Options that indicate how to connect to a cloud provider.
422
-
423
- The cloud providers currently supported are AWS, GCP, and Azure.
424
- See supported keys here:
425
-
426
- * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
427
- * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
428
- * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
429
- * Hugging Face (`hf://`): Accepts an API key under the `token` parameter: \
430
- `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.
431
-
432
- If `storage_options` is not provided, Polars will try to infer the information
433
- from environment variables.
434
- credential_provider
435
- Provide a function that can be called to provide cloud storage
436
- credentials. The function is expected to return a dictionary of
437
- credential keys along with an optional credential expiry time.
438
-
439
- .. warning::
440
- This functionality is considered **unstable**. It may be changed
441
- at any point without it being considered a breaking change.
442
-
443
- memory_map
444
- Try to memory map the file. This can greatly improve performance on repeated
445
- queries as the OS may cache pages.
446
- Only uncompressed IPC files can be memory mapped.
447
- retries
448
- Number of retries if accessing a cloud instance fails.
449
- file_cache_ttl
450
- Amount of time to keep downloaded cloud files since their last access time,
451
- in seconds. Uses the `POLARS_FILE_CACHE_TTL` environment variable
452
- (which defaults to 1 hour) if not given.
453
- hive_partitioning
454
- Infer statistics and schema from Hive partitioned URL and use them
455
- to prune reads. This is unset by default (i.e. `None`), meaning it is
456
- automatically enabled when a single directory is passed, and otherwise
457
- disabled.
458
- hive_schema
459
- The column names and data types of the columns by which the data is partitioned.
460
- If set to `None` (default), the schema of the Hive partitions is inferred.
461
-
462
- .. warning::
463
- This functionality is considered **unstable**. It may be changed
464
- at any point without it being considered a breaking change.
465
- try_parse_hive_dates
466
- Whether to try parsing hive values as date/datetime types.
467
- include_file_paths
468
- Include the path of the source file(s) as a column with this name.
469
- """
470
- sources: list[str] | list[Path] | list[IO[bytes]] | list[bytes] = []
471
- if isinstance(source, (str, Path)):
472
- source = normalize_filepath(source, check_not_directory=False)
473
- elif isinstance(source, list):
474
- if is_path_or_str_sequence(source):
475
- sources = [
476
- normalize_filepath(source, check_not_directory=False)
477
- for source in source
478
- ]
479
- else:
480
- sources = source
481
-
482
- source = None # type: ignore[assignment]
483
-
484
- # Memory Mapping is now a no-op
485
- _ = memory_map
486
-
487
- credential_provider_builder = _init_credential_provider_builder(
488
- credential_provider, source, storage_options, "scan_parquet"
489
- )
490
- del credential_provider
491
-
492
- if storage_options:
493
- storage_options = list(storage_options.items()) # type: ignore[assignment]
494
- else:
495
- # Handle empty dict input
496
- storage_options = None
497
-
498
- pylf = PyLazyFrame.new_from_ipc(
499
- source,
500
- sources,
501
- n_rows,
502
- cache,
503
- rechunk,
504
- parse_row_index_args(row_index_name, row_index_offset),
505
- cloud_options=storage_options,
506
- credential_provider=credential_provider_builder,
507
- retries=retries,
508
- file_cache_ttl=file_cache_ttl,
509
- hive_partitioning=hive_partitioning,
510
- hive_schema=hive_schema,
511
- try_parse_hive_dates=try_parse_hive_dates,
512
- include_file_paths=include_file_paths,
513
- )
514
- return wrap_ldf(pylf)
@@ -1,3 +0,0 @@
1
- from polars.io.json.read import read_json
2
-
3
- __all__ = ["read_json"]
polars/io/json/read.py DELETED
@@ -1,101 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import contextlib
4
- from io import BytesIO, StringIO
5
- from pathlib import Path
6
- from typing import TYPE_CHECKING
7
-
8
- from polars._utils.various import normalize_filepath
9
- from polars._utils.wrap import wrap_df
10
- from polars.datatypes import N_INFER_DEFAULT
11
-
12
- with contextlib.suppress(ImportError): # Module not available when building docs
13
- from polars._plr import PyDataFrame
14
-
15
- if TYPE_CHECKING:
16
- from io import IOBase
17
-
18
- from polars import DataFrame
19
- from polars._typing import SchemaDefinition
20
-
21
-
22
- def read_json(
23
- source: str | Path | IOBase | bytes,
24
- *,
25
- schema: SchemaDefinition | None = None,
26
- schema_overrides: SchemaDefinition | None = None,
27
- infer_schema_length: int | None = N_INFER_DEFAULT,
28
- ) -> DataFrame:
29
- """
30
- Read into a DataFrame from a JSON file.
31
-
32
- Parameters
33
- ----------
34
- source
35
- Path to a file or a file-like object (by "file-like object" we refer to objects
36
- that have a `read()` method, such as a file handler like the builtin `open`
37
- function, or a `BytesIO` instance). For file-like objects, the stream position
38
- may not be updated accordingly after reading.
39
- schema : Sequence of str, (str,DataType) pairs, or a {str:DataType,} dict
40
- The DataFrame schema may be declared in several ways:
41
-
42
- * As a dict of {name:type} pairs; if type is None, it will be auto-inferred.
43
- * As a list of column names; in this case types are automatically inferred.
44
- * As a list of (name,type) pairs; this is equivalent to the dictionary form.
45
-
46
- If you supply a list of column names that does not match the names in the
47
- underlying data, the names given here will overwrite them. The number
48
- of names given in the schema should match the underlying data dimensions.
49
- schema_overrides : dict, default None
50
- Support type specification or override of one or more columns; note that
51
- any dtypes inferred from the schema param will be overridden.
52
- infer_schema_length
53
- The maximum number of rows to scan for schema inference.
54
- If set to `None`, the full data may be scanned *(this is slow)*.
55
-
56
- See Also
57
- --------
58
- read_ndjson
59
-
60
- Examples
61
- --------
62
- >>> from io import StringIO
63
- >>> json_str = '[{"foo":1,"bar":6},{"foo":2,"bar":7},{"foo":3,"bar":8}]'
64
- >>> pl.read_json(StringIO(json_str))
65
- shape: (3, 2)
66
- ┌─────┬─────┐
67
- │ foo ┆ bar │
68
- │ --- ┆ --- │
69
- │ i64 ┆ i64 │
70
- ╞═════╪═════╡
71
- │ 1 ┆ 6 │
72
- │ 2 ┆ 7 │
73
- │ 3 ┆ 8 │
74
- └─────┴─────┘
75
-
76
- With the schema defined.
77
-
78
- >>> pl.read_json(StringIO(json_str), schema={"foo": pl.Int64, "bar": pl.Float64})
79
- shape: (3, 2)
80
- ┌─────┬─────┐
81
- │ foo ┆ bar │
82
- │ --- ┆ --- │
83
- │ i64 ┆ f64 │
84
- ╞═════╪═════╡
85
- │ 1 ┆ 6.0 │
86
- │ 2 ┆ 7.0 │
87
- │ 3 ┆ 8.0 │
88
- └─────┴─────┘
89
- """
90
- if isinstance(source, StringIO):
91
- source = BytesIO(source.getvalue().encode())
92
- elif isinstance(source, (str, Path)):
93
- source = normalize_filepath(source)
94
-
95
- pydf = PyDataFrame.read_json(
96
- source,
97
- infer_schema_length=infer_schema_length,
98
- schema=schema,
99
- schema_overrides=schema_overrides,
100
- )
101
- return wrap_df(pydf)