streamlit-nightly 1.36.1.dev20240630__py2.py3-none-any.whl → 1.36.1.dev20240703__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. streamlit/commands/navigation.py +2 -2
  2. streamlit/components/v1/component_arrow.py +16 -11
  3. streamlit/components/v1/custom_component.py +2 -1
  4. streamlit/config.py +1 -136
  5. streamlit/dataframe_util.py +835 -0
  6. streamlit/delta_generator.py +5 -3
  7. streamlit/elements/arrow.py +17 -13
  8. streamlit/elements/dialog_decorator.py +1 -1
  9. streamlit/elements/exception.py +2 -8
  10. streamlit/elements/image.py +2 -1
  11. streamlit/elements/lib/built_in_chart_utils.py +78 -12
  12. streamlit/elements/lib/column_config_utils.py +1 -1
  13. streamlit/elements/lib/pandas_styler_utils.py +2 -2
  14. streamlit/elements/lib/policies.py +20 -2
  15. streamlit/elements/lib/utils.py +100 -10
  16. streamlit/elements/map.py +2 -2
  17. streamlit/elements/media.py +1 -1
  18. streamlit/elements/metric.py +5 -2
  19. streamlit/elements/plotly_chart.py +1 -1
  20. streamlit/elements/pyplot.py +26 -39
  21. streamlit/elements/vega_charts.py +6 -5
  22. streamlit/elements/widgets/button.py +1 -1
  23. streamlit/elements/widgets/camera_input.py +7 -2
  24. streamlit/elements/widgets/chat.py +1 -1
  25. streamlit/elements/widgets/checkbox.py +7 -2
  26. streamlit/elements/widgets/color_picker.py +7 -2
  27. streamlit/elements/widgets/data_editor.py +10 -9
  28. streamlit/elements/widgets/file_uploader.py +7 -2
  29. streamlit/elements/widgets/multiselect.py +6 -7
  30. streamlit/elements/widgets/number_input.py +7 -2
  31. streamlit/elements/widgets/radio.py +6 -7
  32. streamlit/elements/widgets/select_slider.py +6 -7
  33. streamlit/elements/widgets/selectbox.py +6 -7
  34. streamlit/elements/widgets/slider.py +7 -2
  35. streamlit/elements/widgets/text_widgets.py +8 -5
  36. streamlit/elements/widgets/time_widgets.py +7 -2
  37. streamlit/elements/write.py +5 -5
  38. streamlit/errors.py +0 -29
  39. streamlit/navigation/page.py +8 -3
  40. streamlit/proto/NewSession_pb2.pyi +1 -1
  41. streamlit/runtime/app_session.py +0 -4
  42. streamlit/runtime/caching/cache_utils.py +1 -1
  43. streamlit/runtime/scriptrunner/script_runner.py +7 -22
  44. streamlit/runtime/state/common.py +51 -2
  45. streamlit/runtime/state/session_state.py +2 -1
  46. streamlit/runtime/state/session_state_proxy.py +1 -1
  47. streamlit/runtime/state/widgets.py +1 -1
  48. streamlit/static/asset-manifest.json +2 -2
  49. streamlit/static/index.html +1 -1
  50. streamlit/static/static/js/main.28e3c6e9.js +2 -0
  51. streamlit/testing/v1/element_tree.py +3 -3
  52. streamlit/type_util.py +0 -1069
  53. streamlit/watcher/path_watcher.py +1 -2
  54. {streamlit_nightly-1.36.1.dev20240630.dist-info → streamlit_nightly-1.36.1.dev20240703.dist-info}/METADATA +1 -1
  55. {streamlit_nightly-1.36.1.dev20240630.dist-info → streamlit_nightly-1.36.1.dev20240703.dist-info}/RECORD +60 -59
  56. {streamlit_nightly-1.36.1.dev20240630.dist-info → streamlit_nightly-1.36.1.dev20240703.dist-info}/WHEEL +1 -1
  57. streamlit/static/static/js/main.0326e951.js +0 -2
  58. /streamlit/static/static/js/{main.0326e951.js.LICENSE.txt → main.28e3c6e9.js.LICENSE.txt} +0 -0
  59. {streamlit_nightly-1.36.1.dev20240630.data → streamlit_nightly-1.36.1.dev20240703.data}/scripts/streamlit.cmd +0 -0
  60. {streamlit_nightly-1.36.1.dev20240630.dist-info → streamlit_nightly-1.36.1.dev20240703.dist-info}/entry_points.txt +0 -0
  61. {streamlit_nightly-1.36.1.dev20240630.dist-info → streamlit_nightly-1.36.1.dev20240703.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,835 @@
1
+ # Copyright (c) Streamlit Inc. (2018-2022) Snowflake Inc. (2022-2024)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """A bunch of useful utilities for dealing with dataframes."""
16
+
17
+ from __future__ import annotations
18
+
19
+ import contextlib
20
+ import math
21
+ from enum import Enum, EnumMeta, auto
22
+ from typing import (
23
+ TYPE_CHECKING,
24
+ Any,
25
+ Final,
26
+ Iterable,
27
+ Protocol,
28
+ Sequence,
29
+ Tuple,
30
+ TypeVar,
31
+ Union,
32
+ cast,
33
+ )
34
+
35
+ from typing_extensions import TypeAlias, TypeGuard
36
+
37
+ import streamlit as st
38
+ from streamlit import config, errors, logger, string_util
39
+ from streamlit.type_util import is_type
40
+
41
+ if TYPE_CHECKING:
42
+ import numpy as np
43
+ import pyarrow as pa
44
+ from pandas import DataFrame, Index, Series
45
+ from pandas.core.indexing import _iLocIndexer
46
+ from pandas.io.formats.style import Styler
47
+
48
+ _LOGGER: Final = logger.get_logger(__name__)
49
+
50
+
51
+ # Maximum number of rows to request from an unevaluated (out-of-core) dataframe
52
+ MAX_UNEVALUATED_DF_ROWS = 10000
53
+
54
+ _PANDAS_DF_TYPE_STR: Final = "pandas.core.frame.DataFrame"
55
+ _PANDAS_INDEX_TYPE_STR: Final = "pandas.core.indexes.base.Index"
56
+ _PANDAS_SERIES_TYPE_STR: Final = "pandas.core.series.Series"
57
+ _PANDAS_STYLER_TYPE_STR: Final = "pandas.io.formats.style.Styler"
58
+ _NUMPY_ARRAY_TYPE_STR: Final = "numpy.ndarray"
59
+ _SNOWPARK_DF_TYPE_STR: Final = "snowflake.snowpark.dataframe.DataFrame"
60
+ _SNOWPARK_DF_ROW_TYPE_STR: Final = "snowflake.snowpark.row.Row"
61
+ _SNOWPARK_TABLE_TYPE_STR: Final = "snowflake.snowpark.table.Table"
62
+ _PYSPARK_DF_TYPE_STR: Final = "pyspark.sql.dataframe.DataFrame"
63
+ _MODIN_DF_TYPE_STR: Final = "modin.pandas.dataframe.DataFrame"
64
+ _MODIN_SERIES_TYPE_STR: Final = "modin.pandas.series.Series"
65
+ _SNOWPANDAS_DF_TYPE_STR: Final = "snowflake.snowpark.modin.pandas.dataframe.DataFrame"
66
+ _SNOWPANDAS_SERIES_TYPE_STR: Final = "snowflake.snowpark.modin.pandas.series.Series"
67
+
68
+
69
+ _DATAFRAME_LIKE_TYPES: Final[tuple[str, ...]] = (
70
+ _PANDAS_DF_TYPE_STR,
71
+ _PANDAS_INDEX_TYPE_STR,
72
+ _PANDAS_SERIES_TYPE_STR,
73
+ _PANDAS_STYLER_TYPE_STR,
74
+ _NUMPY_ARRAY_TYPE_STR,
75
+ )
76
+
77
+ # We show a special "UnevaluatedDataFrame" warning for cached funcs
78
+ # that attempt to return one of these unserializable types:
79
+ UNEVALUATED_DATAFRAME_TYPES = (
80
+ _MODIN_DF_TYPE_STR,
81
+ _MODIN_SERIES_TYPE_STR,
82
+ _PYSPARK_DF_TYPE_STR,
83
+ _SNOWPANDAS_DF_TYPE_STR,
84
+ _SNOWPANDAS_SERIES_TYPE_STR,
85
+ _SNOWPARK_DF_TYPE_STR,
86
+ _SNOWPARK_TABLE_TYPE_STR,
87
+ )
88
+
89
+ DataFrameLike: TypeAlias = "Union[DataFrame, Index, Series, Styler]"
90
+
91
+ _DATAFRAME_COMPATIBLE_TYPES: Final[tuple[type, ...]] = (
92
+ dict,
93
+ list,
94
+ set,
95
+ tuple,
96
+ type(None),
97
+ )
98
+
99
+ _DataFrameCompatible: TypeAlias = Union[dict, list, set, Tuple[Any], None]
100
+ DataFrameCompatible: TypeAlias = Union[_DataFrameCompatible, DataFrameLike]
101
+
102
+ V_co = TypeVar(
103
+ "V_co",
104
+ covariant=True, # https://peps.python.org/pep-0484/#covariance-and-contravariance
105
+ )
106
+
107
+
108
+ class DataFrameGenericAlias(Protocol[V_co]):
109
+ """Technically not a GenericAlias, but serves the same purpose in
110
+ OptionSequence below, in that it is a type which admits DataFrame,
111
+ but is generic. This allows OptionSequence to be a fully generic type,
112
+ significantly increasing its usefulness.
113
+
114
+ We can't use types.GenericAlias, as it is only available from python>=3.9,
115
+ and isn't easily back-ported.
116
+ """
117
+
118
+ @property
119
+ def iloc(self) -> _iLocIndexer: ...
120
+
121
+
122
+ OptionSequence: TypeAlias = Union[
123
+ Iterable[V_co],
124
+ DataFrameGenericAlias[V_co],
125
+ ]
126
+
127
+
128
+ class DataFormat(Enum):
129
+ """DataFormat is used to determine the format of the data."""
130
+
131
+ UNKNOWN = auto()
132
+ EMPTY = auto() # None
133
+ PANDAS_DATAFRAME = auto() # pd.DataFrame
134
+ PANDAS_SERIES = auto() # pd.Series
135
+ PANDAS_INDEX = auto() # pd.Index
136
+ NUMPY_LIST = auto() # np.array[Scalar]
137
+ NUMPY_MATRIX = auto() # np.array[List[Scalar]]
138
+ PYARROW_TABLE = auto() # pyarrow.Table
139
+ SNOWPARK_OBJECT = auto() # Snowpark DataFrame, Table, List[Row]
140
+ PYSPARK_OBJECT = auto() # pyspark.DataFrame
141
+ MODIN_OBJECT = auto() # Modin DataFrame, Series
142
+ SNOWPANDAS_OBJECT = auto() # Snowpandas DataFrame, Series
143
+ PANDAS_STYLER = auto() # pandas Styler
144
+ LIST_OF_RECORDS = auto() # List[Dict[str, Scalar]]
145
+ LIST_OF_ROWS = auto() # List[List[Scalar]]
146
+ LIST_OF_VALUES = auto() # List[Scalar]
147
+ TUPLE_OF_VALUES = auto() # Tuple[Scalar]
148
+ SET_OF_VALUES = auto() # Set[Scalar]
149
+ COLUMN_INDEX_MAPPING = auto() # {column: {index: value}}
150
+ COLUMN_VALUE_MAPPING = auto() # {column: List[values]}
151
+ COLUMN_SERIES_MAPPING = auto() # {column: Series(values)}
152
+ KEY_VALUE_DICT = auto() # {index: value}
153
+
154
+
155
+ def is_dataframe(obj: object) -> TypeGuard[DataFrame]:
156
+ return is_type(obj, _PANDAS_DF_TYPE_STR)
157
+
158
+
159
+ def is_dataframe_like(obj: object) -> TypeGuard[DataFrameLike]:
160
+ return any(is_type(obj, t) for t in _DATAFRAME_LIKE_TYPES)
161
+
162
+
163
+ def is_unevaluated_data_object(obj: object) -> bool:
164
+ """True if the object is one of the supported unevaluated data objects:
165
+
166
+ Currently supported objects are:
167
+ - Snowpark DataFrame / Table
168
+ - PySpark DataFrame
169
+ - Modin DataFrame / Series
170
+ - Snowpandas DataFrame / Series
171
+
172
+ Unevaluated means that the data is not yet in the local memory.
173
+ Unevaluated data objects are treated differently from other data objects by only
174
+ requesting a subset of the data instead of loading all data into th memory
175
+ """
176
+ return (
177
+ is_snowpark_data_object(obj)
178
+ or is_pyspark_data_object(obj)
179
+ or is_snowpandas_data_object(obj)
180
+ or is_modin_data_object(obj)
181
+ )
182
+
183
+
184
+ def is_snowpark_data_object(obj: object) -> bool:
185
+ """True if obj is a Snowpark DataFrame or Table."""
186
+ return is_type(obj, _SNOWPARK_TABLE_TYPE_STR) or is_type(obj, _SNOWPARK_DF_TYPE_STR)
187
+
188
+
189
+ def is_snowpark_row_list(obj: object) -> bool:
190
+ """True if obj is a list of snowflake.snowpark.row.Row."""
191
+ if not isinstance(obj, list):
192
+ return False
193
+ if len(obj) < 1:
194
+ return False
195
+ if not hasattr(obj[0], "__class__"):
196
+ return False
197
+ return is_type(obj[0], _SNOWPARK_DF_ROW_TYPE_STR)
198
+
199
+
200
+ def is_pyspark_data_object(obj: object) -> bool:
201
+ """True if obj is of type pyspark.sql.dataframe.DataFrame"""
202
+ return (
203
+ is_type(obj, _PYSPARK_DF_TYPE_STR)
204
+ and hasattr(obj, "toPandas")
205
+ and callable(obj.toPandas)
206
+ )
207
+
208
+
209
+ def is_modin_data_object(obj: object) -> bool:
210
+ """True if obj is of Modin Dataframe or Series"""
211
+ return is_type(obj, _MODIN_DF_TYPE_STR) or is_type(obj, _MODIN_SERIES_TYPE_STR)
212
+
213
+
214
+ def is_snowpandas_data_object(obj: object) -> bool:
215
+ """True if obj is a Snowpark Pandas DataFrame or Series."""
216
+ return is_type(obj, _SNOWPANDAS_DF_TYPE_STR) or is_type(
217
+ obj, _SNOWPANDAS_SERIES_TYPE_STR
218
+ )
219
+
220
+
221
+ def is_dataframe_compatible(obj: object) -> TypeGuard[DataFrameCompatible]:
222
+ """True if type that can be passed to convert_anything_to_pandas_df."""
223
+ return is_dataframe_like(obj) or type(obj) in _DATAFRAME_COMPATIBLE_TYPES
224
+
225
+
226
+ def is_pandas_styler(obj: object) -> TypeGuard[Styler]:
227
+ return is_type(obj, _PANDAS_STYLER_TYPE_STR)
228
+
229
+
230
+ def convert_anything_to_pandas_df(
231
+ data: Any,
232
+ max_unevaluated_rows: int = MAX_UNEVALUATED_DF_ROWS,
233
+ ensure_copy: bool = False,
234
+ ) -> DataFrame:
235
+ """Try to convert different formats to a Pandas Dataframe.
236
+
237
+ Parameters
238
+ ----------
239
+ data : any
240
+ The data to convert to a Pandas DataFrame.
241
+
242
+ max_unevaluated_rows: int
243
+ If unevaluated data is detected this func will evaluate it,
244
+ taking max_unevaluated_rows, defaults to 10k and 100 for st.table
245
+
246
+ ensure_copy: bool
247
+ If True, make sure to always return a copy of the data. If False, it depends on
248
+ the type of the data. For example, a Pandas DataFrame will be returned as-is.
249
+
250
+ Returns
251
+ -------
252
+ pandas.DataFrame
253
+
254
+ """
255
+ import pandas as pd
256
+
257
+ if is_type(data, _PANDAS_DF_TYPE_STR):
258
+ return data.copy() if ensure_copy else cast(pd.DataFrame, data)
259
+
260
+ if is_pandas_styler(data):
261
+ return cast("DataFrame", data.data.copy() if ensure_copy else data.data)
262
+
263
+ if is_type(data, "numpy.ndarray"):
264
+ return pd.DataFrame([]) if len(data.shape) == 0 else pd.DataFrame(data)
265
+ if is_modin_data_object(data):
266
+ data = data.head(max_unevaluated_rows)._to_pandas()
267
+
268
+ if isinstance(data, pd.Series):
269
+ data = data.to_frame()
270
+
271
+ if data.shape[0] == max_unevaluated_rows:
272
+ st.caption(
273
+ f"⚠️ Showing only {string_util.simplify_number(max_unevaluated_rows)} "
274
+ "rows. Call `_to_pandas()` on the dataframe to show more."
275
+ )
276
+ return cast(pd.DataFrame, data)
277
+
278
+ if is_pyspark_data_object(data):
279
+ data = data.limit(max_unevaluated_rows).toPandas()
280
+ if data.shape[0] == max_unevaluated_rows:
281
+ st.caption(
282
+ f"⚠️ Showing only {string_util.simplify_number(max_unevaluated_rows)} "
283
+ "rows. Call `toPandas()` on the dataframe to show more."
284
+ )
285
+ return cast(pd.DataFrame, data)
286
+
287
+ if is_snowpark_data_object(data):
288
+ data = data.limit(max_unevaluated_rows).to_pandas()
289
+ if data.shape[0] == max_unevaluated_rows:
290
+ st.caption(
291
+ f"⚠️ Showing only {string_util.simplify_number(max_unevaluated_rows)} "
292
+ "rows. Call `to_pandas()` on the dataframe to show more."
293
+ )
294
+ return cast(pd.DataFrame, data)
295
+
296
+ if is_snowpandas_data_object(data):
297
+ data = data.head(max_unevaluated_rows).to_pandas()
298
+
299
+ if isinstance(data, pd.Series):
300
+ data = data.to_frame()
301
+
302
+ if data.shape[0] == max_unevaluated_rows:
303
+ st.caption(
304
+ f"⚠️ Showing only {string_util.simplify_number(max_unevaluated_rows)} "
305
+ "rows. Call `to_pandas()` on the dataframe to show more."
306
+ )
307
+ return cast(pd.DataFrame, data)
308
+
309
+ # This is inefficient when data is a pyarrow.Table as it will be converted
310
+ # back to Arrow when marshalled to protobuf, but area/bar/line charts need
311
+ # DataFrame magic to generate the correct output.
312
+ if hasattr(data, "to_pandas"):
313
+ return cast(pd.DataFrame, data.to_pandas())
314
+
315
+ # Try to convert to pandas.DataFrame. This will raise an error is df is not
316
+ # compatible with the pandas.DataFrame constructor.
317
+ try:
318
+ return pd.DataFrame(data)
319
+
320
+ except ValueError as ex:
321
+ if isinstance(data, dict):
322
+ with contextlib.suppress(ValueError):
323
+ # Try to use index orient as back-up to support key-value dicts
324
+ return pd.DataFrame.from_dict(data, orient="index")
325
+ raise errors.StreamlitAPIException(
326
+ f"""
327
+ Unable to convert object of type `{type(data)}` to `pandas.DataFrame`.
328
+ Offending object:
329
+ ```py
330
+ {data}
331
+ ```"""
332
+ ) from ex
333
+
334
+
335
+ def convert_anything_to_sequence(obj: OptionSequence[V_co]) -> Sequence[V_co]:
336
+ """Try to convert different formats to an indexable Sequence.
337
+
338
+ If the input is a dataframe-like object, we just select the first
339
+ column to iterate over. If the input cannot be converted to a sequence,
340
+ a TypeError is raised.
341
+
342
+ Parameters
343
+ ----------
344
+ obj : OptionSequence
345
+ The object to convert to a sequence.
346
+
347
+ Returns
348
+ -------
349
+ Sequence
350
+ The converted sequence.
351
+ """
352
+ if obj is None:
353
+ return [] # type: ignore
354
+
355
+ if isinstance(obj, (str, list, tuple, set, range, EnumMeta)):
356
+ # This also ensures that the sequence is copied to prevent
357
+ # potential mutations to the original object.
358
+ return list(obj)
359
+
360
+ if isinstance(obj, dict):
361
+ return list(obj.keys())
362
+
363
+ # Fallback to our DataFrame conversion logic:
364
+ try:
365
+ # We use ensure_copy here because the return value of this function is
366
+ # saved in a widget serde class instance to be used in later script runs,
367
+ # and we don't want mutations to the options object passed to a
368
+ # widget affect the widget.
369
+ # (See https://github.com/streamlit/streamlit/issues/7534)
370
+ data_df = convert_anything_to_pandas_df(obj, ensure_copy=True)
371
+ # Return first column as a list:
372
+ return (
373
+ [] if data_df.empty else cast(Sequence[V_co], data_df.iloc[:, 0].to_list())
374
+ )
375
+ except errors.StreamlitAPIException as e:
376
+ raise TypeError(
377
+ "Object is not an iterable and could not be converted to one. "
378
+ f"Object type: {type(obj)}"
379
+ ) from e
380
+
381
+
382
+ def _maybe_truncate_table(
383
+ table: pa.Table, truncated_rows: int | None = None
384
+ ) -> pa.Table:
385
+ """Experimental feature to automatically truncate tables that
386
+ are larger than the maximum allowed message size. It needs to be enabled
387
+ via the server.enableArrowTruncation config option.
388
+
389
+ Parameters
390
+ ----------
391
+ table : pyarrow.Table
392
+ A table to truncate.
393
+
394
+ truncated_rows : int or None
395
+ The number of rows that have been truncated so far. This is used by
396
+ the recursion logic to keep track of the total number of truncated
397
+ rows.
398
+
399
+ """
400
+
401
+ if config.get_option("server.enableArrowTruncation"):
402
+ # This is an optimization problem: We don't know at what row
403
+ # the perfect cut-off is to comply with the max size. But we want to figure
404
+ # it out in as few iterations as possible. We almost always will cut out
405
+ # more than required to keep the iterations low.
406
+
407
+ # The maximum size allowed for protobuf messages in bytes:
408
+ max_message_size = int(config.get_option("server.maxMessageSize") * 1e6)
409
+ # We add 1 MB for other overhead related to the protobuf message.
410
+ # This is a very conservative estimate, but it should be good enough.
411
+ table_size = int(table.nbytes + 1 * 1e6)
412
+ table_rows = table.num_rows
413
+
414
+ if table_rows > 1 and table_size > max_message_size:
415
+ # targeted rows == the number of rows the table should be truncated to.
416
+ # Calculate an approximation of how many rows we need to truncate to.
417
+ targeted_rows = math.ceil(table_rows * (max_message_size / table_size))
418
+ # Make sure to cut out at least a couple of rows to avoid running
419
+ # this logic too often since it is quite inefficient and could lead
420
+ # to infinity recursions without these precautions.
421
+ targeted_rows = math.floor(
422
+ max(
423
+ min(
424
+ # Cut out:
425
+ # an additional 5% of the estimated num rows to cut out:
426
+ targeted_rows - math.floor((table_rows - targeted_rows) * 0.05),
427
+ # at least 1% of table size:
428
+ table_rows - (table_rows * 0.01),
429
+ # at least 5 rows:
430
+ table_rows - 5,
431
+ ),
432
+ 1, # but it should always have at least 1 row
433
+ )
434
+ )
435
+ sliced_table = table.slice(0, targeted_rows)
436
+ return _maybe_truncate_table(
437
+ sliced_table, (truncated_rows or 0) + (table_rows - targeted_rows)
438
+ )
439
+
440
+ if truncated_rows:
441
+ displayed_rows = string_util.simplify_number(table.num_rows)
442
+ total_rows = string_util.simplify_number(table.num_rows + truncated_rows)
443
+
444
+ if displayed_rows == total_rows:
445
+ # If the simplified numbers are the same,
446
+ # we just display the exact numbers.
447
+ displayed_rows = str(table.num_rows)
448
+ total_rows = str(table.num_rows + truncated_rows)
449
+
450
+ st.caption(
451
+ f"⚠️ Showing {displayed_rows} out of {total_rows} "
452
+ "rows due to data size limitations."
453
+ )
454
+
455
+ return table
456
+
457
+
458
+ def pyarrow_table_to_bytes(table: pa.Table) -> bytes:
459
+ """Serialize pyarrow.Table to bytes using Apache Arrow.
460
+
461
+ Parameters
462
+ ----------
463
+ table : pyarrow.Table
464
+ A table to convert.
465
+
466
+ """
467
+ try:
468
+ table = _maybe_truncate_table(table)
469
+ except RecursionError as err:
470
+ # This is a very unlikely edge case, but we want to make sure that
471
+ # it doesn't lead to unexpected behavior.
472
+ # If there is a recursion error, we just return the table as-is
473
+ # which will lead to the normal message limit exceed error.
474
+ _LOGGER.warning(
475
+ "Recursion error while truncating Arrow table. This is not "
476
+ "supposed to happen.",
477
+ exc_info=err,
478
+ )
479
+
480
+ import pyarrow as pa
481
+
482
+ # Convert table to bytes
483
+ sink = pa.BufferOutputStream()
484
+ writer = pa.RecordBatchStreamWriter(sink, table.schema)
485
+ writer.write_table(table)
486
+ writer.close()
487
+ return cast(bytes, sink.getvalue().to_pybytes())
488
+
489
+
490
+ def is_colum_type_arrow_incompatible(column: Series[Any] | Index) -> bool:
491
+ """Return True if the column type is known to cause issues during Arrow conversion."""
492
+ from pandas.api.types import infer_dtype, is_dict_like, is_list_like
493
+
494
+ if column.dtype.kind in [
495
+ "c", # complex64, complex128, complex256
496
+ ]:
497
+ return True
498
+
499
+ if str(column.dtype) in {
500
+ # These period types are not yet supported by our frontend impl.
501
+ # See comments in Quiver.ts for more details.
502
+ "period[B]",
503
+ "period[N]",
504
+ "period[ns]",
505
+ "period[U]",
506
+ "period[us]",
507
+ }:
508
+ return True
509
+
510
+ if column.dtype == "object":
511
+ # The dtype of mixed type columns is always object, the actual type of the column
512
+ # values can be determined via the infer_dtype function:
513
+ # https://pandas.pydata.org/docs/reference/api/pandas.api.types.infer_dtype.html
514
+ inferred_type = infer_dtype(column, skipna=True)
515
+
516
+ if inferred_type in [
517
+ "mixed-integer",
518
+ "complex",
519
+ ]:
520
+ return True
521
+ elif inferred_type == "mixed":
522
+ # This includes most of the more complex/custom types (objects, dicts, lists, ...)
523
+ if len(column) == 0 or not hasattr(column, "iloc"):
524
+ # The column seems to be invalid, so we assume it is incompatible.
525
+ # But this would most likely never happen since empty columns
526
+ # cannot be mixed.
527
+ return True
528
+
529
+ # Get the first value to check if it is a supported list-like type.
530
+ first_value = column.iloc[0]
531
+
532
+ if (
533
+ not is_list_like(first_value)
534
+ # dicts are list-like, but have issues in Arrow JS (see comments in Quiver.ts)
535
+ or is_dict_like(first_value)
536
+ # Frozensets are list-like, but are not compatible with pyarrow.
537
+ or isinstance(first_value, frozenset)
538
+ ):
539
+ # This seems to be an incompatible list-like type
540
+ return True
541
+ return False
542
+ # We did not detect an incompatible type, so we assume it is compatible:
543
+ return False
544
+
545
+
546
+ def fix_arrow_incompatible_column_types(
547
+ df: DataFrame, selected_columns: list[str] | None = None
548
+ ) -> DataFrame:
549
+ """Fix column types that are not supported by Arrow table.
550
+
551
+ This includes mixed types (e.g. mix of integers and strings)
552
+ as well as complex numbers (complex128 type). These types will cause
553
+ errors during conversion of the dataframe to an Arrow table.
554
+ It is fixed by converting all values of the column to strings
555
+ This is sufficient for displaying the data on the frontend.
556
+
557
+ Parameters
558
+ ----------
559
+ df : pandas.DataFrame
560
+ A dataframe to fix.
561
+
562
+ selected_columns: List[str] or None
563
+ A list of columns to fix. If None, all columns are evaluated.
564
+
565
+ Returns
566
+ -------
567
+ The fixed dataframe.
568
+ """
569
+ import pandas as pd
570
+
571
+ # Make a copy, but only initialize if necessary to preserve memory.
572
+ df_copy: DataFrame | None = None
573
+ for col in selected_columns or df.columns:
574
+ if is_colum_type_arrow_incompatible(df[col]):
575
+ if df_copy is None:
576
+ df_copy = df.copy()
577
+ df_copy[col] = df[col].astype("string")
578
+
579
+ # The index can also contain mixed types
580
+ # causing Arrow issues during conversion.
581
+ # Skipping multi-indices since they won't return
582
+ # the correct value from infer_dtype
583
+ if not selected_columns and (
584
+ not isinstance(
585
+ df.index,
586
+ pd.MultiIndex,
587
+ )
588
+ and is_colum_type_arrow_incompatible(df.index)
589
+ ):
590
+ if df_copy is None:
591
+ df_copy = df.copy()
592
+ df_copy.index = df.index.astype("string")
593
+ return df_copy if df_copy is not None else df
594
+
595
+
596
+ def data_frame_to_bytes(df: DataFrame) -> bytes:
597
+ """Serialize pandas.DataFrame to bytes using Apache Arrow.
598
+
599
+ Parameters
600
+ ----------
601
+ df : pandas.DataFrame
602
+ A dataframe to convert.
603
+
604
+ """
605
+ import pyarrow as pa
606
+
607
+ try:
608
+ table = pa.Table.from_pandas(df)
609
+ except (pa.ArrowTypeError, pa.ArrowInvalid, pa.ArrowNotImplementedError) as ex:
610
+ _LOGGER.info(
611
+ "Serialization of dataframe to Arrow table was unsuccessful due to: %s. "
612
+ "Applying automatic fixes for column types to make the dataframe Arrow-compatible.",
613
+ ex,
614
+ )
615
+ df = fix_arrow_incompatible_column_types(df)
616
+ table = pa.Table.from_pandas(df)
617
+ return pyarrow_table_to_bytes(table)
618
+
619
+
620
+ def bytes_to_data_frame(source: bytes) -> DataFrame:
621
+ """Convert bytes to pandas.DataFrame.
622
+
623
+ Using this function in production needs to make sure that
624
+ the pyarrow version >= 14.0.1.
625
+
626
+ Parameters
627
+ ----------
628
+ source : bytes
629
+ A bytes object to convert.
630
+
631
+ """
632
+ import pyarrow as pa
633
+
634
+ reader = pa.RecordBatchStreamReader(source)
635
+ return reader.read_pandas()
636
+
637
+
638
+ def is_list_of_scalars(data: Iterable[Any]) -> bool:
639
+ """Check if the list only contains scalar values."""
640
+ from pandas.api.types import infer_dtype
641
+
642
+ # Overview on all value that are interpreted as scalar:
643
+ # https://pandas.pydata.org/docs/reference/api/pandas.api.types.is_scalar.html
644
+ return infer_dtype(data, skipna=True) not in ["mixed", "unknown-array"]
645
+
646
+
647
+ def determine_data_format(input_data: Any) -> DataFormat:
648
+ """Determine the data format of the input data.
649
+
650
+ Parameters
651
+ ----------
652
+ input_data : Any
653
+ The input data to determine the data format of.
654
+
655
+ Returns
656
+ -------
657
+ DataFormat
658
+ The data format of the input data.
659
+ """
660
+ import numpy as np
661
+ import pandas as pd
662
+ import pyarrow as pa
663
+
664
+ if input_data is None:
665
+ return DataFormat.EMPTY
666
+ elif isinstance(input_data, pd.DataFrame):
667
+ return DataFormat.PANDAS_DATAFRAME
668
+ elif isinstance(input_data, np.ndarray):
669
+ if len(input_data.shape) == 1:
670
+ # For technical reasons, we need to distinguish one
671
+ # one-dimensional numpy array from multidimensional ones.
672
+ return DataFormat.NUMPY_LIST
673
+ return DataFormat.NUMPY_MATRIX
674
+ elif isinstance(input_data, pa.Table):
675
+ return DataFormat.PYARROW_TABLE
676
+ elif isinstance(input_data, pd.Series):
677
+ return DataFormat.PANDAS_SERIES
678
+ elif isinstance(input_data, pd.Index):
679
+ return DataFormat.PANDAS_INDEX
680
+ elif is_pandas_styler(input_data):
681
+ return DataFormat.PANDAS_STYLER
682
+ elif is_snowpark_data_object(input_data):
683
+ return DataFormat.SNOWPARK_OBJECT
684
+ elif is_modin_data_object(input_data):
685
+ return DataFormat.MODIN_OBJECT
686
+ elif is_snowpandas_data_object(input_data):
687
+ return DataFormat.SNOWPANDAS_OBJECT
688
+ elif is_pyspark_data_object(input_data):
689
+ return DataFormat.PYSPARK_OBJECT
690
+ elif isinstance(input_data, (list, tuple, set)):
691
+ if is_list_of_scalars(input_data):
692
+ # -> one-dimensional data structure
693
+ if isinstance(input_data, tuple):
694
+ return DataFormat.TUPLE_OF_VALUES
695
+ if isinstance(input_data, set):
696
+ return DataFormat.SET_OF_VALUES
697
+ return DataFormat.LIST_OF_VALUES
698
+ else:
699
+ # -> Multi-dimensional data structure
700
+ # This should always contain at least one element,
701
+ # otherwise the values type from infer_dtype would have been empty
702
+ first_element = next(iter(input_data))
703
+ if isinstance(first_element, dict):
704
+ return DataFormat.LIST_OF_RECORDS
705
+ if isinstance(first_element, (list, tuple, set)):
706
+ return DataFormat.LIST_OF_ROWS
707
+ elif isinstance(input_data, dict):
708
+ if not input_data:
709
+ return DataFormat.KEY_VALUE_DICT
710
+ if len(input_data) > 0:
711
+ first_value = next(iter(input_data.values()))
712
+ if isinstance(first_value, dict):
713
+ return DataFormat.COLUMN_INDEX_MAPPING
714
+ if isinstance(first_value, (list, tuple)):
715
+ return DataFormat.COLUMN_VALUE_MAPPING
716
+ if isinstance(first_value, pd.Series):
717
+ return DataFormat.COLUMN_SERIES_MAPPING
718
+ # In the future, we could potentially also support the tight & split formats here
719
+ if is_list_of_scalars(input_data.values()):
720
+ # Only use the key-value dict format if the values are only scalar values
721
+ return DataFormat.KEY_VALUE_DICT
722
+ return DataFormat.UNKNOWN
723
+
724
+
725
+ def _unify_missing_values(df: DataFrame) -> DataFrame:
726
+ """Unify all missing values in a DataFrame to None.
727
+
728
+ Pandas uses a variety of values to represent missing values, including np.nan,
729
+ NaT, None, and pd.NA. This function replaces all of these values with None,
730
+ which is the only missing value type that is supported by all data
731
+ """
732
+ import numpy as np
733
+
734
+ return df.fillna(np.nan).replace([np.nan], [None])
735
+
736
+
737
+ def convert_df_to_data_format(
738
+ df: DataFrame, data_format: DataFormat
739
+ ) -> (
740
+ DataFrame
741
+ | Series[Any]
742
+ | pa.Table
743
+ | np.ndarray[Any, np.dtype[Any]]
744
+ | tuple[Any]
745
+ | list[Any]
746
+ | set[Any]
747
+ | dict[str, Any]
748
+ ):
749
+ """Convert a dataframe to the specified data format.
750
+
751
+ Parameters
752
+ ----------
753
+ df : pd.DataFrame
754
+ The dataframe to convert.
755
+
756
+ data_format : DataFormat
757
+ The data format to convert to.
758
+
759
+ Returns
760
+ -------
761
+ pd.DataFrame, pd.Series, pyarrow.Table, np.ndarray, list, set, tuple, or dict.
762
+ The converted dataframe.
763
+ """
764
+
765
+ if data_format in [
766
+ DataFormat.EMPTY,
767
+ DataFormat.PANDAS_DATAFRAME,
768
+ DataFormat.SNOWPARK_OBJECT,
769
+ DataFormat.PYSPARK_OBJECT,
770
+ DataFormat.PANDAS_INDEX,
771
+ DataFormat.PANDAS_STYLER,
772
+ DataFormat.MODIN_OBJECT,
773
+ DataFormat.SNOWPANDAS_OBJECT,
774
+ ]:
775
+ return df
776
+ elif data_format == DataFormat.NUMPY_LIST:
777
+ import numpy as np
778
+
779
+ # It's a 1-dimensional array, so we only return
780
+ # the first column as numpy array
781
+ # Calling to_numpy() on the full DataFrame would result in:
782
+ # [[1], [2]] instead of [1, 2]
783
+ return np.ndarray(0) if df.empty else df.iloc[:, 0].to_numpy()
784
+ elif data_format == DataFormat.NUMPY_MATRIX:
785
+ import numpy as np
786
+
787
+ return np.ndarray(0) if df.empty else df.to_numpy()
788
+ elif data_format == DataFormat.PYARROW_TABLE:
789
+ import pyarrow as pa
790
+
791
+ return pa.Table.from_pandas(df)
792
+ elif data_format == DataFormat.PANDAS_SERIES:
793
+ # Select first column in dataframe and create a new series based on the values
794
+ if len(df.columns) != 1:
795
+ raise ValueError(
796
+ f"DataFrame is expected to have a single column but has {len(df.columns)}."
797
+ )
798
+ return df[df.columns[0]]
799
+ elif data_format == DataFormat.LIST_OF_RECORDS:
800
+ return _unify_missing_values(df).to_dict(orient="records")
801
+ elif data_format == DataFormat.LIST_OF_ROWS:
802
+ # to_numpy converts the dataframe to a list of rows
803
+ return _unify_missing_values(df).to_numpy().tolist()
804
+ elif data_format == DataFormat.COLUMN_INDEX_MAPPING:
805
+ return _unify_missing_values(df).to_dict(orient="dict")
806
+ elif data_format == DataFormat.COLUMN_VALUE_MAPPING:
807
+ return _unify_missing_values(df).to_dict(orient="list")
808
+ elif data_format == DataFormat.COLUMN_SERIES_MAPPING:
809
+ return df.to_dict(orient="series")
810
+ elif data_format in [
811
+ DataFormat.LIST_OF_VALUES,
812
+ DataFormat.TUPLE_OF_VALUES,
813
+ DataFormat.SET_OF_VALUES,
814
+ ]:
815
+ df = _unify_missing_values(df)
816
+ return_list = []
817
+ if len(df.columns) == 1:
818
+ # Get the first column and convert to list
819
+ return_list = df[df.columns[0]].tolist()
820
+ elif len(df.columns) >= 1:
821
+ raise ValueError(
822
+ f"DataFrame is expected to have a single column but has {len(df.columns)}."
823
+ )
824
+ if data_format == DataFormat.TUPLE_OF_VALUES:
825
+ return tuple(return_list)
826
+ if data_format == DataFormat.SET_OF_VALUES:
827
+ return set(return_list)
828
+ return return_list
829
+ elif data_format == DataFormat.KEY_VALUE_DICT:
830
+ df = _unify_missing_values(df)
831
+ # The key is expected to be the index -> this will return the first column
832
+ # as a dict with index as key.
833
+ return {} if df.empty else df.iloc[:, 0].to_dict()
834
+
835
+ raise ValueError(f"Unsupported input data format: {data_format}")