snowpark-connect 0.33.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (39) hide show
  1. snowflake/snowpark_connect/column_name_handler.py +42 -56
  2. snowflake/snowpark_connect/config.py +9 -0
  3. snowflake/snowpark_connect/expression/literal.py +12 -12
  4. snowflake/snowpark_connect/expression/map_sql_expression.py +6 -0
  5. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +147 -63
  6. snowflake/snowpark_connect/expression/map_unresolved_function.py +31 -28
  7. snowflake/snowpark_connect/relation/map_aggregate.py +156 -255
  8. snowflake/snowpark_connect/relation/map_column_ops.py +14 -0
  9. snowflake/snowpark_connect/relation/map_join.py +364 -234
  10. snowflake/snowpark_connect/relation/map_sql.py +309 -150
  11. snowflake/snowpark_connect/relation/read/map_read.py +9 -1
  12. snowflake/snowpark_connect/relation/read/map_read_csv.py +19 -2
  13. snowflake/snowpark_connect/relation/read/map_read_json.py +3 -0
  14. snowflake/snowpark_connect/relation/read/map_read_parquet.py +3 -0
  15. snowflake/snowpark_connect/relation/read/map_read_text.py +4 -0
  16. snowflake/snowpark_connect/relation/read/reader_config.py +10 -0
  17. snowflake/snowpark_connect/relation/read/utils.py +41 -0
  18. snowflake/snowpark_connect/relation/utils.py +4 -2
  19. snowflake/snowpark_connect/relation/write/map_write.py +65 -17
  20. snowflake/snowpark_connect/utils/context.py +0 -14
  21. snowflake/snowpark_connect/utils/expression_transformer.py +163 -0
  22. snowflake/snowpark_connect/utils/session.py +0 -4
  23. snowflake/snowpark_connect/utils/udf_helper.py +1 -0
  24. snowflake/snowpark_connect/utils/udtf_helper.py +3 -0
  25. snowflake/snowpark_connect/version.py +1 -1
  26. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/METADATA +2 -2
  27. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/RECORD +35 -38
  28. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/__init__.py +0 -16
  29. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/accessors.py +0 -1281
  30. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/functions.py +0 -203
  31. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/utils.py +0 -202
  32. {snowpark_connect-0.33.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-connect +0 -0
  33. {snowpark_connect-0.33.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-session +0 -0
  34. {snowpark_connect-0.33.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-submit +0 -0
  35. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/WHEEL +0 -0
  36. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/LICENSE-binary +0 -0
  37. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/LICENSE.txt +0 -0
  38. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/NOTICE-binary +0 -0
  39. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,1281 +0,0 @@
1
- #
2
- # Licensed to the Apache Software Foundation (ASF) under one or more
3
- # contributor license agreements. See the NOTICE file distributed with
4
- # this work for additional information regarding copyright ownership.
5
- # The ASF licenses this file to You under the Apache License, Version 2.0
6
- # (the "License"); you may not use this file except in compliance with
7
- # the License. You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- #
17
-
18
- """
19
- Spark related features. Usually, the features here are missing in pandas
20
- but Spark has it.
21
- """
22
- from abc import ABCMeta, abstractmethod
23
- from typing import TYPE_CHECKING, Callable, Generic, List, Optional, Union
24
-
25
- from pyspark import StorageLevel
26
- from pyspark.sql import Column as PySparkColumn, DataFrame as PySparkDataFrame
27
- from pyspark.sql.types import DataType, StructType
28
-
29
- from pyspark.pandas._typing import IndexOpsLike
30
- from pyspark.pandas.internal import InternalField
31
-
32
- # For Supporting Spark Connect
33
- from pyspark.sql.utils import get_column_class, get_dataframe_class
34
-
35
- if TYPE_CHECKING:
36
- from pyspark.sql._typing import OptionalPrimitiveType
37
- from pyspark._typing import PrimitiveType
38
-
39
- import pyspark.pandas as ps
40
- from pyspark.pandas.frame import CachedDataFrame
41
-
42
-
43
- class SparkIndexOpsMethods(Generic[IndexOpsLike], metaclass=ABCMeta):
44
- """Spark related features. Usually, the features here are missing in pandas
45
- but Spark has it."""
46
-
47
- def __init__(self, data: IndexOpsLike):
48
- self._data = data
49
-
50
- @property
51
- def data_type(self) -> DataType:
52
- """Returns the data type as defined by Spark, as a Spark DataType object."""
53
- return self._data._internal.spark_type_for(self._data._column_label)
54
-
55
- @property
56
- def nullable(self) -> bool:
57
- """Returns the nullability as defined by Spark."""
58
- return self._data._internal.spark_column_nullable_for(self._data._column_label)
59
-
60
- @property
61
- def column(self) -> PySparkColumn:
62
- """
63
- Spark Column object representing the Series/Index.
64
-
65
- .. note:: This Spark Column object is strictly stick to its base DataFrame the Series/Index
66
- was derived from.
67
- """
68
- return self._data._internal.spark_column_for(self._data._column_label)
69
-
70
- def transform(self, func: Callable[[PySparkColumn], PySparkColumn]) -> IndexOpsLike:
71
- """
72
- Applies a function that takes and returns a Spark column. It allows natively
73
- applying a Spark function and column APIs with the Spark column internally used
74
- in Series or Index. The output length of the Spark column should be the same as input's.
75
-
76
- .. note:: It requires to have the same input and output length; therefore,
77
- the aggregate Spark functions such as count does not work.
78
-
79
- Parameters
80
- ----------
81
- func : function
82
- Function to use for transforming the data by using Spark columns.
83
-
84
- Returns
85
- -------
86
- Series or Index
87
-
88
- Raises
89
- ------
90
- ValueError : If the output from the function is not a Spark column.
91
-
92
- Examples
93
- --------
94
- >>> from pyspark.sql.functions import log
95
- >>> df = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"])
96
- >>> df
97
- a b
98
- 0 1 4
99
- 1 2 5
100
- 2 3 6
101
-
102
- >>> df.a.spark.transform(lambda c: log(c))
103
- 0 0.000000
104
- 1 0.693147
105
- 2 1.098612
106
- Name: a, dtype: float64
107
-
108
- >>> df.index.spark.transform(lambda c: c + 10) # doctest: +SKIP
109
- Int64Index([10, 11, 12], dtype='int64')
110
-
111
- >>> df.a.spark.transform(lambda c: c + df.b.spark.column)
112
- 0 5
113
- 1 7
114
- 2 9
115
- Name: a, dtype: int64
116
- """
117
- from pyspark.pandas import MultiIndex
118
-
119
- if isinstance(self._data, MultiIndex):
120
- raise NotImplementedError("MultiIndex does not support spark.transform yet.")
121
- output = func(self._data.spark.column)
122
- Column = get_column_class()
123
- if not isinstance(output, Column):
124
- raise ValueError(
125
- "The output of the function [%s] should be of a "
126
- "pyspark.sql.Column; however, got [%s]." % (func, type(output))
127
- )
128
- # Trigger the resolution so it throws an exception if anything does wrong
129
- # within the function, for example,
130
- # `df1.a.spark.transform(lambda _: F.col("non-existent"))`.
131
- field = InternalField.from_struct_field(
132
- self._data._internal.spark_frame.select(output).schema.fields[0]
133
- )
134
- return self._data._with_new_scol(scol=output, field=field)
135
-
136
- @property
137
- @abstractmethod
138
- def analyzed(self) -> IndexOpsLike:
139
- pass
140
-
141
-
142
- class SparkSeriesMethods(SparkIndexOpsMethods["ps.Series"]):
143
- def apply(self, func: Callable[[PySparkColumn], PySparkColumn]) -> "ps.Series":
144
- """
145
- Applies a function that takes and returns a Spark column. It allows to natively
146
- apply a Spark function and column APIs with the Spark column internally used
147
- in Series or Index.
148
-
149
- .. note:: It forces to lose the index and end up using the default index. It is
150
- preferred to use :meth:`Series.spark.transform` or `:meth:`DataFrame.spark.apply`
151
- with specifying the `index_col`.
152
-
153
- .. note:: It does not require to have the same length of the input and output.
154
- However, it requires to create a new DataFrame internally which will require
155
- to set `compute.ops_on_diff_frames` to compute even with the same origin
156
- DataFrame is expensive, whereas :meth:`Series.spark.transform` does not
157
- require it.
158
-
159
- Parameters
160
- ----------
161
- func : function
162
- Function to apply the function against the data by using Spark columns.
163
-
164
- Returns
165
- -------
166
- Series
167
-
168
- Raises
169
- ------
170
- ValueError : If the output from the function is not a Spark column.
171
-
172
- Examples
173
- --------
174
- >>> from pyspark import pandas as ps
175
- >>> from pyspark.sql.functions import count, lit
176
- >>> df = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"])
177
- >>> df
178
- a b
179
- 0 1 4
180
- 1 2 5
181
- 2 3 6
182
-
183
- >>> df.a.spark.apply(lambda c: count(c))
184
- 0 3
185
- Name: a, dtype: int64
186
-
187
- >>> df.a.spark.apply(lambda c: c + df.b.spark.column)
188
- 0 5
189
- 1 7
190
- 2 9
191
- Name: a, dtype: int64
192
- """
193
- from pyspark.pandas.frame import DataFrame
194
- from pyspark.pandas.series import Series, first_series
195
- from pyspark.pandas.internal import HIDDEN_COLUMNS
196
-
197
- output = func(self._data.spark.column)
198
- Column = get_column_class()
199
- if not isinstance(output, Column):
200
- raise ValueError(
201
- "The output of the function [%s] should be of a "
202
- "pyspark.sql.Column; however, got [%s]." % (func, type(output))
203
- )
204
- assert isinstance(self._data, Series)
205
-
206
- sdf = self._data._internal.spark_frame.drop(*HIDDEN_COLUMNS).select(output)
207
- # Lose index.
208
- return first_series(DataFrame(sdf)).rename(self._data.name)
209
-
210
- @property
211
- def analyzed(self) -> "ps.Series":
212
- """
213
- Returns a new Series with the analyzed Spark DataFrame.
214
-
215
- After multiple operations, the underlying Spark plan could grow huge
216
- and make the Spark planner take a long time to finish the planning.
217
-
218
- This function is for the workaround to avoid it.
219
-
220
- .. note:: After analyzing, operations between the analyzed Series and the original one
221
- will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`.
222
-
223
- Returns
224
- -------
225
- Series
226
-
227
- Examples
228
- --------
229
- >>> ser = ps.Series([1, 2, 3])
230
- >>> ser
231
- 0 1
232
- 1 2
233
- 2 3
234
- dtype: int64
235
-
236
- The analyzed one should return the same value.
237
-
238
- >>> ser.spark.analyzed
239
- 0 1
240
- 1 2
241
- 2 3
242
- dtype: int64
243
-
244
- However, it won't work with the same anchor Series.
245
-
246
- >>> ser + ser.spark.analyzed
247
- Traceback (most recent call last):
248
- ...
249
- ValueError: ... enable 'compute.ops_on_diff_frames' option.
250
-
251
- >>> with ps.option_context('compute.ops_on_diff_frames', True):
252
- ... (ser + ser.spark.analyzed).sort_index()
253
- 0 2
254
- 1 4
255
- 2 6
256
- dtype: int64
257
- """
258
- from pyspark.pandas.frame import DataFrame
259
- from pyspark.pandas.series import first_series
260
-
261
- return first_series(DataFrame(self._data._internal.resolved_copy))
262
-
263
-
264
- class SparkIndexMethods(SparkIndexOpsMethods["ps.Index"]):
265
- @property
266
- def analyzed(self) -> "ps.Index":
267
- """
268
- Returns a new Index with the analyzed Spark DataFrame.
269
-
270
- After multiple operations, the underlying Spark plan could grow huge
271
- and make the Spark planner take a long time to finish the planning.
272
-
273
- This function is for the workaround to avoid it.
274
-
275
- .. note:: After analyzing, operations between the analyzed Series and the original one
276
- will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`.
277
-
278
- Returns
279
- -------
280
- Index
281
-
282
- Examples
283
- --------
284
- >>> import pyspark.pandas as ps
285
- >>> idx = ps.Index([1, 2, 3])
286
- >>> idx # doctest: +SKIP
287
- Int64Index([1, 2, 3], dtype='int64')
288
-
289
- The analyzed one should return the same value.
290
-
291
- >>> idx.spark.analyzed # doctest: +SKIP
292
- Int64Index([1, 2, 3], dtype='int64')
293
-
294
- However, it won't work with the same anchor Index.
295
-
296
- >>> idx + idx.spark.analyzed
297
- Traceback (most recent call last):
298
- ...
299
- ValueError: ... enable 'compute.ops_on_diff_frames' option.
300
-
301
- >>> with ps.option_context('compute.ops_on_diff_frames', True):
302
- ... (idx + idx.spark.analyzed).sort_values() # doctest: +SKIP
303
- Int64Index([2, 4, 6], dtype='int64')
304
- """
305
- from pyspark.pandas.frame import DataFrame
306
-
307
- return DataFrame(self._data._internal.resolved_copy).index
308
-
309
-
310
- class SparkFrameMethods:
311
- """Spark related features. Usually, the features here are missing in pandas
312
- but Spark has it."""
313
-
314
- def __init__(self, frame: "ps.DataFrame"):
315
- self._psdf = frame
316
-
317
- def schema(self, index_col: Optional[Union[str, List[str]]] = None) -> StructType:
318
- """
319
- Returns the underlying Spark schema.
320
-
321
- Returns
322
- -------
323
- pyspark.sql.types.StructType
324
- The underlying Spark schema.
325
-
326
- Parameters
327
- ----------
328
- index_col: str or list of str, optional, default: None
329
- Column names to be used in Spark to represent pandas-on-Spark's index. The index name
330
- in pandas-on-Spark is ignored. By default, the index is always lost.
331
-
332
- Examples
333
- --------
334
- >>> df = ps.DataFrame({'a': list('abc'),
335
- ... 'b': list(range(1, 4)),
336
- ... 'c': np.arange(3, 6).astype('i1'),
337
- ... 'd': np.arange(4.0, 7.0, dtype='float64'),
338
- ... 'e': [True, False, True],
339
- ... 'f': pd.date_range('20130101', periods=3)},
340
- ... columns=['a', 'b', 'c', 'd', 'e', 'f'])
341
- >>> df.spark.schema().simpleString()
342
- 'struct<a:string,b:bigint,c:tinyint,d:double,e:boolean,f:timestamp>'
343
- >>> df.spark.schema(index_col='index').simpleString()
344
- 'struct<index:bigint,a:string,b:bigint,c:tinyint,d:double,e:boolean,f:timestamp>'
345
- """
346
- return self.frame(index_col).schema
347
-
348
- def print_schema(self, index_col: Optional[Union[str, List[str]]] = None) -> None:
349
- """
350
- Prints out the underlying Spark schema in the tree format.
351
-
352
- Parameters
353
- ----------
354
- index_col: str or list of str, optional, default: None
355
- Column names to be used in Spark to represent pandas-on-Spark's index. The index name
356
- in pandas-on-Spark is ignored. By default, the index is always lost.
357
-
358
- Returns
359
- -------
360
- None
361
-
362
- Examples
363
- --------
364
- >>> df = ps.DataFrame({'a': list('abc'),
365
- ... 'b': list(range(1, 4)),
366
- ... 'c': np.arange(3, 6).astype('i1'),
367
- ... 'd': np.arange(4.0, 7.0, dtype='float64'),
368
- ... 'e': [True, False, True],
369
- ... 'f': pd.date_range('20130101', periods=3)},
370
- ... columns=['a', 'b', 'c', 'd', 'e', 'f'])
371
- >>> df.spark.print_schema() # doctest: +NORMALIZE_WHITESPACE
372
- root
373
- |-- a: string (nullable = false)
374
- |-- b: long (nullable = false)
375
- |-- c: byte (nullable = false)
376
- |-- d: double (nullable = false)
377
- |-- e: boolean (nullable = false)
378
- |-- f: timestamp (nullable = false)
379
- >>> df.spark.print_schema(index_col='index') # doctest: +NORMALIZE_WHITESPACE
380
- root
381
- |-- index: long (nullable = false)
382
- |-- a: string (nullable = false)
383
- |-- b: long (nullable = false)
384
- |-- c: byte (nullable = false)
385
- |-- d: double (nullable = false)
386
- |-- e: boolean (nullable = false)
387
- |-- f: timestamp (nullable = false)
388
- """
389
- self.frame(index_col).printSchema()
390
-
391
- def frame(self, index_col: Optional[Union[str, List[str]]] = None) -> PySparkDataFrame:
392
- """
393
- Return the current DataFrame as a Spark DataFrame. :meth:`DataFrame.spark.frame` is an
394
- alias of :meth:`DataFrame.to_spark`.
395
-
396
- Parameters
397
- ----------
398
- index_col: str or list of str, optional, default: None
399
- Column names to be used in Spark to represent pandas-on-Spark's index. The index name
400
- in pandas-on-Spark is ignored. By default, the index is always lost.
401
-
402
- See Also
403
- --------
404
- DataFrame.to_spark
405
- DataFrame.pandas_api
406
- DataFrame.spark.frame
407
-
408
- Examples
409
- --------
410
- By default, this method loses the index as below.
411
-
412
- >>> df = ps.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
413
- >>> df.to_spark().show() # doctest: +NORMALIZE_WHITESPACE
414
- +---+---+---+
415
- | a| b| c|
416
- +---+---+---+
417
- | 1| 4| 7|
418
- | 2| 5| 8|
419
- | 3| 6| 9|
420
- +---+---+---+
421
-
422
- >>> df = ps.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
423
- >>> df.spark.frame().show() # doctest: +NORMALIZE_WHITESPACE
424
- +---+---+---+
425
- | a| b| c|
426
- +---+---+---+
427
- | 1| 4| 7|
428
- | 2| 5| 8|
429
- | 3| 6| 9|
430
- +---+---+---+
431
-
432
- If `index_col` is set, it keeps the index column as specified.
433
-
434
- >>> df.to_spark(index_col="index").show() # doctest: +NORMALIZE_WHITESPACE
435
- +-----+---+---+---+
436
- |index| a| b| c|
437
- +-----+---+---+---+
438
- | 0| 1| 4| 7|
439
- | 1| 2| 5| 8|
440
- | 2| 3| 6| 9|
441
- +-----+---+---+---+
442
-
443
- Keeping an index column is useful when you want to call some Spark APIs and
444
- convert it back to pandas-on-Spark DataFrame without creating a default index, which
445
- can affect performance.
446
-
447
- >>> spark_df = df.to_spark(index_col="index")
448
- >>> spark_df = spark_df.filter("a == 2")
449
- >>> spark_df.pandas_api(index_col="index") # doctest: +NORMALIZE_WHITESPACE
450
- a b c
451
- index
452
- 1 2 5 8
453
-
454
- In case of multi-index, specify a list to `index_col`.
455
-
456
- >>> new_df = df.set_index("a", append=True)
457
- >>> new_spark_df = new_df.to_spark(index_col=["index_1", "index_2"])
458
- >>> new_spark_df.show() # doctest: +NORMALIZE_WHITESPACE
459
- +-------+-------+---+---+
460
- |index_1|index_2| b| c|
461
- +-------+-------+---+---+
462
- | 0| 1| 4| 7|
463
- | 1| 2| 5| 8|
464
- | 2| 3| 6| 9|
465
- +-------+-------+---+---+
466
-
467
- Can be converted back to pandas-on-Spark DataFrame.
468
-
469
- >>> new_spark_df.pandas_api(
470
- ... index_col=["index_1", "index_2"]) # doctest: +NORMALIZE_WHITESPACE
471
- b c
472
- index_1 index_2
473
- 0 1 4 7
474
- 1 2 5 8
475
- 2 3 6 9
476
- """
477
- from pyspark.pandas.utils import name_like_string
478
-
479
- psdf = self._psdf
480
-
481
- data_column_names = []
482
- data_columns = []
483
- for i, (label, spark_column, column_name) in enumerate(
484
- zip(
485
- psdf._internal.column_labels,
486
- psdf._internal.data_spark_columns,
487
- psdf._internal.data_spark_column_names,
488
- )
489
- ):
490
- name = str(i) if label is None else name_like_string(label)
491
- data_column_names.append(name)
492
- if column_name != name:
493
- spark_column = spark_column.alias(name)
494
- data_columns.append(spark_column)
495
-
496
- if index_col is None:
497
- return psdf._internal.spark_frame.select(data_columns)
498
- else:
499
- if isinstance(index_col, str):
500
- index_col = [index_col]
501
-
502
- old_index_scols = psdf._internal.index_spark_columns
503
-
504
- if len(index_col) != len(old_index_scols):
505
- raise ValueError(
506
- "length of index columns is %s; however, the length of the given "
507
- "'index_col' is %s." % (len(old_index_scols), len(index_col))
508
- )
509
-
510
- if any(col in data_column_names for col in index_col):
511
- raise ValueError("'index_col' cannot be overlapped with other columns.")
512
-
513
- new_index_scols = [
514
- index_scol.alias(col) for index_scol, col in zip(old_index_scols, index_col)
515
- ]
516
- return psdf._internal.spark_frame.select(new_index_scols + data_columns)
517
-
518
- def cache(self) -> "CachedDataFrame":
519
- """
520
- Yields and caches the current DataFrame.
521
-
522
- The pandas-on-Spark DataFrame is yielded as a protected resource and its corresponding
523
- data is cached which gets uncached after execution goes off the context.
524
-
525
- If you want to specify the StorageLevel manually, use :meth:`DataFrame.spark.persist`
526
-
527
- See Also
528
- --------
529
- DataFrame.spark.persist
530
-
531
- Examples
532
- --------
533
- >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
534
- ... columns=['dogs', 'cats'])
535
- >>> df
536
- dogs cats
537
- 0 0.2 0.3
538
- 1 0.0 0.6
539
- 2 0.6 0.0
540
- 3 0.2 0.1
541
-
542
- >>> with df.spark.cache() as cached_df:
543
- ... print(cached_df.count())
544
- ...
545
- dogs 4
546
- cats 4
547
- dtype: int64
548
-
549
- >>> df = df.spark.cache()
550
- >>> df.to_pandas().mean(axis=1)
551
- 0 0.25
552
- 1 0.30
553
- 2 0.30
554
- 3 0.15
555
- dtype: float64
556
-
557
- To uncache the dataframe, use `unpersist` function
558
-
559
- >>> df.spark.unpersist()
560
- """
561
- from pyspark.pandas.frame import CachedDataFrame
562
-
563
- self._psdf._update_internal_frame(
564
- self._psdf._internal.resolved_copy, check_same_anchor=False
565
- )
566
- return CachedDataFrame(self._psdf._internal)
567
-
568
- def persist(
569
- self, storage_level: StorageLevel = StorageLevel.MEMORY_AND_DISK
570
- ) -> "CachedDataFrame":
571
- """
572
- Yields and caches the current DataFrame with a specific StorageLevel.
573
- If a StorageLevel is not given, the `MEMORY_AND_DISK` level is used by default like PySpark.
574
-
575
- The pandas-on-Spark DataFrame is yielded as a protected resource and its corresponding
576
- data is cached which gets uncached after execution goes off the context.
577
-
578
- See Also
579
- --------
580
- DataFrame.spark.cache
581
-
582
- Examples
583
- --------
584
- >>> import pyspark
585
- >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
586
- ... columns=['dogs', 'cats'])
587
- >>> df
588
- dogs cats
589
- 0 0.2 0.3
590
- 1 0.0 0.6
591
- 2 0.6 0.0
592
- 3 0.2 0.1
593
-
594
- Set the StorageLevel to `MEMORY_ONLY`.
595
-
596
- >>> with df.spark.persist(pyspark.StorageLevel.MEMORY_ONLY) as cached_df:
597
- ... print(cached_df.spark.storage_level)
598
- ... print(cached_df.count())
599
- ...
600
- Memory Serialized 1x Replicated
601
- dogs 4
602
- cats 4
603
- dtype: int64
604
-
605
- Set the StorageLevel to `DISK_ONLY`.
606
-
607
- >>> with df.spark.persist(pyspark.StorageLevel.DISK_ONLY) as cached_df:
608
- ... print(cached_df.spark.storage_level)
609
- ... print(cached_df.count())
610
- ...
611
- Disk Serialized 1x Replicated
612
- dogs 4
613
- cats 4
614
- dtype: int64
615
-
616
- If a StorageLevel is not given, it uses `MEMORY_AND_DISK` by default.
617
-
618
- >>> with df.spark.persist() as cached_df:
619
- ... print(cached_df.spark.storage_level)
620
- ... print(cached_df.count())
621
- ...
622
- Disk Memory Serialized 1x Replicated
623
- dogs 4
624
- cats 4
625
- dtype: int64
626
-
627
- >>> df = df.spark.persist()
628
- >>> df.to_pandas().mean(axis=1)
629
- 0 0.25
630
- 1 0.30
631
- 2 0.30
632
- 3 0.15
633
- dtype: float64
634
-
635
- To uncache the dataframe, use `unpersist` function
636
-
637
- >>> df.spark.unpersist()
638
- """
639
- from pyspark.pandas.frame import CachedDataFrame
640
-
641
- self._psdf._update_internal_frame(
642
- self._psdf._internal.resolved_copy, check_same_anchor=False
643
- )
644
- return CachedDataFrame(self._psdf._internal, storage_level=storage_level)
645
-
646
- def hint(self, name: str, *parameters: "PrimitiveType") -> "ps.DataFrame":
647
- """
648
- Specifies some hint on the current DataFrame.
649
-
650
- Parameters
651
- ----------
652
- name : A name of the hint.
653
- parameters : Optional parameters.
654
-
655
- Returns
656
- -------
657
- ret : DataFrame with the hint.
658
-
659
- See Also
660
- --------
661
- broadcast : Marks a DataFrame as small enough for use in broadcast joins.
662
-
663
- Examples
664
- --------
665
- >>> df1 = ps.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
666
- ... 'value': [1, 2, 3, 5]},
667
- ... columns=['lkey', 'value']).set_index('lkey')
668
- >>> df2 = ps.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
669
- ... 'value': [5, 6, 7, 8]},
670
- ... columns=['rkey', 'value']).set_index('rkey')
671
- >>> merged = df1.merge(df2.spark.hint("broadcast"), left_index=True, right_index=True)
672
- >>> merged.spark.explain() # doctest: +ELLIPSIS
673
- == Physical Plan ==
674
- ...
675
- ...BroadcastHashJoin...
676
- ...
677
- """
678
- from pyspark.pandas.frame import DataFrame
679
-
680
- internal = self._psdf._internal.resolved_copy
681
- return DataFrame(internal.with_new_sdf(internal.spark_frame.hint(name, *parameters)))
682
-
683
- def to_table(
684
- self,
685
- name: str,
686
- format: Optional[str] = None,
687
- mode: str = "overwrite",
688
- partition_cols: Optional[Union[str, List[str]]] = None,
689
- index_col: Optional[Union[str, List[str]]] = None,
690
- **options: "OptionalPrimitiveType",
691
- ) -> None:
692
- """
693
- Write the DataFrame into a Spark table. :meth:`DataFrame.spark.to_table`
694
- is an alias of :meth:`DataFrame.to_table`.
695
-
696
- Parameters
697
- ----------
698
- name : str, required
699
- Table name in Spark.
700
- format : string, optional
701
- Specifies the output data source format. Some common ones are:
702
-
703
- - 'delta'
704
- - 'parquet'
705
- - 'orc'
706
- - 'json'
707
- - 'csv'
708
-
709
- mode : str {'append', 'overwrite', 'ignore', 'error', 'errorifexists'}, default
710
- 'overwrite'. Specifies the behavior of the save operation when the table exists
711
- already.
712
-
713
- - 'append': Append the new data to existing data.
714
- - 'overwrite': Overwrite existing data.
715
- - 'ignore': Silently ignore this operation if data already exists.
716
- - 'error' or 'errorifexists': Throw an exception if data already exists.
717
-
718
- partition_cols : str or list of str, optional, default None
719
- Names of partitioning columns
720
- index_col: str or list of str, optional, default: None
721
- Column names to be used in Spark to represent pandas-on-Spark's index. The index name
722
- in pandas-on-Spark is ignored. By default, the index is always lost.
723
- options
724
- Additional options passed directly to Spark.
725
-
726
- Returns
727
- -------
728
- None
729
-
730
- See Also
731
- --------
732
- read_table
733
- DataFrame.to_spark_io
734
- DataFrame.spark.to_spark_io
735
- DataFrame.to_parquet
736
-
737
- Examples
738
- --------
739
- >>> df = ps.DataFrame(dict(
740
- ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')),
741
- ... country=['KR', 'US', 'JP'],
742
- ... code=[1, 2 ,3]), columns=['date', 'country', 'code'])
743
- >>> df
744
- date country code
745
- 0 2012-01-31 12:00:00 KR 1
746
- 1 2012-02-29 12:00:00 US 2
747
- 2 2012-03-31 12:00:00 JP 3
748
-
749
- >>> df.to_table('%s.my_table' % db, partition_cols='date')
750
- """
751
- if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
752
- options = options.get("options") # type: ignore[assignment]
753
-
754
- self._psdf.spark.frame(index_col=index_col).write.saveAsTable(
755
- name=name, format=format, mode=mode, partitionBy=partition_cols, **options
756
- )
757
-
758
- def to_spark_io(
759
- self,
760
- path: Optional[str] = None,
761
- format: Optional[str] = None,
762
- mode: str = "overwrite",
763
- partition_cols: Optional[Union[str, List[str]]] = None,
764
- index_col: Optional[Union[str, List[str]]] = None,
765
- **options: "OptionalPrimitiveType",
766
- ) -> None:
767
- """Write the DataFrame out to a Spark data source. :meth:`DataFrame.spark.to_spark_io`
768
- is an alias of :meth:`DataFrame.to_spark_io`.
769
-
770
- Parameters
771
- ----------
772
- path : string, optional
773
- Path to the data source.
774
- format : string, optional
775
- Specifies the output data source format. Some common ones are:
776
-
777
- - 'delta'
778
- - 'parquet'
779
- - 'orc'
780
- - 'json'
781
- - 'csv'
782
- mode : str {'append', 'overwrite', 'ignore', 'error', 'errorifexists'}, default
783
- 'overwrite'. Specifies the behavior of the save operation when data already exists.
784
-
785
- - 'append': Append the new data to existing data.
786
- - 'overwrite': Overwrite existing data.
787
- - 'ignore': Silently ignore this operation if data already exists.
788
- - 'error' or 'errorifexists': Throw an exception if data already exists.
789
- partition_cols : str or list of str, optional
790
- Names of partitioning columns
791
- index_col: str or list of str, optional, default: None
792
- Column names to be used in Spark to represent pandas-on-Spark's index. The index name
793
- in pandas-on-Spark is ignored. By default, the index is always lost.
794
- options : dict
795
- All other options passed directly into Spark's data source.
796
-
797
- Returns
798
- -------
799
- None
800
-
801
- See Also
802
- --------
803
- read_spark_io
804
- DataFrame.to_delta
805
- DataFrame.to_parquet
806
- DataFrame.to_table
807
- DataFrame.to_spark_io
808
- DataFrame.spark.to_spark_io
809
-
810
- Examples
811
- --------
812
- >>> df = ps.DataFrame(dict(
813
- ... date=list(pd.date_range('2012-1-1 12:00:00', periods=3, freq='M')),
814
- ... country=['KR', 'US', 'JP'],
815
- ... code=[1, 2 ,3]), columns=['date', 'country', 'code'])
816
- >>> df
817
- date country code
818
- 0 2012-01-31 12:00:00 KR 1
819
- 1 2012-02-29 12:00:00 US 2
820
- 2 2012-03-31 12:00:00 JP 3
821
-
822
- >>> df.to_spark_io(path='%s/to_spark_io/foo.json' % path, format='json')
823
- """
824
- if "options" in options and isinstance(options.get("options"), dict) and len(options) == 1:
825
- options = options.get("options") # type: ignore[assignment]
826
-
827
- self._psdf.spark.frame(index_col=index_col).write.save(
828
- path=path, format=format, mode=mode, partitionBy=partition_cols, **options
829
- )
830
-
831
- def explain(self, extended: Optional[bool] = None, mode: Optional[str] = None) -> None:
832
- """
833
- Prints the underlying (logical and physical) Spark plans to the console for debugging
834
- purpose.
835
-
836
- Parameters
837
- ----------
838
- extended : boolean, default ``False``.
839
- If ``False``, prints only the physical plan.
840
- mode : string, default ``None``.
841
- The expected output format of plans.
842
-
843
- Returns
844
- -------
845
- None
846
-
847
- Examples
848
- --------
849
- >>> df = ps.DataFrame({'id': range(10)})
850
- >>> df.spark.explain() # doctest: +ELLIPSIS
851
- == Physical Plan ==
852
- ...
853
-
854
- >>> df.spark.explain(True) # doctest: +ELLIPSIS
855
- == Parsed Logical Plan ==
856
- ...
857
- == Analyzed Logical Plan ==
858
- ...
859
- == Optimized Logical Plan ==
860
- ...
861
- == Physical Plan ==
862
- ...
863
-
864
- >>> df.spark.explain("extended") # doctest: +ELLIPSIS
865
- == Parsed Logical Plan ==
866
- ...
867
- == Analyzed Logical Plan ==
868
- ...
869
- == Optimized Logical Plan ==
870
- ...
871
- == Physical Plan ==
872
- ...
873
-
874
- >>> df.spark.explain(mode="extended") # doctest: +ELLIPSIS
875
- == Parsed Logical Plan ==
876
- ...
877
- == Analyzed Logical Plan ==
878
- ...
879
- == Optimized Logical Plan ==
880
- ...
881
- == Physical Plan ==
882
- ...
883
- """
884
- self._psdf._internal.to_internal_spark_frame.explain(extended, mode)
885
-
886
- def apply(
887
- self,
888
- func: Callable[[PySparkDataFrame], PySparkDataFrame],
889
- index_col: Optional[Union[str, List[str]]] = None,
890
- ) -> "ps.DataFrame":
891
- """
892
- Applies a function that takes and returns a Spark DataFrame. It allows natively
893
- apply a Spark function and column APIs with the Spark column internally used
894
- in Series or Index.
895
-
896
- .. note:: set `index_col` and keep the column named as so in the output Spark
897
- DataFrame to avoid using the default index to prevent performance penalty.
898
- If you omit `index_col`, it will use default index which is potentially
899
- expensive in general.
900
-
901
- .. note:: it will lose column labels. This is a synonym of
902
- ``func(psdf.to_spark(index_col)).pandas_api(index_col)``.
903
-
904
- Parameters
905
- ----------
906
- func : function
907
- Function to apply the function against the data by using Spark DataFrame.
908
-
909
- Returns
910
- -------
911
- DataFrame
912
-
913
- Raises
914
- ------
915
- ValueError : If the output from the function is not a Spark DataFrame.
916
-
917
- Examples
918
- --------
919
- >>> psdf = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"])
920
- >>> psdf
921
- a b
922
- 0 1 4
923
- 1 2 5
924
- 2 3 6
925
-
926
- >>> psdf.spark.apply(
927
- ... lambda sdf: sdf.selectExpr("a + b as c", "index"), index_col="index")
928
- ... # doctest: +NORMALIZE_WHITESPACE
929
- c
930
- index
931
- 0 5
932
- 1 7
933
- 2 9
934
-
935
- The case below ends up with using the default index, which should be avoided
936
- if possible.
937
-
938
- >>> psdf.spark.apply(lambda sdf: sdf.groupby("a").count().sort("a"))
939
- a count
940
- 0 1 1
941
- 1 2 1
942
- 2 3 1
943
- """
944
- output = func(self.frame(index_col))
945
- SparkDataFrame = get_dataframe_class()
946
- if not isinstance(output, SparkDataFrame):
947
- raise ValueError(
948
- "The output of the function [%s] should be of a "
949
- "pyspark.sql.DataFrame; however, got [%s]." % (func, type(output))
950
- )
951
- return output.pandas_api(index_col)
952
-
953
- def repartition(self, num_partitions: int) -> "ps.DataFrame":
954
- """
955
- Returns a new DataFrame partitioned by the given partitioning expressions. The
956
- resulting DataFrame is hash partitioned.
957
-
958
- Parameters
959
- ----------
960
- num_partitions : int
961
- The target number of partitions.
962
-
963
- Returns
964
- -------
965
- DataFrame
966
-
967
- Examples
968
- --------
969
- >>> psdf = ps.DataFrame({"age": [5, 5, 2, 2],
970
- ... "name": ["Bob", "Bob", "Alice", "Alice"]}).set_index("age")
971
- >>> psdf.sort_index() # doctest: +NORMALIZE_WHITESPACE
972
- name
973
- age
974
- 2 Alice
975
- 2 Alice
976
- 5 Bob
977
- 5 Bob
978
- >>> new_psdf = psdf.spark.repartition(7)
979
- >>> new_psdf.to_spark().rdd.getNumPartitions()
980
- 7
981
- >>> new_psdf.sort_index() # doctest: +NORMALIZE_WHITESPACE
982
- name
983
- age
984
- 2 Alice
985
- 2 Alice
986
- 5 Bob
987
- 5 Bob
988
- """
989
- from pyspark.pandas.frame import DataFrame
990
-
991
- internal = self._psdf._internal.resolved_copy
992
- repartitioned_sdf = internal.spark_frame.repartition(num_partitions)
993
- return DataFrame(internal.with_new_sdf(repartitioned_sdf))
994
-
995
- def coalesce(self, num_partitions: int) -> "ps.DataFrame":
996
- """
997
- Returns a new DataFrame that has exactly `num_partitions` partitions.
998
-
999
- .. note:: This operation results in a narrow dependency, e.g. if you go from 1000
1000
- partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new
1001
- partitions will claim 10 of the current partitions. If a larger number of partitions is
1002
- requested, it will stay at the current number of partitions. However, if you're doing a
1003
- drastic coalesce, e.g. to num_partitions = 1, this may result in your computation taking
1004
- place on fewer nodes than you like (e.g. one node in the case of num_partitions = 1). To
1005
- avoid this, you can call repartition(). This will add a shuffle step, but means the
1006
- current upstream partitions will be executed in parallel (per whatever the current
1007
- partitioning is).
1008
-
1009
- Parameters
1010
- ----------
1011
- num_partitions : int
1012
- The target number of partitions.
1013
-
1014
- Returns
1015
- -------
1016
- DataFrame
1017
-
1018
- Examples
1019
- --------
1020
- >>> psdf = ps.DataFrame({"age": [5, 5, 2, 2],
1021
- ... "name": ["Bob", "Bob", "Alice", "Alice"]}).set_index("age")
1022
- >>> psdf.sort_index() # doctest: +NORMALIZE_WHITESPACE
1023
- name
1024
- age
1025
- 2 Alice
1026
- 2 Alice
1027
- 5 Bob
1028
- 5 Bob
1029
- >>> new_psdf = psdf.spark.coalesce(1)
1030
- >>> new_psdf.to_spark().rdd.getNumPartitions()
1031
- 1
1032
- >>> new_psdf.sort_index() # doctest: +NORMALIZE_WHITESPACE
1033
- name
1034
- age
1035
- 2 Alice
1036
- 2 Alice
1037
- 5 Bob
1038
- 5 Bob
1039
- """
1040
- from pyspark.pandas.frame import DataFrame
1041
-
1042
- internal = self._psdf._internal.resolved_copy
1043
- coalesced_sdf = internal.spark_frame.coalesce(num_partitions)
1044
- return DataFrame(internal.with_new_sdf(coalesced_sdf))
1045
-
1046
- def checkpoint(self, eager: bool = True) -> "ps.DataFrame":
1047
- """Returns a checkpointed version of this DataFrame.
1048
-
1049
- Checkpointing can be used to truncate the logical plan of this DataFrame, which is
1050
- especially useful in iterative algorithms where the plan may grow exponentially. It will be
1051
- saved to files inside the checkpoint directory set with `SparkContext.setCheckpointDir`.
1052
-
1053
- Parameters
1054
- ----------
1055
- eager : bool
1056
- Whether to checkpoint this DataFrame immediately
1057
-
1058
- Returns
1059
- -------
1060
- DataFrame
1061
-
1062
- Examples
1063
- --------
1064
- >>> psdf = ps.DataFrame({"a": ["a", "b", "c"]})
1065
- >>> psdf
1066
- a
1067
- 0 a
1068
- 1 b
1069
- 2 c
1070
- >>> new_psdf = psdf.spark.checkpoint() # doctest: +SKIP
1071
- >>> new_psdf # doctest: +SKIP
1072
- a
1073
- 0 a
1074
- 1 b
1075
- 2 c
1076
- """
1077
- from pyspark.pandas.frame import DataFrame
1078
-
1079
- internal = self._psdf._internal.resolved_copy
1080
- checkpointed_sdf = internal.spark_frame.checkpoint(eager)
1081
- return DataFrame(internal.with_new_sdf(checkpointed_sdf))
1082
-
1083
- def local_checkpoint(self, eager: bool = True) -> "ps.DataFrame":
1084
- """Returns a locally checkpointed version of this DataFrame.
1085
-
1086
- Checkpointing can be used to truncate the logical plan of this DataFrame, which is
1087
- especially useful in iterative algorithms where the plan may grow exponentially. Local
1088
- checkpoints are stored in the executors using the caching subsystem and therefore they are
1089
- not reliable.
1090
-
1091
- Parameters
1092
- ----------
1093
- eager : bool
1094
- Whether to locally checkpoint this DataFrame immediately
1095
-
1096
- Returns
1097
- -------
1098
- DataFrame
1099
-
1100
- Examples
1101
- --------
1102
- >>> psdf = ps.DataFrame({"a": ["a", "b", "c"]})
1103
- >>> psdf
1104
- a
1105
- 0 a
1106
- 1 b
1107
- 2 c
1108
- >>> new_psdf = psdf.spark.local_checkpoint()
1109
- >>> new_psdf
1110
- a
1111
- 0 a
1112
- 1 b
1113
- 2 c
1114
- """
1115
- from pyspark.pandas.frame import DataFrame
1116
-
1117
- internal = self._psdf._internal.resolved_copy
1118
- checkpointed_sdf = internal.spark_frame.localCheckpoint(eager)
1119
- return DataFrame(internal.with_new_sdf(checkpointed_sdf))
1120
-
1121
- @property
1122
- def analyzed(self) -> "ps.DataFrame":
1123
- """
1124
- Returns a new DataFrame with the analyzed Spark DataFrame.
1125
-
1126
- After multiple operations, the underlying Spark plan could grow huge
1127
- and make the Spark planner take a long time to finish the planning.
1128
-
1129
- This function is for the workaround to avoid it.
1130
-
1131
- .. note:: After analysis, operations between the analyzed DataFrame and the original one
1132
- will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`.
1133
-
1134
- Returns
1135
- -------
1136
- DataFrame
1137
-
1138
- Examples
1139
- --------
1140
- >>> df = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"])
1141
- >>> df
1142
- a b
1143
- 0 1 4
1144
- 1 2 5
1145
- 2 3 6
1146
-
1147
- The analyzed one should return the same value.
1148
-
1149
- >>> df.spark.analyzed
1150
- a b
1151
- 0 1 4
1152
- 1 2 5
1153
- 2 3 6
1154
-
1155
- However, it won't work with the same anchor Series.
1156
-
1157
- >>> df + df.spark.analyzed
1158
- Traceback (most recent call last):
1159
- ...
1160
- ValueError: ... enable 'compute.ops_on_diff_frames' option.
1161
-
1162
- >>> with ps.option_context('compute.ops_on_diff_frames', True):
1163
- ... (df + df.spark.analyzed).sort_index()
1164
- a b
1165
- 0 2 8
1166
- 1 4 10
1167
- 2 6 12
1168
- """
1169
- from pyspark.pandas.frame import DataFrame
1170
-
1171
- return DataFrame(self._psdf._internal.resolved_copy)
1172
-
1173
-
1174
- class CachedSparkFrameMethods(SparkFrameMethods):
1175
- """Spark related features for cached DataFrame. This is usually created via
1176
- `df.spark.cache()`."""
1177
-
1178
- def __init__(self, frame: "CachedDataFrame"):
1179
- super().__init__(frame)
1180
-
1181
- @property
1182
- def storage_level(self) -> StorageLevel:
1183
- """
1184
- Return the storage level of this cache.
1185
-
1186
- Examples
1187
- --------
1188
- >>> import pyspark.pandas as ps
1189
- >>> import pyspark
1190
- >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
1191
- ... columns=['dogs', 'cats'])
1192
- >>> df
1193
- dogs cats
1194
- 0 0.2 0.3
1195
- 1 0.0 0.6
1196
- 2 0.6 0.0
1197
- 3 0.2 0.1
1198
-
1199
- >>> with df.spark.cache() as cached_df:
1200
- ... print(cached_df.spark.storage_level)
1201
- ...
1202
- Disk Memory Deserialized 1x Replicated
1203
-
1204
- Set the StorageLevel to `MEMORY_ONLY`.
1205
-
1206
- >>> with df.spark.persist(pyspark.StorageLevel.MEMORY_ONLY) as cached_df:
1207
- ... print(cached_df.spark.storage_level)
1208
- ...
1209
- Memory Serialized 1x Replicated
1210
- """
1211
- return self._psdf._cached.storageLevel
1212
-
1213
- def unpersist(self) -> None:
1214
- """
1215
- The `unpersist` function is used to uncache the pandas-on-Spark DataFrame when it
1216
- is not used with the `with` statement.
1217
-
1218
- Returns
1219
- -------
1220
- None
1221
-
1222
- Examples
1223
- --------
1224
- >>> df = ps.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
1225
- ... columns=['dogs', 'cats'])
1226
- >>> df = df.spark.cache()
1227
-
1228
- To uncache the dataframe, use `unpersist` function
1229
-
1230
- >>> df.spark.unpersist()
1231
- """
1232
- if self._psdf._cached.is_cached:
1233
- self._psdf._cached.unpersist()
1234
-
1235
-
1236
- def _test() -> None:
1237
- import os
1238
- import doctest
1239
- import shutil
1240
- import sys
1241
- import tempfile
1242
- import uuid
1243
- import numpy
1244
- import pandas
1245
- from pyspark.sql import SparkSession
1246
- import pyspark.pandas.spark.accessors
1247
-
1248
- os.chdir(os.environ["SPARK_HOME"])
1249
-
1250
- globs = pyspark.pandas.spark.accessors.__dict__.copy()
1251
- globs["np"] = numpy
1252
- globs["pd"] = pandas
1253
- globs["ps"] = pyspark.pandas
1254
- spark = (
1255
- SparkSession.builder.master("local[4]")
1256
- .appName("pyspark.pandas.spark.accessors tests")
1257
- .getOrCreate()
1258
- )
1259
-
1260
- db_name = "db%s" % str(uuid.uuid4()).replace("-", "")
1261
- spark.sql("CREATE DATABASE %s" % db_name)
1262
- globs["db"] = db_name
1263
-
1264
- path = tempfile.mkdtemp()
1265
- globs["path"] = path
1266
-
1267
- (failure_count, test_count) = doctest.testmod(
1268
- pyspark.pandas.spark.accessors,
1269
- globs=globs,
1270
- optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
1271
- )
1272
-
1273
- shutil.rmtree(path, ignore_errors=True)
1274
- spark.sql("DROP DATABASE IF EXISTS %s CASCADE" % db_name)
1275
- spark.stop()
1276
- if failure_count:
1277
- sys.exit(-1)
1278
-
1279
-
1280
- if __name__ == "__main__":
1281
- _test()