fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +0 -1
- fugue/_utils/io.py +2 -91
- fugue/api.py +1 -0
- fugue/collections/partition.py +12 -6
- fugue/constants.py +1 -1
- fugue/dataframe/__init__.py +1 -7
- fugue/dataframe/arrow_dataframe.py +1 -1
- fugue/dataframe/function_wrapper.py +2 -3
- fugue/dataframe/utils.py +10 -84
- fugue/execution/api.py +34 -12
- fugue/execution/native_execution_engine.py +33 -19
- fugue/extensions/_builtins/creators.py +4 -2
- fugue/extensions/_builtins/outputters.py +3 -3
- fugue/extensions/_builtins/processors.py +2 -3
- fugue/plugins.py +1 -0
- fugue/workflow/_checkpoint.py +1 -1
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
- fugue_contrib/viz/_ext.py +7 -1
- fugue_dask/_io.py +0 -13
- fugue_dask/_utils.py +10 -4
- fugue_dask/execution_engine.py +42 -16
- fugue_duckdb/_utils.py +7 -2
- fugue_duckdb/dask.py +1 -1
- fugue_duckdb/dataframe.py +17 -10
- fugue_duckdb/execution_engine.py +12 -22
- fugue_ibis/dataframe.py +2 -7
- fugue_notebook/env.py +5 -10
- fugue_polars/_utils.py +0 -40
- fugue_polars/polars_dataframe.py +22 -7
- fugue_ray/_constants.py +8 -1
- fugue_ray/_utils/dataframe.py +31 -4
- fugue_ray/_utils/io.py +2 -4
- fugue_ray/dataframe.py +13 -4
- fugue_ray/execution_engine.py +39 -21
- fugue_spark/_utils/convert.py +22 -11
- fugue_spark/_utils/io.py +0 -13
- fugue_spark/_utils/misc.py +27 -0
- fugue_spark/_utils/partition.py +11 -18
- fugue_spark/dataframe.py +24 -19
- fugue_spark/execution_engine.py +61 -35
- fugue_spark/registry.py +15 -3
- fugue_test/builtin_suite.py +7 -9
- fugue_test/dataframe_suite.py +7 -3
- fugue_test/execution_suite.py +100 -122
- fugue_version/__init__.py +1 -1
- tests/fugue/collections/test_partition.py +6 -3
- tests/fugue/dataframe/test_utils.py +2 -43
- tests/fugue/execution/test_naive_execution_engine.py +33 -0
- tests/fugue/utils/test_io.py +0 -80
- tests/fugue_dask/test_execution_engine.py +45 -0
- tests/fugue_dask/test_io.py +0 -55
- tests/fugue_duckdb/test_dataframe.py +2 -2
- tests/fugue_duckdb/test_utils.py +1 -1
- tests/fugue_polars/test_api.py +13 -0
- tests/fugue_polars/test_transform.py +11 -5
- tests/fugue_ray/test_execution_engine.py +32 -1
- tests/fugue_spark/test_dataframe.py +0 -8
- tests/fugue_spark/test_execution_engine.py +48 -10
- tests/fugue_spark/test_importless.py +4 -4
- tests/fugue_spark/test_spark_connect.py +82 -0
- tests/fugue_spark/utils/test_convert.py +6 -8
- tests/fugue_spark/utils/test_io.py +0 -17
- fugue_test/_utils.py +0 -13
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0
fugue_test/execution_suite.py
CHANGED
|
@@ -26,7 +26,6 @@ from fugue import (
|
|
|
26
26
|
from fugue.column import all_cols, col, lit
|
|
27
27
|
from fugue.dataframe.utils import _df_eq as df_eq
|
|
28
28
|
from fugue.execution.native_execution_engine import NativeExecutionEngine
|
|
29
|
-
from fugue_test._utils import skip_spark2
|
|
30
29
|
|
|
31
30
|
|
|
32
31
|
class ExecutionEngineTests(object):
|
|
@@ -72,20 +71,20 @@ class ExecutionEngineTests(object):
|
|
|
72
71
|
)
|
|
73
72
|
# all engines should accept these types of inputs
|
|
74
73
|
# should take fugue.DataFrame
|
|
75
|
-
df_eq(o,
|
|
74
|
+
df_eq(o, fa.as_fugue_engine_df(e, o), throw=True)
|
|
76
75
|
# should take array, shema
|
|
77
76
|
df_eq(
|
|
78
77
|
o,
|
|
79
|
-
|
|
78
|
+
fa.as_fugue_engine_df(e, [[1.1, 2.2], [3.3, 4.4]], "a:double,b:double"),
|
|
80
79
|
throw=True,
|
|
81
80
|
)
|
|
82
81
|
# should take pandas dataframe
|
|
83
82
|
pdf = pd.DataFrame([[1.1, 2.2], [3.3, 4.4]], columns=["a", "b"])
|
|
84
|
-
df_eq(o,
|
|
83
|
+
df_eq(o, fa.as_fugue_engine_df(e, pdf), throw=True)
|
|
85
84
|
|
|
86
85
|
# should convert string to datetime in to_df
|
|
87
86
|
df_eq(
|
|
88
|
-
|
|
87
|
+
fa.as_fugue_engine_df(e, [["2020-01-01"]], "a:datetime"),
|
|
89
88
|
[[datetime(2020, 1, 1)]],
|
|
90
89
|
"a:datetime",
|
|
91
90
|
throw=True,
|
|
@@ -95,7 +94,7 @@ class ExecutionEngineTests(object):
|
|
|
95
94
|
o = ArrayDataFrame([], "a:double,b:str")
|
|
96
95
|
pdf = pd.DataFrame([[0.1, "a"]], columns=["a", "b"])
|
|
97
96
|
pdf = pdf[pdf.a < 0]
|
|
98
|
-
df_eq(o,
|
|
97
|
+
df_eq(o, fa.as_fugue_engine_df(e, pdf), throw=True)
|
|
99
98
|
|
|
100
99
|
def test_filter(self):
|
|
101
100
|
a = ArrayDataFrame(
|
|
@@ -230,7 +229,7 @@ class ExecutionEngineTests(object):
|
|
|
230
229
|
o = ArrayDataFrame(
|
|
231
230
|
[[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int"
|
|
232
231
|
)
|
|
233
|
-
a =
|
|
232
|
+
a = fa.as_fugue_engine_df(e, o)
|
|
234
233
|
# no partition
|
|
235
234
|
c = e.map_engine.map_dataframe(a, noop, a.schema, PartitionSpec())
|
|
236
235
|
df_eq(c, o, throw=True)
|
|
@@ -353,9 +352,9 @@ class ExecutionEngineTests(object):
|
|
|
353
352
|
|
|
354
353
|
def test_join_multiple(self):
|
|
355
354
|
e = self.engine
|
|
356
|
-
a =
|
|
357
|
-
b =
|
|
358
|
-
c =
|
|
355
|
+
a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
|
|
356
|
+
b = fa.as_fugue_engine_df(e, [[1, 20], [3, 40]], "a:int,c:int")
|
|
357
|
+
c = fa.as_fugue_engine_df(e, [[1, 200], [3, 400]], "a:int,d:int")
|
|
359
358
|
d = fa.inner_join(a, b, c)
|
|
360
359
|
df_eq(
|
|
361
360
|
d,
|
|
@@ -366,8 +365,8 @@ class ExecutionEngineTests(object):
|
|
|
366
365
|
|
|
367
366
|
def test__join_cross(self):
|
|
368
367
|
e = self.engine
|
|
369
|
-
a =
|
|
370
|
-
b =
|
|
368
|
+
a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
|
|
369
|
+
b = fa.as_fugue_engine_df(e, [[6], [7]], "c:int")
|
|
371
370
|
c = fa.join(a, b, how="Cross")
|
|
372
371
|
df_eq(
|
|
373
372
|
c,
|
|
@@ -376,56 +375,56 @@ class ExecutionEngineTests(object):
|
|
|
376
375
|
throw=True,
|
|
377
376
|
)
|
|
378
377
|
|
|
379
|
-
b =
|
|
378
|
+
b = fa.as_fugue_engine_df(e, [], "c:int")
|
|
380
379
|
c = fa.cross_join(a, b)
|
|
381
380
|
df_eq(c, [], "a:int,b:int,c:int", throw=True)
|
|
382
381
|
|
|
383
|
-
a =
|
|
384
|
-
b =
|
|
382
|
+
a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
|
|
383
|
+
b = fa.as_fugue_engine_df(e, [], "c:int")
|
|
385
384
|
c = fa.join(a, b, how="Cross")
|
|
386
385
|
df_eq(c, [], "a:int,b:int,c:int", throw=True)
|
|
387
386
|
|
|
388
387
|
def test__join_inner(self):
|
|
389
388
|
e = self.engine
|
|
390
|
-
a =
|
|
391
|
-
b =
|
|
389
|
+
a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
|
|
390
|
+
b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
|
|
392
391
|
c = fa.join(a, b, how="INNER", on=["a"])
|
|
393
392
|
df_eq(c, [[1, 2, 6]], "a:int,b:int,c:int", throw=True)
|
|
394
393
|
c = fa.inner_join(b, a)
|
|
395
394
|
df_eq(c, [[6, 1, 2]], "c:int,a:int,b:int", throw=True)
|
|
396
395
|
|
|
397
|
-
a =
|
|
398
|
-
b =
|
|
396
|
+
a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
|
|
397
|
+
b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
|
|
399
398
|
c = fa.join(a, b, how="INNER", on=["a"])
|
|
400
399
|
df_eq(c, [], "a:int,b:int,c:int", throw=True)
|
|
401
400
|
|
|
402
401
|
def test__join_outer(self):
|
|
403
402
|
e = self.engine
|
|
404
403
|
|
|
405
|
-
a =
|
|
406
|
-
b =
|
|
404
|
+
a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
|
|
405
|
+
b = fa.as_fugue_engine_df(e, [], "c:str,a:int")
|
|
407
406
|
c = fa.left_outer_join(a, b)
|
|
408
407
|
df_eq(c, [], "a:int,b:int,c:str", throw=True)
|
|
409
408
|
|
|
410
|
-
a =
|
|
411
|
-
b =
|
|
409
|
+
a = fa.as_fugue_engine_df(e, [], "a:int,b:str")
|
|
410
|
+
b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
|
|
412
411
|
c = fa.right_outer_join(a, b)
|
|
413
412
|
df_eq(c, [], "a:int,b:str,c:int", throw=True)
|
|
414
413
|
|
|
415
|
-
a =
|
|
416
|
-
b =
|
|
414
|
+
a = fa.as_fugue_engine_df(e, [], "a:int,b:str")
|
|
415
|
+
b = fa.as_fugue_engine_df(e, [], "c:str,a:int")
|
|
417
416
|
c = fa.full_outer_join(a, b)
|
|
418
417
|
df_eq(c, [], "a:int,b:str,c:str", throw=True)
|
|
419
418
|
|
|
420
|
-
a =
|
|
421
|
-
b =
|
|
419
|
+
a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
|
|
420
|
+
b = fa.as_fugue_engine_df(e, [["6", 1], ["2", 7]], "c:str,a:int")
|
|
422
421
|
c = fa.join(a, b, how="left_OUTER", on=["a"])
|
|
423
422
|
df_eq(c, [[1, "2", "6"], [3, "4", None]], "a:int,b:str,c:str", throw=True)
|
|
424
423
|
c = fa.join(b, a, how="left_outer", on=["a"])
|
|
425
424
|
df_eq(c, [["6", 1, "2"], ["2", 7, None]], "c:str,a:int,b:str", throw=True)
|
|
426
425
|
|
|
427
|
-
a =
|
|
428
|
-
b =
|
|
426
|
+
a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
|
|
427
|
+
b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:double,a:int")
|
|
429
428
|
c = fa.join(a, b, how="left_OUTER", on=["a"])
|
|
430
429
|
df_eq(
|
|
431
430
|
c, [[1, "2", 6.0], [3, "4", None]], "a:int,b:str,c:double", throw=True
|
|
@@ -436,8 +435,8 @@ class ExecutionEngineTests(object):
|
|
|
436
435
|
c, [[6.0, 1, "2"], [2.0, 7, None]], "c:double,a:int,b:str", throw=True
|
|
437
436
|
)
|
|
438
437
|
|
|
439
|
-
a =
|
|
440
|
-
b =
|
|
438
|
+
a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
|
|
439
|
+
b = fa.as_fugue_engine_df(e, [["6", 1], ["2", 7]], "c:str,a:int")
|
|
441
440
|
c = fa.join(a, b, how="right_outer", on=["a"])
|
|
442
441
|
# assert c.as_pandas().values.tolist()[1][1] is None
|
|
443
442
|
df_eq(c, [[1, "2", "6"], [7, None, "2"]], "a:int,b:str,c:str", throw=True)
|
|
@@ -453,8 +452,8 @@ class ExecutionEngineTests(object):
|
|
|
453
452
|
def test__join_outer_pandas_incompatible(self):
|
|
454
453
|
e = self.engine
|
|
455
454
|
|
|
456
|
-
a =
|
|
457
|
-
b =
|
|
455
|
+
a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
|
|
456
|
+
b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
|
|
458
457
|
c = fa.join(a, b, how="left_OUTER", on=["a"])
|
|
459
458
|
df_eq(
|
|
460
459
|
c,
|
|
@@ -465,8 +464,8 @@ class ExecutionEngineTests(object):
|
|
|
465
464
|
c = fa.join(b, a, how="left_outer", on=["a"])
|
|
466
465
|
df_eq(c, [[6, 1, "2"], [2, 7, None]], "c:int,a:int,b:str", throw=True)
|
|
467
466
|
|
|
468
|
-
a =
|
|
469
|
-
b =
|
|
467
|
+
a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
|
|
468
|
+
b = fa.as_fugue_engine_df(e, [[True, 1], [False, 7]], "c:bool,a:int")
|
|
470
469
|
c = fa.join(a, b, how="left_OUTER", on=["a"])
|
|
471
470
|
df_eq(c, [[1, "2", True], [3, "4", None]], "a:int,b:str,c:bool", throw=True)
|
|
472
471
|
c = fa.join(b, a, how="left_outer", on=["a"])
|
|
@@ -476,52 +475,60 @@ class ExecutionEngineTests(object):
|
|
|
476
475
|
|
|
477
476
|
def test__join_semi(self):
|
|
478
477
|
e = self.engine
|
|
479
|
-
a =
|
|
480
|
-
b =
|
|
478
|
+
a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
|
|
479
|
+
b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
|
|
481
480
|
c = fa.join(a, b, how="semi", on=["a"])
|
|
482
481
|
df_eq(c, [[1, 2]], "a:int,b:int", throw=True)
|
|
483
482
|
c = fa.semi_join(b, a)
|
|
484
483
|
df_eq(c, [[6, 1]], "c:int,a:int", throw=True)
|
|
485
484
|
|
|
486
|
-
b =
|
|
485
|
+
b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
|
|
487
486
|
c = fa.join(a, b, how="semi", on=["a"])
|
|
488
487
|
df_eq(c, [], "a:int,b:int", throw=True)
|
|
489
488
|
|
|
490
|
-
a =
|
|
491
|
-
b =
|
|
489
|
+
a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
|
|
490
|
+
b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
|
|
492
491
|
c = fa.join(a, b, how="semi", on=["a"])
|
|
493
492
|
df_eq(c, [], "a:int,b:int", throw=True)
|
|
494
493
|
|
|
495
494
|
def test__join_anti(self):
|
|
496
495
|
e = self.engine
|
|
497
|
-
a =
|
|
498
|
-
b =
|
|
496
|
+
a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
|
|
497
|
+
b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
|
|
499
498
|
c = fa.join(a, b, how="anti", on=["a"])
|
|
500
499
|
df_eq(c, [[3, 4]], "a:int,b:int", throw=True)
|
|
501
500
|
c = fa.anti_join(b, a)
|
|
502
501
|
df_eq(c, [[2, 7]], "c:int,a:int", throw=True)
|
|
503
502
|
|
|
504
|
-
b =
|
|
503
|
+
b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
|
|
505
504
|
c = fa.join(a, b, how="anti", on=["a"])
|
|
506
505
|
df_eq(c, [[1, 2], [3, 4]], "a:int,b:int", throw=True)
|
|
507
506
|
|
|
508
|
-
a =
|
|
509
|
-
b =
|
|
507
|
+
a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
|
|
508
|
+
b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
|
|
510
509
|
c = fa.join(a, b, how="anti", on=["a"])
|
|
511
510
|
df_eq(c, [], "a:int,b:int", throw=True)
|
|
512
511
|
|
|
513
512
|
def test__join_with_null_keys(self):
|
|
514
513
|
# SQL will not match null values
|
|
515
514
|
e = self.engine
|
|
516
|
-
a =
|
|
517
|
-
|
|
515
|
+
a = fa.as_fugue_engine_df(
|
|
516
|
+
e, [[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
|
|
517
|
+
)
|
|
518
|
+
b = fa.as_fugue_engine_df(
|
|
519
|
+
e, [[1, 2, 33], [4, None, 63]], "a:double,b:double,d:int"
|
|
520
|
+
)
|
|
518
521
|
c = fa.join(a, b, how="INNER")
|
|
519
522
|
df_eq(c, [[1, 2, 3, 33]], "a:double,b:double,c:int,d:int", throw=True)
|
|
520
523
|
|
|
521
524
|
def test_union(self):
|
|
522
525
|
e = self.engine
|
|
523
|
-
a =
|
|
524
|
-
|
|
526
|
+
a = fa.as_fugue_engine_df(
|
|
527
|
+
e, [[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
|
|
528
|
+
)
|
|
529
|
+
b = fa.as_fugue_engine_df(
|
|
530
|
+
e, [[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int"
|
|
531
|
+
)
|
|
525
532
|
c = fa.union(a, b)
|
|
526
533
|
df_eq(
|
|
527
534
|
c,
|
|
@@ -555,8 +562,12 @@ class ExecutionEngineTests(object):
|
|
|
555
562
|
|
|
556
563
|
def test_subtract(self):
|
|
557
564
|
e = self.engine
|
|
558
|
-
a =
|
|
559
|
-
|
|
565
|
+
a = fa.as_fugue_engine_df(
|
|
566
|
+
e, [[1, 2, 3], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
|
|
567
|
+
)
|
|
568
|
+
b = fa.as_fugue_engine_df(
|
|
569
|
+
e, [[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int"
|
|
570
|
+
)
|
|
560
571
|
c = fa.subtract(a, b)
|
|
561
572
|
df_eq(
|
|
562
573
|
c,
|
|
@@ -564,8 +575,8 @@ class ExecutionEngineTests(object):
|
|
|
564
575
|
"a:double,b:double,c:int",
|
|
565
576
|
throw=True,
|
|
566
577
|
)
|
|
567
|
-
x =
|
|
568
|
-
y =
|
|
578
|
+
x = fa.as_fugue_engine_df(e, [[1, 2, 33]], "a:double,b:double,c:int")
|
|
579
|
+
y = fa.as_fugue_engine_df(e, [[4, None, 6]], "a:double,b:double,c:int")
|
|
569
580
|
z = fa.subtract(a, x, y)
|
|
570
581
|
df_eq(
|
|
571
582
|
z,
|
|
@@ -584,10 +595,11 @@ class ExecutionEngineTests(object):
|
|
|
584
595
|
|
|
585
596
|
def test_intersect(self):
|
|
586
597
|
e = self.engine
|
|
587
|
-
a =
|
|
588
|
-
[[1, 2, 3], [4, None, 6], [4, None, 6]], "a:double,b:double,c:int"
|
|
598
|
+
a = fa.as_fugue_engine_df(
|
|
599
|
+
e, [[1, 2, 3], [4, None, 6], [4, None, 6]], "a:double,b:double,c:int"
|
|
589
600
|
)
|
|
590
|
-
b =
|
|
601
|
+
b = fa.as_fugue_engine_df(
|
|
602
|
+
e,
|
|
591
603
|
[[1, 2, 33], [4, None, 6], [4, None, 6], [4, None, 6]],
|
|
592
604
|
"a:double,b:double,c:int",
|
|
593
605
|
)
|
|
@@ -598,11 +610,13 @@ class ExecutionEngineTests(object):
|
|
|
598
610
|
"a:double,b:double,c:int",
|
|
599
611
|
throw=True,
|
|
600
612
|
)
|
|
601
|
-
x =
|
|
613
|
+
x = fa.as_fugue_engine_df(
|
|
614
|
+
e,
|
|
602
615
|
[[1, 2, 33]],
|
|
603
616
|
"a:double,b:double,c:int",
|
|
604
617
|
)
|
|
605
|
-
y =
|
|
618
|
+
y = fa.as_fugue_engine_df(
|
|
619
|
+
e,
|
|
606
620
|
[[4, None, 6], [4, None, 6], [4, None, 6]],
|
|
607
621
|
"a:double,b:double,c:int",
|
|
608
622
|
)
|
|
@@ -624,8 +638,8 @@ class ExecutionEngineTests(object):
|
|
|
624
638
|
|
|
625
639
|
def test_distinct(self):
|
|
626
640
|
e = self.engine
|
|
627
|
-
a =
|
|
628
|
-
[[4, None, 6], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
|
|
641
|
+
a = fa.as_fugue_engine_df(
|
|
642
|
+
e, [[4, None, 6], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
|
|
629
643
|
)
|
|
630
644
|
c = fa.distinct(a)
|
|
631
645
|
df_eq(
|
|
@@ -637,8 +651,10 @@ class ExecutionEngineTests(object):
|
|
|
637
651
|
|
|
638
652
|
def test_dropna(self):
|
|
639
653
|
e = self.engine
|
|
640
|
-
a =
|
|
641
|
-
|
|
654
|
+
a = fa.as_fugue_engine_df(
|
|
655
|
+
e,
|
|
656
|
+
[[4, None, 6], [1, 2, 3], [4, None, None]],
|
|
657
|
+
"a:double,b:double,c:double",
|
|
642
658
|
)
|
|
643
659
|
c = fa.dropna(a) # default
|
|
644
660
|
d = fa.dropna(a, how="all")
|
|
@@ -672,8 +688,10 @@ class ExecutionEngineTests(object):
|
|
|
672
688
|
|
|
673
689
|
def test_fillna(self):
|
|
674
690
|
e = self.engine
|
|
675
|
-
a =
|
|
676
|
-
|
|
691
|
+
a = fa.as_fugue_engine_df(
|
|
692
|
+
e,
|
|
693
|
+
[[4, None, 6], [1, 2, 3], [4, None, None]],
|
|
694
|
+
"a:double,b:double,c:double",
|
|
677
695
|
)
|
|
678
696
|
c = fa.fillna(a, value=1)
|
|
679
697
|
d = fa.fillna(a, {"b": 99, "c": -99})
|
|
@@ -703,8 +721,8 @@ class ExecutionEngineTests(object):
|
|
|
703
721
|
# raises(ValueError, lambda: fa.fillna(a, ["b"]))
|
|
704
722
|
|
|
705
723
|
def test_sample(self):
|
|
706
|
-
|
|
707
|
-
a =
|
|
724
|
+
e = self.engine
|
|
725
|
+
a = fa.as_fugue_engine_df(e, [[x] for x in range(100)], "a:int")
|
|
708
726
|
|
|
709
727
|
with raises(ValueError):
|
|
710
728
|
fa.sample(a) # must set one
|
|
@@ -725,7 +743,8 @@ class ExecutionEngineTests(object):
|
|
|
725
743
|
e = self.engine
|
|
726
744
|
ps = dict(by=["a"], presort="b DESC,c DESC")
|
|
727
745
|
ps2 = dict(by=["c"], presort="b ASC")
|
|
728
|
-
a =
|
|
746
|
+
a = fa.as_fugue_engine_df(
|
|
747
|
+
e,
|
|
729
748
|
[
|
|
730
749
|
["a", 2, 3],
|
|
731
750
|
["a", 3, 4],
|
|
@@ -784,8 +803,8 @@ class ExecutionEngineTests(object):
|
|
|
784
803
|
raises(ValueError, lambda: fa.take(a, n=0.5, presort=None))
|
|
785
804
|
|
|
786
805
|
def test_sample_n(self):
|
|
787
|
-
|
|
788
|
-
a =
|
|
806
|
+
e = self.engine
|
|
807
|
+
a = fa.as_fugue_engine_df(e, [[x] for x in range(100)], "a:int")
|
|
789
808
|
|
|
790
809
|
b = fa.sample(a, n=90, replace=False)
|
|
791
810
|
c = fa.sample(a, n=90, replace=True)
|
|
@@ -799,7 +818,7 @@ class ExecutionEngineTests(object):
|
|
|
799
818
|
|
|
800
819
|
def test__serialize_by_partition(self):
|
|
801
820
|
e = self.engine
|
|
802
|
-
a =
|
|
821
|
+
a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
|
|
803
822
|
s = e._serialize_by_partition(
|
|
804
823
|
a, PartitionSpec(by=["a"], presort="b"), df_name="_0"
|
|
805
824
|
)
|
|
@@ -814,8 +833,8 @@ class ExecutionEngineTests(object):
|
|
|
814
833
|
def test_zip(self):
|
|
815
834
|
ps = PartitionSpec(by=["a"], presort="b DESC,c DESC")
|
|
816
835
|
e = self.engine
|
|
817
|
-
a =
|
|
818
|
-
b =
|
|
836
|
+
a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
|
|
837
|
+
b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
|
|
819
838
|
sa = e._serialize_by_partition(a, ps, df_name="_0")
|
|
820
839
|
sb = e._serialize_by_partition(b, ps, df_name="_1")
|
|
821
840
|
# test zip with serialized dfs
|
|
@@ -874,7 +893,7 @@ class ExecutionEngineTests(object):
|
|
|
874
893
|
|
|
875
894
|
def test_zip_all(self):
|
|
876
895
|
e = self.engine
|
|
877
|
-
a =
|
|
896
|
+
a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
|
|
878
897
|
z = fa.persist(e.zip_all(DataFrames(a)))
|
|
879
898
|
assert 1 == z.count()
|
|
880
899
|
assert z.metadata.get("serialized", False)
|
|
@@ -890,8 +909,8 @@ class ExecutionEngineTests(object):
|
|
|
890
909
|
assert z.metadata.get("serialized", False)
|
|
891
910
|
assert z.metadata.get("serialized_has_name", False)
|
|
892
911
|
|
|
893
|
-
b =
|
|
894
|
-
c =
|
|
912
|
+
b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
|
|
913
|
+
c = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "d:int,a:int")
|
|
895
914
|
z = fa.persist(e.zip_all(DataFrames(a, b, c)))
|
|
896
915
|
assert 1 == z.count()
|
|
897
916
|
assert not z.metadata.get("serialized_has_name", False)
|
|
@@ -918,8 +937,8 @@ class ExecutionEngineTests(object):
|
|
|
918
937
|
def test_comap(self):
|
|
919
938
|
ps = PartitionSpec(presort="b,c")
|
|
920
939
|
e = self.engine
|
|
921
|
-
a =
|
|
922
|
-
b =
|
|
940
|
+
a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
|
|
941
|
+
b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
|
|
923
942
|
z1 = fa.persist(e.zip(a, b))
|
|
924
943
|
z2 = fa.persist(e.zip(a, b, partition_spec=ps, how="left_outer"))
|
|
925
944
|
z3 = fa.persist(
|
|
@@ -966,9 +985,9 @@ class ExecutionEngineTests(object):
|
|
|
966
985
|
|
|
967
986
|
def test_comap_with_key(self):
|
|
968
987
|
e = self.engine
|
|
969
|
-
a =
|
|
970
|
-
b =
|
|
971
|
-
c =
|
|
988
|
+
a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
|
|
989
|
+
b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
|
|
990
|
+
c = fa.as_fugue_engine_df(e, [[6, 1]], "c:int,a:int")
|
|
972
991
|
z1 = fa.persist(e.zip(a, b, df1_name="x", df2_name="y"))
|
|
973
992
|
z2 = fa.persist(e.zip_all(DataFrames(x=a, y=b, z=b)))
|
|
974
993
|
z3 = fa.persist(
|
|
@@ -1068,47 +1087,6 @@ class ExecutionEngineTests(object):
|
|
|
1068
1087
|
)
|
|
1069
1088
|
df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
|
|
1070
1089
|
|
|
1071
|
-
@skip_spark2
|
|
1072
|
-
def test_save_single_and_load_avro(self):
|
|
1073
|
-
# TODO: switch to c:int,a:long when we can preserve schema to avro
|
|
1074
|
-
e = self.engine
|
|
1075
|
-
b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long")
|
|
1076
|
-
path = os.path.join(self.tmpdir, "a", "b")
|
|
1077
|
-
e.fs.makedirs(path, recreate=True)
|
|
1078
|
-
# over write folder with single file
|
|
1079
|
-
fa.save(b, path, format_hint="avro", force_single=True)
|
|
1080
|
-
assert e.fs.isfile(path)
|
|
1081
|
-
c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
|
|
1082
|
-
df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)
|
|
1083
|
-
|
|
1084
|
-
# overwirte single with folder (if applicable)
|
|
1085
|
-
b = ArrayDataFrame([[60, 1], [20, 7]], "c:long,a:long")
|
|
1086
|
-
fa.save(b, path, format_hint="avro", mode="overwrite")
|
|
1087
|
-
c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
|
|
1088
|
-
df_eq(c, [[1, 60], [7, 20]], "a:long,c:long", throw=True)
|
|
1089
|
-
|
|
1090
|
-
@skip_spark2
|
|
1091
|
-
def test_save_and_load_avro(self):
|
|
1092
|
-
# TODO: switch to c:int,a:long when we can preserve schema to avro
|
|
1093
|
-
b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long")
|
|
1094
|
-
path = os.path.join(self.tmpdir, "a", "b")
|
|
1095
|
-
fa.save(b, path, format_hint="avro")
|
|
1096
|
-
c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
|
|
1097
|
-
df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)
|
|
1098
|
-
|
|
1099
|
-
@skip_spark2
|
|
1100
|
-
def test_load_avro_folder(self):
|
|
1101
|
-
# TODO: switch to c:int,a:long when we can preserve schema to avro
|
|
1102
|
-
native = NativeExecutionEngine()
|
|
1103
|
-
a = ArrayDataFrame([[6, 1]], "c:long,a:long")
|
|
1104
|
-
b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long")
|
|
1105
|
-
path = os.path.join(self.tmpdir, "a", "b")
|
|
1106
|
-
fa.save(a, os.path.join(path, "a.avro"), engine=native)
|
|
1107
|
-
fa.save(b, os.path.join(path, "b.avro"), engine=native)
|
|
1108
|
-
FileSystem().touch(os.path.join(path, "_SUCCESS"))
|
|
1109
|
-
c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
|
|
1110
|
-
df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True)
|
|
1111
|
-
|
|
1112
1090
|
def test_save_single_and_load_csv(self):
|
|
1113
1091
|
e = self.engine
|
|
1114
1092
|
b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double")
|
|
@@ -1297,7 +1275,7 @@ class ExecutionEngineTests(object):
|
|
|
1297
1275
|
b = ArrayDataFrame([[6, 1], [3, 4], [2, 7], [4, 8], [6, 7]], "c:int,a:long")
|
|
1298
1276
|
path = os.path.join(self.tmpdir, "a", "b")
|
|
1299
1277
|
fa.save(
|
|
1300
|
-
e.repartition(
|
|
1278
|
+
e.repartition(fa.as_fugue_engine_df(e, b), PartitionSpec(num=2)),
|
|
1301
1279
|
path,
|
|
1302
1280
|
format_hint="json",
|
|
1303
1281
|
)
|
fugue_version/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.8.
|
|
1
|
+
__version__ = "0.8.4"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
|
|
3
3
|
from fugue.collections.partition import parse_presort_exp, PartitionSpec
|
|
4
|
-
from fugue.constants import
|
|
4
|
+
from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
|
|
5
5
|
from pytest import raises
|
|
6
6
|
from triad.collections.schema import Schema
|
|
7
7
|
from triad.utils.hash import to_uuid
|
|
@@ -148,6 +148,9 @@ def test_partition_spec():
|
|
|
148
148
|
assert dict(a=True, d=True, e=False) == p.get_sorts(
|
|
149
149
|
Schema("a:int,b:int,d:int,e:int")
|
|
150
150
|
)
|
|
151
|
+
assert dict(d=True, e=False) == p.get_sorts(
|
|
152
|
+
Schema("a:int,b:int,d:int,e:int"), with_partition_keys=False
|
|
153
|
+
)
|
|
151
154
|
p = PartitionSpec(dict(partition_by=["e", "a"], presort="d asc"))
|
|
152
155
|
assert p.get_key_schema(Schema("a:int,b:int,d:int,e:int")) == "e:int,a:int"
|
|
153
156
|
|
|
@@ -228,9 +231,9 @@ def test_get_num_partitions():
|
|
|
228
231
|
assert 6 == p.get_num_partitions(x=lambda: 1, Y=lambda: 2)
|
|
229
232
|
raises(Exception, lambda: p.get_num_partitions(x=lambda: 1))
|
|
230
233
|
|
|
231
|
-
p = PartitionSpec(dict(partition_by=["b", "a"], num="min(ROWCOUNT,
|
|
234
|
+
p = PartitionSpec(dict(partition_by=["b", "a"], num="min(ROWCOUNT,CONCURRENCY)"))
|
|
232
235
|
assert 90 == p.get_num_partitions(
|
|
233
|
-
**{KEYWORD_ROWCOUNT: lambda: 100,
|
|
236
|
+
**{KEYWORD_ROWCOUNT: lambda: 100, KEYWORD_PARALLELISM: lambda: 90}
|
|
234
237
|
)
|
|
235
238
|
|
|
236
239
|
|
|
@@ -8,8 +8,7 @@ from triad import FileSystem, Schema
|
|
|
8
8
|
from triad.collections.schema import SchemaError
|
|
9
9
|
from triad.exceptions import InvalidOperationError, NoneArgumentError
|
|
10
10
|
|
|
11
|
-
from fugue import ArrayDataFrame,
|
|
12
|
-
from fugue.dataframe import to_local_bounded_df, to_local_df
|
|
11
|
+
from fugue import ArrayDataFrame, IterableDataFrame, PandasDataFrame
|
|
13
12
|
from fugue.dataframe.utils import _df_eq as df_eq
|
|
14
13
|
from fugue.dataframe.utils import (
|
|
15
14
|
_schema_eq,
|
|
@@ -24,46 +23,6 @@ from fugue.dataframe.utils import (
|
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
|
|
27
|
-
def test_to_local_df():
|
|
28
|
-
df = ArrayDataFrame([[0, 1]], "a:int,b:int")
|
|
29
|
-
pdf = PandasDataFrame(df.as_pandas(), "a:int,b:int")
|
|
30
|
-
idf = IterableDataFrame([[0, 1]], "a:int,b:int")
|
|
31
|
-
assert to_local_df(df) is df
|
|
32
|
-
assert to_local_df(pdf) is pdf
|
|
33
|
-
assert to_local_df(idf) is idf
|
|
34
|
-
assert isinstance(to_local_df(df.native, "a:int,b:int"), ArrayDataFrame)
|
|
35
|
-
assert isinstance(to_local_df(pdf.native, "a:int,b:int"), PandasDataFrame)
|
|
36
|
-
assert isinstance(to_local_df(idf.native, "a:int,b:int"), IterableDataFrame)
|
|
37
|
-
raises(ValueError, lambda: to_local_df(123))
|
|
38
|
-
|
|
39
|
-
raises(NoneArgumentError, lambda: to_local_df(None))
|
|
40
|
-
raises(ValueError, lambda: to_local_df(df, "a:int,b:int"))
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def test_to_local_bounded_df():
|
|
44
|
-
df = ArrayDataFrame([[0, 1]], "a:int,b:int")
|
|
45
|
-
idf = IterableDataFrame([[0, 1]], "a:int,b:int")
|
|
46
|
-
adf = ArrowDataFrame(df.as_array(), "a:int,b:int")
|
|
47
|
-
assert to_local_bounded_df(df) is df
|
|
48
|
-
r = to_local_bounded_df(idf)
|
|
49
|
-
assert r is not idf
|
|
50
|
-
assert r.as_array() == [[0, 1]]
|
|
51
|
-
assert r.schema == "a:int,b:int"
|
|
52
|
-
r = to_local_bounded_df(adf.native)
|
|
53
|
-
assert isinstance(r, ArrowDataFrame)
|
|
54
|
-
assert r.as_array() == [[0, 1]]
|
|
55
|
-
assert r.schema == "a:int,b:int"
|
|
56
|
-
|
|
57
|
-
raises(ValueError, lambda: to_local_bounded_df(123))
|
|
58
|
-
|
|
59
|
-
def rows():
|
|
60
|
-
yield [0]
|
|
61
|
-
yield [1]
|
|
62
|
-
|
|
63
|
-
with raises(ValueError):
|
|
64
|
-
to_local_bounded_df(rows(), schema="a:int")
|
|
65
|
-
|
|
66
|
-
|
|
67
26
|
def test_schema_eq():
|
|
68
27
|
assert not _schema_eq(Schema("a:int"), Schema("a:int8"))
|
|
69
28
|
assert not _schema_eq(Schema("a:int"), Schema("b:int"))
|
|
@@ -85,7 +44,7 @@ def test_df_eq():
|
|
|
85
44
|
df1 = ArrayDataFrame([[0, 100.0, "a"]], "a:int,b:double,c:str")
|
|
86
45
|
df2 = ArrayDataFrame([[0, 100.001, "a"]], "a:int,b:double,c:str")
|
|
87
46
|
assert df_eq(df1, df1)
|
|
88
|
-
assert df_eq(df1, df2, digits=
|
|
47
|
+
assert df_eq(df1, df2, digits=2)
|
|
89
48
|
# precision
|
|
90
49
|
assert not df_eq(df1, df2, digits=6)
|
|
91
50
|
# no content
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from typing import Any, List
|
|
2
|
+
|
|
1
3
|
import pandas as pd
|
|
2
4
|
import pyarrow as pa
|
|
3
5
|
|
|
@@ -34,6 +36,37 @@ class NativeExecutionEngineBuiltInQPDTests(BuiltInTests.Tests):
|
|
|
34
36
|
def test_yield_table(self):
|
|
35
37
|
pass
|
|
36
38
|
|
|
39
|
+
def test_coarse_partition(self):
|
|
40
|
+
def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
|
|
41
|
+
ct = df.a.nunique()
|
|
42
|
+
s = df.a * 1000 + df.b
|
|
43
|
+
ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
|
|
44
|
+
return [[ct, ordered]]
|
|
45
|
+
|
|
46
|
+
def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
|
|
47
|
+
if rc > 0:
|
|
48
|
+
assert len(df) == rc
|
|
49
|
+
assert df.ct.sum() == n
|
|
50
|
+
if check_ordered:
|
|
51
|
+
assert (df.ordered == True).all()
|
|
52
|
+
|
|
53
|
+
gps = 100
|
|
54
|
+
partition_num = 6
|
|
55
|
+
df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
|
|
56
|
+
frac=1.0
|
|
57
|
+
)
|
|
58
|
+
with FugueWorkflow() as dag:
|
|
59
|
+
a = dag.df(df)
|
|
60
|
+
c = a.partition(
|
|
61
|
+
algo="coarse", by="a", presort="b", num=partition_num
|
|
62
|
+
).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
|
|
63
|
+
dag.output(
|
|
64
|
+
c,
|
|
65
|
+
using=assert_,
|
|
66
|
+
params=dict(rc=0, n=gps, check_ordered=True),
|
|
67
|
+
)
|
|
68
|
+
dag.run(self.engine)
|
|
69
|
+
|
|
37
70
|
|
|
38
71
|
def test_get_file_threshold():
|
|
39
72
|
assert -1 == _get_file_threshold(None)
|