fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. fugue/__init__.py +0 -1
  2. fugue/_utils/io.py +2 -91
  3. fugue/api.py +1 -0
  4. fugue/collections/partition.py +12 -6
  5. fugue/constants.py +1 -1
  6. fugue/dataframe/__init__.py +1 -7
  7. fugue/dataframe/arrow_dataframe.py +1 -1
  8. fugue/dataframe/function_wrapper.py +2 -3
  9. fugue/dataframe/utils.py +10 -84
  10. fugue/execution/api.py +34 -12
  11. fugue/execution/native_execution_engine.py +33 -19
  12. fugue/extensions/_builtins/creators.py +4 -2
  13. fugue/extensions/_builtins/outputters.py +3 -3
  14. fugue/extensions/_builtins/processors.py +2 -3
  15. fugue/plugins.py +1 -0
  16. fugue/workflow/_checkpoint.py +1 -1
  17. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
  18. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
  19. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
  20. fugue_contrib/viz/_ext.py +7 -1
  21. fugue_dask/_io.py +0 -13
  22. fugue_dask/_utils.py +10 -4
  23. fugue_dask/execution_engine.py +42 -16
  24. fugue_duckdb/_utils.py +7 -2
  25. fugue_duckdb/dask.py +1 -1
  26. fugue_duckdb/dataframe.py +17 -10
  27. fugue_duckdb/execution_engine.py +12 -22
  28. fugue_ibis/dataframe.py +2 -7
  29. fugue_notebook/env.py +5 -10
  30. fugue_polars/_utils.py +0 -40
  31. fugue_polars/polars_dataframe.py +22 -7
  32. fugue_ray/_constants.py +8 -1
  33. fugue_ray/_utils/dataframe.py +31 -4
  34. fugue_ray/_utils/io.py +2 -4
  35. fugue_ray/dataframe.py +13 -4
  36. fugue_ray/execution_engine.py +39 -21
  37. fugue_spark/_utils/convert.py +22 -11
  38. fugue_spark/_utils/io.py +0 -13
  39. fugue_spark/_utils/misc.py +27 -0
  40. fugue_spark/_utils/partition.py +11 -18
  41. fugue_spark/dataframe.py +24 -19
  42. fugue_spark/execution_engine.py +61 -35
  43. fugue_spark/registry.py +15 -3
  44. fugue_test/builtin_suite.py +7 -9
  45. fugue_test/dataframe_suite.py +7 -3
  46. fugue_test/execution_suite.py +100 -122
  47. fugue_version/__init__.py +1 -1
  48. tests/fugue/collections/test_partition.py +6 -3
  49. tests/fugue/dataframe/test_utils.py +2 -43
  50. tests/fugue/execution/test_naive_execution_engine.py +33 -0
  51. tests/fugue/utils/test_io.py +0 -80
  52. tests/fugue_dask/test_execution_engine.py +45 -0
  53. tests/fugue_dask/test_io.py +0 -55
  54. tests/fugue_duckdb/test_dataframe.py +2 -2
  55. tests/fugue_duckdb/test_utils.py +1 -1
  56. tests/fugue_polars/test_api.py +13 -0
  57. tests/fugue_polars/test_transform.py +11 -5
  58. tests/fugue_ray/test_execution_engine.py +32 -1
  59. tests/fugue_spark/test_dataframe.py +0 -8
  60. tests/fugue_spark/test_execution_engine.py +48 -10
  61. tests/fugue_spark/test_importless.py +4 -4
  62. tests/fugue_spark/test_spark_connect.py +82 -0
  63. tests/fugue_spark/utils/test_convert.py +6 -8
  64. tests/fugue_spark/utils/test_io.py +0 -17
  65. fugue_test/_utils.py +0 -13
  66. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
  67. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
  68. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0
@@ -26,7 +26,6 @@ from fugue import (
26
26
  from fugue.column import all_cols, col, lit
27
27
  from fugue.dataframe.utils import _df_eq as df_eq
28
28
  from fugue.execution.native_execution_engine import NativeExecutionEngine
29
- from fugue_test._utils import skip_spark2
30
29
 
31
30
 
32
31
  class ExecutionEngineTests(object):
@@ -72,20 +71,20 @@ class ExecutionEngineTests(object):
72
71
  )
73
72
  # all engines should accept these types of inputs
74
73
  # should take fugue.DataFrame
75
- df_eq(o, e.to_df(o), throw=True)
74
+ df_eq(o, fa.as_fugue_engine_df(e, o), throw=True)
76
75
  # should take array, shema
77
76
  df_eq(
78
77
  o,
79
- e.to_df([[1.1, 2.2], [3.3, 4.4]], "a:double,b:double"),
78
+ fa.as_fugue_engine_df(e, [[1.1, 2.2], [3.3, 4.4]], "a:double,b:double"),
80
79
  throw=True,
81
80
  )
82
81
  # should take pandas dataframe
83
82
  pdf = pd.DataFrame([[1.1, 2.2], [3.3, 4.4]], columns=["a", "b"])
84
- df_eq(o, e.to_df(pdf), throw=True)
83
+ df_eq(o, fa.as_fugue_engine_df(e, pdf), throw=True)
85
84
 
86
85
  # should convert string to datetime in to_df
87
86
  df_eq(
88
- e.to_df([["2020-01-01"]], "a:datetime"),
87
+ fa.as_fugue_engine_df(e, [["2020-01-01"]], "a:datetime"),
89
88
  [[datetime(2020, 1, 1)]],
90
89
  "a:datetime",
91
90
  throw=True,
@@ -95,7 +94,7 @@ class ExecutionEngineTests(object):
95
94
  o = ArrayDataFrame([], "a:double,b:str")
96
95
  pdf = pd.DataFrame([[0.1, "a"]], columns=["a", "b"])
97
96
  pdf = pdf[pdf.a < 0]
98
- df_eq(o, e.to_df(pdf), throw=True)
97
+ df_eq(o, fa.as_fugue_engine_df(e, pdf), throw=True)
99
98
 
100
99
  def test_filter(self):
101
100
  a = ArrayDataFrame(
@@ -230,7 +229,7 @@ class ExecutionEngineTests(object):
230
229
  o = ArrayDataFrame(
231
230
  [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int"
232
231
  )
233
- a = e.to_df(o)
232
+ a = fa.as_fugue_engine_df(e, o)
234
233
  # no partition
235
234
  c = e.map_engine.map_dataframe(a, noop, a.schema, PartitionSpec())
236
235
  df_eq(c, o, throw=True)
@@ -353,9 +352,9 @@ class ExecutionEngineTests(object):
353
352
 
354
353
  def test_join_multiple(self):
355
354
  e = self.engine
356
- a = e.to_df([[1, 2], [3, 4]], "a:int,b:int")
357
- b = e.to_df([[1, 20], [3, 40]], "a:int,c:int")
358
- c = e.to_df([[1, 200], [3, 400]], "a:int,d:int")
355
+ a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
356
+ b = fa.as_fugue_engine_df(e, [[1, 20], [3, 40]], "a:int,c:int")
357
+ c = fa.as_fugue_engine_df(e, [[1, 200], [3, 400]], "a:int,d:int")
359
358
  d = fa.inner_join(a, b, c)
360
359
  df_eq(
361
360
  d,
@@ -366,8 +365,8 @@ class ExecutionEngineTests(object):
366
365
 
367
366
  def test__join_cross(self):
368
367
  e = self.engine
369
- a = e.to_df([[1, 2], [3, 4]], "a:int,b:int")
370
- b = e.to_df([[6], [7]], "c:int")
368
+ a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
369
+ b = fa.as_fugue_engine_df(e, [[6], [7]], "c:int")
371
370
  c = fa.join(a, b, how="Cross")
372
371
  df_eq(
373
372
  c,
@@ -376,56 +375,56 @@ class ExecutionEngineTests(object):
376
375
  throw=True,
377
376
  )
378
377
 
379
- b = e.to_df([], "c:int")
378
+ b = fa.as_fugue_engine_df(e, [], "c:int")
380
379
  c = fa.cross_join(a, b)
381
380
  df_eq(c, [], "a:int,b:int,c:int", throw=True)
382
381
 
383
- a = e.to_df([], "a:int,b:int")
384
- b = e.to_df([], "c:int")
382
+ a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
383
+ b = fa.as_fugue_engine_df(e, [], "c:int")
385
384
  c = fa.join(a, b, how="Cross")
386
385
  df_eq(c, [], "a:int,b:int,c:int", throw=True)
387
386
 
388
387
  def test__join_inner(self):
389
388
  e = self.engine
390
- a = e.to_df([[1, 2], [3, 4]], "a:int,b:int")
391
- b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
389
+ a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
390
+ b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
392
391
  c = fa.join(a, b, how="INNER", on=["a"])
393
392
  df_eq(c, [[1, 2, 6]], "a:int,b:int,c:int", throw=True)
394
393
  c = fa.inner_join(b, a)
395
394
  df_eq(c, [[6, 1, 2]], "c:int,a:int,b:int", throw=True)
396
395
 
397
- a = e.to_df([], "a:int,b:int")
398
- b = e.to_df([], "c:int,a:int")
396
+ a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
397
+ b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
399
398
  c = fa.join(a, b, how="INNER", on=["a"])
400
399
  df_eq(c, [], "a:int,b:int,c:int", throw=True)
401
400
 
402
401
  def test__join_outer(self):
403
402
  e = self.engine
404
403
 
405
- a = e.to_df([], "a:int,b:int")
406
- b = e.to_df([], "c:str,a:int")
404
+ a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
405
+ b = fa.as_fugue_engine_df(e, [], "c:str,a:int")
407
406
  c = fa.left_outer_join(a, b)
408
407
  df_eq(c, [], "a:int,b:int,c:str", throw=True)
409
408
 
410
- a = e.to_df([], "a:int,b:str")
411
- b = e.to_df([], "c:int,a:int")
409
+ a = fa.as_fugue_engine_df(e, [], "a:int,b:str")
410
+ b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
412
411
  c = fa.right_outer_join(a, b)
413
412
  df_eq(c, [], "a:int,b:str,c:int", throw=True)
414
413
 
415
- a = e.to_df([], "a:int,b:str")
416
- b = e.to_df([], "c:str,a:int")
414
+ a = fa.as_fugue_engine_df(e, [], "a:int,b:str")
415
+ b = fa.as_fugue_engine_df(e, [], "c:str,a:int")
417
416
  c = fa.full_outer_join(a, b)
418
417
  df_eq(c, [], "a:int,b:str,c:str", throw=True)
419
418
 
420
- a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
421
- b = e.to_df([["6", 1], ["2", 7]], "c:str,a:int")
419
+ a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
420
+ b = fa.as_fugue_engine_df(e, [["6", 1], ["2", 7]], "c:str,a:int")
422
421
  c = fa.join(a, b, how="left_OUTER", on=["a"])
423
422
  df_eq(c, [[1, "2", "6"], [3, "4", None]], "a:int,b:str,c:str", throw=True)
424
423
  c = fa.join(b, a, how="left_outer", on=["a"])
425
424
  df_eq(c, [["6", 1, "2"], ["2", 7, None]], "c:str,a:int,b:str", throw=True)
426
425
 
427
- a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
428
- b = e.to_df([[6, 1], [2, 7]], "c:double,a:int")
426
+ a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
427
+ b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:double,a:int")
429
428
  c = fa.join(a, b, how="left_OUTER", on=["a"])
430
429
  df_eq(
431
430
  c, [[1, "2", 6.0], [3, "4", None]], "a:int,b:str,c:double", throw=True
@@ -436,8 +435,8 @@ class ExecutionEngineTests(object):
436
435
  c, [[6.0, 1, "2"], [2.0, 7, None]], "c:double,a:int,b:str", throw=True
437
436
  )
438
437
 
439
- a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
440
- b = e.to_df([["6", 1], ["2", 7]], "c:str,a:int")
438
+ a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
439
+ b = fa.as_fugue_engine_df(e, [["6", 1], ["2", 7]], "c:str,a:int")
441
440
  c = fa.join(a, b, how="right_outer", on=["a"])
442
441
  # assert c.as_pandas().values.tolist()[1][1] is None
443
442
  df_eq(c, [[1, "2", "6"], [7, None, "2"]], "a:int,b:str,c:str", throw=True)
@@ -453,8 +452,8 @@ class ExecutionEngineTests(object):
453
452
  def test__join_outer_pandas_incompatible(self):
454
453
  e = self.engine
455
454
 
456
- a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
457
- b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
455
+ a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
456
+ b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
458
457
  c = fa.join(a, b, how="left_OUTER", on=["a"])
459
458
  df_eq(
460
459
  c,
@@ -465,8 +464,8 @@ class ExecutionEngineTests(object):
465
464
  c = fa.join(b, a, how="left_outer", on=["a"])
466
465
  df_eq(c, [[6, 1, "2"], [2, 7, None]], "c:int,a:int,b:str", throw=True)
467
466
 
468
- a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
469
- b = e.to_df([[True, 1], [False, 7]], "c:bool,a:int")
467
+ a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
468
+ b = fa.as_fugue_engine_df(e, [[True, 1], [False, 7]], "c:bool,a:int")
470
469
  c = fa.join(a, b, how="left_OUTER", on=["a"])
471
470
  df_eq(c, [[1, "2", True], [3, "4", None]], "a:int,b:str,c:bool", throw=True)
472
471
  c = fa.join(b, a, how="left_outer", on=["a"])
@@ -476,52 +475,60 @@ class ExecutionEngineTests(object):
476
475
 
477
476
  def test__join_semi(self):
478
477
  e = self.engine
479
- a = e.to_df([[1, 2], [3, 4]], "a:int,b:int")
480
- b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
478
+ a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
479
+ b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
481
480
  c = fa.join(a, b, how="semi", on=["a"])
482
481
  df_eq(c, [[1, 2]], "a:int,b:int", throw=True)
483
482
  c = fa.semi_join(b, a)
484
483
  df_eq(c, [[6, 1]], "c:int,a:int", throw=True)
485
484
 
486
- b = e.to_df([], "c:int,a:int")
485
+ b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
487
486
  c = fa.join(a, b, how="semi", on=["a"])
488
487
  df_eq(c, [], "a:int,b:int", throw=True)
489
488
 
490
- a = e.to_df([], "a:int,b:int")
491
- b = e.to_df([], "c:int,a:int")
489
+ a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
490
+ b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
492
491
  c = fa.join(a, b, how="semi", on=["a"])
493
492
  df_eq(c, [], "a:int,b:int", throw=True)
494
493
 
495
494
  def test__join_anti(self):
496
495
  e = self.engine
497
- a = e.to_df([[1, 2], [3, 4]], "a:int,b:int")
498
- b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
496
+ a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
497
+ b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
499
498
  c = fa.join(a, b, how="anti", on=["a"])
500
499
  df_eq(c, [[3, 4]], "a:int,b:int", throw=True)
501
500
  c = fa.anti_join(b, a)
502
501
  df_eq(c, [[2, 7]], "c:int,a:int", throw=True)
503
502
 
504
- b = e.to_df([], "c:int,a:int")
503
+ b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
505
504
  c = fa.join(a, b, how="anti", on=["a"])
506
505
  df_eq(c, [[1, 2], [3, 4]], "a:int,b:int", throw=True)
507
506
 
508
- a = e.to_df([], "a:int,b:int")
509
- b = e.to_df([], "c:int,a:int")
507
+ a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
508
+ b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
510
509
  c = fa.join(a, b, how="anti", on=["a"])
511
510
  df_eq(c, [], "a:int,b:int", throw=True)
512
511
 
513
512
  def test__join_with_null_keys(self):
514
513
  # SQL will not match null values
515
514
  e = self.engine
516
- a = e.to_df([[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int")
517
- b = e.to_df([[1, 2, 33], [4, None, 63]], "a:double,b:double,d:int")
515
+ a = fa.as_fugue_engine_df(
516
+ e, [[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
517
+ )
518
+ b = fa.as_fugue_engine_df(
519
+ e, [[1, 2, 33], [4, None, 63]], "a:double,b:double,d:int"
520
+ )
518
521
  c = fa.join(a, b, how="INNER")
519
522
  df_eq(c, [[1, 2, 3, 33]], "a:double,b:double,c:int,d:int", throw=True)
520
523
 
521
524
  def test_union(self):
522
525
  e = self.engine
523
- a = e.to_df([[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int")
524
- b = e.to_df([[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int")
526
+ a = fa.as_fugue_engine_df(
527
+ e, [[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
528
+ )
529
+ b = fa.as_fugue_engine_df(
530
+ e, [[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int"
531
+ )
525
532
  c = fa.union(a, b)
526
533
  df_eq(
527
534
  c,
@@ -555,8 +562,12 @@ class ExecutionEngineTests(object):
555
562
 
556
563
  def test_subtract(self):
557
564
  e = self.engine
558
- a = e.to_df([[1, 2, 3], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int")
559
- b = e.to_df([[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int")
565
+ a = fa.as_fugue_engine_df(
566
+ e, [[1, 2, 3], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
567
+ )
568
+ b = fa.as_fugue_engine_df(
569
+ e, [[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int"
570
+ )
560
571
  c = fa.subtract(a, b)
561
572
  df_eq(
562
573
  c,
@@ -564,8 +575,8 @@ class ExecutionEngineTests(object):
564
575
  "a:double,b:double,c:int",
565
576
  throw=True,
566
577
  )
567
- x = e.to_df([[1, 2, 33]], "a:double,b:double,c:int")
568
- y = e.to_df([[4, None, 6]], "a:double,b:double,c:int")
578
+ x = fa.as_fugue_engine_df(e, [[1, 2, 33]], "a:double,b:double,c:int")
579
+ y = fa.as_fugue_engine_df(e, [[4, None, 6]], "a:double,b:double,c:int")
569
580
  z = fa.subtract(a, x, y)
570
581
  df_eq(
571
582
  z,
@@ -584,10 +595,11 @@ class ExecutionEngineTests(object):
584
595
 
585
596
  def test_intersect(self):
586
597
  e = self.engine
587
- a = e.to_df(
588
- [[1, 2, 3], [4, None, 6], [4, None, 6]], "a:double,b:double,c:int"
598
+ a = fa.as_fugue_engine_df(
599
+ e, [[1, 2, 3], [4, None, 6], [4, None, 6]], "a:double,b:double,c:int"
589
600
  )
590
- b = e.to_df(
601
+ b = fa.as_fugue_engine_df(
602
+ e,
591
603
  [[1, 2, 33], [4, None, 6], [4, None, 6], [4, None, 6]],
592
604
  "a:double,b:double,c:int",
593
605
  )
@@ -598,11 +610,13 @@ class ExecutionEngineTests(object):
598
610
  "a:double,b:double,c:int",
599
611
  throw=True,
600
612
  )
601
- x = e.to_df(
613
+ x = fa.as_fugue_engine_df(
614
+ e,
602
615
  [[1, 2, 33]],
603
616
  "a:double,b:double,c:int",
604
617
  )
605
- y = e.to_df(
618
+ y = fa.as_fugue_engine_df(
619
+ e,
606
620
  [[4, None, 6], [4, None, 6], [4, None, 6]],
607
621
  "a:double,b:double,c:int",
608
622
  )
@@ -624,8 +638,8 @@ class ExecutionEngineTests(object):
624
638
 
625
639
  def test_distinct(self):
626
640
  e = self.engine
627
- a = e.to_df(
628
- [[4, None, 6], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
641
+ a = fa.as_fugue_engine_df(
642
+ e, [[4, None, 6], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
629
643
  )
630
644
  c = fa.distinct(a)
631
645
  df_eq(
@@ -637,8 +651,10 @@ class ExecutionEngineTests(object):
637
651
 
638
652
  def test_dropna(self):
639
653
  e = self.engine
640
- a = e.to_df(
641
- [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double"
654
+ a = fa.as_fugue_engine_df(
655
+ e,
656
+ [[4, None, 6], [1, 2, 3], [4, None, None]],
657
+ "a:double,b:double,c:double",
642
658
  )
643
659
  c = fa.dropna(a) # default
644
660
  d = fa.dropna(a, how="all")
@@ -672,8 +688,10 @@ class ExecutionEngineTests(object):
672
688
 
673
689
  def test_fillna(self):
674
690
  e = self.engine
675
- a = e.to_df(
676
- [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double"
691
+ a = fa.as_fugue_engine_df(
692
+ e,
693
+ [[4, None, 6], [1, 2, 3], [4, None, None]],
694
+ "a:double,b:double,c:double",
677
695
  )
678
696
  c = fa.fillna(a, value=1)
679
697
  d = fa.fillna(a, {"b": 99, "c": -99})
@@ -703,8 +721,8 @@ class ExecutionEngineTests(object):
703
721
  # raises(ValueError, lambda: fa.fillna(a, ["b"]))
704
722
 
705
723
  def test_sample(self):
706
- engine = self.engine
707
- a = engine.to_df([[x] for x in range(100)], "a:int")
724
+ e = self.engine
725
+ a = fa.as_fugue_engine_df(e, [[x] for x in range(100)], "a:int")
708
726
 
709
727
  with raises(ValueError):
710
728
  fa.sample(a) # must set one
@@ -725,7 +743,8 @@ class ExecutionEngineTests(object):
725
743
  e = self.engine
726
744
  ps = dict(by=["a"], presort="b DESC,c DESC")
727
745
  ps2 = dict(by=["c"], presort="b ASC")
728
- a = e.to_df(
746
+ a = fa.as_fugue_engine_df(
747
+ e,
729
748
  [
730
749
  ["a", 2, 3],
731
750
  ["a", 3, 4],
@@ -784,8 +803,8 @@ class ExecutionEngineTests(object):
784
803
  raises(ValueError, lambda: fa.take(a, n=0.5, presort=None))
785
804
 
786
805
  def test_sample_n(self):
787
- engine = self.engine
788
- a = engine.to_df([[x] for x in range(100)], "a:int")
806
+ e = self.engine
807
+ a = fa.as_fugue_engine_df(e, [[x] for x in range(100)], "a:int")
789
808
 
790
809
  b = fa.sample(a, n=90, replace=False)
791
810
  c = fa.sample(a, n=90, replace=True)
@@ -799,7 +818,7 @@ class ExecutionEngineTests(object):
799
818
 
800
819
  def test__serialize_by_partition(self):
801
820
  e = self.engine
802
- a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
821
+ a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
803
822
  s = e._serialize_by_partition(
804
823
  a, PartitionSpec(by=["a"], presort="b"), df_name="_0"
805
824
  )
@@ -814,8 +833,8 @@ class ExecutionEngineTests(object):
814
833
  def test_zip(self):
815
834
  ps = PartitionSpec(by=["a"], presort="b DESC,c DESC")
816
835
  e = self.engine
817
- a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
818
- b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
836
+ a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
837
+ b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
819
838
  sa = e._serialize_by_partition(a, ps, df_name="_0")
820
839
  sb = e._serialize_by_partition(b, ps, df_name="_1")
821
840
  # test zip with serialized dfs
@@ -874,7 +893,7 @@ class ExecutionEngineTests(object):
874
893
 
875
894
  def test_zip_all(self):
876
895
  e = self.engine
877
- a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
896
+ a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
878
897
  z = fa.persist(e.zip_all(DataFrames(a)))
879
898
  assert 1 == z.count()
880
899
  assert z.metadata.get("serialized", False)
@@ -890,8 +909,8 @@ class ExecutionEngineTests(object):
890
909
  assert z.metadata.get("serialized", False)
891
910
  assert z.metadata.get("serialized_has_name", False)
892
911
 
893
- b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
894
- c = e.to_df([[6, 1], [2, 7]], "d:int,a:int")
912
+ b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
913
+ c = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "d:int,a:int")
895
914
  z = fa.persist(e.zip_all(DataFrames(a, b, c)))
896
915
  assert 1 == z.count()
897
916
  assert not z.metadata.get("serialized_has_name", False)
@@ -918,8 +937,8 @@ class ExecutionEngineTests(object):
918
937
  def test_comap(self):
919
938
  ps = PartitionSpec(presort="b,c")
920
939
  e = self.engine
921
- a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
922
- b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
940
+ a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
941
+ b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
923
942
  z1 = fa.persist(e.zip(a, b))
924
943
  z2 = fa.persist(e.zip(a, b, partition_spec=ps, how="left_outer"))
925
944
  z3 = fa.persist(
@@ -966,9 +985,9 @@ class ExecutionEngineTests(object):
966
985
 
967
986
  def test_comap_with_key(self):
968
987
  e = self.engine
969
- a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
970
- b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
971
- c = e.to_df([[6, 1]], "c:int,a:int")
988
+ a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
989
+ b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
990
+ c = fa.as_fugue_engine_df(e, [[6, 1]], "c:int,a:int")
972
991
  z1 = fa.persist(e.zip(a, b, df1_name="x", df2_name="y"))
973
992
  z2 = fa.persist(e.zip_all(DataFrames(x=a, y=b, z=b)))
974
993
  z3 = fa.persist(
@@ -1068,47 +1087,6 @@ class ExecutionEngineTests(object):
1068
1087
  )
1069
1088
  df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
1070
1089
 
1071
- @skip_spark2
1072
- def test_save_single_and_load_avro(self):
1073
- # TODO: switch to c:int,a:long when we can preserve schema to avro
1074
- e = self.engine
1075
- b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long")
1076
- path = os.path.join(self.tmpdir, "a", "b")
1077
- e.fs.makedirs(path, recreate=True)
1078
- # over write folder with single file
1079
- fa.save(b, path, format_hint="avro", force_single=True)
1080
- assert e.fs.isfile(path)
1081
- c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
1082
- df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)
1083
-
1084
- # overwirte single with folder (if applicable)
1085
- b = ArrayDataFrame([[60, 1], [20, 7]], "c:long,a:long")
1086
- fa.save(b, path, format_hint="avro", mode="overwrite")
1087
- c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
1088
- df_eq(c, [[1, 60], [7, 20]], "a:long,c:long", throw=True)
1089
-
1090
- @skip_spark2
1091
- def test_save_and_load_avro(self):
1092
- # TODO: switch to c:int,a:long when we can preserve schema to avro
1093
- b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long")
1094
- path = os.path.join(self.tmpdir, "a", "b")
1095
- fa.save(b, path, format_hint="avro")
1096
- c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
1097
- df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)
1098
-
1099
- @skip_spark2
1100
- def test_load_avro_folder(self):
1101
- # TODO: switch to c:int,a:long when we can preserve schema to avro
1102
- native = NativeExecutionEngine()
1103
- a = ArrayDataFrame([[6, 1]], "c:long,a:long")
1104
- b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long")
1105
- path = os.path.join(self.tmpdir, "a", "b")
1106
- fa.save(a, os.path.join(path, "a.avro"), engine=native)
1107
- fa.save(b, os.path.join(path, "b.avro"), engine=native)
1108
- FileSystem().touch(os.path.join(path, "_SUCCESS"))
1109
- c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
1110
- df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True)
1111
-
1112
1090
  def test_save_single_and_load_csv(self):
1113
1091
  e = self.engine
1114
1092
  b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double")
@@ -1297,7 +1275,7 @@ class ExecutionEngineTests(object):
1297
1275
  b = ArrayDataFrame([[6, 1], [3, 4], [2, 7], [4, 8], [6, 7]], "c:int,a:long")
1298
1276
  path = os.path.join(self.tmpdir, "a", "b")
1299
1277
  fa.save(
1300
- e.repartition(e.to_df(b), PartitionSpec(num=2)),
1278
+ e.repartition(fa.as_fugue_engine_df(e, b), PartitionSpec(num=2)),
1301
1279
  path,
1302
1280
  format_hint="json",
1303
1281
  )
fugue_version/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.8.2"
1
+ __version__ = "0.8.4"
@@ -1,7 +1,7 @@
1
1
  import json
2
2
 
3
3
  from fugue.collections.partition import parse_presort_exp, PartitionSpec
4
- from fugue.constants import KEYWORD_CORECOUNT, KEYWORD_ROWCOUNT
4
+ from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
5
5
  from pytest import raises
6
6
  from triad.collections.schema import Schema
7
7
  from triad.utils.hash import to_uuid
@@ -148,6 +148,9 @@ def test_partition_spec():
148
148
  assert dict(a=True, d=True, e=False) == p.get_sorts(
149
149
  Schema("a:int,b:int,d:int,e:int")
150
150
  )
151
+ assert dict(d=True, e=False) == p.get_sorts(
152
+ Schema("a:int,b:int,d:int,e:int"), with_partition_keys=False
153
+ )
151
154
  p = PartitionSpec(dict(partition_by=["e", "a"], presort="d asc"))
152
155
  assert p.get_key_schema(Schema("a:int,b:int,d:int,e:int")) == "e:int,a:int"
153
156
 
@@ -228,9 +231,9 @@ def test_get_num_partitions():
228
231
  assert 6 == p.get_num_partitions(x=lambda: 1, Y=lambda: 2)
229
232
  raises(Exception, lambda: p.get_num_partitions(x=lambda: 1))
230
233
 
231
- p = PartitionSpec(dict(partition_by=["b", "a"], num="min(ROWCOUNT,CORECOUNT)"))
234
+ p = PartitionSpec(dict(partition_by=["b", "a"], num="min(ROWCOUNT,CONCURRENCY)"))
232
235
  assert 90 == p.get_num_partitions(
233
- **{KEYWORD_ROWCOUNT: lambda: 100, KEYWORD_CORECOUNT: lambda: 90}
236
+ **{KEYWORD_ROWCOUNT: lambda: 100, KEYWORD_PARALLELISM: lambda: 90}
234
237
  )
235
238
 
236
239
 
@@ -8,8 +8,7 @@ from triad import FileSystem, Schema
8
8
  from triad.collections.schema import SchemaError
9
9
  from triad.exceptions import InvalidOperationError, NoneArgumentError
10
10
 
11
- from fugue import ArrayDataFrame, ArrowDataFrame, IterableDataFrame, PandasDataFrame
12
- from fugue.dataframe import to_local_bounded_df, to_local_df
11
+ from fugue import ArrayDataFrame, IterableDataFrame, PandasDataFrame
13
12
  from fugue.dataframe.utils import _df_eq as df_eq
14
13
  from fugue.dataframe.utils import (
15
14
  _schema_eq,
@@ -24,46 +23,6 @@ from fugue.dataframe.utils import (
24
23
  )
25
24
 
26
25
 
27
- def test_to_local_df():
28
- df = ArrayDataFrame([[0, 1]], "a:int,b:int")
29
- pdf = PandasDataFrame(df.as_pandas(), "a:int,b:int")
30
- idf = IterableDataFrame([[0, 1]], "a:int,b:int")
31
- assert to_local_df(df) is df
32
- assert to_local_df(pdf) is pdf
33
- assert to_local_df(idf) is idf
34
- assert isinstance(to_local_df(df.native, "a:int,b:int"), ArrayDataFrame)
35
- assert isinstance(to_local_df(pdf.native, "a:int,b:int"), PandasDataFrame)
36
- assert isinstance(to_local_df(idf.native, "a:int,b:int"), IterableDataFrame)
37
- raises(ValueError, lambda: to_local_df(123))
38
-
39
- raises(NoneArgumentError, lambda: to_local_df(None))
40
- raises(ValueError, lambda: to_local_df(df, "a:int,b:int"))
41
-
42
-
43
- def test_to_local_bounded_df():
44
- df = ArrayDataFrame([[0, 1]], "a:int,b:int")
45
- idf = IterableDataFrame([[0, 1]], "a:int,b:int")
46
- adf = ArrowDataFrame(df.as_array(), "a:int,b:int")
47
- assert to_local_bounded_df(df) is df
48
- r = to_local_bounded_df(idf)
49
- assert r is not idf
50
- assert r.as_array() == [[0, 1]]
51
- assert r.schema == "a:int,b:int"
52
- r = to_local_bounded_df(adf.native)
53
- assert isinstance(r, ArrowDataFrame)
54
- assert r.as_array() == [[0, 1]]
55
- assert r.schema == "a:int,b:int"
56
-
57
- raises(ValueError, lambda: to_local_bounded_df(123))
58
-
59
- def rows():
60
- yield [0]
61
- yield [1]
62
-
63
- with raises(ValueError):
64
- to_local_bounded_df(rows(), schema="a:int")
65
-
66
-
67
26
  def test_schema_eq():
68
27
  assert not _schema_eq(Schema("a:int"), Schema("a:int8"))
69
28
  assert not _schema_eq(Schema("a:int"), Schema("b:int"))
@@ -85,7 +44,7 @@ def test_df_eq():
85
44
  df1 = ArrayDataFrame([[0, 100.0, "a"]], "a:int,b:double,c:str")
86
45
  df2 = ArrayDataFrame([[0, 100.001, "a"]], "a:int,b:double,c:str")
87
46
  assert df_eq(df1, df1)
88
- assert df_eq(df1, df2, digits=4)
47
+ assert df_eq(df1, df2, digits=2)
89
48
  # precision
90
49
  assert not df_eq(df1, df2, digits=6)
91
50
  # no content
@@ -1,3 +1,5 @@
1
+ from typing import Any, List
2
+
1
3
  import pandas as pd
2
4
  import pyarrow as pa
3
5
 
@@ -34,6 +36,37 @@ class NativeExecutionEngineBuiltInQPDTests(BuiltInTests.Tests):
34
36
  def test_yield_table(self):
35
37
  pass
36
38
 
39
+ def test_coarse_partition(self):
40
+ def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
41
+ ct = df.a.nunique()
42
+ s = df.a * 1000 + df.b
43
+ ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
44
+ return [[ct, ordered]]
45
+
46
+ def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
47
+ if rc > 0:
48
+ assert len(df) == rc
49
+ assert df.ct.sum() == n
50
+ if check_ordered:
51
+ assert (df.ordered == True).all()
52
+
53
+ gps = 100
54
+ partition_num = 6
55
+ df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
56
+ frac=1.0
57
+ )
58
+ with FugueWorkflow() as dag:
59
+ a = dag.df(df)
60
+ c = a.partition(
61
+ algo="coarse", by="a", presort="b", num=partition_num
62
+ ).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
63
+ dag.output(
64
+ c,
65
+ using=assert_,
66
+ params=dict(rc=0, n=gps, check_ordered=True),
67
+ )
68
+ dag.run(self.engine)
69
+
37
70
 
38
71
  def test_get_file_threshold():
39
72
  assert -1 == _get_file_threshold(None)