fugue 0.8.7.dev6__py3-none-any.whl → 0.8.7.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fugue_spark/_utils/io.py CHANGED
@@ -4,7 +4,6 @@ import pyspark.sql as ps
4
4
  from pyspark.sql import SparkSession
5
5
  from triad.collections import Schema
6
6
  from triad.collections.dict import ParamDict
7
- from triad.collections.fs import FileSystem
8
7
  from triad.utils.assertion import assert_or_throw
9
8
 
10
9
  from fugue._utils.io import FileParser, save_df
@@ -16,9 +15,8 @@ from .convert import to_schema, to_spark_schema
16
15
 
17
16
 
18
17
  class SparkIO(object):
19
- def __init__(self, spark_session: SparkSession, fs: FileSystem):
18
+ def __init__(self, spark_session: SparkSession):
20
19
  self._session = spark_session
21
- self._fs = fs
22
20
  self._loads: Dict[str, Callable[..., DataFrame]] = {
23
21
  "csv": self._load_csv,
24
22
  "parquet": self._load_parquet,
@@ -41,7 +39,7 @@ class SparkIO(object):
41
39
  len(fmts) == 1, NotImplementedError("can't support multiple formats")
42
40
  )
43
41
  fmt = fmts[0]
44
- files = [f.uri for f in fp]
42
+ files = [f.path for f in fp]
45
43
  return self._loads[fmt](files, columns, **kwargs)
46
44
 
47
45
  def save_df(
@@ -64,7 +62,7 @@ class SparkIO(object):
64
62
  ldf = df.as_local()
65
63
  if isinstance(ldf, PandasDataFrame) and hasattr(ldf.native, "attrs"):
66
64
  ldf.native.attrs = {} # pragma: no cover
67
- save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)
65
+ save_df(ldf, uri, format_hint=format_hint, mode=mode, **kwargs)
68
66
 
69
67
  def _get_writer(
70
68
  self, sdf: ps.DataFrame, partition_spec: PartitionSpec
@@ -11,7 +11,7 @@ from pyspark.rdd import RDD
11
11
  from pyspark.sql import SparkSession
12
12
  from pyspark.sql.functions import broadcast, col, lit, row_number
13
13
  from pyspark.sql.window import Window
14
- from triad import FileSystem, IndexedOrderedDict, ParamDict, Schema, SerializableRLock
14
+ from triad import IndexedOrderedDict, ParamDict, Schema, SerializableRLock
15
15
  from triad.utils.assertion import assert_arg_not_none, assert_or_throw
16
16
  from triad.utils.hash import to_uuid
17
17
  from triad.utils.iter import EmptyAwareIterable
@@ -360,13 +360,12 @@ class SparkExecutionEngine(ExecutionEngine):
360
360
  cf.update(ParamDict(conf))
361
361
  super().__init__(cf)
362
362
  self._lock = SerializableRLock()
363
- self._fs = FileSystem()
364
363
  self._log = logging.getLogger()
365
364
  self._broadcast_func = RunOnce(
366
365
  self._broadcast, lambda *args, **kwargs: id(args[0])
367
366
  )
368
367
  self._persist_func = RunOnce(self._persist, lambda *args, **kwargs: id(args[0]))
369
- self._io = SparkIO(self.spark_session, self.fs)
368
+ self._io = SparkIO(self.spark_session)
370
369
  self._registered_dfs: Dict[str, SparkDataFrame] = {}
371
370
 
372
371
  def __repr__(self) -> str:
@@ -395,10 +394,6 @@ class SparkExecutionEngine(ExecutionEngine):
395
394
  def log(self) -> logging.Logger:
396
395
  return self._log
397
396
 
398
- @property
399
- def fs(self) -> FileSystem:
400
- return self._fs
401
-
402
397
  def create_default_sql_engine(self) -> SQLEngine:
403
398
  return SparkSQLEngine(self)
404
399
 
@@ -12,11 +12,12 @@ import pickle
12
12
  from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional
13
13
  from unittest import TestCase
14
14
  from uuid import uuid4
15
-
15
+ from triad.utils.io import write_text, join
16
16
  import numpy as np
17
17
  import pandas as pd
18
18
  import pyarrow as pa
19
19
  import pytest
20
+ from fsspec.implementations.local import LocalFileSystem
20
21
  from pytest import raises
21
22
  from triad import SerializableRLock
22
23
 
@@ -28,7 +29,6 @@ from fugue import (
28
29
  DataFrame,
29
30
  DataFrames,
30
31
  ExecutionEngine,
31
- FileSystem,
32
32
  FugueWorkflow,
33
33
  LocalDataFrame,
34
34
  OutputCoTransformer,
@@ -65,6 +65,8 @@ from fugue.exceptions import (
65
65
  FugueWorkflowRuntimeValidationError,
66
66
  )
67
67
 
68
+ _LOCAL_FS = LocalFileSystem(auto_mkdir=True)
69
+
68
70
 
69
71
  class BuiltInTests(object):
70
72
  """Workflow level general test suite. It is a more general end to end
@@ -633,9 +635,8 @@ class BuiltInTests(object):
633
635
  tmpdir = str(self.tmpdir)
634
636
 
635
637
  def incr():
636
- fs = FileSystem(auto_close=False).makedirs(tmpdir, recreate=True)
637
- fs.writetext(str(uuid4()) + ".txt", "")
638
- return fs.glob("*.txt").count().files
638
+ write_text(join(tmpdir, str(uuid4()) + ".txt"), "")
639
+ return len(_LOCAL_FS.glob(join(tmpdir, "*.txt")))
639
640
 
640
641
  def t1(df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
641
642
  for row in df:
@@ -717,9 +718,8 @@ class BuiltInTests(object):
717
718
  tmpdir = str(self.tmpdir)
718
719
 
719
720
  def incr():
720
- fs = FileSystem(auto_close=False).makedirs(tmpdir, recreate=True)
721
- fs.writetext(str(uuid4()) + ".txt", "")
722
- return fs.glob("*.tx" "t").count().files
721
+ write_text(join(tmpdir, str(uuid4()) + ".txt"), "")
722
+ return len(_LOCAL_FS.glob(join(tmpdir, "*.txt")))
723
723
 
724
724
  def t1(
725
725
  df: Iterable[Dict[str, Any]], df2: pd.DataFrame
@@ -1348,7 +1348,7 @@ class BuiltInTests(object):
1348
1348
  b.partition(num=3).save(path, fmt="parquet", single=True)
1349
1349
  b.save(path2, header=True)
1350
1350
  dag.run(self.engine)
1351
- assert FileSystem().isfile(path)
1351
+ assert _LOCAL_FS.isfile(path)
1352
1352
  with FugueWorkflow() as dag:
1353
1353
  a = dag.load(path, fmt="parquet", columns=["a", "c"])
1354
1354
  a.assert_eq(dag.df([[1, 6], [7, 2]], "a:long,c:int"))
@@ -1359,9 +1359,9 @@ class BuiltInTests(object):
1359
1359
  b = dag.df([[6, 1], [2, 7]], "c:int,a:long")
1360
1360
  b.partition(by="c").save(path3, fmt="parquet", single=False)
1361
1361
  dag.run(self.engine)
1362
- assert FileSystem().isdir(path3)
1363
- assert FileSystem().isdir(os.path.join(path3, "c=6"))
1364
- assert FileSystem().isdir(os.path.join(path3, "c=2"))
1362
+ assert _LOCAL_FS.isdir(path3)
1363
+ assert _LOCAL_FS.isdir(os.path.join(path3, "c=6"))
1364
+ assert _LOCAL_FS.isdir(os.path.join(path3, "c=2"))
1365
1365
  # TODO: in test below, once issue #288 is fixed, use dag.load
1366
1366
  # instead of pd.read_parquet
1367
1367
  pdf = pd.read_parquet(path3).sort_values("a").reset_index(drop=True)
@@ -15,8 +15,8 @@ from unittest import TestCase
15
15
  import pandas as pd
16
16
  import pytest
17
17
  from pytest import raises
18
- from triad.collections.fs import FileSystem
19
18
  from triad.exceptions import InvalidOperationError
19
+ from triad.utils.io import isfile, makedirs, touch
20
20
 
21
21
  import fugue.api as fa
22
22
  import fugue.column.functions as ff
@@ -62,7 +62,6 @@ class ExecutionEngineTests(object):
62
62
  def test_init(self):
63
63
  print(self.engine)
64
64
  assert self.engine.log is not None
65
- assert self.engine.fs is not None
66
65
  assert copy.copy(self.engine) is self.engine
67
66
  assert copy.deepcopy(self.engine) is self.engine
68
67
 
@@ -985,17 +984,16 @@ class ExecutionEngineTests(object):
985
984
  df_eq(res, [[1, "z1"]], "a:int,v:str", throw=True)
986
985
 
987
986
  @pytest.fixture(autouse=True)
988
- def init_tmpdir(self, tmpdir):
987
+ def init_tmpdir(self, tmpdir, tmp_mem_dir):
989
988
  self.tmpdir = tmpdir
990
989
 
991
990
  def test_save_single_and_load_parquet(self):
992
- e = self.engine
993
991
  b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long")
994
992
  path = os.path.join(self.tmpdir, "a", "b")
995
- e.fs.makedirs(path, recreate=True)
993
+ makedirs(path, exist_ok=True)
996
994
  # over write folder with single file
997
995
  fa.save(b, path, format_hint="parquet", force_single=True)
998
- assert e.fs.isfile(path)
996
+ assert isfile(path)
999
997
  c = fa.load(path, format_hint="parquet", columns=["a", "c"], as_fugue=True)
1000
998
  df_eq(c, [[1, 6], [7, 2]], "a:long,c:int", throw=True)
1001
999
 
@@ -1019,7 +1017,7 @@ class ExecutionEngineTests(object):
1019
1017
  path = os.path.join(self.tmpdir, "a", "b")
1020
1018
  fa.save(a, os.path.join(path, "a.parquet"), engine=native)
1021
1019
  fa.save(b, os.path.join(path, "b.parquet"), engine=native)
1022
- FileSystem().touch(os.path.join(path, "_SUCCESS"))
1020
+ touch(os.path.join(path, "_SUCCESS"))
1023
1021
  c = fa.load(path, format_hint="parquet", columns=["a", "c"], as_fugue=True)
1024
1022
  df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
1025
1023
 
@@ -1038,13 +1036,12 @@ class ExecutionEngineTests(object):
1038
1036
  df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
1039
1037
 
1040
1038
  def test_save_single_and_load_csv(self):
1041
- e = self.engine
1042
1039
  b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double")
1043
1040
  path = os.path.join(self.tmpdir, "a", "b")
1044
- e.fs.makedirs(path, recreate=True)
1041
+ makedirs(path, exist_ok=True)
1045
1042
  # over write folder with single file
1046
1043
  fa.save(b, path, format_hint="csv", header=True, force_single=True)
1047
- assert e.fs.isfile(path)
1044
+ assert isfile(path)
1048
1045
  c = fa.load(
1049
1046
  path, format_hint="csv", header=True, infer_schema=False, as_fugue=True
1050
1047
  )
@@ -1099,13 +1096,12 @@ class ExecutionEngineTests(object):
1099
1096
  df_eq(c, [["1.1", "60.1"], ["7.1", "20.1"]], "a:str,c:str", throw=True)
1100
1097
 
1101
1098
  def test_save_single_and_load_csv_no_header(self):
1102
- e = self.engine
1103
1099
  b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double")
1104
1100
  path = os.path.join(self.tmpdir, "a", "b")
1105
- e.fs.makedirs(path, recreate=True)
1101
+ makedirs(path, exist_ok=True)
1106
1102
  # over write folder with single file
1107
1103
  fa.save(b, path, format_hint="csv", header=False, force_single=True)
1108
- assert e.fs.isfile(path)
1104
+ assert isfile(path)
1109
1105
 
1110
1106
  with raises(ValueError):
1111
1107
  c = fa.load(
@@ -1190,7 +1186,7 @@ class ExecutionEngineTests(object):
1190
1186
  header=True,
1191
1187
  engine=native,
1192
1188
  )
1193
- FileSystem().touch(os.path.join(path, "_SUCCESS"))
1189
+ touch(os.path.join(path, "_SUCCESS"))
1194
1190
  c = fa.load(
1195
1191
  path,
1196
1192
  format_hint="csv",
@@ -1204,13 +1200,12 @@ class ExecutionEngineTests(object):
1204
1200
  )
1205
1201
 
1206
1202
  def test_save_single_and_load_json(self):
1207
- e = self.engine
1208
1203
  b = ArrayDataFrame([[6, 1], [2, 7]], "c:int,a:long")
1209
1204
  path = os.path.join(self.tmpdir, "a", "b")
1210
- e.fs.makedirs(path, recreate=True)
1205
+ makedirs(path, exist_ok=True)
1211
1206
  # over write folder with single file
1212
1207
  fa.save(b, path, format_hint="json", force_single=True)
1213
- assert e.fs.isfile(path)
1208
+ assert isfile(path)
1214
1209
  c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True)
1215
1210
  df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)
1216
1211
 
@@ -1241,7 +1236,7 @@ class ExecutionEngineTests(object):
1241
1236
  path = os.path.join(self.tmpdir, "a", "b")
1242
1237
  fa.save(a, os.path.join(path, "a.json"), format_hint="json", engine=native)
1243
1238
  fa.save(b, os.path.join(path, "b.json"), format_hint="json", engine=native)
1244
- FileSystem().touch(os.path.join(path, "_SUCCESS"))
1239
+ touch(os.path.join(path, "_SUCCESS"))
1245
1240
  c = fa.load(path, format_hint="json", columns=["a", "c"], as_fugue=True)
1246
1241
  df_eq(c, [[1, 6], [7, 2], [8, 4], [4, 3]], "a:long,c:long", throw=True)
1247
1242
 
@@ -0,0 +1,2 @@
1
+ # flake8: noqa
2
+ from .fixtures import tmp_mem_dir
@@ -0,0 +1,18 @@
1
+ import uuid
2
+
3
+ import pytest
4
+ from triad.utils.io import makedirs, rm
5
+
6
+
7
+ @pytest.fixture
8
+ def tmp_mem_dir():
9
+ uuid_str = str(uuid.uuid4())[:5]
10
+ path = "memory://test_" + uuid_str
11
+ makedirs(path)
12
+ try:
13
+ yield path
14
+ finally:
15
+ try:
16
+ rm(path, recursive=True)
17
+ except Exception: # pragma: no cover
18
+ pass