daplapath 2.0.8__tar.gz → 2.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: daplapath
3
- Version: 2.0.8
3
+ Version: 2.0.10
4
4
  Summary: A pathlib.Path class for dapla
5
5
  License: MIT
6
6
  Author: ort
@@ -31,6 +31,11 @@ try:
31
31
  except ImportError:
32
32
  pass
33
33
 
34
+ try:
35
+ from google.cloud import storage
36
+ except ImportError:
37
+ pass
38
+
34
39
  # regex with the prefix '_v' followed by an integer of any length
35
40
  VERSION_PATTERN = r"_v(\d+)\."
36
41
  VERSION_PREFIX = "_v"
@@ -113,15 +118,26 @@ class LocalFileSystem(AbstractFileSystem):
113
118
  return os.makedirs(path, exist_ok=exist_ok)
114
119
 
115
120
 
116
- class GCSFileSystem(gcsfs.GCSFileSystem):
121
+ class MyGCSFileSystem(gcsfs.GCSFileSystem):
117
122
  def isdir(self, path: str) -> bool:
118
123
  """Check if path is a directory."""
119
124
  info = super(gcsfs.GCSFileSystem, self).info(path)
120
125
  return info["type"] == "directory"
121
126
 
127
+ def rmdir(self, path: str) -> None:
128
+ """Remove contents of a directory in GCS. It might take some time before files are actually deleted."""
129
+ path = pathlib.Path(path)
130
+ remaining = self.glob(str(path / "**"))
131
+ assert all(self.isdir(x) for x in remaining), remaining
132
+ storage_client = storage.Client()
133
+ bucket = storage_client.get_bucket(path.parts[0])
134
+ blobs = bucket.list_blobs(prefix="/".join(path.parts) + "/")
135
+ for blob in blobs:
136
+ blob.delete()
137
+
122
138
 
123
139
  if any("dapla" in key.lower() for key in os.environ) and "gcsfs" in locals():
124
- _config = Config(GCSFileSystem)
140
+ _config = Config(MyGCSFileSystem)
125
141
  else:
126
142
  _config = Config(LocalFileSystem)
127
143
 
@@ -314,7 +330,14 @@ class Path(str, _PathBase):
314
330
  return self._cp_or_mv(destination, "cp")
315
331
 
316
332
  def mv(self, destination: "Path | str") -> "Path":
317
- return self._cp_or_mv(destination, "mv")
333
+ was_dir = self.isdir()
334
+ out_path = self._cp_or_mv(destination, "mv")
335
+ if was_dir:
336
+ try:
337
+ self.file_system.rmdir(str(self))
338
+ except (FileNotFoundError, NotADirectoryError):
339
+ pass
340
+ return out_path
318
341
 
319
342
  def versions(self, include_versionless: bool = False) -> "PathSeries":
320
343
  """Returns a PathSeries of all versions of the file."""
@@ -599,7 +622,7 @@ class Path(str, _PathBase):
599
622
 
600
623
  @property
601
624
  def index_column_names(self) -> list[str]:
602
- return _get_index_cols(self.schema)
625
+ return _get_index_cols(self.schema, self)
603
626
 
604
627
  @property
605
628
  def columns(self) -> pd.Index:
@@ -612,7 +635,7 @@ class Path(str, _PathBase):
612
635
  ]
613
636
  except (KeyError, TypeError):
614
637
  names = schema.names
615
- index_cols = _get_index_cols(schema)
638
+ index_cols = _get_index_cols(schema, self)
616
639
  return pd.Index(names).difference(index_cols)
617
640
 
618
641
  @property
@@ -621,16 +644,14 @@ class Path(str, _PathBase):
621
644
  try:
622
645
  with self.open("rb") as file:
623
646
  return get_schema(file)
624
- except (
625
- Exception
626
- ): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
647
+ except Exception:
627
648
  return get_schema(self)
628
649
 
629
650
  @property
630
651
  def dtypes(self) -> pd.Series:
631
652
  """Date types of the file's columns."""
632
653
  schema = self.schema
633
- index_cols = _get_index_cols(schema)
654
+ index_cols = _get_index_cols(schema, self)
634
655
  return pd.Series(schema.types, index=schema.names).loc[
635
656
  lambda x: ~x.index.isin(index_cols)
636
657
  ]
@@ -641,9 +662,7 @@ class Path(str, _PathBase):
641
662
  try:
642
663
  with self.open("rb") as file:
643
664
  return get_shape(file)
644
- except (
645
- Exception
646
- ): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
665
+ except Exception:
647
666
  return get_shape(self)
648
667
 
649
668
  @property
@@ -702,7 +721,12 @@ class Path(str, _PathBase):
702
721
  def partition_root(self) -> "Path":
703
722
  if ".parquet" not in self:
704
723
  return self
705
- return self.split(".parquet")[0] + ".parquet"
724
+ return self._new(self.split(".parquet")[0] + ".parquet")
725
+
726
+ def is_partitioned(self) -> bool:
727
+ if ".parquet" not in self or self.isfile() and self.count(".parquet") != 2:
728
+ return False
729
+ return bool(len(self.glob("**/*.parquet")))
706
730
 
707
731
  def isfile(self) -> bool:
708
732
  return not self.isdir()
@@ -1517,8 +1541,11 @@ def get_path_tree(
1517
1541
  return tree
1518
1542
 
1519
1543
 
1520
- def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
1521
- cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
1544
+ def _get_index_cols(schema: pyarrow.Schema, path_or_file: str | Path) -> list[str]:
1545
+ try:
1546
+ cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
1547
+ except KeyError as e:
1548
+ raise KeyError(f"{e}. For {type(path_or_file)}: {path_or_file}")
1522
1549
  return [x for x in cols if not isinstance(x, dict)]
1523
1550
 
1524
1551
 
@@ -1570,13 +1597,21 @@ def get_schema(file) -> pyarrow.Schema:
1570
1597
  except Exception as e2:
1571
1598
  raise e2.__class__(f"{e2}. {path}") from e
1572
1599
 
1600
+ child_paths = file_system.glob(file + "/**/*.parquet")
1601
+ if not len(child_paths):
1602
+ raise e.__class__(f"{e}: {file}") from e
1603
+
1573
1604
  with ThreadPoolExecutor() as executor:
1574
- return pyarrow.unify_schemas(
1575
- list(
1576
- executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1577
- ),
1578
- promote_options="permissive",
1605
+ schemas: list[pyarrow.Schema] = list(
1606
+ executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1579
1607
  )
1608
+ if not schemas:
1609
+ raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
1610
+
1611
+ return pyarrow.unify_schemas(
1612
+ schemas,
1613
+ promote_options="permissive",
1614
+ )
1580
1615
 
1581
1616
 
1582
1617
  def get_num_rows(file):
@@ -1599,7 +1634,7 @@ def get_num_rows(file):
1599
1634
 
1600
1635
  def get_shape(file) -> tuple[int, int]:
1601
1636
  schema = get_schema(file)
1602
- index_cols = _get_index_cols(schema)
1637
+ index_cols = _get_index_cols(schema, file)
1603
1638
  ncol: int = sum(name not in index_cols for name in schema.names)
1604
1639
  nrow: int = get_num_rows(file)
1605
1640
  return nrow, ncol
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "daplapath"
3
- version = "2.0.8"
3
+ version = "2.0.10"
4
4
  description = "A pathlib.Path class for dapla"
5
5
  authors = ["ort <ort@ssb.no>"]
6
6
  license = "MIT"
File without changes
File without changes