daplapath 2.0.8__tar.gz → 2.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: daplapath
3
- Version: 2.0.8
3
+ Version: 2.0.9
4
4
  Summary: A pathlib.Path class for dapla
5
5
  License: MIT
6
6
  Author: ort
@@ -599,7 +599,7 @@ class Path(str, _PathBase):
599
599
 
600
600
  @property
601
601
  def index_column_names(self) -> list[str]:
602
- return _get_index_cols(self.schema)
602
+ return _get_index_cols(self.schema, self)
603
603
 
604
604
  @property
605
605
  def columns(self) -> pd.Index:
@@ -612,7 +612,7 @@ class Path(str, _PathBase):
612
612
  ]
613
613
  except (KeyError, TypeError):
614
614
  names = schema.names
615
- index_cols = _get_index_cols(schema)
615
+ index_cols = _get_index_cols(schema, self)
616
616
  return pd.Index(names).difference(index_cols)
617
617
 
618
618
  @property
@@ -621,16 +621,14 @@ class Path(str, _PathBase):
621
621
  try:
622
622
  with self.open("rb") as file:
623
623
  return get_schema(file)
624
- except (
625
- Exception
626
- ): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
624
+ except Exception:
627
625
  return get_schema(self)
628
626
 
629
627
  @property
630
628
  def dtypes(self) -> pd.Series:
631
629
  """Date types of the file's columns."""
632
630
  schema = self.schema
633
- index_cols = _get_index_cols(schema)
631
+ index_cols = _get_index_cols(schema, self)
634
632
  return pd.Series(schema.types, index=schema.names).loc[
635
633
  lambda x: ~x.index.isin(index_cols)
636
634
  ]
@@ -641,9 +639,7 @@ class Path(str, _PathBase):
641
639
  try:
642
640
  with self.open("rb") as file:
643
641
  return get_shape(file)
644
- except (
645
- Exception
646
- ): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
642
+ except Exception:
647
643
  return get_shape(self)
648
644
 
649
645
  @property
@@ -1517,8 +1513,11 @@ def get_path_tree(
1517
1513
  return tree
1518
1514
 
1519
1515
 
1520
- def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
1521
- cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
1516
+ def _get_index_cols(schema: pyarrow.Schema, path_or_file: str | Path) -> list[str]:
1517
+ try:
1518
+ cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
1519
+ except KeyError as e:
1520
+ raise KeyError(f"{e}. For {type(path_or_file)}: {path_or_file}")
1522
1521
  return [x for x in cols if not isinstance(x, dict)]
1523
1522
 
1524
1523
 
@@ -1570,13 +1569,21 @@ def get_schema(file) -> pyarrow.Schema:
1570
1569
  except Exception as e2:
1571
1570
  raise e2.__class__(f"{e2}. {path}") from e
1572
1571
 
1572
+ child_paths = file_system.glob(file + "/**/*.parquet")
1573
+ if not len(child_paths):
1574
+ raise e.__class__(f"{e}: {file}") from e
1575
+
1573
1576
  with ThreadPoolExecutor() as executor:
1574
- return pyarrow.unify_schemas(
1575
- list(
1576
- executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1577
- ),
1578
- promote_options="permissive",
1577
+ schemas: list[pyarrow.Schema] = list(
1578
+ executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1579
1579
  )
1580
+ if not schemas:
1581
+ raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
1582
+
1583
+ return pyarrow.unify_schemas(
1584
+ schemas,
1585
+ promote_options="permissive",
1586
+ )
1580
1587
 
1581
1588
 
1582
1589
  def get_num_rows(file):
@@ -1599,7 +1606,7 @@ def get_num_rows(file):
1599
1606
 
1600
1607
  def get_shape(file) -> tuple[int, int]:
1601
1608
  schema = get_schema(file)
1602
- index_cols = _get_index_cols(schema)
1609
+ index_cols = _get_index_cols(schema, file)
1603
1610
  ncol: int = sum(name not in index_cols for name in schema.names)
1604
1611
  nrow: int = get_num_rows(file)
1605
1612
  return nrow, ncol
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "daplapath"
3
- version = "2.0.8"
3
+ version = "2.0.9"
4
4
  description = "A pathlib.Path class for dapla"
5
5
  authors = ["ort <ort@ssb.no>"]
6
6
  license = "MIT"
File without changes
File without changes