daplapath 2.0.8__tar.gz → 2.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {daplapath-2.0.8 → daplapath-2.0.9}/PKG-INFO +1 -1
- {daplapath-2.0.8 → daplapath-2.0.9}/daplapath/path.py +24 -17
- {daplapath-2.0.8 → daplapath-2.0.9}/pyproject.toml +1 -1
- {daplapath-2.0.8 → daplapath-2.0.9}/LICENSE.md +0 -0
- {daplapath-2.0.8 → daplapath-2.0.9}/README.md +0 -0
- {daplapath-2.0.8 → daplapath-2.0.9}/daplapath/__init__.py +0 -0
|
@@ -599,7 +599,7 @@ class Path(str, _PathBase):
|
|
|
599
599
|
|
|
600
600
|
@property
|
|
601
601
|
def index_column_names(self) -> list[str]:
|
|
602
|
-
return _get_index_cols(self.schema)
|
|
602
|
+
return _get_index_cols(self.schema, self)
|
|
603
603
|
|
|
604
604
|
@property
|
|
605
605
|
def columns(self) -> pd.Index:
|
|
@@ -612,7 +612,7 @@ class Path(str, _PathBase):
|
|
|
612
612
|
]
|
|
613
613
|
except (KeyError, TypeError):
|
|
614
614
|
names = schema.names
|
|
615
|
-
index_cols = _get_index_cols(schema)
|
|
615
|
+
index_cols = _get_index_cols(schema, self)
|
|
616
616
|
return pd.Index(names).difference(index_cols)
|
|
617
617
|
|
|
618
618
|
@property
|
|
@@ -621,16 +621,14 @@ class Path(str, _PathBase):
|
|
|
621
621
|
try:
|
|
622
622
|
with self.open("rb") as file:
|
|
623
623
|
return get_schema(file)
|
|
624
|
-
except
|
|
625
|
-
Exception
|
|
626
|
-
): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
|
|
624
|
+
except Exception:
|
|
627
625
|
return get_schema(self)
|
|
628
626
|
|
|
629
627
|
@property
|
|
630
628
|
def dtypes(self) -> pd.Series:
|
|
631
629
|
"""Date types of the file's columns."""
|
|
632
630
|
schema = self.schema
|
|
633
|
-
index_cols = _get_index_cols(schema)
|
|
631
|
+
index_cols = _get_index_cols(schema, self)
|
|
634
632
|
return pd.Series(schema.types, index=schema.names).loc[
|
|
635
633
|
lambda x: ~x.index.isin(index_cols)
|
|
636
634
|
]
|
|
@@ -641,9 +639,7 @@ class Path(str, _PathBase):
|
|
|
641
639
|
try:
|
|
642
640
|
with self.open("rb") as file:
|
|
643
641
|
return get_shape(file)
|
|
644
|
-
except
|
|
645
|
-
Exception
|
|
646
|
-
): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
|
|
642
|
+
except Exception:
|
|
647
643
|
return get_shape(self)
|
|
648
644
|
|
|
649
645
|
@property
|
|
@@ -1517,8 +1513,11 @@ def get_path_tree(
|
|
|
1517
1513
|
return tree
|
|
1518
1514
|
|
|
1519
1515
|
|
|
1520
|
-
def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
|
|
1521
|
-
|
|
1516
|
+
def _get_index_cols(schema: pyarrow.Schema, path_or_file: str | Path) -> list[str]:
|
|
1517
|
+
try:
|
|
1518
|
+
cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
|
|
1519
|
+
except KeyError as e:
|
|
1520
|
+
raise KeyError(f"{e}. For {type(path_or_file)}: {path_or_file}")
|
|
1522
1521
|
return [x for x in cols if not isinstance(x, dict)]
|
|
1523
1522
|
|
|
1524
1523
|
|
|
@@ -1570,13 +1569,21 @@ def get_schema(file) -> pyarrow.Schema:
|
|
|
1570
1569
|
except Exception as e2:
|
|
1571
1570
|
raise e2.__class__(f"{e2}. {path}") from e
|
|
1572
1571
|
|
|
1572
|
+
child_paths = file_system.glob(file + "/**/*.parquet")
|
|
1573
|
+
if not len(child_paths):
|
|
1574
|
+
raise e.__class__(f"{e}: {file}") from e
|
|
1575
|
+
|
|
1573
1576
|
with ThreadPoolExecutor() as executor:
|
|
1574
|
-
|
|
1575
|
-
|
|
1576
|
-
executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
|
|
1577
|
-
),
|
|
1578
|
-
promote_options="permissive",
|
|
1577
|
+
schemas: list[pyarrow.Schema] = list(
|
|
1578
|
+
executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
|
|
1579
1579
|
)
|
|
1580
|
+
if not schemas:
|
|
1581
|
+
raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
|
|
1582
|
+
|
|
1583
|
+
return pyarrow.unify_schemas(
|
|
1584
|
+
schemas,
|
|
1585
|
+
promote_options="permissive",
|
|
1586
|
+
)
|
|
1580
1587
|
|
|
1581
1588
|
|
|
1582
1589
|
def get_num_rows(file):
|
|
@@ -1599,7 +1606,7 @@ def get_num_rows(file):
|
|
|
1599
1606
|
|
|
1600
1607
|
def get_shape(file) -> tuple[int, int]:
|
|
1601
1608
|
schema = get_schema(file)
|
|
1602
|
-
index_cols = _get_index_cols(schema)
|
|
1609
|
+
index_cols = _get_index_cols(schema, file)
|
|
1603
1610
|
ncol: int = sum(name not in index_cols for name in schema.names)
|
|
1604
1611
|
nrow: int = get_num_rows(file)
|
|
1605
1612
|
return nrow, ncol
|
|
File without changes
|
|
File without changes
|
|
File without changes
|