daplapath 2.0.7__tar.gz → 2.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {daplapath-2.0.7 → daplapath-2.0.9}/PKG-INFO +14 -17
- {daplapath-2.0.7 → daplapath-2.0.9}/README.md +13 -16
- {daplapath-2.0.7 → daplapath-2.0.9}/daplapath/path.py +73 -32
- {daplapath-2.0.7 → daplapath-2.0.9}/pyproject.toml +1 -1
- {daplapath-2.0.7 → daplapath-2.0.9}/LICENSE.md +0 -0
- {daplapath-2.0.7 → daplapath-2.0.9}/daplapath/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: daplapath
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.9
|
|
4
4
|
Summary: A pathlib.Path class for dapla
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: ort
|
|
@@ -35,14 +35,14 @@ from daplapath.path import Path
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
```python
|
|
38
|
-
folder = Path('ssb-
|
|
38
|
+
folder = Path('ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024')
|
|
39
39
|
folder
|
|
40
40
|
```
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
'ssb-
|
|
45
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024'
|
|
46
46
|
|
|
47
47
|
|
|
48
48
|
|
|
@@ -107,7 +107,7 @@ file
|
|
|
107
107
|
|
|
108
108
|
|
|
109
109
|
|
|
110
|
-
'ssb-
|
|
110
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
|
|
111
111
|
|
|
112
112
|
|
|
113
113
|
|
|
@@ -119,7 +119,7 @@ file.parent
|
|
|
119
119
|
|
|
120
120
|
|
|
121
121
|
|
|
122
|
-
'ssb-
|
|
122
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024'
|
|
123
123
|
|
|
124
124
|
|
|
125
125
|
|
|
@@ -135,8 +135,7 @@ file.columns
|
|
|
135
135
|
|
|
136
136
|
|
|
137
137
|
|
|
138
|
-
Index(['
|
|
139
|
-
'SHAPE_Area', 'geometry'],
|
|
138
|
+
Index(['objtype', 'navn', "komm_nr", "fylke_nr", 'areal_gdb', 'geometry'],
|
|
140
139
|
dtype='object')
|
|
141
140
|
|
|
142
141
|
|
|
@@ -149,13 +148,11 @@ file.dtypes
|
|
|
149
148
|
|
|
150
149
|
|
|
151
150
|
|
|
152
|
-
|
|
153
|
-
|
|
151
|
+
objtype string
|
|
152
|
+
navn string
|
|
154
153
|
komm_nr string
|
|
155
154
|
fylke_nr string
|
|
156
|
-
|
|
157
|
-
SHAPE_Length double
|
|
158
|
-
SHAPE_Area double
|
|
155
|
+
areal_gdb double
|
|
159
156
|
geometry binary
|
|
160
157
|
dtype: object
|
|
161
158
|
|
|
@@ -206,7 +203,7 @@ file.latest_version()
|
|
|
206
203
|
|
|
207
204
|
|
|
208
205
|
|
|
209
|
-
'ssb-
|
|
206
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
|
|
210
207
|
|
|
211
208
|
|
|
212
209
|
|
|
@@ -218,7 +215,7 @@ file.highest_numbered_version()
|
|
|
218
215
|
|
|
219
216
|
|
|
220
217
|
|
|
221
|
-
'ssb-
|
|
218
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
|
|
222
219
|
|
|
223
220
|
|
|
224
221
|
|
|
@@ -231,7 +228,7 @@ file.new_version()
|
|
|
231
228
|
|
|
232
229
|
|
|
233
230
|
|
|
234
|
-
'ssb-
|
|
231
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v2.parquet'
|
|
235
232
|
|
|
236
233
|
|
|
237
234
|
|
|
@@ -268,11 +265,11 @@ Filtre med hyperlenke. Gjør at man kopierer stien når man klikker på den.
|
|
|
268
265
|
|
|
269
266
|
```python
|
|
270
267
|
print(
|
|
271
|
-
Path("ssb-
|
|
268
|
+
Path("ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data").tree()
|
|
272
269
|
)
|
|
273
270
|
```
|
|
274
271
|
|
|
275
|
-
ssb-
|
|
272
|
+
ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data /
|
|
276
273
|
└──2000 /
|
|
277
274
|
└──SSB_tettsted_flate_p2000.parquet
|
|
278
275
|
└──SSB_tettsted_flate_p2000_v1.parquet
|
|
@@ -19,14 +19,14 @@ from daplapath.path import Path
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
```python
|
|
22
|
-
folder = Path('ssb-
|
|
22
|
+
folder = Path('ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024')
|
|
23
23
|
folder
|
|
24
24
|
```
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
'ssb-
|
|
29
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024'
|
|
30
30
|
|
|
31
31
|
|
|
32
32
|
|
|
@@ -91,7 +91,7 @@ file
|
|
|
91
91
|
|
|
92
92
|
|
|
93
93
|
|
|
94
|
-
'ssb-
|
|
94
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
|
|
95
95
|
|
|
96
96
|
|
|
97
97
|
|
|
@@ -103,7 +103,7 @@ file.parent
|
|
|
103
103
|
|
|
104
104
|
|
|
105
105
|
|
|
106
|
-
'ssb-
|
|
106
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024'
|
|
107
107
|
|
|
108
108
|
|
|
109
109
|
|
|
@@ -119,8 +119,7 @@ file.columns
|
|
|
119
119
|
|
|
120
120
|
|
|
121
121
|
|
|
122
|
-
Index(['
|
|
123
|
-
'SHAPE_Area', 'geometry'],
|
|
122
|
+
Index(['objtype', 'navn', "komm_nr", "fylke_nr", 'areal_gdb', 'geometry'],
|
|
124
123
|
dtype='object')
|
|
125
124
|
|
|
126
125
|
|
|
@@ -133,13 +132,11 @@ file.dtypes
|
|
|
133
132
|
|
|
134
133
|
|
|
135
134
|
|
|
136
|
-
|
|
137
|
-
|
|
135
|
+
objtype string
|
|
136
|
+
navn string
|
|
138
137
|
komm_nr string
|
|
139
138
|
fylke_nr string
|
|
140
|
-
|
|
141
|
-
SHAPE_Length double
|
|
142
|
-
SHAPE_Area double
|
|
139
|
+
areal_gdb double
|
|
143
140
|
geometry binary
|
|
144
141
|
dtype: object
|
|
145
142
|
|
|
@@ -190,7 +187,7 @@ file.latest_version()
|
|
|
190
187
|
|
|
191
188
|
|
|
192
189
|
|
|
193
|
-
'ssb-
|
|
190
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
|
|
194
191
|
|
|
195
192
|
|
|
196
193
|
|
|
@@ -202,7 +199,7 @@ file.highest_numbered_version()
|
|
|
202
199
|
|
|
203
200
|
|
|
204
201
|
|
|
205
|
-
'ssb-
|
|
202
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
|
|
206
203
|
|
|
207
204
|
|
|
208
205
|
|
|
@@ -215,7 +212,7 @@ file.new_version()
|
|
|
215
212
|
|
|
216
213
|
|
|
217
214
|
|
|
218
|
-
'ssb-
|
|
215
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v2.parquet'
|
|
219
216
|
|
|
220
217
|
|
|
221
218
|
|
|
@@ -252,11 +249,11 @@ Filtre med hyperlenke. Gjør at man kopierer stien når man klikker på den.
|
|
|
252
249
|
|
|
253
250
|
```python
|
|
254
251
|
print(
|
|
255
|
-
Path("ssb-
|
|
252
|
+
Path("ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data").tree()
|
|
256
253
|
)
|
|
257
254
|
```
|
|
258
255
|
|
|
259
|
-
ssb-
|
|
256
|
+
ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data /
|
|
260
257
|
└──2000 /
|
|
261
258
|
└──SSB_tettsted_flate_p2000.parquet
|
|
262
259
|
└──SSB_tettsted_flate_p2000_v1.parquet
|
|
@@ -96,7 +96,7 @@ class LocalFileSystem(AbstractFileSystem):
|
|
|
96
96
|
return cls.cp_file(source, destination, **kwargs)
|
|
97
97
|
|
|
98
98
|
@staticmethod
|
|
99
|
-
def cp_file(
|
|
99
|
+
def cp_file(source, destination, **kwargs):
|
|
100
100
|
os.makedirs(pathlib.Path(destination).parent, exist_ok=True)
|
|
101
101
|
return shutil.copy2(source, destination, **kwargs)
|
|
102
102
|
|
|
@@ -252,11 +252,16 @@ class Path(str, _PathBase):
|
|
|
252
252
|
def rglob(self, pattern: str, **kwargs) -> "PathSeries":
|
|
253
253
|
return self.glob(pattern, recursive=True, **kwargs)
|
|
254
254
|
|
|
255
|
-
def glob(
|
|
255
|
+
def glob(
|
|
256
|
+
self, pattern: str | None = None, recursive: bool = True, **kwargs
|
|
257
|
+
) -> "PathSeries":
|
|
256
258
|
"""Create PathSeries of files/directories that match the pattern."""
|
|
257
259
|
recursive = kwargs.get("recurse_symlinks", recursive)
|
|
258
260
|
|
|
259
|
-
|
|
261
|
+
if pattern:
|
|
262
|
+
pattern = str(self / pattern)
|
|
263
|
+
else:
|
|
264
|
+
pattern = str(self)
|
|
260
265
|
|
|
261
266
|
# pop kwargs going into PathSeries initialiser.
|
|
262
267
|
iterable_init_args = get_arguments(self._iterable_type)
|
|
@@ -300,6 +305,17 @@ class Path(str, _PathBase):
|
|
|
300
305
|
"""
|
|
301
306
|
return self.glob("**", recursive=recursive, **kwargs)
|
|
302
307
|
|
|
308
|
+
def rmdir(self) -> None:
|
|
309
|
+
files = self.glob("**").files
|
|
310
|
+
with ThreadPoolExecutor() as executor:
|
|
311
|
+
list(executor.map(self.file_system.rm_file, files))
|
|
312
|
+
|
|
313
|
+
def cp(self, destination: "Path | str") -> "Path":
|
|
314
|
+
return self._cp_or_mv(destination, "cp")
|
|
315
|
+
|
|
316
|
+
def mv(self, destination: "Path | str") -> "Path":
|
|
317
|
+
return self._cp_or_mv(destination, "mv")
|
|
318
|
+
|
|
303
319
|
def versions(self, include_versionless: bool = False) -> "PathSeries":
|
|
304
320
|
"""Returns a PathSeries of all versions of the file."""
|
|
305
321
|
files_in_folder: Iterable[Path] = self.parent.glob("**", recursive=False)
|
|
@@ -583,7 +599,7 @@ class Path(str, _PathBase):
|
|
|
583
599
|
|
|
584
600
|
@property
|
|
585
601
|
def index_column_names(self) -> list[str]:
|
|
586
|
-
return _get_index_cols(self.schema)
|
|
602
|
+
return _get_index_cols(self.schema, self)
|
|
587
603
|
|
|
588
604
|
@property
|
|
589
605
|
def columns(self) -> pd.Index:
|
|
@@ -596,7 +612,7 @@ class Path(str, _PathBase):
|
|
|
596
612
|
]
|
|
597
613
|
except (KeyError, TypeError):
|
|
598
614
|
names = schema.names
|
|
599
|
-
index_cols = _get_index_cols(schema)
|
|
615
|
+
index_cols = _get_index_cols(schema, self)
|
|
600
616
|
return pd.Index(names).difference(index_cols)
|
|
601
617
|
|
|
602
618
|
@property
|
|
@@ -605,16 +621,14 @@ class Path(str, _PathBase):
|
|
|
605
621
|
try:
|
|
606
622
|
with self.open("rb") as file:
|
|
607
623
|
return get_schema(file)
|
|
608
|
-
except
|
|
609
|
-
Exception
|
|
610
|
-
): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
|
|
624
|
+
except Exception:
|
|
611
625
|
return get_schema(self)
|
|
612
626
|
|
|
613
627
|
@property
|
|
614
628
|
def dtypes(self) -> pd.Series:
|
|
615
629
|
"""Date types of the file's columns."""
|
|
616
630
|
schema = self.schema
|
|
617
|
-
index_cols = _get_index_cols(schema)
|
|
631
|
+
index_cols = _get_index_cols(schema, self)
|
|
618
632
|
return pd.Series(schema.types, index=schema.names).loc[
|
|
619
633
|
lambda x: ~x.index.isin(index_cols)
|
|
620
634
|
]
|
|
@@ -625,9 +639,7 @@ class Path(str, _PathBase):
|
|
|
625
639
|
try:
|
|
626
640
|
with self.open("rb") as file:
|
|
627
641
|
return get_shape(file)
|
|
628
|
-
except
|
|
629
|
-
Exception
|
|
630
|
-
): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
|
|
642
|
+
except Exception:
|
|
631
643
|
return get_shape(self)
|
|
632
644
|
|
|
633
645
|
@property
|
|
@@ -722,10 +734,10 @@ class Path(str, _PathBase):
|
|
|
722
734
|
|
|
723
735
|
Example
|
|
724
736
|
-------
|
|
725
|
-
>>> folder = 'ssb-
|
|
737
|
+
>>> folder = 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2023'
|
|
726
738
|
>>> file_path = folder / "ABAS_kommune_flate_p2023_v1.parquet"
|
|
727
739
|
>>> file_path
|
|
728
|
-
'ssb-
|
|
740
|
+
'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2023/ABAS_kommune_flate_p2023_v1.parquet'
|
|
729
741
|
"""
|
|
730
742
|
if not isinstance(other, (str, PurePath, os.PathLike)):
|
|
731
743
|
raise TypeError(
|
|
@@ -783,6 +795,27 @@ class Path(str, _PathBase):
|
|
|
783
795
|
def _new(self, new_path: str | Path) -> "Path":
|
|
784
796
|
return self.__class__(new_path, self.file_system)
|
|
785
797
|
|
|
798
|
+
def _cp_or_mv(self, destination: "Path | str", attr: str) -> "Path":
|
|
799
|
+
func: Callable = getattr(self.file_system, attr)
|
|
800
|
+
try:
|
|
801
|
+
func(self, destination)
|
|
802
|
+
except FileNotFoundError:
|
|
803
|
+
destination = self.__class__(destination)
|
|
804
|
+
sources = list(self.glob("**").files)
|
|
805
|
+
destinations = [path.replace(self, destination) for path in sources]
|
|
806
|
+
with ThreadPoolExecutor() as executor:
|
|
807
|
+
list(executor.map(func, sources, destinations))
|
|
808
|
+
self._new(destination)
|
|
809
|
+
|
|
810
|
+
def keep_newest_partitions(self) -> "Path":
|
|
811
|
+
def _keep_newest(path):
|
|
812
|
+
while True:
|
|
813
|
+
if path.isfile():
|
|
814
|
+
pass
|
|
815
|
+
|
|
816
|
+
with ThreadPoolExecutor() as executor:
|
|
817
|
+
list(executor.map(_keep_newest, self.ls()))
|
|
818
|
+
|
|
786
819
|
|
|
787
820
|
class PathSeries(pd.Series, _PathBase):
|
|
788
821
|
"""A pandas Series for working with GCS (Google Cloud Storage) paths.
|
|
@@ -1480,8 +1513,11 @@ def get_path_tree(
|
|
|
1480
1513
|
return tree
|
|
1481
1514
|
|
|
1482
1515
|
|
|
1483
|
-
def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
|
|
1484
|
-
|
|
1516
|
+
def _get_index_cols(schema: pyarrow.Schema, path_or_file: str | Path) -> list[str]:
|
|
1517
|
+
try:
|
|
1518
|
+
cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
|
|
1519
|
+
except KeyError as e:
|
|
1520
|
+
raise KeyError(f"{e}. For {type(path_or_file)}: {path_or_file}")
|
|
1485
1521
|
return [x for x in cols if not isinstance(x, dict)]
|
|
1486
1522
|
|
|
1487
1523
|
|
|
@@ -1526,29 +1562,34 @@ def get_schema(file) -> pyarrow.Schema:
|
|
|
1526
1562
|
def _get_schema(path):
|
|
1527
1563
|
try:
|
|
1528
1564
|
return pq.read_schema(path)
|
|
1529
|
-
except FileNotFoundError:
|
|
1530
|
-
|
|
1531
|
-
|
|
1565
|
+
except FileNotFoundError as e:
|
|
1566
|
+
try:
|
|
1567
|
+
with file_system.open(path, "rb") as f:
|
|
1568
|
+
return pq.read_schema(f)
|
|
1569
|
+
except Exception as e2:
|
|
1570
|
+
raise e2.__class__(f"{e2}. {path}") from e
|
|
1571
|
+
|
|
1572
|
+
child_paths = file_system.glob(file + "/**/*.parquet")
|
|
1573
|
+
if not len(child_paths):
|
|
1574
|
+
raise e.__class__(f"{e}: {file}") from e
|
|
1532
1575
|
|
|
1533
1576
|
with ThreadPoolExecutor() as executor:
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
|
|
1537
|
-
),
|
|
1538
|
-
promote_options="permissive",
|
|
1577
|
+
schemas: list[pyarrow.Schema] = list(
|
|
1578
|
+
executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
|
|
1539
1579
|
)
|
|
1580
|
+
if not schemas:
|
|
1581
|
+
raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
|
|
1582
|
+
|
|
1583
|
+
return pyarrow.unify_schemas(
|
|
1584
|
+
schemas,
|
|
1585
|
+
promote_options="permissive",
|
|
1586
|
+
)
|
|
1540
1587
|
|
|
1541
1588
|
|
|
1542
1589
|
def get_num_rows(file):
|
|
1543
1590
|
try:
|
|
1544
1591
|
return pq.read_metadata(file).num_rows
|
|
1545
|
-
except
|
|
1546
|
-
PermissionError,
|
|
1547
|
-
pyarrow.ArrowInvalid,
|
|
1548
|
-
FileNotFoundError,
|
|
1549
|
-
TypeError,
|
|
1550
|
-
OSError,
|
|
1551
|
-
) as e:
|
|
1592
|
+
except Exception as e:
|
|
1552
1593
|
try:
|
|
1553
1594
|
return ds.dataset(file).count_rows()
|
|
1554
1595
|
except Exception as e2:
|
|
@@ -1565,7 +1606,7 @@ def get_num_rows(file):
|
|
|
1565
1606
|
|
|
1566
1607
|
def get_shape(file) -> tuple[int, int]:
|
|
1567
1608
|
schema = get_schema(file)
|
|
1568
|
-
index_cols = _get_index_cols(schema)
|
|
1609
|
+
index_cols = _get_index_cols(schema, file)
|
|
1569
1610
|
ncol: int = sum(name not in index_cols for name in schema.names)
|
|
1570
1611
|
nrow: int = get_num_rows(file)
|
|
1571
1612
|
return nrow, ncol
|
|
File without changes
|
|
File without changes
|