daplapath 2.0.7__tar.gz → 2.0.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: daplapath
3
- Version: 2.0.7
3
+ Version: 2.0.9
4
4
  Summary: A pathlib.Path class for dapla
5
5
  License: MIT
6
6
  Author: ort
@@ -35,14 +35,14 @@ from daplapath.path import Path
35
35
 
36
36
 
37
37
  ```python
38
- folder = Path('ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024')
38
+ folder = Path('ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024')
39
39
  folder
40
40
  ```
41
41
 
42
42
 
43
43
 
44
44
 
45
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024'
45
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024'
46
46
 
47
47
 
48
48
 
@@ -107,7 +107,7 @@ file
107
107
 
108
108
 
109
109
 
110
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
110
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
111
111
 
112
112
 
113
113
 
@@ -119,7 +119,7 @@ file.parent
119
119
 
120
120
 
121
121
 
122
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024'
122
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024'
123
123
 
124
124
 
125
125
 
@@ -135,8 +135,7 @@ file.columns
135
135
 
136
136
 
137
137
 
138
- Index(['OBJTYPE', 'NAVN', "komm_nr", "fylke_nr", 'AREAL_GDB', 'SHAPE_Length',
139
- 'SHAPE_Area', 'geometry'],
138
+ Index(['objtype', 'navn', "komm_nr", "fylke_nr", 'areal_gdb', 'geometry'],
140
139
  dtype='object')
141
140
 
142
141
 
@@ -149,13 +148,11 @@ file.dtypes
149
148
 
150
149
 
151
150
 
152
- OBJTYPE string
153
- NAVN string
151
+ objtype string
152
+ navn string
154
153
  komm_nr string
155
154
  fylke_nr string
156
- AREAL_GDB double
157
- SHAPE_Length double
158
- SHAPE_Area double
155
+ areal_gdb double
159
156
  geometry binary
160
157
  dtype: object
161
158
 
@@ -206,7 +203,7 @@ file.latest_version()
206
203
 
207
204
 
208
205
 
209
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
206
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
210
207
 
211
208
 
212
209
 
@@ -218,7 +215,7 @@ file.highest_numbered_version()
218
215
 
219
216
 
220
217
 
221
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
218
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
222
219
 
223
220
 
224
221
 
@@ -231,7 +228,7 @@ file.new_version()
231
228
 
232
229
 
233
230
 
234
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v2.parquet'
231
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v2.parquet'
235
232
 
236
233
 
237
234
 
@@ -268,11 +265,11 @@ Filtre med hyperlenke. Gjør at man kopierer stien når man klikker på den.
268
265
 
269
266
  ```python
270
267
  print(
271
- Path("ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data").tree()
268
+ Path("ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data").tree()
272
269
  )
273
270
  ```
274
271
 
275
- ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data /
272
+ ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data /
276
273
  └──2000 /
277
274
  └──SSB_tettsted_flate_p2000.parquet
278
275
  └──SSB_tettsted_flate_p2000_v1.parquet
@@ -19,14 +19,14 @@ from daplapath.path import Path
19
19
 
20
20
 
21
21
  ```python
22
- folder = Path('ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024')
22
+ folder = Path('ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024')
23
23
  folder
24
24
  ```
25
25
 
26
26
 
27
27
 
28
28
 
29
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024'
29
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024'
30
30
 
31
31
 
32
32
 
@@ -91,7 +91,7 @@ file
91
91
 
92
92
 
93
93
 
94
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
94
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
95
95
 
96
96
 
97
97
 
@@ -103,7 +103,7 @@ file.parent
103
103
 
104
104
 
105
105
 
106
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024'
106
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024'
107
107
 
108
108
 
109
109
 
@@ -119,8 +119,7 @@ file.columns
119
119
 
120
120
 
121
121
 
122
- Index(['OBJTYPE', 'NAVN', "komm_nr", "fylke_nr", 'AREAL_GDB', 'SHAPE_Length',
123
- 'SHAPE_Area', 'geometry'],
122
+ Index(['objtype', 'navn', "komm_nr", "fylke_nr", 'areal_gdb', 'geometry'],
124
123
  dtype='object')
125
124
 
126
125
 
@@ -133,13 +132,11 @@ file.dtypes
133
132
 
134
133
 
135
134
 
136
- OBJTYPE string
137
- NAVN string
135
+ objtype string
136
+ navn string
138
137
  komm_nr string
139
138
  fylke_nr string
140
- AREAL_GDB double
141
- SHAPE_Length double
142
- SHAPE_Area double
139
+ areal_gdb double
143
140
  geometry binary
144
141
  dtype: object
145
142
 
@@ -190,7 +187,7 @@ file.latest_version()
190
187
 
191
188
 
192
189
 
193
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
190
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
194
191
 
195
192
 
196
193
 
@@ -202,7 +199,7 @@ file.highest_numbered_version()
202
199
 
203
200
 
204
201
 
205
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
202
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
206
203
 
207
204
 
208
205
 
@@ -215,7 +212,7 @@ file.new_version()
215
212
 
216
213
 
217
214
 
218
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v2.parquet'
215
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v2.parquet'
219
216
 
220
217
 
221
218
 
@@ -252,11 +249,11 @@ Filtre med hyperlenke. Gjør at man kopierer stien når man klikker på den.
252
249
 
253
250
  ```python
254
251
  print(
255
- Path("ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data").tree()
252
+ Path("ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data").tree()
256
253
  )
257
254
  ```
258
255
 
259
- ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data /
256
+ ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data /
260
257
  └──2000 /
261
258
  └──SSB_tettsted_flate_p2000.parquet
262
259
  └──SSB_tettsted_flate_p2000_v1.parquet
@@ -96,7 +96,7 @@ class LocalFileSystem(AbstractFileSystem):
96
96
  return cls.cp_file(source, destination, **kwargs)
97
97
 
98
98
  @staticmethod
99
- def cp_file(self, path1, path2, **kwargs):
99
+ def cp_file(source, destination, **kwargs):
100
100
  os.makedirs(pathlib.Path(destination).parent, exist_ok=True)
101
101
  return shutil.copy2(source, destination, **kwargs)
102
102
 
@@ -252,11 +252,16 @@ class Path(str, _PathBase):
252
252
  def rglob(self, pattern: str, **kwargs) -> "PathSeries":
253
253
  return self.glob(pattern, recursive=True, **kwargs)
254
254
 
255
- def glob(self, pattern: str, recursive: bool = True, **kwargs) -> "PathSeries":
255
+ def glob(
256
+ self, pattern: str | None = None, recursive: bool = True, **kwargs
257
+ ) -> "PathSeries":
256
258
  """Create PathSeries of files/directories that match the pattern."""
257
259
  recursive = kwargs.get("recurse_symlinks", recursive)
258
260
 
259
- pattern = str(self / pattern)
261
+ if pattern:
262
+ pattern = str(self / pattern)
263
+ else:
264
+ pattern = str(self)
260
265
 
261
266
  # pop kwargs going into PathSeries initialiser.
262
267
  iterable_init_args = get_arguments(self._iterable_type)
@@ -300,6 +305,17 @@ class Path(str, _PathBase):
300
305
  """
301
306
  return self.glob("**", recursive=recursive, **kwargs)
302
307
 
308
+ def rmdir(self) -> None:
309
+ files = self.glob("**").files
310
+ with ThreadPoolExecutor() as executor:
311
+ list(executor.map(self.file_system.rm_file, files))
312
+
313
+ def cp(self, destination: "Path | str") -> "Path":
314
+ return self._cp_or_mv(destination, "cp")
315
+
316
+ def mv(self, destination: "Path | str") -> "Path":
317
+ return self._cp_or_mv(destination, "mv")
318
+
303
319
  def versions(self, include_versionless: bool = False) -> "PathSeries":
304
320
  """Returns a PathSeries of all versions of the file."""
305
321
  files_in_folder: Iterable[Path] = self.parent.glob("**", recursive=False)
@@ -583,7 +599,7 @@ class Path(str, _PathBase):
583
599
 
584
600
  @property
585
601
  def index_column_names(self) -> list[str]:
586
- return _get_index_cols(self.schema)
602
+ return _get_index_cols(self.schema, self)
587
603
 
588
604
  @property
589
605
  def columns(self) -> pd.Index:
@@ -596,7 +612,7 @@ class Path(str, _PathBase):
596
612
  ]
597
613
  except (KeyError, TypeError):
598
614
  names = schema.names
599
- index_cols = _get_index_cols(schema)
615
+ index_cols = _get_index_cols(schema, self)
600
616
  return pd.Index(names).difference(index_cols)
601
617
 
602
618
  @property
@@ -605,16 +621,14 @@ class Path(str, _PathBase):
605
621
  try:
606
622
  with self.open("rb") as file:
607
623
  return get_schema(file)
608
- except (
609
- Exception
610
- ): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
624
+ except Exception:
611
625
  return get_schema(self)
612
626
 
613
627
  @property
614
628
  def dtypes(self) -> pd.Series:
615
629
  """Date types of the file's columns."""
616
630
  schema = self.schema
617
- index_cols = _get_index_cols(schema)
631
+ index_cols = _get_index_cols(schema, self)
618
632
  return pd.Series(schema.types, index=schema.names).loc[
619
633
  lambda x: ~x.index.isin(index_cols)
620
634
  ]
@@ -625,9 +639,7 @@ class Path(str, _PathBase):
625
639
  try:
626
640
  with self.open("rb") as file:
627
641
  return get_shape(file)
628
- except (
629
- Exception
630
- ): # (PermissionError, FileNotFoundError, TypeError, IsADirectoryError):
642
+ except Exception:
631
643
  return get_shape(self)
632
644
 
633
645
  @property
@@ -722,10 +734,10 @@ class Path(str, _PathBase):
722
734
 
723
735
  Example
724
736
  -------
725
- >>> folder = 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2023'
737
+ >>> folder = 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2023'
726
738
  >>> file_path = folder / "ABAS_kommune_flate_p2023_v1.parquet"
727
739
  >>> file_path
728
- 'ssb-kart-data-delt-geo-prod/analyse_data/klargjorte-data/2023/ABAS_kommune_flate_p2023_v1.parquet'
740
+ 'ssb-areal-data-delt-kart-prod/analyse_data/klargjorte-data/2023/ABAS_kommune_flate_p2023_v1.parquet'
729
741
  """
730
742
  if not isinstance(other, (str, PurePath, os.PathLike)):
731
743
  raise TypeError(
@@ -783,6 +795,27 @@ class Path(str, _PathBase):
783
795
  def _new(self, new_path: str | Path) -> "Path":
784
796
  return self.__class__(new_path, self.file_system)
785
797
 
798
+ def _cp_or_mv(self, destination: "Path | str", attr: str) -> "Path":
799
+ func: Callable = getattr(self.file_system, attr)
800
+ try:
801
+ func(self, destination)
802
+ except FileNotFoundError:
803
+ destination = self.__class__(destination)
804
+ sources = list(self.glob("**").files)
805
+ destinations = [path.replace(self, destination) for path in sources]
806
+ with ThreadPoolExecutor() as executor:
807
+ list(executor.map(func, sources, destinations))
808
+ self._new(destination)
809
+
810
+ def keep_newest_partitions(self) -> "Path":
811
+ def _keep_newest(path):
812
+ while True:
813
+ if path.isfile():
814
+ pass
815
+
816
+ with ThreadPoolExecutor() as executor:
817
+ list(executor.map(_keep_newest, self.ls()))
818
+
786
819
 
787
820
  class PathSeries(pd.Series, _PathBase):
788
821
  """A pandas Series for working with GCS (Google Cloud Storage) paths.
@@ -1480,8 +1513,11 @@ def get_path_tree(
1480
1513
  return tree
1481
1514
 
1482
1515
 
1483
- def _get_index_cols(schema: pyarrow.Schema) -> list[str]:
1484
- cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
1516
+ def _get_index_cols(schema: pyarrow.Schema, path_or_file: str | Path) -> list[str]:
1517
+ try:
1518
+ cols = json.loads(schema.metadata[b"pandas"])["index_columns"]
1519
+ except KeyError as e:
1520
+ raise KeyError(f"{e}. For {type(path_or_file)}: {path_or_file}")
1485
1521
  return [x for x in cols if not isinstance(x, dict)]
1486
1522
 
1487
1523
 
@@ -1526,29 +1562,34 @@ def get_schema(file) -> pyarrow.Schema:
1526
1562
  def _get_schema(path):
1527
1563
  try:
1528
1564
  return pq.read_schema(path)
1529
- except FileNotFoundError:
1530
- with file_system.open(path, "rb") as f:
1531
- return pq.read_schema(f)
1565
+ except FileNotFoundError as e:
1566
+ try:
1567
+ with file_system.open(path, "rb") as f:
1568
+ return pq.read_schema(f)
1569
+ except Exception as e2:
1570
+ raise e2.__class__(f"{e2}. {path}") from e
1571
+
1572
+ child_paths = file_system.glob(file + "/**/*.parquet")
1573
+ if not len(child_paths):
1574
+ raise e.__class__(f"{e}: {file}") from e
1532
1575
 
1533
1576
  with ThreadPoolExecutor() as executor:
1534
- return pyarrow.unify_schemas(
1535
- list(
1536
- executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1537
- ),
1538
- promote_options="permissive",
1577
+ schemas: list[pyarrow.Schema] = list(
1578
+ executor.map(_get_schema, file_system.glob(file + "/**/*.parquet"))
1539
1579
  )
1580
+ if not schemas:
1581
+ raise ValueError(f"Couldn't find any schemas among {child_paths}.") from e
1582
+
1583
+ return pyarrow.unify_schemas(
1584
+ schemas,
1585
+ promote_options="permissive",
1586
+ )
1540
1587
 
1541
1588
 
1542
1589
  def get_num_rows(file):
1543
1590
  try:
1544
1591
  return pq.read_metadata(file).num_rows
1545
- except (
1546
- PermissionError,
1547
- pyarrow.ArrowInvalid,
1548
- FileNotFoundError,
1549
- TypeError,
1550
- OSError,
1551
- ) as e:
1592
+ except Exception as e:
1552
1593
  try:
1553
1594
  return ds.dataset(file).count_rows()
1554
1595
  except Exception as e2:
@@ -1565,7 +1606,7 @@ def get_num_rows(file):
1565
1606
 
1566
1607
  def get_shape(file) -> tuple[int, int]:
1567
1608
  schema = get_schema(file)
1568
- index_cols = _get_index_cols(schema)
1609
+ index_cols = _get_index_cols(schema, file)
1569
1610
  ncol: int = sum(name not in index_cols for name in schema.names)
1570
1611
  nrow: int = get_num_rows(file)
1571
1612
  return nrow, ncol
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "daplapath"
3
- version = "2.0.7"
3
+ version = "2.0.9"
4
4
  description = "A pathlib.Path class for dapla"
5
5
  authors = ["ort <ort@ssb.no>"]
6
6
  license = "MIT"
File without changes