daplapath 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Statistics Norway
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,606 @@
1
+ Metadata-Version: 2.1
2
+ Name: daplapath
3
+ Version: 1.0.2
4
+ Summary: A pathlib.Path class for dapla
5
+ License: MIT
6
+ Author: ort
7
+ Author-email: ort@ssb.no
8
+ Requires-Python: >=3.10,<4
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Dist: dapla-toolbelt (>=1.3.2)
15
+ Description-Content-Type: text/markdown
16
+
17
+ # dapla-path
18
+
19
+ pathlib.Path for dapla
20
+
21
+ Opprettet av:
22
+ ort <ort@ssb.no>
23
+
24
+ ---
25
+
26
+ # Path (dapla)
27
+
28
+
29
+ ```python
30
+ import dapla as dp
31
+ import pandas as pd
32
+
33
+ from daplapath.path import Path
34
+ ```
35
+
36
+
37
+ ```python
38
+ folder = Path('ssb-kart-data-delt-prod/analyse_data/klargjorte-data/2024')
39
+ folder
40
+ ```
41
+
42
+
43
+
44
+
45
+ 'ssb-kart-data-delt-prod/analyse_data/klargjorte-data/2024'
46
+
47
+
48
+
49
+ ## Fungerer som tekst
50
+
51
+
52
+ ```python
53
+ folder.startswith("ssb")
54
+ ```
55
+
56
+
57
+
58
+
59
+ True
60
+
61
+
62
+
63
+
64
+ ```python
65
+ dp.FileClient.get_gcs_file_system().exists(folder)
66
+ ```
67
+
68
+
69
+
70
+
71
+ True
72
+
73
+
74
+
75
+ ## Med metoder og attributter ala pathlib.Path
76
+
77
+
78
+ ```python
79
+ folder.exists()
80
+ ```
81
+
82
+
83
+
84
+
85
+ True
86
+
87
+
88
+
89
+
90
+ ```python
91
+ folder.is_dir()
92
+ ```
93
+
94
+
95
+
96
+
97
+ True
98
+
99
+
100
+
101
+
102
+ ```python
103
+ file = folder / "ABAS_kommune_utenhav_p2024_v1.parquet"
104
+ file
105
+ ```
106
+
107
+
108
+
109
+
110
+ 'ssb-kart-data-delt-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
111
+
112
+
113
+
114
+
115
+ ```python
116
+ file.parent
117
+ ```
118
+
119
+
120
+
121
+
122
+ 'ssb-kart-data-delt-prod/analyse_data/klargjorte-data/2024'
123
+
124
+
125
+
126
+ ## Og noen pandas attributter
127
+
128
+ Uten å lese filen
129
+
130
+
131
+ ```python
132
+ file.columns
133
+ ```
134
+
135
+
136
+
137
+
138
+ Index(['OBJTYPE', 'NAVN', 'KOMMUNENR', 'FYLKE', 'AREAL_GDB', 'SHAPE_Length',
139
+ 'SHAPE_Area', 'geometry'],
140
+ dtype='object')
141
+
142
+
143
+
144
+
145
+ ```python
146
+ file.dtypes
147
+ ```
148
+
149
+
150
+
151
+
152
+ OBJTYPE string
153
+ NAVN string
154
+ KOMMUNENR string
155
+ FYLKE string
156
+ AREAL_GDB double
157
+ SHAPE_Length double
158
+ SHAPE_Area double
159
+ geometry binary
160
+ dtype: object
161
+
162
+
163
+
164
+
165
+ ```python
166
+ file.shape
167
+ ```
168
+
169
+
170
+
171
+
172
+ (481, 8)
173
+
174
+
175
+
176
+ ## Versjonering
177
+
178
+
179
+ ```python
180
+ file.version_number
181
+ ```
182
+
183
+
184
+
185
+
186
+ 1
187
+
188
+
189
+
190
+
191
+ ```python
192
+ print(file.versions())
193
+ ```
194
+
195
+ timestamp mb (int)
196
+ 2024-05-19 12:31:02 941 .../ABAS_kommune_utenhav_p2024.parquet
197
+ 2024-08-16 16:15:10 941 .../ABAS_kommune_utenhav_p2024_v1.parquet
198
+ Name: path, dtype: object
199
+
200
+
201
+
202
+ ```python
203
+ file.latest_version()
204
+ ```
205
+
206
+
207
+
208
+
209
+ 'ssb-kart-data-delt-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
210
+
211
+
212
+
213
+
214
+ ```python
215
+ file.highest_numbered_version()
216
+ ```
217
+
218
+
219
+
220
+
221
+ 'ssb-kart-data-delt-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v1.parquet'
222
+
223
+
224
+
225
+
226
+ ```python
227
+ # highest_numbered_version + 1
228
+ file.new_version()
229
+ ```
230
+
231
+
232
+
233
+
234
+ 'ssb-kart-data-delt-prod/analyse_data/klargjorte-data/2024/ABAS_kommune_utenhav_p2024_v2.parquet'
235
+
236
+
237
+
238
+
239
+ ```python
240
+ # alltid False
241
+ file.new_version().exists()
242
+ ```
243
+
244
+
245
+
246
+
247
+ False
248
+
249
+
250
+
251
+
252
+ ```python
253
+ # finner/fjerner versjonsnummer med regex-søk
254
+ file._version_pattern
255
+ ```
256
+
257
+
258
+
259
+
260
+ '_v(\\d+)'
261
+
262
+
263
+
264
+ ## Branch tree
265
+
266
+ Filtre med hyperlenke. Gjør at man kopierer stien når man klikker på den.
267
+
268
+
269
+ ```python
270
+ print(
271
+ Path("ssb-kart-data-delt-prod/analyse_data/klargjorte-data").tree()
272
+ )
273
+ ```
274
+
275
+ ssb-kart-data-delt-prod/analyse_data/klargjorte-data /
276
+ └──2000 /
277
+ └──SSB_tettsted_flate_p2000.parquet
278
+ └──SSB_tettsted_flate_p2000_v1.parquet
279
+ └──2002 /
280
+ └──SSB_tettsted_flate_p2002.parquet
281
+ └──SSB_tettsted_flate_p2002_v1.parquet
282
+ └──2003 /
283
+ └──SSB_tettsted_flate_p2003.parquet
284
+ └──SSB_tettsted_flate_p2003_v1.parquet
285
+ └──2004 /
286
+ └──SSB_tettsted_flate_p2004.parquet
287
+ └──SSB_tettsted_flate_p2004_v1.parquet
288
+ └──2005 /
289
+ └──SSB_tettsted_flate_p2005.parquet
290
+ └──SSB_tettsted_flate_p2005_v1.parquet
291
+ └──2006 /
292
+ └──SSB_tettsted_flate_p2006.parquet
293
+ └──SSB_tettsted_flate_p2006_v1.parquet
294
+ └──2007 /
295
+ └──SSB_tettsted_flate_p2007.parquet
296
+ └──SSB_tettsted_flate_p2007_v1.parquet
297
+ └──2008 /
298
+ └──SSB_tettsted_flate_p2008.parquet
299
+ └──SSB_tettsted_flate_p2008_v1.parquet
300
+ └──SSB_tettsted_ringbuffer_p2008.parquet
301
+ └──(...)
302
+ └──2009 /
303
+ └──SSB_tettsted_flate_p2009.parquet
304
+ └──SSB_tettsted_flate_p2009_v1.parquet
305
+ └──2010 /
306
+ └──SOL_arealressurs_flate_p2010.parquet
307
+ └──SOL_arealressurs_flate_p2010_v1.parquet
308
+ └──2011 /
309
+ └──SOL_Arstat_flate_p2011.parquet
310
+ └──SOL_Arstat_flate_p2011_v1.parquet
311
+ └──SSB_tettsted_flate_p2011.parquet
312
+ └──(...)
313
+ └──2012 /
314
+ └──ABAS_fylke_flate_p2012_v1.parquet
315
+ └──ABAS_fylke_linje_p2012_v1.parquet
316
+ └──ABAS_grunnkrets_flate_p2012_v1.parquet
317
+ └──(...)
318
+ └──2013 /
319
+ └──ABAS_fylke_flate_p2013_v1.parquet
320
+ └──ABAS_kommune_flate_p2013_v1.parquet
321
+ └──DEK_eiendom_flate_p2013_v1.parquet
322
+ └──(...)
323
+ └──2014 /
324
+ └──DEK_eiendom_flate_p2014_v1.parquet
325
+ └──FKB_anlegg_flate_p2014_v1.parquet
326
+ └──FKB_anlegg_linje_p2014_v1.parquet
327
+ └──(...)
328
+ └──2015 /
329
+ └──ABAS_grunnkrets_flate_p2015_v1.parquet
330
+ └──ABAS_grunnkrets_utenhav_p2015_v1.parquet
331
+ └──ABAS_kommune_flate_p2015_v1.parquet
332
+ └──(...)
333
+ └──2016 /
334
+ └──ABAS_fylke_flate_p2016_v1.parquet
335
+ └──ABAS_grunnkrets_flate_p2016_v1.parquet
336
+ └──ABAS_grunnkrets_utenhav_p2016_v1.parquet
337
+ └──(...)
338
+ └──2017 /
339
+ └──ABAS_fylke_flate_p2017_v1.parquet
340
+ └──ABAS_grunnkrets_flate_p2017_v1.parquet
341
+ └──ABAS_grunnkrets_utenhav_p2017_v1.parquet
342
+ └──(...)
343
+ └──2018 /
344
+ └──ABAS_fylke_flate_p2018_v1.parquet
345
+ └──ABAS_grunnkrets_flate_p2018_v1.parquet
346
+ └──ABAS_grunnkrets_utenhav_p2018_v1.parquet
347
+ └──(...)
348
+ └──2019 /
349
+ └──ABAS_fylke_flate_p2019_v1.parquet
350
+ └──ABAS_grunnkrets_flate_p2019_v1.parquet
351
+ └──ABAS_grunnkrets_utenhav_p2019_v1.parquet
352
+ └──(...)
353
+ └──2020 /
354
+ └──ABAS_fylke_flate_p2020_v1.parquet
355
+ └──ABAS_grunnkrets_flate_p2020_v1.parquet
356
+ └──ABAS_grunnkrets_utenhav_p2020_v1.parquet
357
+ └──(...)
358
+ └──2021 /
359
+ └──ABAS_fylke_flate_p2021_v1.parquet
360
+ └──ABAS_grunnkrets_flate_p2021_v1.parquet
361
+ └──ABAS_grunnkrets_utenhav_p2021_v1.parquet
362
+ └──(...)
363
+ └──2022 /
364
+ └──ABAS_fylke_flate_p2022_v1.parquet
365
+ └──ABAS_grunnkrets_flate_p2022_v1.parquet
366
+ └──ABAS_grunnkrets_utenhav_p2022_v1.parquet
367
+ └──(...)
368
+ └──2023 /
369
+ └──ABAS_KnrGamle_p2023_v1.parquet
370
+ └──ABAS_fylke_flate_p2023_v1.parquet
371
+ └──ABAS_grunnkrets_flate_p2023_v1.parquet
372
+ └──(...)
373
+ └──2024 /
374
+ └──ABAS_fylke_flate_p2024_v1.parquet
375
+ └──ABAS_grunnkrets_flate_p2024_v1.parquet
376
+ └──ABAS_grunnkrets_utenhav_p2024_v1.parquet
377
+ └──(...)
378
+
379
+
380
+ ## ls - få filstier, timestamp og størrelse
381
+
382
+ Med stier som kopieres (som ctrl + c) når man klipper på stien.
383
+
384
+
385
+ ```python
386
+ files_in_dir = file.parent.ls()
387
+ print(files_in_dir)
388
+ ```
389
+
390
+ timestamp mb (int)
391
+ 2024-04-19 11:44:12 11 .../ABAS_kommune_flate_p2024_v1.parquet
392
+ 2024-04-19 11:45:47 0 .../N50_JernbaneStasjon_punkt_p2024.parquet
393
+ 0 .../N50_JernbaneStasjon_punkt_p2024_v1.parquet
394
+ 0 .../N50_lufthavn_punkt_p2024.parquet
395
+ 0 .../N50_lufthavn_punkt_p2024_v1.parquet
396
+ ...
397
+ 2024-08-21 14:47:12 861 .../SSB_hav_flate_p2024.parquet
398
+ 2024-08-23 14:59:30 152 .../SSB_tettsted_flate_p2024_v1.parquet
399
+ 2024-08-23 14:59:36 152 .../SSB_tettsted_kommune_flate_p2024_v1.parquet
400
+ 2024-08-23 15:34:21 1122 .../SSB_tettsted_kommune_ringbuffer_p2024_v1.parquet
401
+ 2024-08-23 17:11:32 740 .../NVDB_veg_linje_p2024_v1.parquet
402
+ Name: path, Length: 127, dtype: object
403
+
404
+
405
+
406
+ ```python
407
+ # subclass av pandas.Series
408
+ type(files_in_dir)
409
+ ```
410
+
411
+
412
+
413
+
414
+ daplapath.path.PathSeries
415
+
416
+
417
+
418
+
419
+ ```python
420
+ print(files_in_dir.loc[lambda x: x.gb > 10].keep_latest_versions())
421
+ ```
422
+
423
+ timestamp mb (int)
424
+ 2024-07-18 00:13:09 17646 .../FKB_arealressurs_flate_p2024_v1.parquet
425
+ 2024-08-20 14:03:16 19717 .../FKB_gronnstruktur_flate_p2024_v1.parquet
426
+ Name: path, dtype: object
427
+
428
+
429
+
430
+ ```python
431
+ # stiene er fortsatt Path
432
+ type(files_in_dir.iloc[0])
433
+ ```
434
+
435
+
436
+
437
+
438
+ daplapath.path.Path
439
+
440
+
441
+
442
+
443
+ ```python
444
+ # velg ut filene
445
+ print(folder.ls().files)
446
+ ```
447
+
448
+ timestamp mb (int)
449
+ 2024-04-19 11:44:12 11 .../ABAS_kommune_flate_p2024_v1.parquet
450
+ 2024-04-19 11:45:47 0 .../N50_JernbaneStasjon_punkt_p2024.parquet
451
+ 0 .../N50_JernbaneStasjon_punkt_p2024_v1.parquet
452
+ 0 .../N50_lufthavn_punkt_p2024.parquet
453
+ 0 .../N50_lufthavn_punkt_p2024_v1.parquet
454
+ ...
455
+ 2024-08-21 14:47:12 861 .../SSB_hav_flate_p2024.parquet
456
+ 2024-08-23 14:59:30 152 .../SSB_tettsted_flate_p2024_v1.parquet
457
+ 2024-08-23 14:59:36 152 .../SSB_tettsted_kommune_flate_p2024_v1.parquet
458
+ 2024-08-23 15:34:21 1122 .../SSB_tettsted_kommune_ringbuffer_p2024_v1.parquet
459
+ 2024-08-23 17:11:32 740 .../NVDB_veg_linje_p2024_v1.parquet
460
+ Name: path, Length: 127, dtype: object
461
+
462
+
463
+
464
+ ```python
465
+ print(folder.ls().dirs)
466
+ ```
467
+
468
+ Series([], Name: path, dtype: object)
469
+
470
+
471
+
472
+ ```python
473
+ # samme som .loc med x.str.contains
474
+ print(folder.ls().containing("kommune"))
475
+ ```
476
+
477
+ timestamp mb (int)
478
+ 2024-04-19 11:44:12 11 .../ABAS_kommune_flate_p2024_v1.parquet
479
+ 2024-05-19 12:31:02 941 .../ABAS_kommune_utenhav_p2024.parquet
480
+ 2024-06-24 14:25:14 11 .../ABAS_kommune_flate_p2024.parquet
481
+ 2024-08-16 16:15:10 941 .../ABAS_kommune_utenhav_p2024_v1.parquet
482
+ 2024-08-23 14:59:36 152 .../SSB_tettsted_kommune_flate_p2024_v1.parquet
483
+ 2024-08-23 15:34:21 1122 .../SSB_tettsted_kommune_ringbuffer_p2024_v1.parquet
484
+ Name: path, dtype: object
485
+
486
+
487
+
488
+ ```python
489
+ print(file.parent.parent.ls(recursive=True).files)
490
+ ```
491
+
492
+ timestamp mb (int)
493
+ 2024-04-19 11:43:21 0 .../2022/N50_JernbaneStasjon_punkt_p2022_v1.parquet
494
+ 2024-04-19 11:43:22 0 .../2022/N50_lufthavn_punkt_p2022_v1.parquet
495
+ 2024-04-19 11:43:23 0 .../2022/NVE_Vindturbin_punkt_p2022_v1.parquet
496
+ 0 .../2022/NVE_Trafostasjon_punkt_p2022_v1.parquet
497
+ 2024-04-19 11:43:24 0 .../2022/S100_TekniskSit_flate_p2022_v1.parquet
498
+ ...
499
+ 2024-08-21 14:47:12 861 .../2024/SSB_hav_flate_p2024.parquet
500
+ 2024-08-23 14:59:30 152 .../2024/SSB_tettsted_flate_p2024_v1.parquet
501
+ 2024-08-23 14:59:36 152 .../2024/SSB_tettsted_kommune_flate_p2024_v1.parquet
502
+ 2024-08-23 15:34:21 1122 .../2024/SSB_tettsted_kommune_ringbuffer_p2024_v1.parquet
503
+ 2024-08-23 17:11:32 740 .../2024/NVDB_veg_linje_p2024_v1.parquet
504
+ Length: 1323, dtype: object
505
+
506
+
507
+ ## Write to testpath
508
+
509
+
510
+ ```python
511
+ testpath = Path('ssb-areal-data-produkt-prod/arealstat/temp/test_df_p2023_v1.parquet')
512
+
513
+ # delete files first
514
+ for version in testpath.versions():
515
+ version.rm_file()
516
+
517
+ testpath.exists()
518
+ ```
519
+
520
+
521
+
522
+
523
+ False
524
+
525
+
526
+
527
+
528
+ ```python
529
+ df = pd.DataFrame({"x": [1,2,3], "y": [*"abc"]})
530
+
531
+ dp.write_pandas(df, testpath)
532
+
533
+ testpath.exists()
534
+ ```
535
+
536
+
537
+
538
+
539
+ True
540
+
541
+
542
+
543
+
544
+ ```python
545
+ testpath.latest_version()
546
+ ```
547
+
548
+
549
+
550
+
551
+ 'ssb-areal-data-produkt-prod/arealstat/temp/test_df_p2023_v1.parquet'
552
+
553
+
554
+
555
+
556
+ ```python
557
+ # highest_numbered_version + 1
558
+ testpath.new_version()
559
+ ```
560
+
561
+
562
+ ---------------------------------------------------------------------------
563
+
564
+ ValueError Traceback (most recent call last)
565
+
566
+ Cell In[31], line 2
567
+ 1 # highest_numbered_version + 1
568
+ ----> 2 testpath.new_version()
569
+
570
+
571
+ File ~/daplapath/daplapath/path.py:805, in Path.new_version(self, timeout)
572
+ 803 time_should_be_at_least = pd.Timestamp.now() - pd.Timedelta(minutes=timeout)
573
+ 804 if timestamp[0] > time_should_be_at_least:
574
+ --> 805 raise ValueError(
575
+ 806 f"Latest version of the file was updated {timestamp[0]}, which "
576
+ 807 f"is less than the timeout period of {timeout} minutes. "
577
+ 808 "Change the timeout argument, but be sure to not save new "
578
+ 809 "versions in a loop."
579
+ 810 )
580
+ 812 return highest_numbered.add_to_version_number(1)
581
+
582
+
583
+ ValueError: Latest version of the file was updated 2024-08-28 15:09:47, which is less than the timeout period of 30 minutes. Change the timeout argument, but be sure to not save new versions in a loop.
584
+
585
+
586
+
587
+ ```python
588
+ dp.write_pandas(df, testpath.new_version(timeout=0.01))
589
+ ```
590
+
591
+
592
+ ```python
593
+ print(testpath.versions())
594
+ ```
595
+
596
+ timestamp mb (int)
597
+ 2024-08-28 15:09:47 0 ssb-areal-data-produkt-prod/arealstat/temp/test_df_p2023_v1.parquet
598
+ 2024-08-28 15:09:52 0 ssb-areal-data-produkt-prod/arealstat/temp/test_df_p2023_v2.parquet
599
+ dtype: object
600
+
601
+
602
+
603
+ ```python
604
+
605
+ ```
606
+