climate-ref 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. climate_ref/__init__.py +30 -0
  2. climate_ref/_config_helpers.py +214 -0
  3. climate_ref/alembic.ini +114 -0
  4. climate_ref/cli/__init__.py +138 -0
  5. climate_ref/cli/_utils.py +68 -0
  6. climate_ref/cli/config.py +28 -0
  7. climate_ref/cli/datasets.py +205 -0
  8. climate_ref/cli/executions.py +201 -0
  9. climate_ref/cli/providers.py +84 -0
  10. climate_ref/cli/solve.py +23 -0
  11. climate_ref/config.py +475 -0
  12. climate_ref/constants.py +8 -0
  13. climate_ref/database.py +223 -0
  14. climate_ref/dataset_registry/obs4ref_reference.txt +2 -0
  15. climate_ref/dataset_registry/sample_data.txt +60 -0
  16. climate_ref/datasets/__init__.py +40 -0
  17. climate_ref/datasets/base.py +214 -0
  18. climate_ref/datasets/cmip6.py +202 -0
  19. climate_ref/datasets/obs4mips.py +224 -0
  20. climate_ref/datasets/pmp_climatology.py +15 -0
  21. climate_ref/datasets/utils.py +16 -0
  22. climate_ref/executor/__init__.py +274 -0
  23. climate_ref/executor/local.py +89 -0
  24. climate_ref/migrations/README +22 -0
  25. climate_ref/migrations/env.py +139 -0
  26. climate_ref/migrations/script.py.mako +26 -0
  27. climate_ref/migrations/versions/2025-05-02T1418_341a4aa2551e_regenerate.py +292 -0
  28. climate_ref/models/__init__.py +33 -0
  29. climate_ref/models/base.py +42 -0
  30. climate_ref/models/dataset.py +206 -0
  31. climate_ref/models/diagnostic.py +61 -0
  32. climate_ref/models/execution.py +306 -0
  33. climate_ref/models/metric_value.py +195 -0
  34. climate_ref/models/provider.py +39 -0
  35. climate_ref/provider_registry.py +146 -0
  36. climate_ref/py.typed +0 -0
  37. climate_ref/solver.py +395 -0
  38. climate_ref/testing.py +109 -0
  39. climate_ref-0.5.0.dist-info/METADATA +97 -0
  40. climate_ref-0.5.0.dist-info/RECORD +44 -0
  41. climate_ref-0.5.0.dist-info/WHEEL +4 -0
  42. climate_ref-0.5.0.dist-info/entry_points.txt +2 -0
  43. climate_ref-0.5.0.dist-info/licenses/LICENCE +201 -0
  44. climate_ref-0.5.0.dist-info/licenses/NOTICE +3 -0
@@ -0,0 +1,60 @@
1
+ CMIP6/C4MIP/CSIRO/ACCESS-ESM1-5/esm-1pct-brch-1000PgC/r1i1p1f1/Amon/tas/gn/v20191206/tas_Amon_ACCESS-ESM1-5_esm-1pct-brch-1000PgC_r1i1p1f1_gn_016801-026812.nc 32f8e310d1da60516972d509419a7f81709e6f3e60fd3923c2a212ec06a62af5
2
+ CMIP6/C4MIP/CSIRO/ACCESS-ESM1-5/esm-1pct-brch-1000PgC/r1i1p1f1/fx/areacella/gn/v20191206/areacella_fx_ACCESS-ESM1-5_esm-1pct-brch-1000PgC_r1i1p1f1_gn.nc 826b3f5bcf9cd320ad302780eeb9f704e1090188be1090cdfa91778e2352202c
3
+ CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/fco2antt/gn/v20190815/fco2antt_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_185001-186912.nc c18f42666e91852cb82d03e4d8d764dd4e12dab1776f70a5527621d3d6e05d8c
4
+ CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/fco2antt/gn/v20190815/fco2antt_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_187001-188912.nc a3fbd25e8069b6ac612ef79e07551b79dee6a96640294be3f5521223bf372add
5
+ CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/fco2antt/gn/v20190815/fco2antt_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_189001-190912.nc e5cede6894466d3229f7d4abd135065f930f73803f17cbb23ddc673b3aeead6d
6
+ CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/fco2antt/gn/v20190815/fco2antt_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_191001-191412.nc 3739b234b0733ae812aee17d0ffeb2fa03b2293bb2f13e789e3c266e2e7b9c9f
7
+ CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_185001-186912.nc 14ab558cd768d22d6f78990327d24d4369647a73fe508723946a00532896a43f
8
+ CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_187001-188912.nc d7086d4bd6d7934f8f194da01ad668840e8be1e5c1fa5056e81be410d3571856
9
+ CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_189001-190912.nc b0556ce505d39d62d01d4a62933476dbc001715e2a331edcba3a6170db075d89
10
+ CMIP6/C4MIP/MPI-M/MPI-ESM1-2-LR/esm-1pctCO2/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-1pctCO2_r1i1p1f1_gn_191001-191412.nc 46e8f4e5b5f2ff9cc018d6713a0e8d4666925e803a0e17886ff7137006791c04
11
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn_010101-025012.nc d29c1e0651d6c179ad5dc8ac8961ade5cd9456f506d9743255664e44360bac62
12
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/1pctCO2/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_1pctCO2_r1i1p1f1_gn.nc 07ae2f59188889030a7c453bca5f8c6a19f22f1b544b3987ba50a7a4f306c82d
13
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rlut/gn/v20191115/rlut_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 0da7114197033589e61a7ed6f53412e0727b540e5da9d1b7ed6a51ee2a4629c6
14
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 3b00f242368a30fabe0db6a8789cf06cacaa0a3ff3726ade731f2ee488a751c5
15
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/rsut/gn/v20191115/rsut_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 4303d70390ef0e1dac94cbe4cf6354e452cda9a7892eb06dd6c6b834ff09bd86
16
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn_010101-012512.nc 0c71cbeb2667a00a452cd7acb2380a162b90abbc5413d31c103941b3bde1882a
17
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/abrupt-4xCO2/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_abrupt-4xCO2_r1i1p1f1_gn.nc 3a1846b06105c44c93d4612518fc7f068e67a115f69b21b6cd81225fe82e4f60
18
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/pr/gn/v20191115/pr_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc acc821dc400f53166379d2e23095bc2690d7ca7db6c7a6f88ae29a8771b3c65a
19
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/psl/gn/v20191115/psl_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc b63a3d4051cf17568df808836b189826da580ca8e1db949b1e93a71c80756c8d
20
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rlut/gn/v20191115/rlut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 44a3c90a41744101afb00344f50947fe46444fe5d6bd3623c0c19aa02a378c86
21
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsdt/gn/v20191115/rsdt_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc a4e1fc3a4a4d00c2fa18ec616338426eb3d91165db3bc57e565ffdc8d6bd9d34
22
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/rsut/gn/v20191115/rsut_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 8d492ef1f2bb654220fe64977d9942a33af0962ee9afa4017dcc75b6f0103015
23
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/tas/gn/v20191115/tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_185001-201412.nc 38e055e57aea5a9ae76ed3fc5325be6783b5694a9edc28aafd24dd462b32e5ce
24
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Amon/ts/gn/v20191115/ts_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc e02530449c92e0ffc72e9edeba57f5d38ab8652a28486c1c2b9ddada1f38fbd9
25
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Lmon/gpp/gn/v20191115/gpp_Lmon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc da36ed1653f7aafe40a4fc9b99004a46cb45231697ce6b3413dfc171980c37df
26
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Ofx/areacello/gn/v20191115/areacello_Ofx_ACCESS-ESM1-5_historical_r1i1p1f1_gn.nc 6808b64c7328bd118537bfb7cfd35748b4e84cae3f6a5586403aa9d8040e4d0b
27
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/Omon/tos/gn/v20191115/tos_Omon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-201412.nc 94121a2233aff78ef8799c5d59b6b6f3e7d3f2fb7ceb3a4a1645943ef3e88040
28
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/SImon/siconc/gn/v20200817/siconc_SImon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_197901-201412.nc 886c62a24797e893fe11b8de4d16c8a277bdee931b692d533f2fb3fa39820aa1
29
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/fx/areacella/gn/v20191115/areacella_fx_ACCESS-ESM1-5_historical_r1i1p1f1_gn.nc 4587a7b0110a226a805f681ee9fe456d20ec310302b2c120334c21595e4e96cb
30
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r1i1p1f1/fx/sftlf/gn/v20191115/sftlf_fx_ACCESS-ESM1-5_historical_r1i1p1f1_gn.nc bf9b256dc97cf72956ed14880e5e85a10e57f10956fb66e21192ec4cfffbdca7
31
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r2i1p1f1/Amon/psl/gn/v20191128/psl_Amon_ACCESS-ESM1-5_historical_r2i1p1f1_gn_200001-201412.nc 0abba30a4b21f1ddd4273eb8684e42a23a217d8be2360f6cfc348f89ebe0237e
32
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r2i1p1f1/Amon/ts/gn/v20191128/ts_Amon_ACCESS-ESM1-5_historical_r2i1p1f1_gn_200001-201412.nc 81c8e83d405ef92cb149a272386b89d2f135d2b620c64924ff77ed55f217fb5a
33
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r2i1p1f1/fx/areacella/gn/v20191128/areacella_fx_ACCESS-ESM1-5_historical_r2i1p1f1_gn.nc dc449fdd4a7677e681b79ce185a95f92be3620036eca0a6b6e8a2ab320d199c4
34
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rlut/gn/v20210316/rlut_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 375990b89a38ab390826d3c3efeef4e9295299164eba119e4545165079b86942
35
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rsdt/gn/v20210316/rsdt_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc e647cd4f0cb0ff9e2727f1a5f8a636ddad6c62bded06c415d28f6d1c0632c471
36
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/rsut/gn/v20210316/rsut_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-012512.nc 63cc0aa1927ded178e79f836ac9f2a058ca96b4cf901339754440bf9a0c55d04
37
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Amon/tas/gn/v20210316/tas_Amon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-018012.nc 28267d35d304d3f3d4bb222eb2a0631a951ed3aaa626e3d0364f83e9ad6e0554
38
+ CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/fx/areacella/gn/v20210316/areacella_fx_ACCESS-ESM1-5_piControl_r1i1p1f1_gn.nc 0eeabbcf35b548cb943e3f45befadf8c4c605e1ad097996cd04cf95ea073b706
39
+ CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-piControl_r1i1p1f1_gn_185001-186912.nc 98dc5c8453e98e008b63b73a3004d984644d45ceaad9776534693f209e96deed
40
+ CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-piControl_r1i1p1f1_gn_187001-188912.nc 7d6b9aeaeb3b77dfee3a9239dfd49db5e6525582d7080251072f07208c560caf
41
+ CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-piControl_r1i1p1f1_gn_189001-190912.nc b712699e038f83bf9ea72acb111857ebd3de01a5c6c2f34f5573d998a08f9d17
42
+ CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/Amon/tas/gn/v20190815/tas_Amon_MPI-ESM1-2-LR_esm-piControl_r1i1p1f1_gn_191001-191512.nc 55b2f09622e999f172b26cc4fbe35ab75cb4a52677f8f4795777604ace566f79
43
+ CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/esm-piControl/r1i1p1f1/fx/areacella/gn/v20190815/areacella_fx_MPI-ESM1-2-LR_esm-piControl_r1i1p1f1_gn.nc 048a705763d6fb08376539095e1092291278d9ad8ada7d3ccc431ae682000f03
44
+ CMIP6/DAMIP/CSIRO/ACCESS-ESM1-5/hist-GHG/r1i1p1f1/Amon/psl/gn/v20200615/psl_Amon_ACCESS-ESM1-5_hist-GHG_r1i1p1f1_gn_200001-202012.nc f5474029b4fc0822382b8024b0d074fc9f706b5f8b9fc1b86c8516b020bfcd1c
45
+ CMIP6/DAMIP/CSIRO/ACCESS-ESM1-5/hist-GHG/r1i1p1f1/Amon/ts/gn/v20200615/ts_Amon_ACCESS-ESM1-5_hist-GHG_r1i1p1f1_gn_200001-202012.nc 90cfaa8bbb4d1207e35a1a8ae2c535d756b3efa6996b53446104e7c6d920260f
46
+ CMIP6/DAMIP/CSIRO/ACCESS-ESM1-5/hist-GHG/r1i1p1f1/fx/areacella/gn/v20200615/areacella_fx_ACCESS-ESM1-5_hist-GHG_r1i1p1f1_gn.nc 10bb46e5bf3da739389f17239a5d128aa9fd419fdef7e96fc1e9ba50a49e0efb
47
+ CMIP6/DAMIP/CSIRO/ACCESS-ESM1-5/hist-GHG/r2i1p1f1/Amon/psl/gn/v20200615/psl_Amon_ACCESS-ESM1-5_hist-GHG_r2i1p1f1_gn_200001-202012.nc 85a58bb08630b87bc0aecf65f064a6b5d8355b07aa01e5f6517e8f356eb55c0c
48
+ CMIP6/DAMIP/CSIRO/ACCESS-ESM1-5/hist-GHG/r2i1p1f1/Amon/ts/gn/v20200615/ts_Amon_ACCESS-ESM1-5_hist-GHG_r2i1p1f1_gn_200001-202012.nc 5bcaa3c3f887baf1cf3edc9c891fa5c600489ce23983136cbd98226eeab46822
49
+ CMIP6/DAMIP/CSIRO/ACCESS-ESM1-5/hist-GHG/r2i1p1f1/fx/areacella/gn/v20200615/areacella_fx_ACCESS-ESM1-5_hist-GHG_r2i1p1f1_gn.nc f5d513c3ebdbe88320794a36cbd229a68aab4ff0a13e240072e8b5ca09410ea7
50
+ CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/pr/gn/v20210318/pr_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 0fe1c4b7c49ce1d7e7213c5bb5ea7b2597f68aef50f1795deefe07a5bafbc67c
51
+ CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsdt/gn/v20210318/rsdt_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 3afba9008a6b334d2bc44b4038b012ae1eca95ab1c886936a7d07bbb2070a9c8
52
+ CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/rsut/gn/v20210318/rsut_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc b6f624150e1bfe987d10ef750b9ae72e2486927496285defc2a686ffaa5387bc
53
+ CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Amon/tas/gn/v20210318/tas_Amon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-210012.nc 640678c83d60c562651fa409f09df8bb7ce560576938fdfd7c932ea10e585db6
54
+ CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/Omon/tos/gn/v20210318/tos_Omon_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn_201501-202512.nc 31a85fade7f921d2650fbcd43f3886f7111d64e65d9c9b32d61e184efdd042bc
55
+ CMIP6/ScenarioMIP/CSIRO/ACCESS-ESM1-5/ssp126/r1i1p1f1/fx/areacella/gn/v20210318/areacella_fx_ACCESS-ESM1-5_ssp126_r1i1p1f1_gn.nc b5ed05309c4a3000b551b1548d88cf1b910ad23347bc39f0094a935e26d3afe6
56
+ obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taNobs_AIRS-2-1_gn_200209-201609.nc 3489895fc6cdd936ae64fa64fa221474e50f6b6bf347458c82d9a61f945f2d9d
57
+ obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/taStderr_AIRS-2-1_gn_200209-201609.nc 81e12ba5c6b058ace93737a3b69b317d2beb17e07fd6aa9f709b3e528ebfb4a2
58
+ obs4MIPs/NASA-JPL/AIRS-2-1/ta/gn/v20201110/ta_AIRS-2-1_gn_200209-201609.nc a72d7172cd0c9df9eb0199082b196655490e5628fbb6a61ed1e7f8f83c610c0b
59
+ obs4REF/obs4MIPs_PCMDI_monthly/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc 4f9a9270d001fc30488b49cdafe28e77db88e78e981ab580f0fae209f849a2da
60
+ obs4REF/obs4MIPs_PCMDI_monthly/NOAA-ESRL-PSD/20CR/mon/psl/gn/v20210727/psl_mon_20CR_PCMDI_gn_187101-201212.nc 357e8915cc2ad30af1dd02cbecfb55f3083c13f54a11912e2f28396ccc84bd9c
@@ -0,0 +1,40 @@
1
+ """
2
+ Dataset handling utilities
3
+ """
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from climate_ref_core.datasets import SourceDatasetType
8
+
9
+ if TYPE_CHECKING:
10
+ from climate_ref.datasets.base import DatasetAdapter
11
+
12
+
13
+ def get_dataset_adapter(source_type: str, **kwargs: Any) -> "DatasetAdapter":
14
+ """
15
+ Get the appropriate adapter for the specified source type
16
+
17
+ Parameters
18
+ ----------
19
+ source_type
20
+ Type of source dataset
21
+
22
+ Returns
23
+ -------
24
+ :
25
+ DatasetAdapter instance
26
+ """
27
+ if source_type.lower() == SourceDatasetType.CMIP6.value:
28
+ from climate_ref.datasets.cmip6 import CMIP6DatasetAdapter
29
+
30
+ return CMIP6DatasetAdapter(**kwargs)
31
+ elif source_type.lower() == SourceDatasetType.obs4MIPs.value.lower():
32
+ from climate_ref.datasets.obs4mips import Obs4MIPsDatasetAdapter
33
+
34
+ return Obs4MIPsDatasetAdapter(**kwargs)
35
+ elif source_type.lower() == SourceDatasetType.PMPClimatology.value.lower():
36
+ from climate_ref.datasets.pmp_climatology import PMPClimatologyDatasetAdapter
37
+
38
+ return PMPClimatologyDatasetAdapter(**kwargs)
39
+ else:
40
+ raise ValueError(f"Unknown source type: {source_type}")
@@ -0,0 +1,214 @@
1
+ from pathlib import Path
2
+ from typing import Protocol
3
+
4
+ import pandas as pd
5
+ from loguru import logger
6
+ from sqlalchemy.orm import joinedload
7
+
8
+ from climate_ref.config import Config
9
+ from climate_ref.database import Database
10
+ from climate_ref.datasets.utils import validate_path
11
+ from climate_ref.models.dataset import Dataset, DatasetFile
12
+ from climate_ref_core.exceptions import RefException
13
+
14
+
15
+ def _log_duplicate_metadata(
16
+ data_catalog: pd.DataFrame, unique_metadata: pd.DataFrame, slug_column: str
17
+ ) -> None:
18
+ # Drop out the rows where the values are the same
19
+ invalid_datasets = unique_metadata[unique_metadata.gt(1).any(axis=1)]
20
+ # Drop out the columns where the values are the same
21
+ invalid_datasets = invalid_datasets[invalid_datasets.columns[invalid_datasets.gt(1).any(axis=0)]]
22
+
23
+ for instance_id in invalid_datasets.index:
24
+ # Get the columns where the values are different
25
+ invalid_dataset_nunique = invalid_datasets.loc[instance_id]
26
+ invalid_dataset_columns = invalid_dataset_nunique[invalid_dataset_nunique.gt(1)].index.tolist()
27
+
28
+ # Include time_range in the list of invalid columns to make debugging easier
29
+ invalid_dataset_columns.append("time_range")
30
+
31
+ data_catalog_subset = data_catalog[data_catalog[slug_column] == instance_id]
32
+
33
+ logger.error(
34
+ f"Dataset {instance_id} has varying metadata:\n{data_catalog_subset[invalid_dataset_columns]}"
35
+ )
36
+
37
+
38
+ class DatasetAdapter(Protocol):
39
+ """
40
+ An adapter to provide a common interface for different dataset types
41
+
42
+ This allows the same code to work with different dataset types.
43
+ """
44
+
45
+ dataset_cls: type[Dataset]
46
+ slug_column: str
47
+ dataset_specific_metadata: tuple[str, ...]
48
+ file_specific_metadata: tuple[str, ...] = ()
49
+
50
+ def pretty_subset(self, data_catalog: pd.DataFrame) -> pd.DataFrame:
51
+ """
52
+ Get a subset of the data_catalog to pretty print
53
+ """
54
+ ...
55
+
56
+ def find_local_datasets(self, file_or_directory: Path) -> pd.DataFrame:
57
+ """
58
+ Generate a data catalog from the specified file or directory
59
+
60
+ This data catalog should contain all the metadata needed by the database.
61
+ The index of the data catalog should be the dataset slug.
62
+ """
63
+ ...
64
+
65
+ def validate_data_catalog(self, data_catalog: pd.DataFrame, skip_invalid: bool = False) -> pd.DataFrame:
66
+ """
67
+ Validate a data catalog
68
+
69
+ Parameters
70
+ ----------
71
+ data_catalog
72
+ Data catalog to validate
73
+ skip_invalid
74
+ If True, ignore datasets with invalid metadata and remove them from the resulting data catalog.
75
+
76
+ Raises
77
+ ------
78
+ ValueError
79
+ If `skip_invalid` is False (default) and the data catalog contains validation errors.
80
+
81
+ Returns
82
+ -------
83
+ :
84
+ Validated data catalog
85
+ """
86
+ # Check if the data catalog contains the required columns
87
+ missing_columns = set(self.dataset_specific_metadata + self.file_specific_metadata) - set(
88
+ data_catalog.columns
89
+ )
90
+ if missing_columns:
91
+ raise ValueError(f"Data catalog is missing required columns: {missing_columns}")
92
+
93
+ # Verify that the dataset specific columns don't vary by dataset by counting the unique values
94
+ # for each dataset and checking if there are any that have more than one unique value.
95
+ unique_metadata = (
96
+ data_catalog[list(self.dataset_specific_metadata)].groupby(self.slug_column).nunique()
97
+ )
98
+ if unique_metadata.gt(1).any(axis=1).any():
99
+ _log_duplicate_metadata(data_catalog, unique_metadata, self.slug_column)
100
+
101
+ if skip_invalid:
102
+ data_catalog = data_catalog[
103
+ ~data_catalog[self.slug_column].isin(
104
+ unique_metadata[unique_metadata.gt(1).any(axis=1)].index
105
+ )
106
+ ]
107
+ else:
108
+ raise ValueError("Dataset specific metadata varies by dataset")
109
+
110
+ return data_catalog
111
+
112
+ def register_dataset(
113
+ self, config: Config, db: Database, data_catalog_dataset: pd.DataFrame
114
+ ) -> Dataset | None:
115
+ """
116
+ Register a dataset in the database using the data catalog
117
+
118
+ Parameters
119
+ ----------
120
+ config
121
+ Configuration object
122
+ db
123
+ Database instance
124
+ data_catalog_dataset
125
+ A subset of the data catalog containing the metadata for a single dataset
126
+
127
+ Returns
128
+ -------
129
+ :
130
+ Registered dataset if successful, else None
131
+ """
132
+ DatasetModel = self.dataset_cls
133
+
134
+ self.validate_data_catalog(data_catalog_dataset)
135
+ unique_slugs = data_catalog_dataset[self.slug_column].unique()
136
+ if len(unique_slugs) != 1:
137
+ raise RefException(f"Found multiple datasets in the same directory: {unique_slugs}")
138
+ slug = unique_slugs[0]
139
+
140
+ dataset_metadata = data_catalog_dataset[list(self.dataset_specific_metadata)].iloc[0].to_dict()
141
+ dataset, created = db.get_or_create(DatasetModel, slug=slug, **dataset_metadata)
142
+ if not created:
143
+ logger.warning(f"{dataset} already exists in the database. Skipping")
144
+ return None
145
+ db.session.flush()
146
+ for dataset_file in data_catalog_dataset.to_dict(orient="records"):
147
+ path = validate_path(dataset_file.pop("path"))
148
+
149
+ db.session.add(
150
+ DatasetFile(
151
+ path=str(path),
152
+ dataset_id=dataset.id,
153
+ start_time=dataset_file.pop("start_time"),
154
+ end_time=dataset_file.pop("end_time"),
155
+ )
156
+ )
157
+ return dataset
158
+
159
+ def load_catalog(
160
+ self, db: Database, include_files: bool = True, limit: int | None = None
161
+ ) -> pd.DataFrame:
162
+ """
163
+ Load the data catalog containing the currently tracked datasets/files from the database
164
+
165
+ Iterating over different datasets within the data catalog can be done using a `groupby`
166
+ operation for the `instance_id` column.
167
+
168
+ The index of the data catalog is the primary key of the dataset.
169
+ This should be maintained during any processing.
170
+
171
+ Returns
172
+ -------
173
+ :
174
+ Data catalog containing the metadata for the currently ingested datasets
175
+ """
176
+ DatasetModel = self.dataset_cls
177
+ dataset_type = DatasetModel.__mapper_args__["polymorphic_identity"]
178
+ # TODO: Paginate this query to avoid loading all the data at once
179
+ if include_files:
180
+ result = (
181
+ db.session.query(DatasetFile)
182
+ # The join is necessary to be able to order by the dataset columns
183
+ .join(DatasetFile.dataset)
184
+ .where(Dataset.dataset_type == dataset_type)
185
+ # The joinedload is necessary to avoid N+1 queries (one for each dataset)
186
+ # https://docs.sqlalchemy.org/en/14/orm/loading_relationships.html#the-zen-of-joined-eager-loading
187
+ .options(joinedload(DatasetFile.dataset.of_type(DatasetModel)))
188
+ .order_by(Dataset.updated_at.desc())
189
+ .limit(limit)
190
+ .all()
191
+ )
192
+
193
+ return pd.DataFrame(
194
+ [
195
+ {
196
+ **{k: getattr(file, k) for k in self.file_specific_metadata},
197
+ **{k: getattr(file.dataset, k) for k in self.dataset_specific_metadata},
198
+ }
199
+ for file in result
200
+ ],
201
+ index=[file.dataset.id for file in result],
202
+ )
203
+ else:
204
+ result_datasets = (
205
+ db.session.query(DatasetModel).order_by(Dataset.updated_at.desc()).limit(limit).all()
206
+ )
207
+
208
+ return pd.DataFrame(
209
+ [
210
+ {k: getattr(dataset, k) for k in self.dataset_specific_metadata}
211
+ for dataset in result_datasets
212
+ ],
213
+ index=[file.id for file in result_datasets],
214
+ )
@@ -0,0 +1,202 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+ from datetime import datetime
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import ecgtools.parsers
9
+ import pandas as pd
10
+ from ecgtools import Builder
11
+
12
+ from climate_ref.datasets.base import DatasetAdapter
13
+ from climate_ref.models.dataset import CMIP6Dataset
14
+
15
+
16
+ def _parse_datetime(dt_str: pd.Series[str]) -> pd.Series[datetime | Any]:
17
+ """
18
+ Pandas tries to coerce everything to their own datetime format, which is not what we want here.
19
+ """
20
+
21
+ def _inner(date_string: str | None) -> datetime | None:
22
+ if not date_string:
23
+ return None
24
+
25
+ # Try to parse the date string with and without milliseconds
26
+ try:
27
+ dt = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
28
+ except ValueError:
29
+ dt = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S.%f")
30
+
31
+ return dt
32
+
33
+ return pd.Series(
34
+ [_inner(dt) for dt in dt_str],
35
+ index=dt_str.index,
36
+ dtype="object",
37
+ )
38
+
39
+
40
+ def _apply_fixes(data_catalog: pd.DataFrame) -> pd.DataFrame:
41
+ def _fix_parent_variant_label(group: pd.DataFrame) -> pd.DataFrame:
42
+ if group["parent_variant_label"].nunique() == 1:
43
+ return group
44
+ group["parent_variant_label"] = group["variant_label"].iloc[0]
45
+
46
+ return group
47
+
48
+ data_catalog = (
49
+ data_catalog.groupby("instance_id")
50
+ .apply(_fix_parent_variant_label, include_groups=False)
51
+ .reset_index(level="instance_id")
52
+ )
53
+
54
+ if "branch_time_in_child" in data_catalog:
55
+ data_catalog["branch_time_in_child"] = _clean_branch_time(data_catalog["branch_time_in_child"])
56
+ if "branch_time_in_parent" in data_catalog:
57
+ data_catalog["branch_time_in_parent"] = _clean_branch_time(data_catalog["branch_time_in_parent"])
58
+
59
+ return data_catalog
60
+
61
+
62
+ def _clean_branch_time(branch_time: pd.Series[str]) -> pd.Series[float]:
63
+ # EC-Earth3 uses "D" as a suffix for the branch_time_in_child and branch_time_in_parent columns
64
+ # Handle missing values (these result in nan values)
65
+ return pd.to_numeric(branch_time.astype(str).str.replace("D", ""), errors="coerce")
66
+
67
+
68
+ class CMIP6DatasetAdapter(DatasetAdapter):
69
+ """
70
+ Adapter for CMIP6 datasets
71
+ """
72
+
73
+ dataset_cls = CMIP6Dataset
74
+ slug_column = "instance_id"
75
+
76
+ dataset_specific_metadata = (
77
+ "activity_id",
78
+ "branch_method",
79
+ "branch_time_in_child",
80
+ "branch_time_in_parent",
81
+ "experiment",
82
+ "experiment_id",
83
+ "frequency",
84
+ "grid",
85
+ "grid_label",
86
+ "institution_id",
87
+ "nominal_resolution",
88
+ "parent_activity_id",
89
+ "parent_experiment_id",
90
+ "parent_source_id",
91
+ "parent_time_units",
92
+ "parent_variant_label",
93
+ "product",
94
+ "realm",
95
+ "source_id",
96
+ "source_type",
97
+ "sub_experiment",
98
+ "sub_experiment_id",
99
+ "table_id",
100
+ "variable_id",
101
+ "variant_label",
102
+ "member_id",
103
+ "vertical_levels",
104
+ "version",
105
+ # Variable identifiers
106
+ "standard_name",
107
+ "long_name",
108
+ "units",
109
+ slug_column,
110
+ )
111
+
112
+ file_specific_metadata = ("start_time", "end_time", "path")
113
+
114
+ def __init__(self, n_jobs: int = 1):
115
+ self.n_jobs = n_jobs
116
+
117
+ def pretty_subset(self, data_catalog: pd.DataFrame) -> pd.DataFrame:
118
+ """
119
+ Get a subset of the data_catalog to pretty print
120
+
121
+ This is particularly useful for CMIP6 datasets, which have a lot of metadata columns.
122
+
123
+ Parameters
124
+ ----------
125
+ data_catalog
126
+ Data catalog to subset
127
+
128
+ Returns
129
+ -------
130
+ :
131
+ Subset of the data catalog to pretty print
132
+
133
+ """
134
+ return data_catalog[
135
+ [
136
+ "activity_id",
137
+ "institution_id",
138
+ "source_id",
139
+ "experiment_id",
140
+ "member_id",
141
+ "table_id",
142
+ "variable_id",
143
+ "grid_label",
144
+ "version",
145
+ ]
146
+ ]
147
+
148
+ def find_local_datasets(self, file_or_directory: Path) -> pd.DataFrame:
149
+ """
150
+ Generate a data catalog from the specified file or directory
151
+
152
+ Each dataset may contain multiple files, which are represented as rows in the data catalog.
153
+ Each dataset has a unique identifier, which is in `slug_column`.
154
+
155
+ Parameters
156
+ ----------
157
+ file_or_directory
158
+ File or directory containing the datasets
159
+
160
+ Returns
161
+ -------
162
+ :
163
+ Data catalog containing the metadata for the dataset
164
+ """
165
+ with warnings.catch_warnings():
166
+ # Ignore the DeprecationWarning from xarray
167
+ warnings.simplefilter("ignore", DeprecationWarning)
168
+
169
+ builder = Builder(
170
+ paths=[str(file_or_directory)],
171
+ depth=10,
172
+ include_patterns=["*.nc"],
173
+ joblib_parallel_kwargs={"n_jobs": self.n_jobs},
174
+ ).build(parsing_func=ecgtools.parsers.parse_cmip6)
175
+
176
+ datasets: pd.DataFrame = builder.df.drop(["init_year"], axis=1)
177
+
178
+ # Convert the start_time and end_time columns to datetime objects
179
+ # We don't know the calendar used in the dataset (TODO: Check what ecgtools does)
180
+ datasets["start_time"] = _parse_datetime(datasets["start_time"])
181
+ datasets["end_time"] = _parse_datetime(datasets["end_time"])
182
+
183
+ drs_items = [
184
+ "activity_id",
185
+ "institution_id",
186
+ "source_id",
187
+ "experiment_id",
188
+ "member_id",
189
+ "table_id",
190
+ "variable_id",
191
+ "grid_label",
192
+ "version",
193
+ ]
194
+ datasets["instance_id"] = datasets.apply(
195
+ lambda row: "CMIP6." + ".".join([row[item] for item in drs_items]), axis=1
196
+ )
197
+
198
+ # Temporary fix for some datasets
199
+ # TODO: Replace with a standalone package that contains metadata fixes for CMIP6 datasets
200
+ datasets = _apply_fixes(datasets)
201
+
202
+ return datasets