the-datagarden 0.1.0__py3-none-any.whl → 1.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- the_datagarden/__init__.py +8 -0
- the_datagarden/abc/__init__.py +3 -0
- the_datagarden/abc/api.py +19 -0
- the_datagarden/abc/authentication.py +42 -0
- the_datagarden/api/__init__.py +5 -0
- the_datagarden/api/authentication/__init__.py +112 -0
- the_datagarden/api/authentication/credentials/__init__.py +120 -0
- the_datagarden/api/authentication/environment/__init__.py +13 -0
- the_datagarden/api/authentication/settings.py +54 -0
- the_datagarden/api/base/__init__.py +215 -0
- the_datagarden/api/regions/__init__.py +4 -0
- the_datagarden/api/regions/base/__init__.py +108 -0
- the_datagarden/api/regions/base/settings.py +19 -0
- the_datagarden/api/regions/continent.py +9 -0
- the_datagarden/api/regions/country.py +9 -0
- the_datagarden/models/__init__.py +9 -0
- the_datagarden/models/geojson.py +179 -0
- the_datagarden/models/regional_data.py +411 -0
- the_datagarden/version.py +1 -0
- the_datagarden-1.2.3.dist-info/METADATA +253 -0
- the_datagarden-1.2.3.dist-info/RECORD +25 -0
- {the_datagarden-0.1.0.dist-info → the_datagarden-1.2.3.dist-info}/WHEEL +1 -1
- the_datagarden-0.1.0.dist-info/METADATA +0 -18
- the_datagarden-0.1.0.dist-info/RECORD +0 -7
- {the_datagarden-0.1.0.dist-info → the_datagarden-1.2.3.dist-info}/entry_points.txt +0 -0
- {the_datagarden-0.1.0.dist-info → the_datagarden-1.2.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,411 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import polars as pl
|
3
|
+
from datagarden_models import DataGardenModel, DatagardenModels, DataGardenSubModel, RegionalDataStats
|
4
|
+
from datagarden_models.models.base.legend import Legend
|
5
|
+
from pydantic import BaseModel
|
6
|
+
|
7
|
+
from the_datagarden.api.base import BaseApi
|
8
|
+
|
9
|
+
UNIQUE_FIELDS = [
|
10
|
+
"region_type",
|
11
|
+
"un_region_code",
|
12
|
+
"iso_cc_2",
|
13
|
+
"local_region_code",
|
14
|
+
"local_region_code_type",
|
15
|
+
"region_level",
|
16
|
+
"period",
|
17
|
+
"period_type",
|
18
|
+
"source_name",
|
19
|
+
]
|
20
|
+
DEFAULT_COLUMNS_TO_EXCLUDE = [
|
21
|
+
"datagarden_model_version",
|
22
|
+
"name",
|
23
|
+
"region_type",
|
24
|
+
"un_region_code",
|
25
|
+
"iso_cc_2",
|
26
|
+
"local_region_code",
|
27
|
+
"local_region_code_type",
|
28
|
+
"parent_region_code",
|
29
|
+
"parent_region_code_type",
|
30
|
+
"parent_region_type",
|
31
|
+
"region_level",
|
32
|
+
"source_name",
|
33
|
+
"data_model_name",
|
34
|
+
"period",
|
35
|
+
"period_type",
|
36
|
+
]
|
37
|
+
|
38
|
+
|
39
|
+
class RegionalDataRecord(BaseModel):
|
40
|
+
name: str | None = None
|
41
|
+
region_type: str | None = None
|
42
|
+
un_region_code: str | None = None
|
43
|
+
iso_cc_2: str | None = None
|
44
|
+
local_region_code: str | None = None
|
45
|
+
local_region_code_type: str | None = None
|
46
|
+
parent_region_code: str | None = None
|
47
|
+
parent_region_code_type: str | None = None
|
48
|
+
parent_region_type: str | None = None
|
49
|
+
region_level: int = 0
|
50
|
+
source_name: str | None = None
|
51
|
+
period: str | None = None
|
52
|
+
period_type: str | None = None
|
53
|
+
data_model_name: str | None = None
|
54
|
+
model: DataGardenSubModel
|
55
|
+
|
56
|
+
def record_hash(self) -> str:
|
57
|
+
hash_str = ".".join([str(getattr(self, key)) for key in sorted(UNIQUE_FIELDS)])
|
58
|
+
return str(hash(hash_str))
|
59
|
+
|
60
|
+
def __str__(self):
|
61
|
+
return (
|
62
|
+
f"RegionalDataRecord: {self.name} ({self.data_model_name} for {self.period}, {self.period_type})"
|
63
|
+
)
|
64
|
+
|
65
|
+
@property
|
66
|
+
def datgarden_model_class(self) -> type[DataGardenModel]:
|
67
|
+
return self.model.__class__
|
68
|
+
|
69
|
+
def record_for_sub_model(self, sub_model_name: str) -> "RegionalDataRecord":
|
70
|
+
if sub_model_name not in self.datgarden_model_class.legends().sub_model_names:
|
71
|
+
raise ValueError(f"Sub model `{sub_model_name}` not found in {self.datgarden_model_class}")
|
72
|
+
child_record = self.model_dump()
|
73
|
+
child_record["data_model_name"] = sub_model_name
|
74
|
+
child_record["model"] = getattr(self.model, sub_model_name)
|
75
|
+
return RegionalDataRecord(**child_record)
|
76
|
+
|
77
|
+
|
78
|
+
class TheDataGardenRegionalDataModel:
|
79
|
+
"""
|
80
|
+
Model to hold response data from the The Data Garden API Regional Data endpoint.
|
81
|
+
|
82
|
+
The model hold a list of regional_data records containg a regional data model
|
83
|
+
for the region for a specific set op sources, periods and period types.
|
84
|
+
|
85
|
+
The data can be converted to Polars and Pandas dataframes by the following
|
86
|
+
methods:
|
87
|
+
- to_polars(model_convertors: dict | None = None) -> pl.DataFrame
|
88
|
+
model_convertors dict will be used to covert specifc model fields to dataframe
|
89
|
+
columns.
|
90
|
+
- full_model_to_polars() -> pl.DataFrame
|
91
|
+
|
92
|
+
For pandas dataframes you can use the same methods:
|
93
|
+
- to_pandas(model_convertors: dict | None = None) -> pd.DataFrame
|
94
|
+
- full_model_to_pandas() -> pd.DataFrame
|
95
|
+
"""
|
96
|
+
|
97
|
+
def __init__(
|
98
|
+
self,
|
99
|
+
api: "BaseApi",
|
100
|
+
model_name: str,
|
101
|
+
region_url: str,
|
102
|
+
meta_data: BaseModel,
|
103
|
+
is_sub_model: bool = False,
|
104
|
+
model: type[DataGardenSubModel] | None = None,
|
105
|
+
):
|
106
|
+
self._api: BaseApi = api
|
107
|
+
self._model_name: str = model_name
|
108
|
+
self._region_url: str = region_url
|
109
|
+
self._request_params_hashes: list[str] = []
|
110
|
+
self._data_records: dict[str, RegionalDataRecord] = {}
|
111
|
+
self.meta_data: BaseModel = meta_data
|
112
|
+
self._model: DataGardenModel = model or getattr(DatagardenModels, model_name.upper())
|
113
|
+
self._is_sub_model: bool = is_sub_model
|
114
|
+
|
115
|
+
def __str__(self):
|
116
|
+
return f"TheDataGardenRegionalDataModel : {self._model_name} : (count={len(self._data_records)})"
|
117
|
+
|
118
|
+
def __repr__(self):
|
119
|
+
return self.__str__()
|
120
|
+
|
121
|
+
def __call__(self, **kwargs) -> "TheDataGardenRegionalDataModel":
|
122
|
+
if self._is_sub_model:
|
123
|
+
raise TypeError(
|
124
|
+
"Sub model data cannot be used to retrieve data. "
|
125
|
+
"Use the main model data object to make calls to The-Datagarden API"
|
126
|
+
)
|
127
|
+
request_hash = self.request_hash(**kwargs)
|
128
|
+
if request_hash not in self._request_params_hashes:
|
129
|
+
regional_data = self.regional_paginated_data_from_api(**kwargs)
|
130
|
+
if regional_data:
|
131
|
+
self.set_items(regional_data)
|
132
|
+
self._request_params_hashes.append(request_hash)
|
133
|
+
return self
|
134
|
+
|
135
|
+
def __getattr__(self, attribute: str) -> "TheDataGardenRegionalDataModel":
|
136
|
+
if attribute not in self._model.legends().sub_model_names:
|
137
|
+
raise ValueError(f"Attribute {attribute} is not a sub-model of {self._model_name}")
|
138
|
+
sub_model = getattr(self._model.legends(), attribute).model
|
139
|
+
regional_data_for_attribute = TheDataGardenRegionalDataModel(
|
140
|
+
api=self._api,
|
141
|
+
model_name=attribute,
|
142
|
+
region_url=self._region_url,
|
143
|
+
meta_data=self.meta_data,
|
144
|
+
is_sub_model=True,
|
145
|
+
model=sub_model,
|
146
|
+
)
|
147
|
+
regional_data_for_attribute._data_records = {
|
148
|
+
key: value.record_for_sub_model(attribute) for key, value in self._data_records.items()
|
149
|
+
}
|
150
|
+
return regional_data_for_attribute
|
151
|
+
|
152
|
+
@property
|
153
|
+
def model_attributes(self) -> list[str]:
|
154
|
+
return self._model.legends().attributes
|
155
|
+
|
156
|
+
def model_attribute_legend(self, attribute: str) -> Legend:
|
157
|
+
return getattr(self._model.legends(), attribute)
|
158
|
+
|
159
|
+
def request_hash(self, **kwargs) -> str:
|
160
|
+
sorted_items = sorted(kwargs.items())
|
161
|
+
hash_str = ",".join(f"{k}:{v}" for k, v in sorted_items)
|
162
|
+
return str(hash(hash_str))
|
163
|
+
|
164
|
+
def _response_has_next_page(self, model_data_resp: dict) -> bool:
|
165
|
+
pagination = model_data_resp.get("pagination", None)
|
166
|
+
if not pagination:
|
167
|
+
return False
|
168
|
+
return pagination.get("next_page", None) is not None
|
169
|
+
|
170
|
+
def _next_page_pagination(self, model_data_resp: dict) -> dict | None:
|
171
|
+
pagination = model_data_resp.pop("pagination", None)
|
172
|
+
if not pagination:
|
173
|
+
return None
|
174
|
+
next_page = pagination.get("next_page", None)
|
175
|
+
if not next_page:
|
176
|
+
return None
|
177
|
+
return {"page": next_page}
|
178
|
+
|
179
|
+
def regional_paginated_data_from_api(self, **kwargs) -> dict:
|
180
|
+
model_data_resp = self.regional_data_from_api(**kwargs)
|
181
|
+
if not model_data_resp:
|
182
|
+
return {}
|
183
|
+
while self._response_has_next_page(model_data_resp):
|
184
|
+
next_page_pagination = self._next_page_pagination(model_data_resp)
|
185
|
+
if next_page_pagination:
|
186
|
+
next_page_resp = self.regional_data_from_api(pagination=next_page_pagination, **kwargs)
|
187
|
+
if next_page_resp:
|
188
|
+
model_data_resp["data_by_region"].extend(next_page_resp["data_by_region"])
|
189
|
+
model_data_resp["pagination"] = next_page_resp["pagination"]
|
190
|
+
|
191
|
+
return model_data_resp
|
192
|
+
|
193
|
+
def regional_data_from_api(self, **kwargs) -> dict:
|
194
|
+
model_data_resp = self._api.retrieve_from_api(
|
195
|
+
url_extension=self._region_url + "regional_data/",
|
196
|
+
method="POST",
|
197
|
+
payload={"model": self._model_name, **kwargs},
|
198
|
+
)
|
199
|
+
if model_data_resp:
|
200
|
+
return model_data_resp.json()
|
201
|
+
return {}
|
202
|
+
|
203
|
+
def set_items(self, data: dict):
|
204
|
+
for regional_data in data["data_by_region"]:
|
205
|
+
base_items = {
|
206
|
+
"name": regional_data.get("region_name", None),
|
207
|
+
"region_type": regional_data.get("region_type", None),
|
208
|
+
"un_region_code": regional_data.get("un_region_code", None),
|
209
|
+
"iso_cc_2": regional_data.get("iso_cc_2", None),
|
210
|
+
"local_region_code": regional_data.get("local_region_code", None),
|
211
|
+
"local_region_code_type": regional_data.get("local_region_code_type", None),
|
212
|
+
"parent_region_code": regional_data.get("parent_region_code", None),
|
213
|
+
"parent_region_code_type": regional_data.get("parent_region_code_type", None),
|
214
|
+
"parent_region_type": regional_data.get("parent_region_type", None),
|
215
|
+
"region_level": regional_data.get("region_level", 0),
|
216
|
+
}
|
217
|
+
data_for_region = regional_data["data_objects_for_region"]
|
218
|
+
data_records = [
|
219
|
+
RegionalDataRecord(**base_items, **self._record_items(data_obj))
|
220
|
+
for data_obj in data_for_region
|
221
|
+
]
|
222
|
+
for data_record in data_records:
|
223
|
+
self._data_records.update({data_record.record_hash(): data_record})
|
224
|
+
|
225
|
+
if self._data_records:
|
226
|
+
_, first_record = list(self._data_records.items())[0]
|
227
|
+
model_name = first_record.data_model_name
|
228
|
+
if not model_name:
|
229
|
+
raise ValueError("data_model_name is required")
|
230
|
+
self._model_name = model_name
|
231
|
+
|
232
|
+
def _record_items(self, data: dict):
|
233
|
+
model_name = data.get("data_type", None)
|
234
|
+
if not model_name:
|
235
|
+
raise ValueError("data_model_name is required")
|
236
|
+
|
237
|
+
model = getattr(DatagardenModels, model_name.upper())
|
238
|
+
if not model:
|
239
|
+
raise ValueError(f"model {model_name} not found in DatagardenModels")
|
240
|
+
return {
|
241
|
+
"source_name": data.get("source_name", None),
|
242
|
+
"period": data.get("period", None),
|
243
|
+
"period_type": data.get("period_type", None),
|
244
|
+
"data_model_name": data.get("data_type", None),
|
245
|
+
"model": model(**data.get("data", {})),
|
246
|
+
}
|
247
|
+
|
248
|
+
def to_polars(self, model_convertors: dict | None = None) -> pl.DataFrame:
|
249
|
+
"""
|
250
|
+
Convert the data to a polars dataframe using a dictionary of model attributes to convert to columns
|
251
|
+
"""
|
252
|
+
model_convertors = model_convertors or {}
|
253
|
+
converted_records = []
|
254
|
+
for record in self._data_records.values():
|
255
|
+
model = record.model
|
256
|
+
record_dict = record.model_dump()
|
257
|
+
record_dict.pop("model")
|
258
|
+
|
259
|
+
for new_col, model_attr in model_convertors.items():
|
260
|
+
# Handle nested attributes using split by dots
|
261
|
+
model_attr_flatten = "__flatten" in model_attr
|
262
|
+
model_attr = model_attr.replace("__flatten", "")
|
263
|
+
attrs = model_attr.split(".")
|
264
|
+
value = getattr(model, attrs[0])
|
265
|
+
for attr in attrs[1:]:
|
266
|
+
value = getattr(value, attr, None)
|
267
|
+
if not value:
|
268
|
+
continue
|
269
|
+
if model_attr_flatten:
|
270
|
+
model_data = value.model_dump() if isinstance(value, BaseModel) else value
|
271
|
+
record_dict.update(self.flatten_dict(model_data, {}))
|
272
|
+
else:
|
273
|
+
record_dict[new_col] = value
|
274
|
+
converted_records.append(record_dict)
|
275
|
+
return pl.from_records(converted_records)
|
276
|
+
|
277
|
+
def flatten_dict(self, dict_to_flatten: dict, flattened_dict: dict, prefix: str = "") -> dict:
|
278
|
+
for key, value in dict_to_flatten.items():
|
279
|
+
new_key = f"{prefix}.{key}" if prefix else key
|
280
|
+
if isinstance(value, dict):
|
281
|
+
flattened_dict.update(self.flatten_dict(value, flattened_dict, new_key))
|
282
|
+
else:
|
283
|
+
flattened_dict[new_key] = value
|
284
|
+
|
285
|
+
return flattened_dict
|
286
|
+
|
287
|
+
def full_model_to_polars(self):
|
288
|
+
"""
|
289
|
+
Convert the data to a polars dataframe, flattening all nested dictionaries
|
290
|
+
"""
|
291
|
+
converted_records = []
|
292
|
+
for record in self._data_records.values():
|
293
|
+
# Get all fields from the record excluding the modeL
|
294
|
+
record_dict = record.model_dump(exclude={"model"})
|
295
|
+
# Model data is added as flattened dictionary
|
296
|
+
model_data = record.model.model_dump()
|
297
|
+
flattened_dict = self.flatten_dict(model_data, {})
|
298
|
+
record_dict.update(flattened_dict)
|
299
|
+
converted_records.append(record_dict)
|
300
|
+
return pl.from_records(converted_records)
|
301
|
+
|
302
|
+
def to_pandas(self, model_convertors: dict | None = None) -> pd.DataFrame:
|
303
|
+
"""
|
304
|
+
Convert the data to a pandas dataframe using a dictionary of model attributes to convert to columns
|
305
|
+
"""
|
306
|
+
return self.to_polars(model_convertors).to_pandas()
|
307
|
+
|
308
|
+
def full_model_to_pandas(self) -> pd.DataFrame:
|
309
|
+
"""
|
310
|
+
Convert the data to a pandas dataframe, flattening all nested dictionaries
|
311
|
+
"""
|
312
|
+
return self.full_model_to_polars().to_pandas()
|
313
|
+
|
314
|
+
def __iter__(self):
|
315
|
+
"""Makes the class iterable over the values in _data_records"""
|
316
|
+
return iter(self._data_records.values())
|
317
|
+
|
318
|
+
def __len__(self):
|
319
|
+
"""Returns the number of records"""
|
320
|
+
return len(self._data_records)
|
321
|
+
|
322
|
+
@property
|
323
|
+
def data_records(self) -> list[RegionalDataRecord]:
|
324
|
+
return list(self._data_records.values())
|
325
|
+
|
326
|
+
def regional_availability(self) -> dict[str, RegionalDataStats | None]:
|
327
|
+
availability_per_region = self.meta_data.statistics_for_data_model(model_name=self._model_name)
|
328
|
+
regional_availability = {}
|
329
|
+
for region_type in self.meta_data.region_types:
|
330
|
+
if region_type in availability_per_region.keys():
|
331
|
+
regional_availability[region_type] = availability_per_region[region_type]
|
332
|
+
else:
|
333
|
+
regional_availability[region_type] = None
|
334
|
+
return regional_availability
|
335
|
+
|
336
|
+
@property
|
337
|
+
def regions_with_model_data(self) -> list[str]:
|
338
|
+
return [region for region in self.regional_availability() if self.regional_availability()[region]]
|
339
|
+
|
340
|
+
def show_summary(self):
|
341
|
+
"""
|
342
|
+
Outputs a summary of the model's structure (submodels and attributes)
|
343
|
+
"""
|
344
|
+
self._model.legends().show_summary()
|
345
|
+
|
346
|
+
def summary(self) -> dict:
|
347
|
+
"""
|
348
|
+
return model's structure (submodels and attributes)
|
349
|
+
"""
|
350
|
+
return self._model.legends().summary()
|
351
|
+
|
352
|
+
def describe(
|
353
|
+
self,
|
354
|
+
include_attributes: list[str] | None = None,
|
355
|
+
exclude_attributes: list[str] | None = None,
|
356
|
+
filter_expr: pl.Expr | None = None,
|
357
|
+
) -> pl.DataFrame:
|
358
|
+
df = self.full_model_to_polars()
|
359
|
+
if df.is_empty():
|
360
|
+
raise ValueError("No data loaded for this model. Data is needed to describe the model.")
|
361
|
+
|
362
|
+
if filter_expr is not None:
|
363
|
+
df = df.filter(filter_expr)
|
364
|
+
|
365
|
+
if include_attributes:
|
366
|
+
return df.select(include_attributes).describe()
|
367
|
+
|
368
|
+
attributes_to_exclude = DEFAULT_COLUMNS_TO_EXCLUDE.copy()
|
369
|
+
if exclude_attributes:
|
370
|
+
attributes_to_exclude.extend(exclude_attributes)
|
371
|
+
return df.select([col for col in df.columns if col not in attributes_to_exclude]).describe()
|
372
|
+
|
373
|
+
def data_availability_per_attribute(
|
374
|
+
self, include_attributes: list[str] | None = None, filter_expr: pl.Expr | None = None
|
375
|
+
):
|
376
|
+
if include_attributes:
|
377
|
+
describe_df = self.describe(include_attributes=include_attributes, filter_expr=filter_expr)
|
378
|
+
else:
|
379
|
+
describe_df = self.describe(
|
380
|
+
exclude_attributes=DEFAULT_COLUMNS_TO_EXCLUDE, filter_expr=filter_expr
|
381
|
+
)
|
382
|
+
|
383
|
+
describe_df = describe_df.with_columns(
|
384
|
+
pl.when(pl.col("statistic").is_in(["count", "null_count"]))
|
385
|
+
.then(pl.all().exclude("statistic").cast(pl.Int64))
|
386
|
+
.otherwise(pl.all().exclude("statistic"))
|
387
|
+
)
|
388
|
+
|
389
|
+
return describe_df
|
390
|
+
|
391
|
+
def show_data_availability_per_attribute(
|
392
|
+
self, include_attributes: list[str] | None = None, filter_expr: pl.Expr | None = None
|
393
|
+
):
|
394
|
+
describe_df = self.data_availability_per_attribute(include_attributes, filter_expr)
|
395
|
+
stats_by_column = {
|
396
|
+
column: dict(
|
397
|
+
zip(describe_df.get_column("statistic"), describe_df.get_column(column), strict=True)
|
398
|
+
)
|
399
|
+
for column in describe_df.columns
|
400
|
+
if column != "statistic"
|
401
|
+
}
|
402
|
+
|
403
|
+
max_column_length = max(len(column) for column in stats_by_column.keys())
|
404
|
+
|
405
|
+
for column, stats in stats_by_column.items():
|
406
|
+
print(
|
407
|
+
f"{column} : {" " * (max_column_length + 3 - len(column))}"
|
408
|
+
f"{int(stats['count'] + stats['null_count'])}"
|
409
|
+
f" of which with data: {int(stats['count'])} "
|
410
|
+
f"({int(stats['count']) / (int(stats['count'] + stats['null_count'])) * 100:.0f}%)"
|
411
|
+
)
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.1.0"
|
@@ -0,0 +1,253 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: the-datagarden
|
3
|
+
Version: 1.2.3
|
4
|
+
Summary: Public data made easy.
|
5
|
+
Author-email: Maarten de Ruyter <info@the-datagarden.io>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: Read the Docs, https://dg-the-datagarden.readthedocs.io/en/stable/
|
8
|
+
Project-URL: The-DataGarden, https://www.the-datagarden.io/
|
9
|
+
Project-URL: API documentation, https://www.the-datagarden.io/api-docs
|
10
|
+
Project-URL: Source, https://github.com/the-datagarden/the-datagarden
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
17
|
+
Classifier: Operating System :: OS Independent
|
18
|
+
Classifier: Development Status :: 4 - Beta
|
19
|
+
Classifier: Intended Audience :: Developers
|
20
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
21
|
+
Classifier: Intended Audience :: Science/Research
|
22
|
+
Classifier: Intended Audience :: Healthcare Industry
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: GIS
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
|
+
Classifier: Topic :: Utilities
|
29
|
+
Requires-Python: >=3.10
|
30
|
+
Description-Content-Type: text/x-rst
|
31
|
+
Requires-Dist: click>=8.1.7
|
32
|
+
Requires-Dist: pandas>=2.2.3
|
33
|
+
Requires-Dist: polars>=1.15.0
|
34
|
+
Requires-Dist: pydantic>=2.9.2
|
35
|
+
Requires-Dist: pyjwt>=2.10.0
|
36
|
+
Requires-Dist: python-decouple>=3.8
|
37
|
+
Requires-Dist: requests>=2.32.3
|
38
|
+
Requires-Dist: the-datagarden-models>=1.6.3
|
39
|
+
|
40
|
+
==================
|
41
|
+
the-datagarden SDK
|
42
|
+
==================
|
43
|
+
|
44
|
+
The-datagarden package is a Python SDK built on top of The-DataGarden API. The SDK provides easy access to continent and country regional hierarchies,
|
45
|
+
as well as public data related to these regions. All data from The-DataGarden API is stored in normalized datamodels like ``Demographics``, ``Health``
|
46
|
+
or ``Economics``. This allows you as a data professional to create value from this data without having to worry about the (varying) data structure and
|
47
|
+
api's from the sources.
|
48
|
+
|
49
|
+
Additionally, The-DataGarden API also provides country and regional GeoJSONs. The SDK makes is easy for you to combine public data abd you own data and merge them into
|
50
|
+
geosjon Feature collections, making geographic visualisation easy.
|
51
|
+
|
52
|
+
|
53
|
+
The-DataGarden SDK main use case
|
54
|
+
--------------------------------
|
55
|
+
The SDK is designed to make it easy to access and work with the DataGarden data. After initializing the SDK you simply
|
56
|
+
retrieve data for a specific continent, country or subregion by calling the appropriate datamodel.
|
57
|
+
|
58
|
+
.. code-block:: python
|
59
|
+
|
60
|
+
# initialize a country object and retrieve the demographics attribute
|
61
|
+
>>> nl = the_datagarden_api.netherlands # or nl = the_datagarden_api.NL
|
62
|
+
>>> nl_demographics = nl.demographics()
|
63
|
+
TheDataGardenRegionalDataModel : Demographics : (count=5)
|
64
|
+
|
65
|
+
In this example the `nl_demographics` object holds 5 records. Each record contains demographic data for the Netherlands for a specific
|
66
|
+
period and period type combination. The data can be made accessible in a tabular format by converting the object to a pandas or polars dataframe.
|
67
|
+
|
68
|
+
.. code-block:: python
|
69
|
+
|
70
|
+
# convert demographics data to a polars dataframe
|
71
|
+
>>> dataframe = nl_demographics.full_model_to_polars()
|
72
|
+
>>> print(dataframe["period", "source_name", "data_model_name", "population.total", "population.total_male", "population.total_female"])
|
73
|
+
|
74
|
+
.. code-block:: text
|
75
|
+
|
76
|
+
┌───────────────┬────────────┬─────────────────┬──────────────────┬───────────────────────┬─────────────────────────┐
|
77
|
+
│ period ┆ source_name┆ data_model_name ┆ population.total ┆ population.total_male ┆ population.total_female │
|
78
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
79
|
+
│ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 │
|
80
|
+
╞═══════════════╪════════════╪═════════════════╪══════════════════╪═══════════════════════╪═════════════════════════╡
|
81
|
+
│ 2022-01-01T0Z ┆ Eurostat ┆ Demographics ┆ null ┆ 8.745468e6 ┆ 8.845204e6 │
|
82
|
+
│ 2022-01-01T0Z ┆ United Nat ┆ Demographics ┆ 1.7789347e7 ┆ 8.890013e6 ┆ 9.014408e6 │
|
83
|
+
│ 2023-01-01T0Z ┆ Eurostat ┆ Demographics ┆ null ┆ 8.850309e6 ┆ 8.960982e6 │
|
84
|
+
│ 2023-01-01T0Z ┆ United Nat ┆ Demographics ┆ 1.8019495e7 ┆ 8.986255e6 ┆ 9.106269e6 │
|
85
|
+
│ 2024-01-01T0Z ┆ United Nat ┆ Demographics ┆ 1.8165554e7 ┆ 9.055978e6 ┆ 9.172763e6 │
|
86
|
+
└───────────────┴────────────┴─────────────────┴──────────────────┴───────────────────────┴─────────────────────────┘
|
87
|
+
|
88
|
+
The demographics model holds lots of submodels and attributes. In this example only a limited number of attributes are listed
|
89
|
+
as the dataframe is way too large to display. For all models and their details see the model data documentation at
|
90
|
+
`The DataGarden Data Documentation <https://www.the-datagarden.io/data-docs>`_.
|
91
|
+
|
92
|
+
Getting started with the SDK
|
93
|
+
----------------------------
|
94
|
+
You can start using the SDK out of the box by simply instatiating the TheDataGardenAPI object:
|
95
|
+
|
96
|
+
.. code-block:: python
|
97
|
+
|
98
|
+
# Starting with the datagarden API
|
99
|
+
>>> from the-datagarden import TheDataGardenAPI
|
100
|
+
>>> the_datagarden_api = TheDataGardenAPI()
|
101
|
+
|
102
|
+
.. code-block:: console
|
103
|
+
|
104
|
+
Welcome to The Data Garden API.
|
105
|
+
|
106
|
+
You can start using the API with an account from The-Datagarden.io.
|
107
|
+
Please provide your credentials or create a new account.
|
108
|
+
Check www.the-datagarden.io for more information.
|
109
|
+
|
110
|
+
Do you want to (1) create a new account or (2) provide existing credentials? Enter 1 or 2:
|
111
|
+
|
112
|
+
|
113
|
+
simply select 1 to create a new account.
|
114
|
+
|
115
|
+
.. code-block:: console
|
116
|
+
|
117
|
+
Enrolling in The Data Garden API...
|
118
|
+
|
119
|
+
Enter your email: <your-email>
|
120
|
+
Enter your password: <your-password>
|
121
|
+
Confirm your password: <your-password>
|
122
|
+
|
123
|
+
Successfully enrolled in The Data Garden API.
|
124
|
+
Initializing : TheDatagardenEnvironment
|
125
|
+
At: https://www.the-datagarden.io/
|
126
|
+
|
127
|
+
If you already have an account at the-datagarden.io, you can either select option 2 or directly provide your credentials
|
128
|
+
when creating the TheDataGardenAPI object:
|
129
|
+
|
130
|
+
.. code-block:: python
|
131
|
+
|
132
|
+
# Retrieve a country object from the datagarden API
|
133
|
+
>>> from the-datagarden import TheDataGardenAPI
|
134
|
+
>>> the_datagarden_api = TheDataGardenAPI(email='your-email@example.com', password='your-password')
|
135
|
+
|
136
|
+
.. code-block:: console
|
137
|
+
|
138
|
+
Initializing : TheDatagardenEnvironment
|
139
|
+
At: https://www.the-datagarden.io/
|
140
|
+
|
141
|
+
A 3rd way to initialize the SDK is adding your credentials to the ``.env`` file.
|
142
|
+
|
143
|
+
|
144
|
+
Getting your first data from The-DataGarden API
|
145
|
+
-----------------------------------------------
|
146
|
+
Now that you have initialized the SDK, you can start retrieving data from The-DataGarden API.
|
147
|
+
For example, you can retrieve the demographics data for the Netherlands:
|
148
|
+
|
149
|
+
.. code-block:: python
|
150
|
+
|
151
|
+
# initialize a country object and retrieve the demographics attribute
|
152
|
+
>>> nl = the_datagarden_api.netherlands
|
153
|
+
>>> nl_demographics = nl.demographics
|
154
|
+
TheDataGardenRegionalDataModel : Demographics : (count=0)
|
155
|
+
|
156
|
+
This creates a country object ``nl`` for the Netherlands, which serves as your gateway to all Netherlands-related
|
157
|
+
data and its regional subdivisions.
|
158
|
+
|
159
|
+
In this getting started section we will work with a demographics object retrieved from the `nl` country object.
|
160
|
+
As shown in the example, the ``nl_demographics`` object can be retrieved by simply calling the `demographics`
|
161
|
+
attribute on the `nl` country object
|
162
|
+
|
163
|
+
The `nl_demographics` object starts empty (count=0). To populate it with data, simply call it as a function:
|
164
|
+
|
165
|
+
.. code-block:: python
|
166
|
+
|
167
|
+
# Calling the demographics attribute will populate it with demographics data from the API
|
168
|
+
>>> nl_demographics()
|
169
|
+
>>> nl_demographics
|
170
|
+
TheDataGardenRegionalDataModel : Demographics : (count=5)
|
171
|
+
|
172
|
+
When called without parameters, the API returns data using default settings, which in this case yields 5 records.
|
173
|
+
You can customize your data retrieval by specifying parameters such as time periods, period types, and data sources.
|
174
|
+
|
175
|
+
|
176
|
+
The DataGarden Regional DataModel
|
177
|
+
---------------------------------
|
178
|
+
When you retrieve data like ``nl_demographics``, you're working with a ``TheDataGardenRegionalDataModel`` object. This object acts as a container that holds:
|
179
|
+
|
180
|
+
1. A collection of ``TheDataGardenRegionalDataRecord`` objects
|
181
|
+
2. Metadata about the records (region, time period, data source, etc.)
|
182
|
+
|
183
|
+
You can easily transform this data into pandas or polars DataFrames for analysis. Here's an example showing population data for the Netherlands:
|
184
|
+
|
185
|
+
.. code-block:: python
|
186
|
+
|
187
|
+
>>> nl = the_datagarden_api.netherlands
|
188
|
+
>>> nl_demographics = nl.demographics(period_from="2010-01-01", source="united nations")
|
189
|
+
>>> # Convert to DataFrame, mapping 'population.total' to column name 'pop_count'
|
190
|
+
>>> df = nl_demographics.to_polars({"pop_count": "population.total"}) # or to_pandas(...)
|
191
|
+
>>> df["name", "source_name", "period", "data_model_name", "total"] # for readability only a limited number of columns are displayed
|
192
|
+
┌─────────────┬────────────────┬─────────────────┬─────────────────┬─────────────┐
|
193
|
+
│ name ┆ source_name ┆ period ┆ data_model_name ┆ pop_count │
|
194
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
195
|
+
│ str ┆ str ┆ str ┆ str ┆ f64 │
|
196
|
+
╞═════════════╪════════════════╪═════════════════╪═════════════════╪═════════════╡
|
197
|
+
│ Netherlands ┆ United Nations ┆ 2010-01-010:00Z ┆ Demographics ┆ 1.6729801e7 │
|
198
|
+
│ Netherlands ┆ United Nations ┆ 2011-01-010:00Z ┆ Demographics ┆ 1.6812669e7 │
|
199
|
+
│ … ┆ … ┆ … ┆ … ┆ … │
|
200
|
+
│ Netherlands ┆ United Nations ┆ 2023-01-010:00Z ┆ Demographics ┆ 1.8019495e7 │
|
201
|
+
│ Netherlands ┆ United Nations ┆ 2024-01-010:00Z ┆ Demographics ┆ 1.8165554e7 │
|
202
|
+
└─────────────┴────────────────┴─────────────────┴─────────────────┴─────────────┘
|
203
|
+
|
204
|
+
Each time you call the ``nl_demographics`` object with different parameters,
|
205
|
+
new demographic records for the specified subregions, periods, and/or sources are added to the existing ``nl_demographics`` object.
|
206
|
+
After you've gathered all the records you need, you can convert the entire collection into a dataframe for further analysis.
|
207
|
+
|
208
|
+
|
209
|
+
Retrieving GeoJSON data
|
210
|
+
-----------------------
|
211
|
+
Retrieving the GeoJSON for the Netherlands and its provinces is straightforward as well:
|
212
|
+
|
213
|
+
.. code-block:: python
|
214
|
+
|
215
|
+
>>> nl_geojson = nl.geojsons()
|
216
|
+
>>> nl_geojson
|
217
|
+
TheDataGardenRegionGeoJSONModel : GeoJSON : (count=1)
|
218
|
+
>>> nl_geojson(region_level=2) # Retrieve GeoJSON for 2nd regional level (provinces)
|
219
|
+
TheDataGardenRegionGeoJSONModel : GeoJSON : (count=13) # 12 provinces + 1 country
|
220
|
+
>>> df = nl_geojson.to_polars()
|
221
|
+
>>> df["name", "region_type", "local_region_code", "region_level", "feature"]
|
222
|
+
┌───────────────┬─────────────┬───────────────┬──────────────┬────────────────────────┐
|
223
|
+
│ name ┆ region_type ┆ local_region_c┆ region_level ┆ feature │
|
224
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
225
|
+
│ str ┆ str ┆ str ┆ i64 ┆ struct[3] │
|
226
|
+
╞═══════════════╪═════════════╪═══════════════╪══════════════╪════════════════════════╡
|
227
|
+
│ Netherlands ┆ country ┆ 528 ┆ 0 ┆ {"Feature",{"Netherland│
|
228
|
+
│ Drenthe ┆ province ┆ NL13 ┆ 2 ┆ {"Feature",{"Drenthe",2│
|
229
|
+
│ … ┆ … ┆ … ┆ … ┆ … │
|
230
|
+
│ Zuid-Holland ┆ province ┆ NL33 ┆ 2 ┆ {"Feature",{"Zuid-Holla│
|
231
|
+
└───────────────┴─────────────┴───────────────┴──────────────┴────────────────────────┘
|
232
|
+
|
233
|
+
For readability, the output only a limited number of dataframe columns are displayed.
|
234
|
+
Attributes in both the demographics and geojson dataframes are available to connect the geojson to
|
235
|
+
the demographics data. This allows you quickly make data sets that contain both demographics and geojson data
|
236
|
+
for further analysis or visualisation in map applications.
|
237
|
+
|
238
|
+
|
239
|
+
Read more
|
240
|
+
---------
|
241
|
+
|
242
|
+
* `The DataGarden Website <https://www.the-datagarden.io>`_
|
243
|
+
* `API Documentation <https://www.the-datagarden.io/api-docs>`_
|
244
|
+
* `The Datagarden Models <https://www.the-datagarden.io/data-docs>`_
|
245
|
+
* `GitHub Repository <https://github.com/MaartendeRuyter/dg-the-datagarden>`_
|
246
|
+
|
247
|
+
|
248
|
+
Access to The DataGarden API
|
249
|
+
----------------------------
|
250
|
+
To use the DataGarden SDK, you need access to the The DataGarden API. Simply register for free at https://www.the-datagarden.io
|
251
|
+
and you will have an inital free access account to the API with access to country and continent data.
|
252
|
+
|
253
|
+
Visit https://www.the-datagarden.io to register for free.
|