the-datagarden 0.1.0__py3-none-any.whl → 1.2.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- the_datagarden/__init__.py +8 -0
- the_datagarden/abc/__init__.py +3 -0
- the_datagarden/abc/api.py +19 -0
- the_datagarden/abc/authentication.py +42 -0
- the_datagarden/api/__init__.py +5 -0
- the_datagarden/api/authentication/__init__.py +112 -0
- the_datagarden/api/authentication/credentials/__init__.py +120 -0
- the_datagarden/api/authentication/environment/__init__.py +13 -0
- the_datagarden/api/authentication/settings.py +54 -0
- the_datagarden/api/base/__init__.py +215 -0
- the_datagarden/api/regions/__init__.py +4 -0
- the_datagarden/api/regions/base/__init__.py +108 -0
- the_datagarden/api/regions/base/settings.py +19 -0
- the_datagarden/api/regions/continent.py +9 -0
- the_datagarden/api/regions/country.py +9 -0
- the_datagarden/models/__init__.py +9 -0
- the_datagarden/models/geojson.py +179 -0
- the_datagarden/models/regional_data.py +411 -0
- the_datagarden/version.py +1 -0
- the_datagarden-1.2.3.dist-info/METADATA +253 -0
- the_datagarden-1.2.3.dist-info/RECORD +25 -0
- {the_datagarden-0.1.0.dist-info → the_datagarden-1.2.3.dist-info}/WHEEL +1 -1
- the_datagarden-0.1.0.dist-info/METADATA +0 -18
- the_datagarden-0.1.0.dist-info/RECORD +0 -7
- {the_datagarden-0.1.0.dist-info → the_datagarden-1.2.3.dist-info}/entry_points.txt +0 -0
- {the_datagarden-0.1.0.dist-info → the_datagarden-1.2.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,411 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
import polars as pl
|
3
|
+
from datagarden_models import DataGardenModel, DatagardenModels, DataGardenSubModel, RegionalDataStats
|
4
|
+
from datagarden_models.models.base.legend import Legend
|
5
|
+
from pydantic import BaseModel
|
6
|
+
|
7
|
+
from the_datagarden.api.base import BaseApi
|
8
|
+
|
9
|
+
UNIQUE_FIELDS = [
|
10
|
+
"region_type",
|
11
|
+
"un_region_code",
|
12
|
+
"iso_cc_2",
|
13
|
+
"local_region_code",
|
14
|
+
"local_region_code_type",
|
15
|
+
"region_level",
|
16
|
+
"period",
|
17
|
+
"period_type",
|
18
|
+
"source_name",
|
19
|
+
]
|
20
|
+
DEFAULT_COLUMNS_TO_EXCLUDE = [
|
21
|
+
"datagarden_model_version",
|
22
|
+
"name",
|
23
|
+
"region_type",
|
24
|
+
"un_region_code",
|
25
|
+
"iso_cc_2",
|
26
|
+
"local_region_code",
|
27
|
+
"local_region_code_type",
|
28
|
+
"parent_region_code",
|
29
|
+
"parent_region_code_type",
|
30
|
+
"parent_region_type",
|
31
|
+
"region_level",
|
32
|
+
"source_name",
|
33
|
+
"data_model_name",
|
34
|
+
"period",
|
35
|
+
"period_type",
|
36
|
+
]
|
37
|
+
|
38
|
+
|
39
|
+
class RegionalDataRecord(BaseModel):
|
40
|
+
name: str | None = None
|
41
|
+
region_type: str | None = None
|
42
|
+
un_region_code: str | None = None
|
43
|
+
iso_cc_2: str | None = None
|
44
|
+
local_region_code: str | None = None
|
45
|
+
local_region_code_type: str | None = None
|
46
|
+
parent_region_code: str | None = None
|
47
|
+
parent_region_code_type: str | None = None
|
48
|
+
parent_region_type: str | None = None
|
49
|
+
region_level: int = 0
|
50
|
+
source_name: str | None = None
|
51
|
+
period: str | None = None
|
52
|
+
period_type: str | None = None
|
53
|
+
data_model_name: str | None = None
|
54
|
+
model: DataGardenSubModel
|
55
|
+
|
56
|
+
def record_hash(self) -> str:
|
57
|
+
hash_str = ".".join([str(getattr(self, key)) for key in sorted(UNIQUE_FIELDS)])
|
58
|
+
return str(hash(hash_str))
|
59
|
+
|
60
|
+
def __str__(self):
|
61
|
+
return (
|
62
|
+
f"RegionalDataRecord: {self.name} ({self.data_model_name} for {self.period}, {self.period_type})"
|
63
|
+
)
|
64
|
+
|
65
|
+
@property
|
66
|
+
def datgarden_model_class(self) -> type[DataGardenModel]:
|
67
|
+
return self.model.__class__
|
68
|
+
|
69
|
+
def record_for_sub_model(self, sub_model_name: str) -> "RegionalDataRecord":
|
70
|
+
if sub_model_name not in self.datgarden_model_class.legends().sub_model_names:
|
71
|
+
raise ValueError(f"Sub model `{sub_model_name}` not found in {self.datgarden_model_class}")
|
72
|
+
child_record = self.model_dump()
|
73
|
+
child_record["data_model_name"] = sub_model_name
|
74
|
+
child_record["model"] = getattr(self.model, sub_model_name)
|
75
|
+
return RegionalDataRecord(**child_record)
|
76
|
+
|
77
|
+
|
78
|
+
class TheDataGardenRegionalDataModel:
|
79
|
+
"""
|
80
|
+
Model to hold response data from the The Data Garden API Regional Data endpoint.
|
81
|
+
|
82
|
+
The model hold a list of regional_data records containg a regional data model
|
83
|
+
for the region for a specific set op sources, periods and period types.
|
84
|
+
|
85
|
+
The data can be converted to Polars and Pandas dataframes by the following
|
86
|
+
methods:
|
87
|
+
- to_polars(model_convertors: dict | None = None) -> pl.DataFrame
|
88
|
+
model_convertors dict will be used to covert specifc model fields to dataframe
|
89
|
+
columns.
|
90
|
+
- full_model_to_polars() -> pl.DataFrame
|
91
|
+
|
92
|
+
For pandas dataframes you can use the same methods:
|
93
|
+
- to_pandas(model_convertors: dict | None = None) -> pd.DataFrame
|
94
|
+
- full_model_to_pandas() -> pd.DataFrame
|
95
|
+
"""
|
96
|
+
|
97
|
+
def __init__(
|
98
|
+
self,
|
99
|
+
api: "BaseApi",
|
100
|
+
model_name: str,
|
101
|
+
region_url: str,
|
102
|
+
meta_data: BaseModel,
|
103
|
+
is_sub_model: bool = False,
|
104
|
+
model: type[DataGardenSubModel] | None = None,
|
105
|
+
):
|
106
|
+
self._api: BaseApi = api
|
107
|
+
self._model_name: str = model_name
|
108
|
+
self._region_url: str = region_url
|
109
|
+
self._request_params_hashes: list[str] = []
|
110
|
+
self._data_records: dict[str, RegionalDataRecord] = {}
|
111
|
+
self.meta_data: BaseModel = meta_data
|
112
|
+
self._model: DataGardenModel = model or getattr(DatagardenModels, model_name.upper())
|
113
|
+
self._is_sub_model: bool = is_sub_model
|
114
|
+
|
115
|
+
def __str__(self):
|
116
|
+
return f"TheDataGardenRegionalDataModel : {self._model_name} : (count={len(self._data_records)})"
|
117
|
+
|
118
|
+
def __repr__(self):
|
119
|
+
return self.__str__()
|
120
|
+
|
121
|
+
def __call__(self, **kwargs) -> "TheDataGardenRegionalDataModel":
|
122
|
+
if self._is_sub_model:
|
123
|
+
raise TypeError(
|
124
|
+
"Sub model data cannot be used to retrieve data. "
|
125
|
+
"Use the main model data object to make calls to The-Datagarden API"
|
126
|
+
)
|
127
|
+
request_hash = self.request_hash(**kwargs)
|
128
|
+
if request_hash not in self._request_params_hashes:
|
129
|
+
regional_data = self.regional_paginated_data_from_api(**kwargs)
|
130
|
+
if regional_data:
|
131
|
+
self.set_items(regional_data)
|
132
|
+
self._request_params_hashes.append(request_hash)
|
133
|
+
return self
|
134
|
+
|
135
|
+
def __getattr__(self, attribute: str) -> "TheDataGardenRegionalDataModel":
|
136
|
+
if attribute not in self._model.legends().sub_model_names:
|
137
|
+
raise ValueError(f"Attribute {attribute} is not a sub-model of {self._model_name}")
|
138
|
+
sub_model = getattr(self._model.legends(), attribute).model
|
139
|
+
regional_data_for_attribute = TheDataGardenRegionalDataModel(
|
140
|
+
api=self._api,
|
141
|
+
model_name=attribute,
|
142
|
+
region_url=self._region_url,
|
143
|
+
meta_data=self.meta_data,
|
144
|
+
is_sub_model=True,
|
145
|
+
model=sub_model,
|
146
|
+
)
|
147
|
+
regional_data_for_attribute._data_records = {
|
148
|
+
key: value.record_for_sub_model(attribute) for key, value in self._data_records.items()
|
149
|
+
}
|
150
|
+
return regional_data_for_attribute
|
151
|
+
|
152
|
+
@property
|
153
|
+
def model_attributes(self) -> list[str]:
|
154
|
+
return self._model.legends().attributes
|
155
|
+
|
156
|
+
def model_attribute_legend(self, attribute: str) -> Legend:
|
157
|
+
return getattr(self._model.legends(), attribute)
|
158
|
+
|
159
|
+
def request_hash(self, **kwargs) -> str:
|
160
|
+
sorted_items = sorted(kwargs.items())
|
161
|
+
hash_str = ",".join(f"{k}:{v}" for k, v in sorted_items)
|
162
|
+
return str(hash(hash_str))
|
163
|
+
|
164
|
+
def _response_has_next_page(self, model_data_resp: dict) -> bool:
|
165
|
+
pagination = model_data_resp.get("pagination", None)
|
166
|
+
if not pagination:
|
167
|
+
return False
|
168
|
+
return pagination.get("next_page", None) is not None
|
169
|
+
|
170
|
+
def _next_page_pagination(self, model_data_resp: dict) -> dict | None:
|
171
|
+
pagination = model_data_resp.pop("pagination", None)
|
172
|
+
if not pagination:
|
173
|
+
return None
|
174
|
+
next_page = pagination.get("next_page", None)
|
175
|
+
if not next_page:
|
176
|
+
return None
|
177
|
+
return {"page": next_page}
|
178
|
+
|
179
|
+
def regional_paginated_data_from_api(self, **kwargs) -> dict:
|
180
|
+
model_data_resp = self.regional_data_from_api(**kwargs)
|
181
|
+
if not model_data_resp:
|
182
|
+
return {}
|
183
|
+
while self._response_has_next_page(model_data_resp):
|
184
|
+
next_page_pagination = self._next_page_pagination(model_data_resp)
|
185
|
+
if next_page_pagination:
|
186
|
+
next_page_resp = self.regional_data_from_api(pagination=next_page_pagination, **kwargs)
|
187
|
+
if next_page_resp:
|
188
|
+
model_data_resp["data_by_region"].extend(next_page_resp["data_by_region"])
|
189
|
+
model_data_resp["pagination"] = next_page_resp["pagination"]
|
190
|
+
|
191
|
+
return model_data_resp
|
192
|
+
|
193
|
+
def regional_data_from_api(self, **kwargs) -> dict:
|
194
|
+
model_data_resp = self._api.retrieve_from_api(
|
195
|
+
url_extension=self._region_url + "regional_data/",
|
196
|
+
method="POST",
|
197
|
+
payload={"model": self._model_name, **kwargs},
|
198
|
+
)
|
199
|
+
if model_data_resp:
|
200
|
+
return model_data_resp.json()
|
201
|
+
return {}
|
202
|
+
|
203
|
+
def set_items(self, data: dict):
|
204
|
+
for regional_data in data["data_by_region"]:
|
205
|
+
base_items = {
|
206
|
+
"name": regional_data.get("region_name", None),
|
207
|
+
"region_type": regional_data.get("region_type", None),
|
208
|
+
"un_region_code": regional_data.get("un_region_code", None),
|
209
|
+
"iso_cc_2": regional_data.get("iso_cc_2", None),
|
210
|
+
"local_region_code": regional_data.get("local_region_code", None),
|
211
|
+
"local_region_code_type": regional_data.get("local_region_code_type", None),
|
212
|
+
"parent_region_code": regional_data.get("parent_region_code", None),
|
213
|
+
"parent_region_code_type": regional_data.get("parent_region_code_type", None),
|
214
|
+
"parent_region_type": regional_data.get("parent_region_type", None),
|
215
|
+
"region_level": regional_data.get("region_level", 0),
|
216
|
+
}
|
217
|
+
data_for_region = regional_data["data_objects_for_region"]
|
218
|
+
data_records = [
|
219
|
+
RegionalDataRecord(**base_items, **self._record_items(data_obj))
|
220
|
+
for data_obj in data_for_region
|
221
|
+
]
|
222
|
+
for data_record in data_records:
|
223
|
+
self._data_records.update({data_record.record_hash(): data_record})
|
224
|
+
|
225
|
+
if self._data_records:
|
226
|
+
_, first_record = list(self._data_records.items())[0]
|
227
|
+
model_name = first_record.data_model_name
|
228
|
+
if not model_name:
|
229
|
+
raise ValueError("data_model_name is required")
|
230
|
+
self._model_name = model_name
|
231
|
+
|
232
|
+
def _record_items(self, data: dict):
|
233
|
+
model_name = data.get("data_type", None)
|
234
|
+
if not model_name:
|
235
|
+
raise ValueError("data_model_name is required")
|
236
|
+
|
237
|
+
model = getattr(DatagardenModels, model_name.upper())
|
238
|
+
if not model:
|
239
|
+
raise ValueError(f"model {model_name} not found in DatagardenModels")
|
240
|
+
return {
|
241
|
+
"source_name": data.get("source_name", None),
|
242
|
+
"period": data.get("period", None),
|
243
|
+
"period_type": data.get("period_type", None),
|
244
|
+
"data_model_name": data.get("data_type", None),
|
245
|
+
"model": model(**data.get("data", {})),
|
246
|
+
}
|
247
|
+
|
248
|
+
def to_polars(self, model_convertors: dict | None = None) -> pl.DataFrame:
|
249
|
+
"""
|
250
|
+
Convert the data to a polars dataframe using a dictionary of model attributes to convert to columns
|
251
|
+
"""
|
252
|
+
model_convertors = model_convertors or {}
|
253
|
+
converted_records = []
|
254
|
+
for record in self._data_records.values():
|
255
|
+
model = record.model
|
256
|
+
record_dict = record.model_dump()
|
257
|
+
record_dict.pop("model")
|
258
|
+
|
259
|
+
for new_col, model_attr in model_convertors.items():
|
260
|
+
# Handle nested attributes using split by dots
|
261
|
+
model_attr_flatten = "__flatten" in model_attr
|
262
|
+
model_attr = model_attr.replace("__flatten", "")
|
263
|
+
attrs = model_attr.split(".")
|
264
|
+
value = getattr(model, attrs[0])
|
265
|
+
for attr in attrs[1:]:
|
266
|
+
value = getattr(value, attr, None)
|
267
|
+
if not value:
|
268
|
+
continue
|
269
|
+
if model_attr_flatten:
|
270
|
+
model_data = value.model_dump() if isinstance(value, BaseModel) else value
|
271
|
+
record_dict.update(self.flatten_dict(model_data, {}))
|
272
|
+
else:
|
273
|
+
record_dict[new_col] = value
|
274
|
+
converted_records.append(record_dict)
|
275
|
+
return pl.from_records(converted_records)
|
276
|
+
|
277
|
+
def flatten_dict(self, dict_to_flatten: dict, flattened_dict: dict, prefix: str = "") -> dict:
|
278
|
+
for key, value in dict_to_flatten.items():
|
279
|
+
new_key = f"{prefix}.{key}" if prefix else key
|
280
|
+
if isinstance(value, dict):
|
281
|
+
flattened_dict.update(self.flatten_dict(value, flattened_dict, new_key))
|
282
|
+
else:
|
283
|
+
flattened_dict[new_key] = value
|
284
|
+
|
285
|
+
return flattened_dict
|
286
|
+
|
287
|
+
def full_model_to_polars(self):
|
288
|
+
"""
|
289
|
+
Convert the data to a polars dataframe, flattening all nested dictionaries
|
290
|
+
"""
|
291
|
+
converted_records = []
|
292
|
+
for record in self._data_records.values():
|
293
|
+
# Get all fields from the record excluding the modeL
|
294
|
+
record_dict = record.model_dump(exclude={"model"})
|
295
|
+
# Model data is added as flattened dictionary
|
296
|
+
model_data = record.model.model_dump()
|
297
|
+
flattened_dict = self.flatten_dict(model_data, {})
|
298
|
+
record_dict.update(flattened_dict)
|
299
|
+
converted_records.append(record_dict)
|
300
|
+
return pl.from_records(converted_records)
|
301
|
+
|
302
|
+
def to_pandas(self, model_convertors: dict | None = None) -> pd.DataFrame:
|
303
|
+
"""
|
304
|
+
Convert the data to a pandas dataframe using a dictionary of model attributes to convert to columns
|
305
|
+
"""
|
306
|
+
return self.to_polars(model_convertors).to_pandas()
|
307
|
+
|
308
|
+
def full_model_to_pandas(self) -> pd.DataFrame:
|
309
|
+
"""
|
310
|
+
Convert the data to a pandas dataframe, flattening all nested dictionaries
|
311
|
+
"""
|
312
|
+
return self.full_model_to_polars().to_pandas()
|
313
|
+
|
314
|
+
def __iter__(self):
|
315
|
+
"""Makes the class iterable over the values in _data_records"""
|
316
|
+
return iter(self._data_records.values())
|
317
|
+
|
318
|
+
def __len__(self):
|
319
|
+
"""Returns the number of records"""
|
320
|
+
return len(self._data_records)
|
321
|
+
|
322
|
+
@property
|
323
|
+
def data_records(self) -> list[RegionalDataRecord]:
|
324
|
+
return list(self._data_records.values())
|
325
|
+
|
326
|
+
def regional_availability(self) -> dict[str, RegionalDataStats | None]:
|
327
|
+
availability_per_region = self.meta_data.statistics_for_data_model(model_name=self._model_name)
|
328
|
+
regional_availability = {}
|
329
|
+
for region_type in self.meta_data.region_types:
|
330
|
+
if region_type in availability_per_region.keys():
|
331
|
+
regional_availability[region_type] = availability_per_region[region_type]
|
332
|
+
else:
|
333
|
+
regional_availability[region_type] = None
|
334
|
+
return regional_availability
|
335
|
+
|
336
|
+
@property
|
337
|
+
def regions_with_model_data(self) -> list[str]:
|
338
|
+
return [region for region in self.regional_availability() if self.regional_availability()[region]]
|
339
|
+
|
340
|
+
def show_summary(self):
|
341
|
+
"""
|
342
|
+
Outputs a summary of the model's structure (submodels and attributes)
|
343
|
+
"""
|
344
|
+
self._model.legends().show_summary()
|
345
|
+
|
346
|
+
def summary(self) -> dict:
|
347
|
+
"""
|
348
|
+
return model's structure (submodels and attributes)
|
349
|
+
"""
|
350
|
+
return self._model.legends().summary()
|
351
|
+
|
352
|
+
def describe(
|
353
|
+
self,
|
354
|
+
include_attributes: list[str] | None = None,
|
355
|
+
exclude_attributes: list[str] | None = None,
|
356
|
+
filter_expr: pl.Expr | None = None,
|
357
|
+
) -> pl.DataFrame:
|
358
|
+
df = self.full_model_to_polars()
|
359
|
+
if df.is_empty():
|
360
|
+
raise ValueError("No data loaded for this model. Data is needed to describe the model.")
|
361
|
+
|
362
|
+
if filter_expr is not None:
|
363
|
+
df = df.filter(filter_expr)
|
364
|
+
|
365
|
+
if include_attributes:
|
366
|
+
return df.select(include_attributes).describe()
|
367
|
+
|
368
|
+
attributes_to_exclude = DEFAULT_COLUMNS_TO_EXCLUDE.copy()
|
369
|
+
if exclude_attributes:
|
370
|
+
attributes_to_exclude.extend(exclude_attributes)
|
371
|
+
return df.select([col for col in df.columns if col not in attributes_to_exclude]).describe()
|
372
|
+
|
373
|
+
def data_availability_per_attribute(
|
374
|
+
self, include_attributes: list[str] | None = None, filter_expr: pl.Expr | None = None
|
375
|
+
):
|
376
|
+
if include_attributes:
|
377
|
+
describe_df = self.describe(include_attributes=include_attributes, filter_expr=filter_expr)
|
378
|
+
else:
|
379
|
+
describe_df = self.describe(
|
380
|
+
exclude_attributes=DEFAULT_COLUMNS_TO_EXCLUDE, filter_expr=filter_expr
|
381
|
+
)
|
382
|
+
|
383
|
+
describe_df = describe_df.with_columns(
|
384
|
+
pl.when(pl.col("statistic").is_in(["count", "null_count"]))
|
385
|
+
.then(pl.all().exclude("statistic").cast(pl.Int64))
|
386
|
+
.otherwise(pl.all().exclude("statistic"))
|
387
|
+
)
|
388
|
+
|
389
|
+
return describe_df
|
390
|
+
|
391
|
+
def show_data_availability_per_attribute(
|
392
|
+
self, include_attributes: list[str] | None = None, filter_expr: pl.Expr | None = None
|
393
|
+
):
|
394
|
+
describe_df = self.data_availability_per_attribute(include_attributes, filter_expr)
|
395
|
+
stats_by_column = {
|
396
|
+
column: dict(
|
397
|
+
zip(describe_df.get_column("statistic"), describe_df.get_column(column), strict=True)
|
398
|
+
)
|
399
|
+
for column in describe_df.columns
|
400
|
+
if column != "statistic"
|
401
|
+
}
|
402
|
+
|
403
|
+
max_column_length = max(len(column) for column in stats_by_column.keys())
|
404
|
+
|
405
|
+
for column, stats in stats_by_column.items():
|
406
|
+
print(
|
407
|
+
f"{column} : {" " * (max_column_length + 3 - len(column))}"
|
408
|
+
f"{int(stats['count'] + stats['null_count'])}"
|
409
|
+
f" of which with data: {int(stats['count'])} "
|
410
|
+
f"({int(stats['count']) / (int(stats['count'] + stats['null_count'])) * 100:.0f}%)"
|
411
|
+
)
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.1.0"
|
@@ -0,0 +1,253 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: the-datagarden
|
3
|
+
Version: 1.2.3
|
4
|
+
Summary: Public data made easy.
|
5
|
+
Author-email: Maarten de Ruyter <info@the-datagarden.io>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: Read the Docs, https://dg-the-datagarden.readthedocs.io/en/stable/
|
8
|
+
Project-URL: The-DataGarden, https://www.the-datagarden.io/
|
9
|
+
Project-URL: API documentation, https://www.the-datagarden.io/api-docs
|
10
|
+
Project-URL: Source, https://github.com/the-datagarden/the-datagarden
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
17
|
+
Classifier: Operating System :: OS Independent
|
18
|
+
Classifier: Development Status :: 4 - Beta
|
19
|
+
Classifier: Intended Audience :: Developers
|
20
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
21
|
+
Classifier: Intended Audience :: Science/Research
|
22
|
+
Classifier: Intended Audience :: Healthcare Industry
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: GIS
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
|
+
Classifier: Topic :: Utilities
|
29
|
+
Requires-Python: >=3.10
|
30
|
+
Description-Content-Type: text/x-rst
|
31
|
+
Requires-Dist: click>=8.1.7
|
32
|
+
Requires-Dist: pandas>=2.2.3
|
33
|
+
Requires-Dist: polars>=1.15.0
|
34
|
+
Requires-Dist: pydantic>=2.9.2
|
35
|
+
Requires-Dist: pyjwt>=2.10.0
|
36
|
+
Requires-Dist: python-decouple>=3.8
|
37
|
+
Requires-Dist: requests>=2.32.3
|
38
|
+
Requires-Dist: the-datagarden-models>=1.6.3
|
39
|
+
|
40
|
+
==================
|
41
|
+
the-datagarden SDK
|
42
|
+
==================
|
43
|
+
|
44
|
+
The-datagarden package is a Python SDK built on top of The-DataGarden API. The SDK provides easy access to continent and country regional hierarchies,
|
45
|
+
as well as public data related to these regions. All data from The-DataGarden API is stored in normalized datamodels like ``Demographics``, ``Health``
|
46
|
+
or ``Economics``. This allows you as a data professional to create value from this data without having to worry about the (varying) data structure and
|
47
|
+
api's from the sources.
|
48
|
+
|
49
|
+
Additionally, The-DataGarden API also provides country and regional GeoJSONs. The SDK makes is easy for you to combine public data abd you own data and merge them into
|
50
|
+
geosjon Feature collections, making geographic visualisation easy.
|
51
|
+
|
52
|
+
|
53
|
+
The-DataGarden SDK main use case
|
54
|
+
--------------------------------
|
55
|
+
The SDK is designed to make it easy to access and work with the DataGarden data. After initializing the SDK you simply
|
56
|
+
retrieve data for a specific continent, country or subregion by calling the appropriate datamodel.
|
57
|
+
|
58
|
+
.. code-block:: python
|
59
|
+
|
60
|
+
# initialize a country object and retrieve the demographics attribute
|
61
|
+
>>> nl = the_datagarden_api.netherlands # or nl = the_datagarden_api.NL
|
62
|
+
>>> nl_demographics = nl.demographics()
|
63
|
+
TheDataGardenRegionalDataModel : Demographics : (count=5)
|
64
|
+
|
65
|
+
In this example the `nl_demographics` object holds 5 records. Each record contains demographic data for the Netherlands for a specific
|
66
|
+
period and period type combination. The data can be made accessible in a tabular format by converting the object to a pandas or polars dataframe.
|
67
|
+
|
68
|
+
.. code-block:: python
|
69
|
+
|
70
|
+
# convert demographics data to a polars dataframe
|
71
|
+
>>> dataframe = nl_demographics.full_model_to_polars()
|
72
|
+
>>> print(dataframe["period", "source_name", "data_model_name", "population.total", "population.total_male", "population.total_female"])
|
73
|
+
|
74
|
+
.. code-block:: text
|
75
|
+
|
76
|
+
┌───────────────┬────────────┬─────────────────┬──────────────────┬───────────────────────┬─────────────────────────┐
|
77
|
+
│ period ┆ source_name┆ data_model_name ┆ population.total ┆ population.total_male ┆ population.total_female │
|
78
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
79
|
+
│ str ┆ str ┆ str ┆ f64 ┆ f64 ┆ f64 │
|
80
|
+
╞═══════════════╪════════════╪═════════════════╪══════════════════╪═══════════════════════╪═════════════════════════╡
|
81
|
+
│ 2022-01-01T0Z ┆ Eurostat ┆ Demographics ┆ null ┆ 8.745468e6 ┆ 8.845204e6 │
|
82
|
+
│ 2022-01-01T0Z ┆ United Nat ┆ Demographics ┆ 1.7789347e7 ┆ 8.890013e6 ┆ 9.014408e6 │
|
83
|
+
│ 2023-01-01T0Z ┆ Eurostat ┆ Demographics ┆ null ┆ 8.850309e6 ┆ 8.960982e6 │
|
84
|
+
│ 2023-01-01T0Z ┆ United Nat ┆ Demographics ┆ 1.8019495e7 ┆ 8.986255e6 ┆ 9.106269e6 │
|
85
|
+
│ 2024-01-01T0Z ┆ United Nat ┆ Demographics ┆ 1.8165554e7 ┆ 9.055978e6 ┆ 9.172763e6 │
|
86
|
+
└───────────────┴────────────┴─────────────────┴──────────────────┴───────────────────────┴─────────────────────────┘
|
87
|
+
|
88
|
+
The demographics model holds lots of submodels and attributes. In this example only a limited number of attributes are listed
|
89
|
+
as the dataframe is way too large to display. For all models and their details see the model data documentation at
|
90
|
+
`The DataGarden Data Documentation <https://www.the-datagarden.io/data-docs>`_.
|
91
|
+
|
92
|
+
Getting started with the SDK
|
93
|
+
----------------------------
|
94
|
+
You can start using the SDK out of the box by simply instatiating the TheDataGardenAPI object:
|
95
|
+
|
96
|
+
.. code-block:: python
|
97
|
+
|
98
|
+
# Starting with the datagarden API
|
99
|
+
>>> from the-datagarden import TheDataGardenAPI
|
100
|
+
>>> the_datagarden_api = TheDataGardenAPI()
|
101
|
+
|
102
|
+
.. code-block:: console
|
103
|
+
|
104
|
+
Welcome to The Data Garden API.
|
105
|
+
|
106
|
+
You can start using the API with an account from The-Datagarden.io.
|
107
|
+
Please provide your credentials or create a new account.
|
108
|
+
Check www.the-datagarden.io for more information.
|
109
|
+
|
110
|
+
Do you want to (1) create a new account or (2) provide existing credentials? Enter 1 or 2:
|
111
|
+
|
112
|
+
|
113
|
+
simply select 1 to create a new account.
|
114
|
+
|
115
|
+
.. code-block:: console
|
116
|
+
|
117
|
+
Enrolling in The Data Garden API...
|
118
|
+
|
119
|
+
Enter your email: <your-email>
|
120
|
+
Enter your password: <your-password>
|
121
|
+
Confirm your password: <your-password>
|
122
|
+
|
123
|
+
Successfully enrolled in The Data Garden API.
|
124
|
+
Initializing : TheDatagardenEnvironment
|
125
|
+
At: https://www.the-datagarden.io/
|
126
|
+
|
127
|
+
If you already have an account at the-datagarden.io, you can either select option 2 or directly provide your credentials
|
128
|
+
when creating the TheDataGardenAPI object:
|
129
|
+
|
130
|
+
.. code-block:: python
|
131
|
+
|
132
|
+
# Retrieve a country object from the datagarden API
|
133
|
+
>>> from the-datagarden import TheDataGardenAPI
|
134
|
+
>>> the_datagarden_api = TheDataGardenAPI(email='your-email@example.com', password='your-password')
|
135
|
+
|
136
|
+
.. code-block:: console
|
137
|
+
|
138
|
+
Initializing : TheDatagardenEnvironment
|
139
|
+
At: https://www.the-datagarden.io/
|
140
|
+
|
141
|
+
A 3rd way to initialize the SDK is adding your credentials to the ``.env`` file.
|
142
|
+
|
143
|
+
|
144
|
+
Getting your first data from The-DataGarden API
|
145
|
+
-----------------------------------------------
|
146
|
+
Now that you have initialized the SDK, you can start retrieving data from The-DataGarden API.
|
147
|
+
For example, you can retrieve the demographics data for the Netherlands:
|
148
|
+
|
149
|
+
.. code-block:: python
|
150
|
+
|
151
|
+
# initialize a country object and retrieve the demographics attribute
|
152
|
+
>>> nl = the_datagarden_api.netherlands
|
153
|
+
>>> nl_demographics = nl.demographics
|
154
|
+
TheDataGardenRegionalDataModel : Demographics : (count=0)
|
155
|
+
|
156
|
+
This creates a country object ``nl`` for the Netherlands, which serves as your gateway to all Netherlands-related
|
157
|
+
data and its regional subdivisions.
|
158
|
+
|
159
|
+
In this getting started section we will work with a demographics object retrieved from the `nl` country object.
|
160
|
+
As shown in the example, the ``nl_demographics`` object can be retrieved by simply calling the `demographics`
|
161
|
+
attribute on the `nl` country object
|
162
|
+
|
163
|
+
The `nl_demographics` object starts empty (count=0). To populate it with data, simply call it as a function:
|
164
|
+
|
165
|
+
.. code-block:: python
|
166
|
+
|
167
|
+
# Calling the demographics attribute will populate it with demographics data from the API
|
168
|
+
>>> nl_demographics()
|
169
|
+
>>> nl_demographics
|
170
|
+
TheDataGardenRegionalDataModel : Demographics : (count=5)
|
171
|
+
|
172
|
+
When called without parameters, the API returns data using default settings, which in this case yields 5 records.
|
173
|
+
You can customize your data retrieval by specifying parameters such as time periods, period types, and data sources.
|
174
|
+
|
175
|
+
|
176
|
+
The DataGarden Regional DataModel
|
177
|
+
---------------------------------
|
178
|
+
When you retrieve data like ``nl_demographics``, you're working with a ``TheDataGardenRegionalDataModel`` object. This object acts as a container that holds:
|
179
|
+
|
180
|
+
1. A collection of ``TheDataGardenRegionalDataRecord`` objects
|
181
|
+
2. Metadata about the records (region, time period, data source, etc.)
|
182
|
+
|
183
|
+
You can easily transform this data into pandas or polars DataFrames for analysis. Here's an example showing population data for the Netherlands:
|
184
|
+
|
185
|
+
.. code-block:: python
|
186
|
+
|
187
|
+
>>> nl = the_datagarden_api.netherlands
|
188
|
+
>>> nl_demographics = nl.demographics(period_from="2010-01-01", source="united nations")
|
189
|
+
>>> # Convert to DataFrame, mapping 'population.total' to column name 'pop_count'
|
190
|
+
>>> df = nl_demographics.to_polars({"pop_count": "population.total"}) # or to_pandas(...)
|
191
|
+
>>> df["name", "source_name", "period", "data_model_name", "total"] # for readability only a limited number of columns are displayed
|
192
|
+
┌─────────────┬────────────────┬─────────────────┬─────────────────┬─────────────┐
|
193
|
+
│ name ┆ source_name ┆ period ┆ data_model_name ┆ pop_count │
|
194
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
195
|
+
│ str ┆ str ┆ str ┆ str ┆ f64 │
|
196
|
+
╞═════════════╪════════════════╪═════════════════╪═════════════════╪═════════════╡
|
197
|
+
│ Netherlands ┆ United Nations ┆ 2010-01-010:00Z ┆ Demographics ┆ 1.6729801e7 │
|
198
|
+
│ Netherlands ┆ United Nations ┆ 2011-01-010:00Z ┆ Demographics ┆ 1.6812669e7 │
|
199
|
+
│ … ┆ … ┆ … ┆ … ┆ … │
|
200
|
+
│ Netherlands ┆ United Nations ┆ 2023-01-010:00Z ┆ Demographics ┆ 1.8019495e7 │
|
201
|
+
│ Netherlands ┆ United Nations ┆ 2024-01-010:00Z ┆ Demographics ┆ 1.8165554e7 │
|
202
|
+
└─────────────┴────────────────┴─────────────────┴─────────────────┴─────────────┘
|
203
|
+
|
204
|
+
Each time you call the ``nl_demographics`` object with different parameters,
|
205
|
+
new demographic records for the specified subregions, periods, and/or sources are added to the existing ``nl_demographics`` object.
|
206
|
+
After you've gathered all the records you need, you can convert the entire collection into a dataframe for further analysis.
|
207
|
+
|
208
|
+
|
209
|
+
Retrieving GeoJSON data
|
210
|
+
-----------------------
|
211
|
+
Retrieving the GeoJSON for the Netherlands and its provinces is straightforward as well:
|
212
|
+
|
213
|
+
.. code-block:: python
|
214
|
+
|
215
|
+
>>> nl_geojson = nl.geojsons()
|
216
|
+
>>> nl_geojson
|
217
|
+
TheDataGardenRegionGeoJSONModel : GeoJSON : (count=1)
|
218
|
+
>>> nl_geojson(region_level=2) # Retrieve GeoJSON for 2nd regional level (provinces)
|
219
|
+
TheDataGardenRegionGeoJSONModel : GeoJSON : (count=13) # 12 provinces + 1 country
|
220
|
+
>>> df = nl_geojson.to_polars()
|
221
|
+
>>> df["name", "region_type", "local_region_code", "region_level", "feature"]
|
222
|
+
┌───────────────┬─────────────┬───────────────┬──────────────┬────────────────────────┐
|
223
|
+
│ name ┆ region_type ┆ local_region_c┆ region_level ┆ feature │
|
224
|
+
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
|
225
|
+
│ str ┆ str ┆ str ┆ i64 ┆ struct[3] │
|
226
|
+
╞═══════════════╪═════════════╪═══════════════╪══════════════╪════════════════════════╡
|
227
|
+
│ Netherlands ┆ country ┆ 528 ┆ 0 ┆ {"Feature",{"Netherland│
|
228
|
+
│ Drenthe ┆ province ┆ NL13 ┆ 2 ┆ {"Feature",{"Drenthe",2│
|
229
|
+
│ … ┆ … ┆ … ┆ … ┆ … │
|
230
|
+
│ Zuid-Holland ┆ province ┆ NL33 ┆ 2 ┆ {"Feature",{"Zuid-Holla│
|
231
|
+
└───────────────┴─────────────┴───────────────┴──────────────┴────────────────────────┘
|
232
|
+
|
233
|
+
For readability, the output only a limited number of dataframe columns are displayed.
|
234
|
+
Attributes in both the demographics and geojson dataframes are available to connect the geojson to
|
235
|
+
the demographics data. This allows you quickly make data sets that contain both demographics and geojson data
|
236
|
+
for further analysis or visualisation in map applications.
|
237
|
+
|
238
|
+
|
239
|
+
Read more
|
240
|
+
---------
|
241
|
+
|
242
|
+
* `The DataGarden Website <https://www.the-datagarden.io>`_
|
243
|
+
* `API Documentation <https://www.the-datagarden.io/api-docs>`_
|
244
|
+
* `The Datagarden Models <https://www.the-datagarden.io/data-docs>`_
|
245
|
+
* `GitHub Repository <https://github.com/MaartendeRuyter/dg-the-datagarden>`_
|
246
|
+
|
247
|
+
|
248
|
+
Access to The DataGarden API
|
249
|
+
----------------------------
|
250
|
+
To use the DataGarden SDK, you need access to the The DataGarden API. Simply register for free at https://www.the-datagarden.io
|
251
|
+
and you will have an inital free access account to the API with access to country and continent data.
|
252
|
+
|
253
|
+
Visit https://www.the-datagarden.io to register for free.
|