notoecd 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- notoecd/__init__.py +1 -1
- notoecd/calls.py +61 -8
- notoecd/datasets.py +4 -3
- notoecd/structure.py +5 -1
- {notoecd-0.1.3.dist-info → notoecd-0.1.5.dist-info}/METADATA +19 -19
- notoecd-0.1.5.dist-info/RECORD +9 -0
- notoecd-0.1.3.dist-info/RECORD +0 -9
- {notoecd-0.1.3.dist-info → notoecd-0.1.5.dist-info}/WHEEL +0 -0
- {notoecd-0.1.3.dist-info → notoecd-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {notoecd-0.1.3.dist-info → notoecd-0.1.5.dist-info}/top_level.txt +0 -0
notoecd/__init__.py
CHANGED
notoecd/calls.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
|
-
from typing import Union
|
|
3
2
|
from functools import lru_cache
|
|
3
|
+
from typing import Union, Optional
|
|
4
4
|
from .structure import get_structure
|
|
5
5
|
|
|
6
6
|
@lru_cache(maxsize=64)
|
|
@@ -12,26 +12,58 @@ def _clean(s: str) -> str:
|
|
|
12
12
|
return str(s).strip().lower()
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
def _clean_dict(d: dict) -> dict:
|
|
16
|
+
out = {}
|
|
17
|
+
for k, v in d.items():
|
|
18
|
+
k = _clean(k)
|
|
19
|
+
if isinstance(v, (list, tuple, set)):
|
|
20
|
+
out[k] = [_clean(x) for x in v]
|
|
21
|
+
else:
|
|
22
|
+
out[k] = _clean(v)
|
|
23
|
+
return out
|
|
24
|
+
|
|
25
|
+
|
|
15
26
|
def _build_filter_expression(
|
|
16
27
|
agencyID: str,
|
|
17
28
|
dataflowID: str,
|
|
18
29
|
filters: dict,
|
|
19
30
|
) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Builds a valid OECD SDMX filter expression from a dictionary.
|
|
20
33
|
|
|
21
|
-
|
|
22
|
-
|
|
34
|
+
Args:
|
|
35
|
+
agencyID (str): The data provider agency identifier.
|
|
36
|
+
dataflowID (str): The dataflow identifier within the agency.
|
|
37
|
+
filters (dict): Dictionary with dimension names as keys and
|
|
38
|
+
either codes or labels as values.
|
|
23
39
|
|
|
40
|
+
Returns
|
|
41
|
+
str: A valid OECD SDMX filter expression.
|
|
42
|
+
"""
|
|
43
|
+
s = get_structure(agencyID, dataflowID)
|
|
44
|
+
filters = _clean_dict(filters)
|
|
45
|
+
|
|
24
46
|
parts = []
|
|
25
47
|
for dim in s.toc.title:
|
|
26
48
|
dim_key = _clean(dim)
|
|
49
|
+
|
|
27
50
|
if dim_key in filters:
|
|
28
51
|
val = filters[dim_key]
|
|
29
|
-
|
|
52
|
+
concepts = _clean_dict(s.explain_vals(dim_key))
|
|
53
|
+
rev = {v: k for k, v in concepts.items()}
|
|
54
|
+
|
|
55
|
+
if isinstance(val, str):
|
|
30
56
|
val = [val]
|
|
31
|
-
|
|
57
|
+
val = [_clean(v) for v in val]
|
|
58
|
+
|
|
59
|
+
for i, v in enumerate(val):
|
|
60
|
+
if v in concepts: continue
|
|
61
|
+
if v in rev: val[i] = rev[v]
|
|
62
|
+
else: raise ValueError(f"Invalid value '{v}' for dimension '{dim_key}'. ")
|
|
63
|
+
|
|
64
|
+
parts.append("+".join(val))
|
|
32
65
|
else:
|
|
33
66
|
parts.append("")
|
|
34
|
-
|
|
35
67
|
return ".".join(parts).upper()
|
|
36
68
|
|
|
37
69
|
|
|
@@ -40,8 +72,26 @@ def get_df(
|
|
|
40
72
|
dataflowID: str,
|
|
41
73
|
filters: Union[str, dict] = "",
|
|
42
74
|
version: str = "",
|
|
75
|
+
startYear: Optional[int] = None,
|
|
76
|
+
endYear: Optional[int] = None,
|
|
43
77
|
) -> pd.DataFrame:
|
|
44
|
-
|
|
78
|
+
"""
|
|
79
|
+
Fetch data from the OECD SDMX API and return it as a pandas DataFrame.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
agencyID (str): The data provider agency identifier.
|
|
83
|
+
dataflowID (str): The dataflow identifier within the agency.
|
|
84
|
+
filters (Union[str, dict], optional): Either a preformatted SDMX filter
|
|
85
|
+
string or a dictionary of filters.
|
|
86
|
+
version (str, optional): The dataflow version. Use an empty string for
|
|
87
|
+
the latest version.
|
|
88
|
+
startYear (int, optional): Start year (inclusive).
|
|
89
|
+
endYear (int, optional): End year (inclusive).
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
pd.DataFrame: The resulting dataset.
|
|
93
|
+
"""
|
|
94
|
+
|
|
45
95
|
if isinstance(filters, dict):
|
|
46
96
|
filter_expression = _build_filter_expression(agencyID, dataflowID, filters)
|
|
47
97
|
else:
|
|
@@ -50,8 +100,11 @@ def get_df(
|
|
|
50
100
|
url = (
|
|
51
101
|
f"https://sdmx.oecd.org/public/rest/data/"
|
|
52
102
|
f"{agencyID},{dataflowID},{version}/{filter_expression}"
|
|
53
|
-
"?dimensionAtObservation=AllDimensions&format=csvfile"
|
|
103
|
+
f"?dimensionAtObservation=AllDimensions&format=csvfile"
|
|
54
104
|
)
|
|
55
105
|
|
|
106
|
+
if startYear is not None: url += f"&startPeriod={startYear}"
|
|
107
|
+
if endYear is not None: url += f"&endPeriod={endYear}"
|
|
108
|
+
|
|
56
109
|
base_df = _fetch_df(url)
|
|
57
110
|
return base_df.copy()
|
notoecd/datasets.py
CHANGED
|
@@ -68,7 +68,7 @@ def search_keywords(*keywords: str) -> pd.DataFrame:
|
|
|
68
68
|
Searches OECD datasets for a set of keywords.
|
|
69
69
|
|
|
70
70
|
Args:
|
|
71
|
-
keywords (str
|
|
71
|
+
*keywords (str): One or more keywords. Acts as OR.
|
|
72
72
|
|
|
73
73
|
Returns:
|
|
74
74
|
pd.DataFrame: Matching rows.
|
|
@@ -89,13 +89,13 @@ def search_keywords(*keywords: str) -> pd.DataFrame:
|
|
|
89
89
|
)
|
|
90
90
|
)
|
|
91
91
|
|
|
92
|
-
# Combined normalized text for each row
|
|
93
92
|
text = (
|
|
94
93
|
datasets["name"].fillna("").astype(str)
|
|
95
94
|
+ " "
|
|
96
95
|
+ datasets["description"].fillna("").astype(str)
|
|
97
96
|
)
|
|
98
97
|
text_norm = _normalize_series(text)
|
|
98
|
+
name_norm = _normalize_series(datasets["name"])
|
|
99
99
|
|
|
100
100
|
def _normalize_kw(kw: str) -> str:
|
|
101
101
|
kw = unicodedata.normalize("NFKD", kw.lower())
|
|
@@ -108,8 +108,9 @@ def search_keywords(*keywords: str) -> pd.DataFrame:
|
|
|
108
108
|
|
|
109
109
|
for kw in norm_keywords:
|
|
110
110
|
m = text_norm.str.contains(kw, na=False, regex=False)
|
|
111
|
+
mt = name_norm.str.contains(kw, na=False, regex=False)
|
|
111
112
|
overall_mask |= m
|
|
112
|
-
score = score.add(m.astype("int8"), fill_value=0)
|
|
113
|
+
score = score.add(m.astype("int8"), fill_value=0) + mt.astype("int8")
|
|
113
114
|
|
|
114
115
|
result = datasets.loc[overall_mask].copy()
|
|
115
116
|
result["_match_score"] = score.loc[overall_mask]
|
notoecd/structure.py
CHANGED
|
@@ -138,7 +138,11 @@ class Structure():
|
|
|
138
138
|
print(f"Could not find explanation for {unclean_values}")
|
|
139
139
|
|
|
140
140
|
return {i:explanation[i] for i in clean_values}
|
|
141
|
+
|
|
142
|
+
def explain_columns(self, df:pd.DataFrame) -> dict:
|
|
143
|
+
dim_dict = self.concepts['DIMENSIONS']
|
|
144
|
+
return {i: dim_dict[i] for i in df.columns if i in dim_dict}
|
|
141
145
|
|
|
142
|
-
@lru_cache(maxsize=
|
|
146
|
+
@lru_cache(maxsize=64)
|
|
143
147
|
def get_structure(agencyID: str, dataflowID: str) -> Structure:
|
|
144
148
|
return Structure(agencyID, dataflowID)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: notoecd
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
|
|
5
5
|
Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -16,16 +16,16 @@ Dynamic: license-file
|
|
|
16
16
|
|
|
17
17
|
⚠️ **Unofficial package, not endorsed by the OECD.**
|
|
18
18
|
|
|
19
|
-
A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
|
|
19
|
+
A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
|
|
20
20
|
|
|
21
21
|
The package provides utilities for:
|
|
22
22
|
|
|
23
|
-
- Discovering dataset metadata
|
|
24
|
-
- Searching for relevant datasets using keyword matching
|
|
25
|
-
- Exploring the structure and code lists of a dataset
|
|
26
|
-
- Fetching filtered SDMX data directly into a pandas DataFrame
|
|
23
|
+
- Discovering dataset metadata
|
|
24
|
+
- Searching for relevant datasets using keyword matching
|
|
25
|
+
- Exploring the structure and code lists of a dataset
|
|
26
|
+
- Fetching filtered SDMX data directly into a pandas DataFrame
|
|
27
27
|
|
|
28
|
-
|
|
28
|
+
---
|
|
29
29
|
|
|
30
30
|
## Installation
|
|
31
31
|
|
|
@@ -33,7 +33,7 @@ You can install the package by running:
|
|
|
33
33
|
|
|
34
34
|
pip install notoecd
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
---
|
|
37
37
|
|
|
38
38
|
## Quick Start
|
|
39
39
|
|
|
@@ -45,30 +45,30 @@ The main functions in this module are:
|
|
|
45
45
|
get_structure(agencyID, dataflowID) -> Structure
|
|
46
46
|
get_df(agencyID, dataflowID, filters) -> pd.DataFrame
|
|
47
47
|
|
|
48
|
-
|
|
48
|
+
---
|
|
49
49
|
|
|
50
50
|
## Searching for datasets
|
|
51
51
|
|
|
52
52
|
`search_keywords` performs:
|
|
53
53
|
|
|
54
|
-
- Normalized text matching
|
|
55
|
-
- Accent-insensitive search
|
|
56
|
-
- Multi-keyword OR matching
|
|
57
|
-
- Ranking by number of matched keywords
|
|
54
|
+
- Normalized text matching
|
|
55
|
+
- Accent-insensitive search
|
|
56
|
+
- Multi-keyword OR matching
|
|
57
|
+
- Ranking by number of matched keywords
|
|
58
58
|
|
|
59
59
|
Example:
|
|
60
60
|
|
|
61
|
-
hits = notoecd.search_keywords(
|
|
61
|
+
hits = notoecd.search_keywords('gross domestic product', 'tl2', 'tl3')
|
|
62
62
|
|
|
63
63
|
This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
|
|
64
64
|
|
|
65
|
-
|
|
65
|
+
---
|
|
66
66
|
|
|
67
67
|
## Inspecting dataset structure
|
|
68
68
|
|
|
69
69
|
Once a dataset is identified, load its SDMX structure:
|
|
70
70
|
|
|
71
|
-
|
|
71
|
+
# Gross domestic product - Regions
|
|
72
72
|
agencyID = 'OECD.CFE.EDS'
|
|
73
73
|
dataflowID = 'DSD_REG_ECO@DF_GDP'
|
|
74
74
|
|
|
@@ -87,7 +87,7 @@ This shows all filters and their available values.
|
|
|
87
87
|
|
|
88
88
|
This shows the available measures and units used in the dataset.
|
|
89
89
|
|
|
90
|
-
|
|
90
|
+
---
|
|
91
91
|
|
|
92
92
|
## Filtering and downloading data
|
|
93
93
|
|
|
@@ -103,12 +103,12 @@ Keys correspond to SDMX dimensions, values are strings or lists (for multiple va
|
|
|
103
103
|
|
|
104
104
|
Fetch the filtered dataset:
|
|
105
105
|
|
|
106
|
-
df = notoecd.get_df(
|
|
106
|
+
df = notoecd.get_df(agencyID, dataflowID, filters)
|
|
107
107
|
df.head()
|
|
108
108
|
|
|
109
109
|
The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
|
|
110
110
|
|
|
111
|
-
|
|
111
|
+
---
|
|
112
112
|
|
|
113
113
|
## Examples
|
|
114
114
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
notoecd/__init__.py,sha256=v397Mb7CzA-dmcZYLK6AiEesjCf3ugEv0keEVFh8bz4,189
|
|
2
|
+
notoecd/calls.py,sha256=wy7ELlMLl6X5bEN7V4Pn-AcV8YYqAUPWzL-9uT-1NzM,3400
|
|
3
|
+
notoecd/datasets.py,sha256=BTDgdhOK6tJrU0fp_7GZQOoTpOf4CRQrqDVkXcvpAcU,3818
|
|
4
|
+
notoecd/structure.py,sha256=ur8kkdCL2zRjCVrw1grtyCRNZ-aqHKRgv9X_a_6qtDQ,7349
|
|
5
|
+
notoecd-0.1.5.dist-info/licenses/LICENSE,sha256=jb9xgeCKfW-VCXFQtYmiM_SZ9tC2zPGtOIVsE5G17W8,1076
|
|
6
|
+
notoecd-0.1.5.dist-info/METADATA,sha256=hnnWK8pgdyajFl71f3hLsS3-QP_vYdapjYnQK-E9YNM,2755
|
|
7
|
+
notoecd-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
notoecd-0.1.5.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
|
|
9
|
+
notoecd-0.1.5.dist-info/RECORD,,
|
notoecd-0.1.3.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
notoecd/__init__.py,sha256=gW8v_eB09ROOISMPTDD8pwSg0xtvNRZ-06lD2Q76Xb8,189
|
|
2
|
-
notoecd/calls.py,sha256=SFM4kerc-K43Yo6oDBCsnvCIpN2Bg0-sHKpRfAujS-o,1496
|
|
3
|
-
notoecd/datasets.py,sha256=c8iz2HzWyCGGQINNnzlHG-kJMqsDKFbDObvK11QZU0Y,3751
|
|
4
|
-
notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
|
|
5
|
-
notoecd-0.1.3.dist-info/licenses/LICENSE,sha256=jb9xgeCKfW-VCXFQtYmiM_SZ9tC2zPGtOIVsE5G17W8,1076
|
|
6
|
-
notoecd-0.1.3.dist-info/METADATA,sha256=oHUyrIqCgzELJHZPrJ3cQKvpwWbR25d9YItSy5X8k6c,3123
|
|
7
|
-
notoecd-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
-
notoecd-0.1.3.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
|
|
9
|
-
notoecd-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|