notoecd 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- notoecd/__init__.py +7 -0
- notoecd/calls.py +57 -0
- notoecd/datasets.py +122 -0
- notoecd/structure.py +144 -0
- {notoecd-0.1.0.dist-info → notoecd-0.1.2.dist-info}/METADATA +4 -5
- notoecd-0.1.2.dist-info/RECORD +8 -0
- notoecd-0.1.2.dist-info/top_level.txt +1 -0
- notoecd-0.1.0.dist-info/RECORD +0 -4
- notoecd-0.1.0.dist-info/top_level.txt +0 -1
- {notoecd-0.1.0.dist-info → notoecd-0.1.2.dist-info}/WHEEL +0 -0
notoecd/__init__.py
ADDED
notoecd/calls.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Union
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
from .structure import get_structure
|
|
5
|
+
|
|
6
|
+
@lru_cache(maxsize=256)
|
|
7
|
+
def _fetch_df(url: str) -> pd.DataFrame:
|
|
8
|
+
return pd.read_csv(url, storage_options={"User-Agent": "Mozilla/5.0"})
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _clean(s: str) -> str:
|
|
12
|
+
return str(s).strip().lower()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _build_filter_expression(
|
|
16
|
+
agencyID: str,
|
|
17
|
+
dataflowID: str,
|
|
18
|
+
filters: dict,
|
|
19
|
+
) -> str:
|
|
20
|
+
|
|
21
|
+
s = get_structure(agencyID, dataflowID)
|
|
22
|
+
filters = {_clean(k): v for k, v in filters.items()}
|
|
23
|
+
|
|
24
|
+
parts = []
|
|
25
|
+
for dim in s.toc.title:
|
|
26
|
+
dim_key = _clean(dim)
|
|
27
|
+
if dim_key in filters:
|
|
28
|
+
val = filters[dim_key]
|
|
29
|
+
if isinstance(val, str):
|
|
30
|
+
val = [val]
|
|
31
|
+
parts.append("+".join(_clean(v) for v in val))
|
|
32
|
+
else:
|
|
33
|
+
parts.append("")
|
|
34
|
+
|
|
35
|
+
return ".".join(parts).upper()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_df(
|
|
39
|
+
agencyID: str,
|
|
40
|
+
dataflowID: str,
|
|
41
|
+
filters: Union[str, dict],
|
|
42
|
+
version: str = "",
|
|
43
|
+
) -> pd.DataFrame:
|
|
44
|
+
|
|
45
|
+
if isinstance(filters, dict):
|
|
46
|
+
filter_expression = _build_filter_expression(agencyID, dataflowID, filters)
|
|
47
|
+
else:
|
|
48
|
+
filter_expression = _clean(filters).upper()
|
|
49
|
+
|
|
50
|
+
url = (
|
|
51
|
+
f"https://sdmx.oecd.org/public/rest/data/"
|
|
52
|
+
f"{agencyID},{dataflowID},{version}/{filter_expression}"
|
|
53
|
+
"?dimensionAtObservation=AllDimensions&format=csvfile"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
base_df = _fetch_df(url)
|
|
57
|
+
return base_df.copy()
|
notoecd/datasets.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import html
|
|
3
|
+
import requests
|
|
4
|
+
import unicodedata
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from typing import Union, List
|
|
7
|
+
import xml.etree.ElementTree as ET
|
|
8
|
+
|
|
9
|
+
url = "https://sdmx.oecd.org/public/rest/dataflow/all"
|
|
10
|
+
|
|
11
|
+
NS = {
|
|
12
|
+
"message": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message",
|
|
13
|
+
"structure": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure",
|
|
14
|
+
"common": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common",
|
|
15
|
+
"xml": "http://www.w3.org/XML/1998/namespace",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
_ws_re = re.compile(r"\s+")
|
|
19
|
+
_tag_re = re.compile(r"<[^>]+>")
|
|
20
|
+
|
|
21
|
+
def _clean(s: str | None) -> str | None:
|
|
22
|
+
if s is None: return None
|
|
23
|
+
s = html.unescape(s)
|
|
24
|
+
s = _tag_re.sub("", s)
|
|
25
|
+
s = _ws_re.sub(" ", s).strip()
|
|
26
|
+
return s or None
|
|
27
|
+
|
|
28
|
+
# Cache
|
|
29
|
+
_datasets: pd.DataFrame | None = None
|
|
30
|
+
|
|
31
|
+
def _load_datasets() -> pd.DataFrame:
|
|
32
|
+
"""
|
|
33
|
+
Loads OECD datasets and keeps them in memory.
|
|
34
|
+
"""
|
|
35
|
+
global _datasets
|
|
36
|
+
if _datasets is not None: return _datasets
|
|
37
|
+
|
|
38
|
+
headers = {"Accept": "application/vnd.sdmx.structure+xml;version=2.1"}
|
|
39
|
+
r = requests.get(url, headers=headers, timeout=30)
|
|
40
|
+
r.raise_for_status()
|
|
41
|
+
root = ET.fromstring(r.content)
|
|
42
|
+
|
|
43
|
+
rows = []
|
|
44
|
+
for df in root.findall(".//structure:Dataflow", NS):
|
|
45
|
+
dataflow_id = df.attrib.get("id")
|
|
46
|
+
agency_id = df.attrib.get("agencyID")
|
|
47
|
+
|
|
48
|
+
name_elem = df.find("common:Name[@xml:lang='en']", NS)
|
|
49
|
+
desc_elem = df.find("common:Description[@xml:lang='en']", NS)
|
|
50
|
+
|
|
51
|
+
name = _clean("".join(name_elem.itertext())) if name_elem is not None else None
|
|
52
|
+
desc_raw = "".join(desc_elem.itertext()) if desc_elem is not None else None
|
|
53
|
+
desc = _clean(desc_raw)
|
|
54
|
+
|
|
55
|
+
rows.append(
|
|
56
|
+
{
|
|
57
|
+
"dataflowID": dataflow_id,
|
|
58
|
+
"agencyID": agency_id,
|
|
59
|
+
"name": name,
|
|
60
|
+
"description": desc,
|
|
61
|
+
}
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
_datasets = pd.DataFrame(rows)
|
|
65
|
+
return _datasets
|
|
66
|
+
|
|
67
|
+
def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
|
|
68
|
+
"""
|
|
69
|
+
Searches OECD datasets for a set of keywords.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
keywords (str | list[str]): Single keyword or list of keywords. Acts as OR.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
pd.DataFrame: Matching rows.
|
|
76
|
+
"""
|
|
77
|
+
datasets = _load_datasets()
|
|
78
|
+
|
|
79
|
+
# Normalize keywords input
|
|
80
|
+
if isinstance(keywords, str): keywords = [keywords]
|
|
81
|
+
elif not isinstance(keywords, list): raise TypeError("keywords must be a string or list of strings")
|
|
82
|
+
|
|
83
|
+
# Clean and drop empty keywords
|
|
84
|
+
keywords = [k for k in keywords if isinstance(k, str) and k.strip()]
|
|
85
|
+
if not keywords: raise ValueError("No valid keywords provided.")
|
|
86
|
+
|
|
87
|
+
def _normalize_series(s: pd.Series) -> pd.Series:
|
|
88
|
+
s = s.fillna("").astype(str).str.lower()
|
|
89
|
+
return s.map(
|
|
90
|
+
lambda x: "".join(ch for ch in unicodedata.normalize("NFKD", x)
|
|
91
|
+
if not unicodedata.combining(ch))
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Combined normalized text for each row
|
|
95
|
+
text = datasets["name"].fillna("").astype(str) + " " + datasets["description"].fillna("").astype(str)
|
|
96
|
+
text_norm = _normalize_series(text)
|
|
97
|
+
|
|
98
|
+
# Normalize keywords similarly
|
|
99
|
+
def _normalize_kw(kw: str) -> str:
|
|
100
|
+
kw = kw.lower()
|
|
101
|
+
kw = unicodedata.normalize("NFKD", kw)
|
|
102
|
+
return "".join(ch for ch in kw if not unicodedata.combining(ch))
|
|
103
|
+
|
|
104
|
+
norm_keywords = [_normalize_kw(k) for k in keywords]
|
|
105
|
+
|
|
106
|
+
# Vectorized OR search + simple score = count of matching keywords
|
|
107
|
+
overall_mask = pd.Series(False, index=datasets.index)
|
|
108
|
+
score = pd.Series(0, index=datasets.index, dtype="int64")
|
|
109
|
+
|
|
110
|
+
for kw in norm_keywords:
|
|
111
|
+
if not kw: continue
|
|
112
|
+
# plain substring search, no regex
|
|
113
|
+
m = text_norm.str.contains(kw, na=False, regex=False)
|
|
114
|
+
overall_mask |= m
|
|
115
|
+
score = score.add(m.astype("int8"), fill_value=0)
|
|
116
|
+
|
|
117
|
+
# Filter and sort by relevance
|
|
118
|
+
result = datasets.loc[overall_mask].copy()
|
|
119
|
+
result["_match_score"] = score.loc[overall_mask]
|
|
120
|
+
result = result.sort_values("_match_score", ascending=False)
|
|
121
|
+
|
|
122
|
+
return result[['agencyID', 'dataflowID', 'name', 'description']]
|
notoecd/structure.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
import xml.etree.ElementTree as ET
|
|
5
|
+
|
|
6
|
+
class Structure():
|
|
7
|
+
def __init__(self, agencyID:str, dataflowID:str):
|
|
8
|
+
self.ns = {
|
|
9
|
+
'message': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message',
|
|
10
|
+
'common': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common',
|
|
11
|
+
'structure': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure',
|
|
12
|
+
'xml': 'http://www.w3.org/XML/1998/namespace'
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
url = f"https://sdmx.oecd.org/public/rest/dataflow/{agencyID}/{dataflowID}/?references=all"
|
|
16
|
+
response = requests.get(url)
|
|
17
|
+
tree = ET.ElementTree(ET.fromstring(response.content))
|
|
18
|
+
self.root = tree.getroot()
|
|
19
|
+
|
|
20
|
+
self.concepts = self.build_concepts_dict()
|
|
21
|
+
self.values = self.build_values_dict()
|
|
22
|
+
self.toc = self.build_toc()
|
|
23
|
+
self.params = list(self.values.keys())
|
|
24
|
+
|
|
25
|
+
def build_concepts_dict(self) -> dict:
|
|
26
|
+
# Extract dimensions from the Concepts section
|
|
27
|
+
concepts = {'DIMENSIONS':{}, 'CODELISTS':{}}
|
|
28
|
+
conceptScheme_section = self.root.findall('.//message:Structures/structure:Concepts/structure:ConceptScheme', self.ns)
|
|
29
|
+
if conceptScheme_section:
|
|
30
|
+
for concept_scheme in conceptScheme_section:
|
|
31
|
+
for concept in concept_scheme.findall('structure:Concept', self.ns):
|
|
32
|
+
concept_id = concept.get('id')
|
|
33
|
+
name_elem = concept.find('common:Name[@{http://www.w3.org/XML/1998/namespace}lang="en"]', self.ns)
|
|
34
|
+
if concept_id and name_elem is not None:
|
|
35
|
+
concepts['DIMENSIONS'][concept_id] = name_elem.text
|
|
36
|
+
for core_representation in concept.findall('structure:CoreRepresentation', self.ns):
|
|
37
|
+
for enumeration in core_representation.findall('structure:Enumeration', self.ns):
|
|
38
|
+
ref_elem = enumeration.find('Ref')
|
|
39
|
+
if ref_elem is not None:
|
|
40
|
+
codelist_id = ref_elem.get('id')
|
|
41
|
+
if concept_id and codelist_id:
|
|
42
|
+
concepts['CODELISTS'][concept_id] = (
|
|
43
|
+
codelist_id.split("CL_")[1] if codelist_id.startswith("CL_")
|
|
44
|
+
else codelist_id
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Extract codes from the Codelists section
|
|
48
|
+
codes_section = self.root.findall('.//message:Structures/structure:Codelists/structure:Codelist', self.ns)
|
|
49
|
+
if codes_section:
|
|
50
|
+
for code_scheme in codes_section:
|
|
51
|
+
dimension = code_scheme.get('id')[3:]
|
|
52
|
+
concepts[dimension] = {}
|
|
53
|
+
for code in code_scheme.findall('structure:Code', self.ns):
|
|
54
|
+
code_id = code.get('id')
|
|
55
|
+
name_elem = code.find('common:Name[@{http://www.w3.org/XML/1998/namespace}lang="en"]', self.ns)
|
|
56
|
+
if code_id and name_elem is not None:
|
|
57
|
+
concepts[dimension][code_id] = name_elem.text
|
|
58
|
+
return concepts
|
|
59
|
+
|
|
60
|
+
def build_values_dict(self) -> dict:
|
|
61
|
+
# Extract values from Constraints section
|
|
62
|
+
values = {}
|
|
63
|
+
constraints_section = self.root.findall('.//message:Structures/structure:Constraints/structure:ContentConstraint', self.ns)
|
|
64
|
+
if constraints_section:
|
|
65
|
+
constraint = constraints_section[0]
|
|
66
|
+
cube_region = constraint.find('structure:CubeRegion', self.ns)
|
|
67
|
+
if cube_region is not None:
|
|
68
|
+
for key_value in cube_region.findall('common:KeyValue', self.ns):
|
|
69
|
+
key = key_value.get('id')
|
|
70
|
+
value_elems = key_value.findall('common:Value', self.ns)
|
|
71
|
+
if key is not None and value_elems:
|
|
72
|
+
values[key] = [value_elem.text for value_elem in value_elems if value_elem.text]
|
|
73
|
+
return values
|
|
74
|
+
|
|
75
|
+
def build_toc(self) -> pd.DataFrame:
|
|
76
|
+
# Builds a DataFrame containing the call dimensions and options (table of contents)
|
|
77
|
+
data_structure = self.root.find('.//message:Structures/structure:DataStructures/structure:DataStructure', self.ns)
|
|
78
|
+
dimension_list = data_structure.find('.//structure:DataStructureComponents/structure:DimensionList', self.ns)
|
|
79
|
+
rows = []
|
|
80
|
+
|
|
81
|
+
if dimension_list is not None:
|
|
82
|
+
dimensions = dimension_list.findall('.//structure:Dimension', self.ns)
|
|
83
|
+
|
|
84
|
+
# Sort dimensions by their 'position' to ensure correct order
|
|
85
|
+
sorted_dimensions = sorted(dimensions, key=lambda dim: int(dim.get('position')))
|
|
86
|
+
|
|
87
|
+
for dimension in sorted_dimensions:
|
|
88
|
+
dimension_id = dimension.get('id')
|
|
89
|
+
position = dimension.get('position')
|
|
90
|
+
|
|
91
|
+
# Get the corresponding values for the dimension (if present in the 'values' dictionary)
|
|
92
|
+
dimension_values = self.values.get(dimension_id, [])
|
|
93
|
+
|
|
94
|
+
rows.append({
|
|
95
|
+
'id': int(position),
|
|
96
|
+
'title': dimension_id,
|
|
97
|
+
'values': dimension_values
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
df = pd.DataFrame(rows)
|
|
101
|
+
df = df.sort_values(by='id').reset_index(drop=True)
|
|
102
|
+
return df
|
|
103
|
+
|
|
104
|
+
def explain_vals(self, dimension:str) -> dict:
|
|
105
|
+
# Explains the parameters of a given dimension
|
|
106
|
+
dimension = dimension.strip().upper()
|
|
107
|
+
concepts = self.concepts
|
|
108
|
+
values = self.values[dimension]
|
|
109
|
+
codelists = concepts['CODELISTS']
|
|
110
|
+
|
|
111
|
+
def fallback(dimension, concepts):
|
|
112
|
+
# Look for subset of keys
|
|
113
|
+
toc = self.toc
|
|
114
|
+
concept_mapping = {}
|
|
115
|
+
for title in toc.title:
|
|
116
|
+
vals = set(toc[toc['title'] == title].values[0][2])
|
|
117
|
+
concept_mapping[title] = [i for i in concepts.keys() if vals.issubset(set(concepts[i].keys()))][0]
|
|
118
|
+
concept_mapping['FREQ'] = 'FREQ'
|
|
119
|
+
if (dimension not in concepts) and (dimension in concept_mapping):
|
|
120
|
+
dimension = concept_mapping[dimension]
|
|
121
|
+
concepts = concepts[dimension]
|
|
122
|
+
return {i: concepts[i] for i in values}
|
|
123
|
+
|
|
124
|
+
if dimension not in codelists:
|
|
125
|
+
if dimension in concepts:
|
|
126
|
+
return concepts[dimension]
|
|
127
|
+
return fallback(dimension, concepts)
|
|
128
|
+
|
|
129
|
+
dim_codelist = codelists[dimension]
|
|
130
|
+
explanation = concepts[dim_codelist]
|
|
131
|
+
clean_values = [i for i in values if i in explanation]
|
|
132
|
+
|
|
133
|
+
if len(clean_values) == 0:
|
|
134
|
+
return fallback(dimension, concepts)
|
|
135
|
+
|
|
136
|
+
unclean_values = [i for i in values if i not in explanation]
|
|
137
|
+
if len(unclean_values) > 0:
|
|
138
|
+
print(f"Could not find explanation for {unclean_values}")
|
|
139
|
+
|
|
140
|
+
return {i:explanation[i] for i in clean_values}
|
|
141
|
+
|
|
142
|
+
@lru_cache(maxsize=128)
|
|
143
|
+
def get_structure(agencyID: str, dataflowID: str) -> Structure:
|
|
144
|
+
return Structure(agencyID, dataflowID)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: notoecd
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
|
|
5
5
|
Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/dani-37/notoecd
|
|
@@ -14,7 +14,8 @@ Requires-Dist: requests>=2.31
|
|
|
14
14
|
|
|
15
15
|
⚠️ **Unofficial package, not endorsed by the OECD.**
|
|
16
16
|
|
|
17
|
-
A lightweight Python interface for exploring OECD SDMX structures and downloading OECD
|
|
17
|
+
A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
|
|
18
|
+
|
|
18
19
|
The package provides utilities for:
|
|
19
20
|
|
|
20
21
|
- Discovering dataset metadata
|
|
@@ -110,5 +111,3 @@ The returned object is a pandas DataFrame containing the requested subset of OEC
|
|
|
110
111
|
## Examples
|
|
111
112
|
|
|
112
113
|
You can see this full example as a notebook called example.ipynb.
|
|
113
|
-
|
|
114
|
-
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
notoecd/__init__.py,sha256=J3-DA__Z9hHzp6eOszRMFh4VE6tk1YtSNigW7el4qDM,189
|
|
2
|
+
notoecd/calls.py,sha256=CDx-1wJ4myXtoihIfTvjHoXBvIwylvv7AdN_UL5gnF4,1492
|
|
3
|
+
notoecd/datasets.py,sha256=a1L45vie6q4cjNXAued2gQ4oE4Fbpk25kdhDTexMuAI,4158
|
|
4
|
+
notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
|
|
5
|
+
notoecd-0.1.2.dist-info/METADATA,sha256=kt6OjNXdlGpn5K-_WQm-z-GLKbbPLgF3mq0C7YSGNho,3192
|
|
6
|
+
notoecd-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
notoecd-0.1.2.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
|
|
8
|
+
notoecd-0.1.2.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
notoecd
|
notoecd-0.1.0.dist-info/RECORD
DELETED
|
@@ -1,4 +0,0 @@
|
|
|
1
|
-
notoecd-0.1.0.dist-info/METADATA,sha256=UcQeYIOzhpdpZE5HcFVm_7DCjxdJmLkcCdbSDsrPTHY,3191
|
|
2
|
-
notoecd-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
3
|
-
notoecd-0.1.0.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
4
|
-
notoecd-0.1.0.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
|
File without changes
|