notoecd 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
notoecd/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ __version__ = "0.1.1"
2
+
3
+ from .calls import get_df
4
+ from .structure import get_structure
5
+ from .datasets import search_keywords
6
+
7
+ __all__ = ["get_df", "get_structure", "search_keywords"]
notoecd/calls.py ADDED
@@ -0,0 +1,57 @@
1
+ import pandas as pd
2
+ from typing import Union
3
+ from functools import lru_cache
4
+ from .structure import get_structure
5
+
6
+ @lru_cache(maxsize=256)
7
+ def _fetch_df(url: str) -> pd.DataFrame:
8
+ return pd.read_csv(url, storage_options={"User-Agent": "Mozilla/5.0"})
9
+
10
+
11
+ def _clean(s: str) -> str:
12
+ return str(s).strip().lower()
13
+
14
+
15
+ def _build_filter_expression(
16
+ agencyID: str,
17
+ dataflowID: str,
18
+ filters: dict,
19
+ ) -> str:
20
+
21
+ s = get_structure(agencyID, dataflowID)
22
+ filters = {_clean(k): v for k, v in filters.items()}
23
+
24
+ parts = []
25
+ for dim in s.toc.title:
26
+ dim_key = _clean(dim)
27
+ if dim_key in filters:
28
+ val = filters[dim_key]
29
+ if isinstance(val, str):
30
+ val = [val]
31
+ parts.append("+".join(_clean(v) for v in val))
32
+ else:
33
+ parts.append("")
34
+
35
+ return ".".join(parts).upper()
36
+
37
+
38
+ def get_df(
39
+ agencyID: str,
40
+ dataflowID: str,
41
+ filters: Union[str, dict],
42
+ version: str = "",
43
+ ) -> pd.DataFrame:
44
+
45
+ if isinstance(filters, dict):
46
+ filter_expression = _build_filter_expression(agencyID, dataflowID, filters)
47
+ else:
48
+ filter_expression = _clean(filters).upper()
49
+
50
+ url = (
51
+ f"https://sdmx.oecd.org/public/rest/data/"
52
+ f"{agencyID},{dataflowID},{version}/{filter_expression}"
53
+ "?dimensionAtObservation=AllDimensions&format=csvfile"
54
+ )
55
+
56
+ base_df = _fetch_df(url)
57
+ return base_df.copy()
notoecd/datasets.py ADDED
@@ -0,0 +1,115 @@
1
+ import re
2
+ import html
3
+ import requests
4
+ import unicodedata
5
+ import pandas as pd
6
+ from typing import Union, List
7
+ import xml.etree.ElementTree as ET
8
+
9
+ url = "https://sdmx.oecd.org/public/rest/dataflow/all"
10
+
11
+ NS = {
12
+ "message": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message",
13
+ "structure": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure",
14
+ "common": "http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common",
15
+ "xml": "http://www.w3.org/XML/1998/namespace",
16
+ }
17
+
18
+ _ws_re = re.compile(r"\s+")
19
+ _tag_re = re.compile(r"<[^>]+>")
20
+
21
+ def _clean_text(s: str | None) -> str | None:
22
+ if s is None: return None
23
+ s = html.unescape(s)
24
+ s = _tag_re.sub("", s)
25
+ s = _ws_re.sub(" ", s).strip()
26
+ return s or None
27
+
28
+ headers = {
29
+ "Accept": "application/vnd.sdmx.structure+xml;version=2.1"
30
+ }
31
+
32
+ r = requests.get(url, headers=headers, timeout=30)
33
+ r.raise_for_status()
34
+ root = ET.fromstring(r.content)
35
+
36
+ rows = []
37
+ for df in root.findall(".//structure:Dataflow", NS):
38
+ dataflow_id = df.attrib.get("id")
39
+ agency_id = df.attrib.get("agencyID")
40
+
41
+ name_elem = df.find("common:Name[@xml:lang='en']", NS)
42
+ desc_elem = df.find("common:Description[@xml:lang='en']", NS)
43
+
44
+ name = _clean_text("".join(name_elem.itertext())) if name_elem is not None else None
45
+ desc_raw = "".join(desc_elem.itertext()) if desc_elem is not None else None
46
+ desc = _clean_text(desc_raw)
47
+
48
+ rows.append(
49
+ {
50
+ "dataflowID": dataflow_id,
51
+ "agencyID": agency_id,
52
+ "name": name,
53
+ "description": desc,
54
+ }
55
+ )
56
+
57
+ datasets = pd.DataFrame(rows)
58
+
59
+ def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
60
+ """
61
+ Searches OECD datasets for a set of keywords.
62
+
63
+ Args:
64
+ keywords (str | list[str]): Single keyword or list of keywords. Acts as OR.
65
+
66
+ Returns:
67
+ pd.DataFrame: Matching rows.
68
+ """
69
+
70
+ # Normalize keywords input
71
+ if isinstance(keywords, str): keywords = [keywords]
72
+ elif not isinstance(keywords, list): raise TypeError("keywords must be a string or list of strings")
73
+
74
+ # Clean and drop empty keywords
75
+ keywords = [k for k in keywords if isinstance(k, str) and k.strip()]
76
+ if not keywords: raise ValueError("No valid keywords provided.")
77
+
78
+ def _normalize_series(s: pd.Series) -> pd.Series:
79
+ s = s.fillna("").astype(str).str.lower()
80
+ return s.map(
81
+ lambda x: "".join(
82
+ ch for ch in unicodedata.normalize("NFKD", x)
83
+ if not unicodedata.combining(ch)
84
+ )
85
+ )
86
+
87
+ # Combined normalized text for each row
88
+ text = datasets["name"].fillna("").astype(str) + " " + datasets["description"].fillna("").astype(str)
89
+ text_norm = _normalize_series(text)
90
+
91
+ # Normalize keywords similarly
92
+ def _normalize_kw(kw: str) -> str:
93
+ kw = kw.lower()
94
+ kw = unicodedata.normalize("NFKD", kw)
95
+ return "".join(ch for ch in kw if not unicodedata.combining(ch))
96
+
97
+ norm_keywords = [_normalize_kw(k) for k in keywords]
98
+
99
+ # Vectorized OR search + simple score = count of matching keywords
100
+ overall_mask = pd.Series(False, index=datasets.index)
101
+ score = pd.Series(0, index=datasets.index, dtype="int64")
102
+
103
+ for kw in norm_keywords:
104
+ if not kw: continue
105
+ # plain substring search, no regex
106
+ m = text_norm.str.contains(kw, na=False, regex=False)
107
+ overall_mask |= m
108
+ score = score.add(m.astype("int8"), fill_value=0)
109
+
110
+ # Filter and sort by relevance
111
+ result = datasets.loc[overall_mask].copy()
112
+ result["_match_score"] = score.loc[overall_mask]
113
+ result = result.sort_values("_match_score", ascending=False)
114
+
115
+ return result[['agencyID', 'dataflowID', 'name', 'description']]
notoecd/structure.py ADDED
@@ -0,0 +1,144 @@
1
+ import requests
2
+ import pandas as pd
3
+ from functools import lru_cache
4
+ import xml.etree.ElementTree as ET
5
+
6
+ class Structure():
7
+ def __init__(self, agencyID:str, dataflowID:str):
8
+ self.ns = {
9
+ 'message': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message',
10
+ 'common': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common',
11
+ 'structure': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure',
12
+ 'xml': 'http://www.w3.org/XML/1998/namespace'
13
+ }
14
+
15
+ url = f"https://sdmx.oecd.org/public/rest/dataflow/{agencyID}/{dataflowID}/?references=all"
16
+ response = requests.get(url)
17
+ tree = ET.ElementTree(ET.fromstring(response.content))
18
+ self.root = tree.getroot()
19
+
20
+ self.concepts = self.build_concepts_dict()
21
+ self.values = self.build_values_dict()
22
+ self.toc = self.build_toc()
23
+ self.params = list(self.values.keys())
24
+
25
+ def build_concepts_dict(self) -> dict:
26
+ # Extract dimensions from the Concepts section
27
+ concepts = {'DIMENSIONS':{}, 'CODELISTS':{}}
28
+ conceptScheme_section = self.root.findall('.//message:Structures/structure:Concepts/structure:ConceptScheme', self.ns)
29
+ if conceptScheme_section:
30
+ for concept_scheme in conceptScheme_section:
31
+ for concept in concept_scheme.findall('structure:Concept', self.ns):
32
+ concept_id = concept.get('id')
33
+ name_elem = concept.find('common:Name[@{http://www.w3.org/XML/1998/namespace}lang="en"]', self.ns)
34
+ if concept_id and name_elem is not None:
35
+ concepts['DIMENSIONS'][concept_id] = name_elem.text
36
+ for core_representation in concept.findall('structure:CoreRepresentation', self.ns):
37
+ for enumeration in core_representation.findall('structure:Enumeration', self.ns):
38
+ ref_elem = enumeration.find('Ref')
39
+ if ref_elem is not None:
40
+ codelist_id = ref_elem.get('id')
41
+ if concept_id and codelist_id:
42
+ concepts['CODELISTS'][concept_id] = (
43
+ codelist_id.split("CL_")[1] if codelist_id.startswith("CL_")
44
+ else codelist_id
45
+ )
46
+
47
+ # Extract codes from the Codelists section
48
+ codes_section = self.root.findall('.//message:Structures/structure:Codelists/structure:Codelist', self.ns)
49
+ if codes_section:
50
+ for code_scheme in codes_section:
51
+ dimension = code_scheme.get('id')[3:]
52
+ concepts[dimension] = {}
53
+ for code in code_scheme.findall('structure:Code', self.ns):
54
+ code_id = code.get('id')
55
+ name_elem = code.find('common:Name[@{http://www.w3.org/XML/1998/namespace}lang="en"]', self.ns)
56
+ if code_id and name_elem is not None:
57
+ concepts[dimension][code_id] = name_elem.text
58
+ return concepts
59
+
60
+ def build_values_dict(self) -> dict:
61
+ # Extract values from Constraints section
62
+ values = {}
63
+ constraints_section = self.root.findall('.//message:Structures/structure:Constraints/structure:ContentConstraint', self.ns)
64
+ if constraints_section:
65
+ constraint = constraints_section[0]
66
+ cube_region = constraint.find('structure:CubeRegion', self.ns)
67
+ if cube_region is not None:
68
+ for key_value in cube_region.findall('common:KeyValue', self.ns):
69
+ key = key_value.get('id')
70
+ value_elems = key_value.findall('common:Value', self.ns)
71
+ if key is not None and value_elems:
72
+ values[key] = [value_elem.text for value_elem in value_elems if value_elem.text]
73
+ return values
74
+
75
+ def build_toc(self) -> pd.DataFrame:
76
+ # Builds a DataFrame containing the call dimensions and options (table of contents)
77
+ data_structure = self.root.find('.//message:Structures/structure:DataStructures/structure:DataStructure', self.ns)
78
+ dimension_list = data_structure.find('.//structure:DataStructureComponents/structure:DimensionList', self.ns)
79
+ rows = []
80
+
81
+ if dimension_list is not None:
82
+ dimensions = dimension_list.findall('.//structure:Dimension', self.ns)
83
+
84
+ # Sort dimensions by their 'position' to ensure correct order
85
+ sorted_dimensions = sorted(dimensions, key=lambda dim: int(dim.get('position')))
86
+
87
+ for dimension in sorted_dimensions:
88
+ dimension_id = dimension.get('id')
89
+ position = dimension.get('position')
90
+
91
+ # Get the corresponding values for the dimension (if present in the 'values' dictionary)
92
+ dimension_values = self.values.get(dimension_id, [])
93
+
94
+ rows.append({
95
+ 'id': int(position),
96
+ 'title': dimension_id,
97
+ 'values': dimension_values
98
+ })
99
+
100
+ df = pd.DataFrame(rows)
101
+ df = df.sort_values(by='id').reset_index(drop=True)
102
+ return df
103
+
104
+ def explain_vals(self, dimension:str) -> dict:
105
+ # Explains the parameters of a given dimension
106
+ dimension = dimension.strip().upper()
107
+ concepts = self.concepts
108
+ values = self.values[dimension]
109
+ codelists = concepts['CODELISTS']
110
+
111
+ def fallback(dimension, concepts):
112
+ # Look for subset of keys
113
+ toc = self.toc
114
+ concept_mapping = {}
115
+ for title in toc.title:
116
+ vals = set(toc[toc['title'] == title].values[0][2])
117
+ concept_mapping[title] = [i for i in concepts.keys() if vals.issubset(set(concepts[i].keys()))][0]
118
+ concept_mapping['FREQ'] = 'FREQ'
119
+ if (dimension not in concepts) and (dimension in concept_mapping):
120
+ dimension = concept_mapping[dimension]
121
+ concepts = concepts[dimension]
122
+ return {i: concepts[i] for i in values}
123
+
124
+ if dimension not in codelists:
125
+ if dimension in concepts:
126
+ return concepts[dimension]
127
+ return fallback(dimension, concepts)
128
+
129
+ dim_codelist = codelists[dimension]
130
+ explanation = concepts[dim_codelist]
131
+ clean_values = [i for i in values if i in explanation]
132
+
133
+ if len(clean_values) == 0:
134
+ return fallback(dimension, concepts)
135
+
136
+ unclean_values = [i for i in values if i not in explanation]
137
+ if len(unclean_values) > 0:
138
+ print(f"Could not find explanation for {unclean_values}")
139
+
140
+ return {i:explanation[i] for i in clean_values}
141
+
142
+ @lru_cache(maxsize=128)
143
+ def get_structure(agencyID: str, dataflowID: str) -> Structure:
144
+ return Structure(agencyID, dataflowID)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: notoecd
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Library for interacting with the OECD Data Explorer through Python
5
5
  Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
6
6
  License-Expression: MIT
@@ -14,7 +14,7 @@ Requires-Dist: requests>=2.31
14
14
 
15
15
  ⚠️ **Unofficial package, not endorsed by the OECD.**
16
16
 
17
- A lightweight Python interface for exploring OECD SDMX structures and downloading OECD regional datasets.
17
+ A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
18
18
  The package provides utilities for:
19
19
 
20
20
  - Discovering dataset metadata
@@ -111,4 +111,3 @@ The returned object is a pandas DataFrame containing the requested subset of OEC
111
111
 
112
112
  You can see this full example as a notebook called example.ipynb.
113
113
 
114
-
@@ -0,0 +1,8 @@
1
+ notoecd/__init__.py,sha256=134t-qMBmEQN_mtpGOSfQ5zr128wWT68J0e5RIBzN0g,189
2
+ notoecd/calls.py,sha256=CDx-1wJ4myXtoihIfTvjHoXBvIwylvv7AdN_UL5gnF4,1492
3
+ notoecd/datasets.py,sha256=nxLNP0L28mXTKYpdR6BQN5Tk6CKoQS7dygm8twnIVSg,3845
4
+ notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
5
+ notoecd-0.1.1.dist-info/METADATA,sha256=YphZ-il5WvyxK0u1w3cI_vdH2uUdCLaWd3ms981SovI,3180
6
+ notoecd-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ notoecd-0.1.1.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
8
+ notoecd-0.1.1.dist-info/RECORD,,
@@ -0,0 +1 @@
1
+ notoecd
@@ -1,4 +0,0 @@
1
- notoecd-0.1.0.dist-info/METADATA,sha256=UcQeYIOzhpdpZE5HcFVm_7DCjxdJmLkcCdbSDsrPTHY,3191
2
- notoecd-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
3
- notoecd-0.1.0.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
4
- notoecd-0.1.0.dist-info/RECORD,,
@@ -1 +0,0 @@
1
-