notoecd 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- notoecd/__init__.py +1 -1
- notoecd/calls.py +2 -2
- notoecd/datasets.py +50 -47
- {notoecd-0.1.1.dist-info → notoecd-0.1.3.dist-info}/METADATA +115 -113
- notoecd-0.1.3.dist-info/RECORD +9 -0
- notoecd-0.1.3.dist-info/licenses/LICENSE +21 -0
- notoecd-0.1.1.dist-info/RECORD +0 -8
- {notoecd-0.1.1.dist-info → notoecd-0.1.3.dist-info}/WHEEL +0 -0
- {notoecd-0.1.1.dist-info → notoecd-0.1.3.dist-info}/top_level.txt +0 -0
notoecd/__init__.py
CHANGED
notoecd/calls.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import Union
|
|
|
3
3
|
from functools import lru_cache
|
|
4
4
|
from .structure import get_structure
|
|
5
5
|
|
|
6
|
-
@lru_cache(maxsize=
|
|
6
|
+
@lru_cache(maxsize=64)
|
|
7
7
|
def _fetch_df(url: str) -> pd.DataFrame:
|
|
8
8
|
return pd.read_csv(url, storage_options={"User-Agent": "Mozilla/5.0"})
|
|
9
9
|
|
|
@@ -38,7 +38,7 @@ def _build_filter_expression(
|
|
|
38
38
|
def get_df(
|
|
39
39
|
agencyID: str,
|
|
40
40
|
dataflowID: str,
|
|
41
|
-
filters: Union[str, dict],
|
|
41
|
+
filters: Union[str, dict] = "",
|
|
42
42
|
version: str = "",
|
|
43
43
|
) -> pd.DataFrame:
|
|
44
44
|
|
notoecd/datasets.py
CHANGED
|
@@ -3,7 +3,6 @@ import html
|
|
|
3
3
|
import requests
|
|
4
4
|
import unicodedata
|
|
5
5
|
import pandas as pd
|
|
6
|
-
from typing import Union, List
|
|
7
6
|
import xml.etree.ElementTree as ET
|
|
8
7
|
|
|
9
8
|
url = "https://sdmx.oecd.org/public/rest/dataflow/all"
|
|
@@ -18,45 +17,53 @@ NS = {
|
|
|
18
17
|
_ws_re = re.compile(r"\s+")
|
|
19
18
|
_tag_re = re.compile(r"<[^>]+>")
|
|
20
19
|
|
|
21
|
-
def
|
|
20
|
+
def _clean(s: str | None) -> str | None:
|
|
22
21
|
if s is None: return None
|
|
23
22
|
s = html.unescape(s)
|
|
24
23
|
s = _tag_re.sub("", s)
|
|
25
24
|
s = _ws_re.sub(" ", s).strip()
|
|
26
25
|
return s or None
|
|
27
26
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
}
|
|
27
|
+
# Cache
|
|
28
|
+
_datasets: pd.DataFrame | None = None
|
|
31
29
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
30
|
+
def _load_datasets() -> pd.DataFrame:
|
|
31
|
+
"""
|
|
32
|
+
Loads OECD datasets and keeps them in memory.
|
|
33
|
+
"""
|
|
34
|
+
global _datasets
|
|
35
|
+
if _datasets is not None: return _datasets
|
|
36
|
+
|
|
37
|
+
headers = {"Accept": "application/vnd.sdmx.structure+xml;version=2.1"}
|
|
38
|
+
r = requests.get(url, headers=headers, timeout=30)
|
|
39
|
+
r.raise_for_status()
|
|
40
|
+
root = ET.fromstring(r.content)
|
|
41
|
+
|
|
42
|
+
rows = []
|
|
43
|
+
for df in root.findall(".//structure:Dataflow", NS):
|
|
44
|
+
dataflow_id = df.attrib.get("id")
|
|
45
|
+
agency_id = df.attrib.get("agencyID")
|
|
46
|
+
|
|
47
|
+
name_elem = df.find("common:Name[@xml:lang='en']", NS)
|
|
48
|
+
desc_elem = df.find("common:Description[@xml:lang='en']", NS)
|
|
49
|
+
|
|
50
|
+
name = _clean("".join(name_elem.itertext())) if name_elem is not None else None
|
|
51
|
+
desc_raw = "".join(desc_elem.itertext()) if desc_elem is not None else None
|
|
52
|
+
desc = _clean(desc_raw)
|
|
53
|
+
|
|
54
|
+
rows.append(
|
|
55
|
+
{
|
|
56
|
+
"dataflowID": dataflow_id,
|
|
57
|
+
"agencyID": agency_id,
|
|
58
|
+
"name": name,
|
|
59
|
+
"description": desc,
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
_datasets = pd.DataFrame(rows)
|
|
64
|
+
return _datasets
|
|
58
65
|
|
|
59
|
-
def search_keywords(keywords:
|
|
66
|
+
def search_keywords(*keywords: str) -> pd.DataFrame:
|
|
60
67
|
"""
|
|
61
68
|
Searches OECD datasets for a set of keywords.
|
|
62
69
|
|
|
@@ -66,14 +73,12 @@ def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
|
|
|
66
73
|
Returns:
|
|
67
74
|
pd.DataFrame: Matching rows.
|
|
68
75
|
"""
|
|
76
|
+
datasets = _load_datasets()
|
|
69
77
|
|
|
70
|
-
#
|
|
71
|
-
if isinstance(keywords, str): keywords = [keywords]
|
|
72
|
-
elif not isinstance(keywords, list): raise TypeError("keywords must be a string or list of strings")
|
|
73
|
-
|
|
74
|
-
# Clean and drop empty keywords
|
|
78
|
+
# Clean and validate keywords
|
|
75
79
|
keywords = [k for k in keywords if isinstance(k, str) and k.strip()]
|
|
76
|
-
if not keywords:
|
|
80
|
+
if not keywords:
|
|
81
|
+
raise ValueError("No valid keywords provided.")
|
|
77
82
|
|
|
78
83
|
def _normalize_series(s: pd.Series) -> pd.Series:
|
|
79
84
|
s = s.fillna("").astype(str).str.lower()
|
|
@@ -85,31 +90,29 @@ def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
|
|
|
85
90
|
)
|
|
86
91
|
|
|
87
92
|
# Combined normalized text for each row
|
|
88
|
-
text =
|
|
93
|
+
text = (
|
|
94
|
+
datasets["name"].fillna("").astype(str)
|
|
95
|
+
+ " "
|
|
96
|
+
+ datasets["description"].fillna("").astype(str)
|
|
97
|
+
)
|
|
89
98
|
text_norm = _normalize_series(text)
|
|
90
99
|
|
|
91
|
-
# Normalize keywords similarly
|
|
92
100
|
def _normalize_kw(kw: str) -> str:
|
|
93
|
-
kw = kw.lower()
|
|
94
|
-
kw = unicodedata.normalize("NFKD", kw)
|
|
101
|
+
kw = unicodedata.normalize("NFKD", kw.lower())
|
|
95
102
|
return "".join(ch for ch in kw if not unicodedata.combining(ch))
|
|
96
103
|
|
|
97
104
|
norm_keywords = [_normalize_kw(k) for k in keywords]
|
|
98
105
|
|
|
99
|
-
# Vectorized OR search + simple score = count of matching keywords
|
|
100
106
|
overall_mask = pd.Series(False, index=datasets.index)
|
|
101
107
|
score = pd.Series(0, index=datasets.index, dtype="int64")
|
|
102
108
|
|
|
103
109
|
for kw in norm_keywords:
|
|
104
|
-
if not kw: continue
|
|
105
|
-
# plain substring search, no regex
|
|
106
110
|
m = text_norm.str.contains(kw, na=False, regex=False)
|
|
107
111
|
overall_mask |= m
|
|
108
112
|
score = score.add(m.astype("int8"), fill_value=0)
|
|
109
113
|
|
|
110
|
-
# Filter and sort by relevance
|
|
111
114
|
result = datasets.loc[overall_mask].copy()
|
|
112
115
|
result["_match_score"] = score.loc[overall_mask]
|
|
113
116
|
result = result.sort_values("_match_score", ascending=False)
|
|
114
117
|
|
|
115
|
-
return result[[
|
|
118
|
+
return result[["agencyID", "dataflowID", "name", "description"]]
|
|
@@ -1,113 +1,115 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: notoecd
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary:
|
|
5
|
-
Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
|
|
6
|
-
License-Expression: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/dani-37/notoecd
|
|
8
|
-
Requires-Python: >=3.10
|
|
9
|
-
Description-Content-Type: text/markdown
|
|
10
|
-
|
|
11
|
-
Requires-Dist:
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
'
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: notoecd
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
|
|
5
|
+
Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/dani-37/notoecd
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: pandas>=2.0
|
|
12
|
+
Requires-Dist: requests>=2.31
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# notoecd
|
|
16
|
+
|
|
17
|
+
⚠️ **Unofficial package, not endorsed by the OECD.**
|
|
18
|
+
|
|
19
|
+
A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
|
|
20
|
+
|
|
21
|
+
The package provides utilities for:
|
|
22
|
+
|
|
23
|
+
- Discovering dataset metadata
|
|
24
|
+
- Searching for relevant datasets using keyword matching
|
|
25
|
+
- Exploring the structure and code lists of a dataset
|
|
26
|
+
- Fetching filtered SDMX data directly into a pandas DataFrame
|
|
27
|
+
|
|
28
|
+
------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
You can install the package by running:
|
|
33
|
+
|
|
34
|
+
pip install notoecd
|
|
35
|
+
|
|
36
|
+
------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
import notoecd
|
|
41
|
+
|
|
42
|
+
The main functions in this module are:
|
|
43
|
+
|
|
44
|
+
search_keywords(keywords) -> pd.DataFrame
|
|
45
|
+
get_structure(agencyID, dataflowID) -> Structure
|
|
46
|
+
get_df(agencyID, dataflowID, filters) -> pd.DataFrame
|
|
47
|
+
|
|
48
|
+
------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
## Searching for datasets
|
|
51
|
+
|
|
52
|
+
`search_keywords` performs:
|
|
53
|
+
|
|
54
|
+
- Normalized text matching
|
|
55
|
+
- Accent-insensitive search
|
|
56
|
+
- Multi-keyword OR matching
|
|
57
|
+
- Ranking by number of matched keywords
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
|
|
61
|
+
hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
|
|
62
|
+
|
|
63
|
+
This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
|
|
64
|
+
|
|
65
|
+
------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
## Inspecting dataset structure
|
|
68
|
+
|
|
69
|
+
Once a dataset is identified, load its SDMX structure:
|
|
70
|
+
|
|
71
|
+
dataset = 'Gross domestic product - Regions'
|
|
72
|
+
agencyID = 'OECD.CFE.EDS'
|
|
73
|
+
dataflowID = 'DSD_REG_ECO@DF_GDP'
|
|
74
|
+
|
|
75
|
+
s = notoecd.get_structure(agencyID, dataflowID)
|
|
76
|
+
|
|
77
|
+
### Table of contents
|
|
78
|
+
|
|
79
|
+
s.toc
|
|
80
|
+
|
|
81
|
+
This shows all filters and their available values.
|
|
82
|
+
|
|
83
|
+
### Exploring code values
|
|
84
|
+
|
|
85
|
+
s.explain_vals('MEASURE')
|
|
86
|
+
s.explain_vals('UNIT_MEASURE')
|
|
87
|
+
|
|
88
|
+
This shows the available measures and units used in the dataset.
|
|
89
|
+
|
|
90
|
+
------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
## Filtering and downloading data
|
|
93
|
+
|
|
94
|
+
To download data, build a dictionary of filters.
|
|
95
|
+
Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
|
|
96
|
+
|
|
97
|
+
filters = {
|
|
98
|
+
'territorial_level': ['tl2', 'tl3'],
|
|
99
|
+
'measure': 'gdp',
|
|
100
|
+
'prices': 'Q',
|
|
101
|
+
'unit_measure': 'USD_PPP_PS'
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
Fetch the filtered dataset:
|
|
105
|
+
|
|
106
|
+
df = notoecd.get_df(agency, dataflow, filters)
|
|
107
|
+
df.head()
|
|
108
|
+
|
|
109
|
+
The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
|
|
110
|
+
|
|
111
|
+
------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
## Examples
|
|
114
|
+
|
|
115
|
+
You can see this full example as a notebook called example.ipynb.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
notoecd/__init__.py,sha256=gW8v_eB09ROOISMPTDD8pwSg0xtvNRZ-06lD2Q76Xb8,189
|
|
2
|
+
notoecd/calls.py,sha256=SFM4kerc-K43Yo6oDBCsnvCIpN2Bg0-sHKpRfAujS-o,1496
|
|
3
|
+
notoecd/datasets.py,sha256=c8iz2HzWyCGGQINNnzlHG-kJMqsDKFbDObvK11QZU0Y,3751
|
|
4
|
+
notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
|
|
5
|
+
notoecd-0.1.3.dist-info/licenses/LICENSE,sha256=jb9xgeCKfW-VCXFQtYmiM_SZ9tC2zPGtOIVsE5G17W8,1076
|
|
6
|
+
notoecd-0.1.3.dist-info/METADATA,sha256=oHUyrIqCgzELJHZPrJ3cQKvpwWbR25d9YItSy5X8k6c,3123
|
|
7
|
+
notoecd-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
notoecd-0.1.3.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
|
|
9
|
+
notoecd-0.1.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Daniel Vegara Balsa
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
notoecd-0.1.1.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
notoecd/__init__.py,sha256=134t-qMBmEQN_mtpGOSfQ5zr128wWT68J0e5RIBzN0g,189
|
|
2
|
-
notoecd/calls.py,sha256=CDx-1wJ4myXtoihIfTvjHoXBvIwylvv7AdN_UL5gnF4,1492
|
|
3
|
-
notoecd/datasets.py,sha256=nxLNP0L28mXTKYpdR6BQN5Tk6CKoQS7dygm8twnIVSg,3845
|
|
4
|
-
notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
|
|
5
|
-
notoecd-0.1.1.dist-info/METADATA,sha256=YphZ-il5WvyxK0u1w3cI_vdH2uUdCLaWd3ms981SovI,3180
|
|
6
|
-
notoecd-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
notoecd-0.1.1.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
|
|
8
|
-
notoecd-0.1.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|