notoecd 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- notoecd/__init__.py +1 -1
- notoecd/calls.py +2 -2
- notoecd/datasets.py +15 -19
- {notoecd-0.1.2.dist-info → notoecd-0.1.3.dist-info}/METADATA +115 -113
- notoecd-0.1.3.dist-info/RECORD +9 -0
- notoecd-0.1.3.dist-info/licenses/LICENSE +21 -0
- notoecd-0.1.2.dist-info/RECORD +0 -8
- {notoecd-0.1.2.dist-info → notoecd-0.1.3.dist-info}/WHEEL +0 -0
- {notoecd-0.1.2.dist-info → notoecd-0.1.3.dist-info}/top_level.txt +0 -0
notoecd/__init__.py
CHANGED
notoecd/calls.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import Union
|
|
|
3
3
|
from functools import lru_cache
|
|
4
4
|
from .structure import get_structure
|
|
5
5
|
|
|
6
|
-
@lru_cache(maxsize=
|
|
6
|
+
@lru_cache(maxsize=64)
|
|
7
7
|
def _fetch_df(url: str) -> pd.DataFrame:
|
|
8
8
|
return pd.read_csv(url, storage_options={"User-Agent": "Mozilla/5.0"})
|
|
9
9
|
|
|
@@ -38,7 +38,7 @@ def _build_filter_expression(
|
|
|
38
38
|
def get_df(
|
|
39
39
|
agencyID: str,
|
|
40
40
|
dataflowID: str,
|
|
41
|
-
filters: Union[str, dict],
|
|
41
|
+
filters: Union[str, dict] = "",
|
|
42
42
|
version: str = "",
|
|
43
43
|
) -> pd.DataFrame:
|
|
44
44
|
|
notoecd/datasets.py
CHANGED
|
@@ -3,7 +3,6 @@ import html
|
|
|
3
3
|
import requests
|
|
4
4
|
import unicodedata
|
|
5
5
|
import pandas as pd
|
|
6
|
-
from typing import Union, List
|
|
7
6
|
import xml.etree.ElementTree as ET
|
|
8
7
|
|
|
9
8
|
url = "https://sdmx.oecd.org/public/rest/dataflow/all"
|
|
@@ -64,7 +63,7 @@ def _load_datasets() -> pd.DataFrame:
|
|
|
64
63
|
_datasets = pd.DataFrame(rows)
|
|
65
64
|
return _datasets
|
|
66
65
|
|
|
67
|
-
def search_keywords(keywords:
|
|
66
|
+
def search_keywords(*keywords: str) -> pd.DataFrame:
|
|
68
67
|
"""
|
|
69
68
|
Searches OECD datasets for a set of keywords.
|
|
70
69
|
|
|
@@ -76,47 +75,44 @@ def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
|
|
|
76
75
|
"""
|
|
77
76
|
datasets = _load_datasets()
|
|
78
77
|
|
|
79
|
-
#
|
|
80
|
-
if isinstance(keywords, str): keywords = [keywords]
|
|
81
|
-
elif not isinstance(keywords, list): raise TypeError("keywords must be a string or list of strings")
|
|
82
|
-
|
|
83
|
-
# Clean and drop empty keywords
|
|
78
|
+
# Clean and validate keywords
|
|
84
79
|
keywords = [k for k in keywords if isinstance(k, str) and k.strip()]
|
|
85
|
-
if not keywords:
|
|
80
|
+
if not keywords:
|
|
81
|
+
raise ValueError("No valid keywords provided.")
|
|
86
82
|
|
|
87
83
|
def _normalize_series(s: pd.Series) -> pd.Series:
|
|
88
84
|
s = s.fillna("").astype(str).str.lower()
|
|
89
85
|
return s.map(
|
|
90
|
-
lambda x: "".join(
|
|
91
|
-
|
|
86
|
+
lambda x: "".join(
|
|
87
|
+
ch for ch in unicodedata.normalize("NFKD", x)
|
|
88
|
+
if not unicodedata.combining(ch)
|
|
89
|
+
)
|
|
92
90
|
)
|
|
93
91
|
|
|
94
92
|
# Combined normalized text for each row
|
|
95
|
-
text =
|
|
93
|
+
text = (
|
|
94
|
+
datasets["name"].fillna("").astype(str)
|
|
95
|
+
+ " "
|
|
96
|
+
+ datasets["description"].fillna("").astype(str)
|
|
97
|
+
)
|
|
96
98
|
text_norm = _normalize_series(text)
|
|
97
99
|
|
|
98
|
-
# Normalize keywords similarly
|
|
99
100
|
def _normalize_kw(kw: str) -> str:
|
|
100
|
-
kw = kw.lower()
|
|
101
|
-
kw = unicodedata.normalize("NFKD", kw)
|
|
101
|
+
kw = unicodedata.normalize("NFKD", kw.lower())
|
|
102
102
|
return "".join(ch for ch in kw if not unicodedata.combining(ch))
|
|
103
103
|
|
|
104
104
|
norm_keywords = [_normalize_kw(k) for k in keywords]
|
|
105
105
|
|
|
106
|
-
# Vectorized OR search + simple score = count of matching keywords
|
|
107
106
|
overall_mask = pd.Series(False, index=datasets.index)
|
|
108
107
|
score = pd.Series(0, index=datasets.index, dtype="int64")
|
|
109
108
|
|
|
110
109
|
for kw in norm_keywords:
|
|
111
|
-
if not kw: continue
|
|
112
|
-
# plain substring search, no regex
|
|
113
110
|
m = text_norm.str.contains(kw, na=False, regex=False)
|
|
114
111
|
overall_mask |= m
|
|
115
112
|
score = score.add(m.astype("int8"), fill_value=0)
|
|
116
113
|
|
|
117
|
-
# Filter and sort by relevance
|
|
118
114
|
result = datasets.loc[overall_mask].copy()
|
|
119
115
|
result["_match_score"] = score.loc[overall_mask]
|
|
120
116
|
result = result.sort_values("_match_score", ascending=False)
|
|
121
117
|
|
|
122
|
-
return result[[
|
|
118
|
+
return result[["agencyID", "dataflowID", "name", "description"]]
|
|
@@ -1,113 +1,115 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: notoecd
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
|
|
5
|
-
Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
|
|
6
|
-
License-Expression: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/dani-37/notoecd
|
|
8
|
-
Requires-Python: >=3.10
|
|
9
|
-
Description-Content-Type: text/markdown
|
|
10
|
-
|
|
11
|
-
Requires-Dist:
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
-
|
|
24
|
-
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
-
|
|
55
|
-
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
'
|
|
99
|
-
'
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: notoecd
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
|
|
5
|
+
Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/dani-37/notoecd
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: pandas>=2.0
|
|
12
|
+
Requires-Dist: requests>=2.31
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# notoecd
|
|
16
|
+
|
|
17
|
+
⚠️ **Unofficial package, not endorsed by the OECD.**
|
|
18
|
+
|
|
19
|
+
A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
|
|
20
|
+
|
|
21
|
+
The package provides utilities for:
|
|
22
|
+
|
|
23
|
+
- Discovering dataset metadata
|
|
24
|
+
- Searching for relevant datasets using keyword matching
|
|
25
|
+
- Exploring the structure and code lists of a dataset
|
|
26
|
+
- Fetching filtered SDMX data directly into a pandas DataFrame
|
|
27
|
+
|
|
28
|
+
------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
You can install the package by running:
|
|
33
|
+
|
|
34
|
+
pip install notoecd
|
|
35
|
+
|
|
36
|
+
------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
import notoecd
|
|
41
|
+
|
|
42
|
+
The main functions in this module are:
|
|
43
|
+
|
|
44
|
+
search_keywords(keywords) -> pd.DataFrame
|
|
45
|
+
get_structure(agencyID, dataflowID) -> Structure
|
|
46
|
+
get_df(agencyID, dataflowID, filters) -> pd.DataFrame
|
|
47
|
+
|
|
48
|
+
------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
## Searching for datasets
|
|
51
|
+
|
|
52
|
+
`search_keywords` performs:
|
|
53
|
+
|
|
54
|
+
- Normalized text matching
|
|
55
|
+
- Accent-insensitive search
|
|
56
|
+
- Multi-keyword OR matching
|
|
57
|
+
- Ranking by number of matched keywords
|
|
58
|
+
|
|
59
|
+
Example:
|
|
60
|
+
|
|
61
|
+
hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
|
|
62
|
+
|
|
63
|
+
This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
|
|
64
|
+
|
|
65
|
+
------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
## Inspecting dataset structure
|
|
68
|
+
|
|
69
|
+
Once a dataset is identified, load its SDMX structure:
|
|
70
|
+
|
|
71
|
+
dataset = 'Gross domestic product - Regions'
|
|
72
|
+
agencyID = 'OECD.CFE.EDS'
|
|
73
|
+
dataflowID = 'DSD_REG_ECO@DF_GDP'
|
|
74
|
+
|
|
75
|
+
s = notoecd.get_structure(agencyID, dataflowID)
|
|
76
|
+
|
|
77
|
+
### Table of contents
|
|
78
|
+
|
|
79
|
+
s.toc
|
|
80
|
+
|
|
81
|
+
This shows all filters and their available values.
|
|
82
|
+
|
|
83
|
+
### Exploring code values
|
|
84
|
+
|
|
85
|
+
s.explain_vals('MEASURE')
|
|
86
|
+
s.explain_vals('UNIT_MEASURE')
|
|
87
|
+
|
|
88
|
+
This shows the available measures and units used in the dataset.
|
|
89
|
+
|
|
90
|
+
------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
## Filtering and downloading data
|
|
93
|
+
|
|
94
|
+
To download data, build a dictionary of filters.
|
|
95
|
+
Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
|
|
96
|
+
|
|
97
|
+
filters = {
|
|
98
|
+
'territorial_level': ['tl2', 'tl3'],
|
|
99
|
+
'measure': 'gdp',
|
|
100
|
+
'prices': 'Q',
|
|
101
|
+
'unit_measure': 'USD_PPP_PS'
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
Fetch the filtered dataset:
|
|
105
|
+
|
|
106
|
+
df = notoecd.get_df(agency, dataflow, filters)
|
|
107
|
+
df.head()
|
|
108
|
+
|
|
109
|
+
The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
|
|
110
|
+
|
|
111
|
+
------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
## Examples
|
|
114
|
+
|
|
115
|
+
You can see this full example as a notebook called example.ipynb.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
notoecd/__init__.py,sha256=gW8v_eB09ROOISMPTDD8pwSg0xtvNRZ-06lD2Q76Xb8,189
|
|
2
|
+
notoecd/calls.py,sha256=SFM4kerc-K43Yo6oDBCsnvCIpN2Bg0-sHKpRfAujS-o,1496
|
|
3
|
+
notoecd/datasets.py,sha256=c8iz2HzWyCGGQINNnzlHG-kJMqsDKFbDObvK11QZU0Y,3751
|
|
4
|
+
notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
|
|
5
|
+
notoecd-0.1.3.dist-info/licenses/LICENSE,sha256=jb9xgeCKfW-VCXFQtYmiM_SZ9tC2zPGtOIVsE5G17W8,1076
|
|
6
|
+
notoecd-0.1.3.dist-info/METADATA,sha256=oHUyrIqCgzELJHZPrJ3cQKvpwWbR25d9YItSy5X8k6c,3123
|
|
7
|
+
notoecd-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
notoecd-0.1.3.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
|
|
9
|
+
notoecd-0.1.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Daniel Vegara Balsa
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
notoecd-0.1.2.dist-info/RECORD
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
notoecd/__init__.py,sha256=J3-DA__Z9hHzp6eOszRMFh4VE6tk1YtSNigW7el4qDM,189
|
|
2
|
-
notoecd/calls.py,sha256=CDx-1wJ4myXtoihIfTvjHoXBvIwylvv7AdN_UL5gnF4,1492
|
|
3
|
-
notoecd/datasets.py,sha256=a1L45vie6q4cjNXAued2gQ4oE4Fbpk25kdhDTexMuAI,4158
|
|
4
|
-
notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
|
|
5
|
-
notoecd-0.1.2.dist-info/METADATA,sha256=kt6OjNXdlGpn5K-_WQm-z-GLKbbPLgF3mq0C7YSGNho,3192
|
|
6
|
-
notoecd-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
-
notoecd-0.1.2.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
|
|
8
|
-
notoecd-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|