notoecd 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
notoecd/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.2"
1
+ __version__ = "0.1.4"
2
2
 
3
3
  from .calls import get_df
4
4
  from .structure import get_structure
notoecd/calls.py CHANGED
@@ -3,7 +3,7 @@ from typing import Union
3
3
  from functools import lru_cache
4
4
  from .structure import get_structure
5
5
 
6
- @lru_cache(maxsize=256)
6
+ @lru_cache(maxsize=64)
7
7
  def _fetch_df(url: str) -> pd.DataFrame:
8
8
  return pd.read_csv(url, storage_options={"User-Agent": "Mozilla/5.0"})
9
9
 
@@ -38,7 +38,7 @@ def _build_filter_expression(
38
38
  def get_df(
39
39
  agencyID: str,
40
40
  dataflowID: str,
41
- filters: Union[str, dict],
41
+ filters: Union[str, dict] = "",
42
42
  version: str = "",
43
43
  ) -> pd.DataFrame:
44
44
 
notoecd/datasets.py CHANGED
@@ -3,7 +3,6 @@ import html
3
3
  import requests
4
4
  import unicodedata
5
5
  import pandas as pd
6
- from typing import Union, List
7
6
  import xml.etree.ElementTree as ET
8
7
 
9
8
  url = "https://sdmx.oecd.org/public/rest/dataflow/all"
@@ -64,7 +63,7 @@ def _load_datasets() -> pd.DataFrame:
64
63
  _datasets = pd.DataFrame(rows)
65
64
  return _datasets
66
65
 
67
- def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
66
+ def search_keywords(*keywords: str) -> pd.DataFrame:
68
67
  """
69
68
  Searches OECD datasets for a set of keywords.
70
69
 
@@ -76,47 +75,44 @@ def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
76
75
  """
77
76
  datasets = _load_datasets()
78
77
 
79
- # Normalize keywords input
80
- if isinstance(keywords, str): keywords = [keywords]
81
- elif not isinstance(keywords, list): raise TypeError("keywords must be a string or list of strings")
82
-
83
- # Clean and drop empty keywords
78
+ # Clean and validate keywords
84
79
  keywords = [k for k in keywords if isinstance(k, str) and k.strip()]
85
- if not keywords: raise ValueError("No valid keywords provided.")
80
+ if not keywords:
81
+ raise ValueError("No valid keywords provided.")
86
82
 
87
83
  def _normalize_series(s: pd.Series) -> pd.Series:
88
84
  s = s.fillna("").astype(str).str.lower()
89
85
  return s.map(
90
- lambda x: "".join(ch for ch in unicodedata.normalize("NFKD", x)
91
- if not unicodedata.combining(ch))
86
+ lambda x: "".join(
87
+ ch for ch in unicodedata.normalize("NFKD", x)
88
+ if not unicodedata.combining(ch)
89
+ )
92
90
  )
93
91
 
94
92
  # Combined normalized text for each row
95
- text = datasets["name"].fillna("").astype(str) + " " + datasets["description"].fillna("").astype(str)
93
+ text = (
94
+ datasets["name"].fillna("").astype(str)
95
+ + " "
96
+ + datasets["description"].fillna("").astype(str)
97
+ )
96
98
  text_norm = _normalize_series(text)
97
99
 
98
- # Normalize keywords similarly
99
100
  def _normalize_kw(kw: str) -> str:
100
- kw = kw.lower()
101
- kw = unicodedata.normalize("NFKD", kw)
101
+ kw = unicodedata.normalize("NFKD", kw.lower())
102
102
  return "".join(ch for ch in kw if not unicodedata.combining(ch))
103
103
 
104
104
  norm_keywords = [_normalize_kw(k) for k in keywords]
105
105
 
106
- # Vectorized OR search + simple score = count of matching keywords
107
106
  overall_mask = pd.Series(False, index=datasets.index)
108
107
  score = pd.Series(0, index=datasets.index, dtype="int64")
109
108
 
110
109
  for kw in norm_keywords:
111
- if not kw: continue
112
- # plain substring search, no regex
113
110
  m = text_norm.str.contains(kw, na=False, regex=False)
114
111
  overall_mask |= m
115
112
  score = score.add(m.astype("int8"), fill_value=0)
116
113
 
117
- # Filter and sort by relevance
118
114
  result = datasets.loc[overall_mask].copy()
119
115
  result["_match_score"] = score.loc[overall_mask]
120
116
  result = result.sort_values("_match_score", ascending=False)
121
117
 
122
- return result[['agencyID', 'dataflowID', 'name', 'description']]
118
+ return result[["agencyID", "dataflowID", "name", "description"]]
@@ -1,113 +1,115 @@
1
- Metadata-Version: 2.4
2
- Name: notoecd
3
- Version: 0.1.2
4
- Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
5
- Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/dani-37/notoecd
8
- Requires-Python: >=3.10
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: pandas>=2.0
11
- Requires-Dist: requests>=2.31
12
-
13
- # notoecd
14
-
15
- ⚠️ **Unofficial package, not endorsed by the OECD.**
16
-
17
- A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
18
-
19
- The package provides utilities for:
20
-
21
- - Discovering dataset metadata
22
- - Searching for relevant datasets using keyword matching
23
- - Exploring the structure and code lists of a dataset
24
- - Fetching filtered SDMX data directly into a pandas DataFrame
25
-
26
- ------------------------------------------------------------
27
-
28
- ## Installation
29
-
30
- You can install the package by running:
31
-
32
- pip install notoecd
33
-
34
- ------------------------------------------------------------
35
-
36
- ## Quick Start
37
-
38
- import notoecd
39
-
40
- The main functions in this module are:
41
-
42
- search_keywords(keywords) -> pd.DataFrame
43
- get_structure(agencyID, dataflowID) -> Structure
44
- get_df(agencyID, dataflowID, filters) -> pd.DataFrame
45
-
46
- ------------------------------------------------------------
47
-
48
- ## Searching for datasets
49
-
50
- `search_keywords` performs:
51
-
52
- - Normalized text matching
53
- - Accent-insensitive search
54
- - Multi-keyword OR matching
55
- - Ranking by number of matched keywords
56
-
57
- Example:
58
-
59
- hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
60
-
61
- This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
62
-
63
- ------------------------------------------------------------
64
-
65
- ## Inspecting dataset structure
66
-
67
- Once a dataset is identified, load its SDMX structure:
68
-
69
- dataset = 'Gross domestic product - Regions'
70
- agencyID = 'OECD.CFE.EDS'
71
- dataflowID = 'DSD_REG_ECO@DF_GDP'
72
-
73
- s = notoecd.get_structure(agencyID, dataflowID)
74
-
75
- ### Table of contents
76
-
77
- s.toc
78
-
79
- This shows all filters and their available values.
80
-
81
- ### Exploring code values
82
-
83
- s.explain_vals('MEASURE')
84
- s.explain_vals('UNIT_MEASURE')
85
-
86
- This shows the available measures and units used in the dataset.
87
-
88
- ------------------------------------------------------------
89
-
90
- ## Filtering and downloading data
91
-
92
- To download data, build a dictionary of filters.
93
- Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
94
-
95
- filters = {
96
- 'territorial_level': ['tl2', 'tl3'],
97
- 'measure': 'gdp',
98
- 'prices': 'Q',
99
- 'unit_measure': 'USD_PPP_PS'
100
- }
101
-
102
- Fetch the filtered dataset:
103
-
104
- df = notoecd.get_df(agency, dataflow, filters)
105
- df.head()
106
-
107
- The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
108
-
109
- ------------------------------------------------------------
110
-
111
- ## Examples
112
-
113
- You can see this full example as a notebook called example.ipynb.
1
+ Metadata-Version: 2.4
2
+ Name: notoecd
3
+ Version: 0.1.4
4
+ Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
5
+ Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/dani-37/notoecd
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: pandas>=2.0
12
+ Requires-Dist: requests>=2.31
13
+ Dynamic: license-file
14
+
15
+ # notoecd
16
+
17
+ ⚠️ **Unofficial package, not endorsed by the OECD.**
18
+
19
+ A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
20
+
21
+ The package provides utilities for:
22
+
23
+ - Discovering dataset metadata
24
+ - Searching for relevant datasets using keyword matching
25
+ - Exploring the structure and code lists of a dataset
26
+ - Fetching filtered SDMX data directly into a pandas DataFrame
27
+
28
+ ---
29
+
30
+ ## Installation
31
+
32
+ You can install the package by running:
33
+
34
+ pip install notoecd
35
+
36
+ ---
37
+
38
+ ## Quick Start
39
+
40
+ import notoecd
41
+
42
+ The main functions in this module are:
43
+
44
+ search_keywords(keywords) -> pd.DataFrame
45
+ get_structure(agencyID, dataflowID) -> Structure
46
+ get_df(agencyID, dataflowID, filters) -> pd.DataFrame
47
+
48
+ ---
49
+
50
+ ## Searching for datasets
51
+
52
+ `search_keywords` performs:
53
+
54
+ - Normalized text matching
55
+ - Accent-insensitive search
56
+ - Multi-keyword OR matching
57
+ - Ranking by number of matched keywords
58
+
59
+ Example:
60
+
61
+ hits = notoecd.search_keywords('gross domestic product', 'tl2', 'tl3')
62
+
63
+ This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
64
+
65
+ ---
66
+
67
+ ## Inspecting dataset structure
68
+
69
+ Once a dataset is identified, load its SDMX structure:
70
+
71
+ dataset = 'Gross domestic product - Regions'
72
+ agencyID = 'OECD.CFE.EDS'
73
+ dataflowID = 'DSD_REG_ECO@DF_GDP'
74
+
75
+ s = notoecd.get_structure(agencyID, dataflowID)
76
+
77
+ ### Table of contents
78
+
79
+ s.toc
80
+
81
+ This shows all filters and their available values.
82
+
83
+ ### Exploring code values
84
+
85
+ s.explain_vals('MEASURE')
86
+ s.explain_vals('UNIT_MEASURE')
87
+
88
+ This shows the available measures and units used in the dataset.
89
+
90
+ ---
91
+
92
+ ## Filtering and downloading data
93
+
94
+ To download data, build a dictionary of filters.
95
+ Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
96
+
97
+ filters = {
98
+ 'territorial_level': ['tl2', 'tl3'],
99
+ 'measure': 'gdp',
100
+ 'prices': 'Q',
101
+ 'unit_measure': 'USD_PPP_PS'
102
+ }
103
+
104
+ Fetch the filtered dataset:
105
+
106
+ df = notoecd.get_df(agency, dataflow, filters)
107
+ df.head()
108
+
109
+ The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
110
+
111
+ ---
112
+
113
+ ## Examples
114
+
115
+ You can see this full example as a notebook called example.ipynb.
@@ -0,0 +1,9 @@
1
+ notoecd/__init__.py,sha256=8_cXiMZN0gino3W2Wat06ncVcAR8XpCBgC3Q7vEbHjQ,189
2
+ notoecd/calls.py,sha256=SFM4kerc-K43Yo6oDBCsnvCIpN2Bg0-sHKpRfAujS-o,1496
3
+ notoecd/datasets.py,sha256=c8iz2HzWyCGGQINNnzlHG-kJMqsDKFbDObvK11QZU0Y,3751
4
+ notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
5
+ notoecd-0.1.4.dist-info/licenses/LICENSE,sha256=jb9xgeCKfW-VCXFQtYmiM_SZ9tC2zPGtOIVsE5G17W8,1076
6
+ notoecd-0.1.4.dist-info/METADATA,sha256=hDuKZgO-urzF1ZHaqFIco36OFN435-b8zFf3lkwhs9c,2761
7
+ notoecd-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ notoecd-0.1.4.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
9
+ notoecd-0.1.4.dist-info/RECORD,,
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Daniel Vegara Balsa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,8 +0,0 @@
1
- notoecd/__init__.py,sha256=J3-DA__Z9hHzp6eOszRMFh4VE6tk1YtSNigW7el4qDM,189
2
- notoecd/calls.py,sha256=CDx-1wJ4myXtoihIfTvjHoXBvIwylvv7AdN_UL5gnF4,1492
3
- notoecd/datasets.py,sha256=a1L45vie6q4cjNXAued2gQ4oE4Fbpk25kdhDTexMuAI,4158
4
- notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
5
- notoecd-0.1.2.dist-info/METADATA,sha256=kt6OjNXdlGpn5K-_WQm-z-GLKbbPLgF3mq0C7YSGNho,3192
6
- notoecd-0.1.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- notoecd-0.1.2.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
8
- notoecd-0.1.2.dist-info/RECORD,,