notoecd 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
notoecd/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.1"
1
+ __version__ = "0.1.3"
2
2
 
3
3
  from .calls import get_df
4
4
  from .structure import get_structure
notoecd/calls.py CHANGED
@@ -3,7 +3,7 @@ from typing import Union
3
3
  from functools import lru_cache
4
4
  from .structure import get_structure
5
5
 
6
- @lru_cache(maxsize=256)
6
+ @lru_cache(maxsize=64)
7
7
  def _fetch_df(url: str) -> pd.DataFrame:
8
8
  return pd.read_csv(url, storage_options={"User-Agent": "Mozilla/5.0"})
9
9
 
@@ -38,7 +38,7 @@ def _build_filter_expression(
38
38
  def get_df(
39
39
  agencyID: str,
40
40
  dataflowID: str,
41
- filters: Union[str, dict],
41
+ filters: Union[str, dict] = "",
42
42
  version: str = "",
43
43
  ) -> pd.DataFrame:
44
44
 
notoecd/datasets.py CHANGED
@@ -3,7 +3,6 @@ import html
3
3
  import requests
4
4
  import unicodedata
5
5
  import pandas as pd
6
- from typing import Union, List
7
6
  import xml.etree.ElementTree as ET
8
7
 
9
8
  url = "https://sdmx.oecd.org/public/rest/dataflow/all"
@@ -18,45 +17,53 @@ NS = {
18
17
  _ws_re = re.compile(r"\s+")
19
18
  _tag_re = re.compile(r"<[^>]+>")
20
19
 
21
- def _clean_text(s: str | None) -> str | None:
20
+ def _clean(s: str | None) -> str | None:
22
21
  if s is None: return None
23
22
  s = html.unescape(s)
24
23
  s = _tag_re.sub("", s)
25
24
  s = _ws_re.sub(" ", s).strip()
26
25
  return s or None
27
26
 
28
- headers = {
29
- "Accept": "application/vnd.sdmx.structure+xml;version=2.1"
30
- }
27
+ # Cache
28
+ _datasets: pd.DataFrame | None = None
31
29
 
32
- r = requests.get(url, headers=headers, timeout=30)
33
- r.raise_for_status()
34
- root = ET.fromstring(r.content)
35
-
36
- rows = []
37
- for df in root.findall(".//structure:Dataflow", NS):
38
- dataflow_id = df.attrib.get("id")
39
- agency_id = df.attrib.get("agencyID")
40
-
41
- name_elem = df.find("common:Name[@xml:lang='en']", NS)
42
- desc_elem = df.find("common:Description[@xml:lang='en']", NS)
43
-
44
- name = _clean_text("".join(name_elem.itertext())) if name_elem is not None else None
45
- desc_raw = "".join(desc_elem.itertext()) if desc_elem is not None else None
46
- desc = _clean_text(desc_raw)
47
-
48
- rows.append(
49
- {
50
- "dataflowID": dataflow_id,
51
- "agencyID": agency_id,
52
- "name": name,
53
- "description": desc,
54
- }
55
- )
56
-
57
- datasets = pd.DataFrame(rows)
30
+ def _load_datasets() -> pd.DataFrame:
31
+ """
32
+ Loads OECD datasets and keeps them in memory.
33
+ """
34
+ global _datasets
35
+ if _datasets is not None: return _datasets
36
+
37
+ headers = {"Accept": "application/vnd.sdmx.structure+xml;version=2.1"}
38
+ r = requests.get(url, headers=headers, timeout=30)
39
+ r.raise_for_status()
40
+ root = ET.fromstring(r.content)
41
+
42
+ rows = []
43
+ for df in root.findall(".//structure:Dataflow", NS):
44
+ dataflow_id = df.attrib.get("id")
45
+ agency_id = df.attrib.get("agencyID")
46
+
47
+ name_elem = df.find("common:Name[@xml:lang='en']", NS)
48
+ desc_elem = df.find("common:Description[@xml:lang='en']", NS)
49
+
50
+ name = _clean("".join(name_elem.itertext())) if name_elem is not None else None
51
+ desc_raw = "".join(desc_elem.itertext()) if desc_elem is not None else None
52
+ desc = _clean(desc_raw)
53
+
54
+ rows.append(
55
+ {
56
+ "dataflowID": dataflow_id,
57
+ "agencyID": agency_id,
58
+ "name": name,
59
+ "description": desc,
60
+ }
61
+ )
62
+
63
+ _datasets = pd.DataFrame(rows)
64
+ return _datasets
58
65
 
59
- def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
66
+ def search_keywords(*keywords: str) -> pd.DataFrame:
60
67
  """
61
68
  Searches OECD datasets for a set of keywords.
62
69
 
@@ -66,14 +73,12 @@ def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
66
73
  Returns:
67
74
  pd.DataFrame: Matching rows.
68
75
  """
76
+ datasets = _load_datasets()
69
77
 
70
- # Normalize keywords input
71
- if isinstance(keywords, str): keywords = [keywords]
72
- elif not isinstance(keywords, list): raise TypeError("keywords must be a string or list of strings")
73
-
74
- # Clean and drop empty keywords
78
+ # Clean and validate keywords
75
79
  keywords = [k for k in keywords if isinstance(k, str) and k.strip()]
76
- if not keywords: raise ValueError("No valid keywords provided.")
80
+ if not keywords:
81
+ raise ValueError("No valid keywords provided.")
77
82
 
78
83
  def _normalize_series(s: pd.Series) -> pd.Series:
79
84
  s = s.fillna("").astype(str).str.lower()
@@ -85,31 +90,29 @@ def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
85
90
  )
86
91
 
87
92
  # Combined normalized text for each row
88
- text = datasets["name"].fillna("").astype(str) + " " + datasets["description"].fillna("").astype(str)
93
+ text = (
94
+ datasets["name"].fillna("").astype(str)
95
+ + " "
96
+ + datasets["description"].fillna("").astype(str)
97
+ )
89
98
  text_norm = _normalize_series(text)
90
99
 
91
- # Normalize keywords similarly
92
100
  def _normalize_kw(kw: str) -> str:
93
- kw = kw.lower()
94
- kw = unicodedata.normalize("NFKD", kw)
101
+ kw = unicodedata.normalize("NFKD", kw.lower())
95
102
  return "".join(ch for ch in kw if not unicodedata.combining(ch))
96
103
 
97
104
  norm_keywords = [_normalize_kw(k) for k in keywords]
98
105
 
99
- # Vectorized OR search + simple score = count of matching keywords
100
106
  overall_mask = pd.Series(False, index=datasets.index)
101
107
  score = pd.Series(0, index=datasets.index, dtype="int64")
102
108
 
103
109
  for kw in norm_keywords:
104
- if not kw: continue
105
- # plain substring search, no regex
106
110
  m = text_norm.str.contains(kw, na=False, regex=False)
107
111
  overall_mask |= m
108
112
  score = score.add(m.astype("int8"), fill_value=0)
109
113
 
110
- # Filter and sort by relevance
111
114
  result = datasets.loc[overall_mask].copy()
112
115
  result["_match_score"] = score.loc[overall_mask]
113
116
  result = result.sort_values("_match_score", ascending=False)
114
117
 
115
- return result[['agencyID', 'dataflowID', 'name', 'description']]
118
+ return result[["agencyID", "dataflowID", "name", "description"]]
@@ -1,113 +1,115 @@
1
- Metadata-Version: 2.4
2
- Name: notoecd
3
- Version: 0.1.1
4
- Summary: Library for interacting with the OECD Data Explorer through Python
5
- Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/dani-37/notoecd
8
- Requires-Python: >=3.10
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: pandas>=2.0
11
- Requires-Dist: requests>=2.31
12
-
13
- # notoecd
14
-
15
- ⚠️ **Unofficial package, not endorsed by the OECD.**
16
-
17
- A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
18
- The package provides utilities for:
19
-
20
- - Discovering dataset metadata
21
- - Searching for relevant datasets using keyword matching
22
- - Exploring the structure and code lists of a dataset
23
- - Fetching filtered SDMX data directly into a pandas DataFrame
24
-
25
- ------------------------------------------------------------
26
-
27
- ## Installation
28
-
29
- You can install the package by running:
30
-
31
- pip install notoecd
32
-
33
- ------------------------------------------------------------
34
-
35
- ## Quick Start
36
-
37
- import notoecd
38
-
39
- The main functions in this module are:
40
-
41
- search_keywords(keywords) -> pd.DataFrame
42
- get_structure(agencyID, dataflowID) -> Structure
43
- get_df(agencyID, dataflowID, filters) -> pd.DataFrame
44
-
45
- ------------------------------------------------------------
46
-
47
- ## Searching for datasets
48
-
49
- `search_keywords` performs:
50
-
51
- - Normalized text matching
52
- - Accent-insensitive search
53
- - Multi-keyword OR matching
54
- - Ranking by number of matched keywords
55
-
56
- Example:
57
-
58
- hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
59
-
60
- This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
61
-
62
- ------------------------------------------------------------
63
-
64
- ## Inspecting dataset structure
65
-
66
- Once a dataset is identified, load its SDMX structure:
67
-
68
- dataset = 'Gross domestic product - Regions'
69
- agencyID = 'OECD.CFE.EDS'
70
- dataflowID = 'DSD_REG_ECO@DF_GDP'
71
-
72
- s = notoecd.get_structure(agencyID, dataflowID)
73
-
74
- ### Table of contents
75
-
76
- s.toc
77
-
78
- This shows all filters and their available values.
79
-
80
- ### Exploring code values
81
-
82
- s.explain_vals('MEASURE')
83
- s.explain_vals('UNIT_MEASURE')
84
-
85
- This shows the available measures and units used in the dataset.
86
-
87
- ------------------------------------------------------------
88
-
89
- ## Filtering and downloading data
90
-
91
- To download data, build a dictionary of filters.
92
- Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
93
-
94
- filters = {
95
- 'territorial_level': ['tl2', 'tl3'],
96
- 'measure': 'gdp',
97
- 'prices': 'Q',
98
- 'unit_measure': 'USD_PPP_PS'
99
- }
100
-
101
- Fetch the filtered dataset:
102
-
103
- df = notoecd.get_df(agency, dataflow, filters)
104
- df.head()
105
-
106
- The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
107
-
108
- ------------------------------------------------------------
109
-
110
- ## Examples
111
-
112
- You can see this full example as a notebook called example.ipynb.
113
-
1
+ Metadata-Version: 2.4
2
+ Name: notoecd
3
+ Version: 0.1.3
4
+ Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
5
+ Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/dani-37/notoecd
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: pandas>=2.0
12
+ Requires-Dist: requests>=2.31
13
+ Dynamic: license-file
14
+
15
+ # notoecd
16
+
17
+ ⚠️ **Unofficial package, not endorsed by the OECD.**
18
+
19
+ A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
20
+
21
+ The package provides utilities for:
22
+
23
+ - Discovering dataset metadata
24
+ - Searching for relevant datasets using keyword matching
25
+ - Exploring the structure and code lists of a dataset
26
+ - Fetching filtered SDMX data directly into a pandas DataFrame
27
+
28
+ ------------------------------------------------------------
29
+
30
+ ## Installation
31
+
32
+ You can install the package by running:
33
+
34
+ pip install notoecd
35
+
36
+ ------------------------------------------------------------
37
+
38
+ ## Quick Start
39
+
40
+ import notoecd
41
+
42
+ The main functions in this module are:
43
+
44
+ search_keywords(keywords) -> pd.DataFrame
45
+ get_structure(agencyID, dataflowID) -> Structure
46
+ get_df(agencyID, dataflowID, filters) -> pd.DataFrame
47
+
48
+ ------------------------------------------------------------
49
+
50
+ ## Searching for datasets
51
+
52
+ `search_keywords` performs:
53
+
54
+ - Normalized text matching
55
+ - Accent-insensitive search
56
+ - Multi-keyword OR matching
57
+ - Ranking by number of matched keywords
58
+
59
+ Example:
60
+
61
+ hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
62
+
63
+ This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
64
+
65
+ ------------------------------------------------------------
66
+
67
+ ## Inspecting dataset structure
68
+
69
+ Once a dataset is identified, load its SDMX structure:
70
+
71
+ dataset = 'Gross domestic product - Regions'
72
+ agencyID = 'OECD.CFE.EDS'
73
+ dataflowID = 'DSD_REG_ECO@DF_GDP'
74
+
75
+ s = notoecd.get_structure(agencyID, dataflowID)
76
+
77
+ ### Table of contents
78
+
79
+ s.toc
80
+
81
+ This shows all filters and their available values.
82
+
83
+ ### Exploring code values
84
+
85
+ s.explain_vals('MEASURE')
86
+ s.explain_vals('UNIT_MEASURE')
87
+
88
+ This shows the available measures and units used in the dataset.
89
+
90
+ ------------------------------------------------------------
91
+
92
+ ## Filtering and downloading data
93
+
94
+ To download data, build a dictionary of filters.
95
+ Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
96
+
97
+ filters = {
98
+ 'territorial_level': ['tl2', 'tl3'],
99
+ 'measure': 'gdp',
100
+ 'prices': 'Q',
101
+ 'unit_measure': 'USD_PPP_PS'
102
+ }
103
+
104
+ Fetch the filtered dataset:
105
+
106
+ df = notoecd.get_df(agency, dataflow, filters)
107
+ df.head()
108
+
109
+ The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
110
+
111
+ ------------------------------------------------------------
112
+
113
+ ## Examples
114
+
115
+ You can see this full example as a notebook called example.ipynb.
@@ -0,0 +1,9 @@
1
+ notoecd/__init__.py,sha256=gW8v_eB09ROOISMPTDD8pwSg0xtvNRZ-06lD2Q76Xb8,189
2
+ notoecd/calls.py,sha256=SFM4kerc-K43Yo6oDBCsnvCIpN2Bg0-sHKpRfAujS-o,1496
3
+ notoecd/datasets.py,sha256=c8iz2HzWyCGGQINNnzlHG-kJMqsDKFbDObvK11QZU0Y,3751
4
+ notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
5
+ notoecd-0.1.3.dist-info/licenses/LICENSE,sha256=jb9xgeCKfW-VCXFQtYmiM_SZ9tC2zPGtOIVsE5G17W8,1076
6
+ notoecd-0.1.3.dist-info/METADATA,sha256=oHUyrIqCgzELJHZPrJ3cQKvpwWbR25d9YItSy5X8k6c,3123
7
+ notoecd-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
8
+ notoecd-0.1.3.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
9
+ notoecd-0.1.3.dist-info/RECORD,,
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Daniel Vegara Balsa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,8 +0,0 @@
1
- notoecd/__init__.py,sha256=134t-qMBmEQN_mtpGOSfQ5zr128wWT68J0e5RIBzN0g,189
2
- notoecd/calls.py,sha256=CDx-1wJ4myXtoihIfTvjHoXBvIwylvv7AdN_UL5gnF4,1492
3
- notoecd/datasets.py,sha256=nxLNP0L28mXTKYpdR6BQN5Tk6CKoQS7dygm8twnIVSg,3845
4
- notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
5
- notoecd-0.1.1.dist-info/METADATA,sha256=YphZ-il5WvyxK0u1w3cI_vdH2uUdCLaWd3ms981SovI,3180
6
- notoecd-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- notoecd-0.1.1.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
8
- notoecd-0.1.1.dist-info/RECORD,,