PyPI - notoecd - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

notoecd 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

notoecd/__init__.py +1 -1
notoecd/calls.py +2 -2
notoecd/datasets.py +50 -47
{notoecd-0.1.1.dist-info → notoecd-0.1.3.dist-info}/METADATA +115 -113
notoecd-0.1.3.dist-info/RECORD +9 -0
notoecd-0.1.3.dist-info/licenses/LICENSE +21 -0
notoecd-0.1.1.dist-info/RECORD +0 -8
{notoecd-0.1.1.dist-info → notoecd-0.1.3.dist-info}/WHEEL +0 -0
{notoecd-0.1.1.dist-info → notoecd-0.1.3.dist-info}/top_level.txt +0 -0

notoecd/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.1"
+__version__ = "0.1.3"
 from .calls import get_df
 from .structure import get_structure

notoecd/calls.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Union
 from functools import lru_cache
 from .structure import get_structure
-@lru_cache(maxsize=256)
+@lru_cache(maxsize=64)
 def _fetch_df(url: str) -> pd.DataFrame:
     return pd.read_csv(url, storage_options={"User-Agent": "Mozilla/5.0"})
@@ -38,7 +38,7 @@ def _build_filter_expression(
 def get_df(
     agencyID: str,
     dataflowID: str,
-    filters: Union[str, dict],
+    filters: Union[str, dict] = "",
     version: str = "",
 ) -> pd.DataFrame:

notoecd/datasets.py CHANGED Viewed

@@ -3,7 +3,6 @@ import html
 import requests
 import unicodedata
 import pandas as pd
-from typing import Union, List
 import xml.etree.ElementTree as ET
 url = "https://sdmx.oecd.org/public/rest/dataflow/all"
@@ -18,45 +17,53 @@ NS = {
 _ws_re = re.compile(r"\s+")
 _tag_re = re.compile(r"<[^>]+>")
-def _clean_text(s: str | None) -> str | None:
+def _clean(s: str | None) -> str | None:
     if s is None: return None
     s = html.unescape(s)
     s = _tag_re.sub("", s)
     s = _ws_re.sub(" ", s).strip()
     return s or None
-headers = {
-    "Accept": "application/vnd.sdmx.structure+xml;version=2.1"
-}
+# Cache
+_datasets: pd.DataFrame | None = None
-r = requests.get(url, headers=headers, timeout=30)
-r.raise_for_status()
-root = ET.fromstring(r.content)
-rows = []
-for df in root.findall(".//structure:Dataflow", NS):
-    dataflow_id = df.attrib.get("id")
-    agency_id = df.attrib.get("agencyID")
-    name_elem = df.find("common:Name[@xml:lang='en']", NS)
-    desc_elem = df.find("common:Description[@xml:lang='en']", NS)
-    name = _clean_text("".join(name_elem.itertext())) if name_elem is not None else None
-    desc_raw = "".join(desc_elem.itertext()) if desc_elem is not None else None
-    desc = _clean_text(desc_raw)
-    rows.append(
-        {
-            "dataflowID": dataflow_id,
-            "agencyID": agency_id,
-            "name": name,
-            "description": desc,
-        }
-    )
-datasets = pd.DataFrame(rows)
+def _load_datasets() -> pd.DataFrame:
+    """
+    Loads OECD datasets and keeps them in memory.
+    """
+    global _datasets
+    if _datasets is not None: return _datasets
+    headers = {"Accept": "application/vnd.sdmx.structure+xml;version=2.1"}
+    r = requests.get(url, headers=headers, timeout=30)
+    r.raise_for_status()
+    root = ET.fromstring(r.content)
+    rows = []
+    for df in root.findall(".//structure:Dataflow", NS):
+        dataflow_id = df.attrib.get("id")
+        agency_id = df.attrib.get("agencyID")
+        name_elem = df.find("common:Name[@xml:lang='en']", NS)
+        desc_elem = df.find("common:Description[@xml:lang='en']", NS)
+        name = _clean("".join(name_elem.itertext())) if name_elem is not None else None
+        desc_raw = "".join(desc_elem.itertext()) if desc_elem is not None else None
+        desc = _clean(desc_raw)
+        rows.append(
+            {
+                "dataflowID": dataflow_id,
+                "agencyID": agency_id,
+                "name": name,
+                "description": desc,
+            }
+        )
+    _datasets = pd.DataFrame(rows)
+    return _datasets
-def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
+def search_keywords(*keywords: str) -> pd.DataFrame:
     """
     Searches OECD datasets for a set of keywords.
@@ -66,14 +73,12 @@ def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
     Returns:
         pd.DataFrame: Matching rows.
     """
+    datasets = _load_datasets()
-    # Normalize keywords input
-    if isinstance(keywords, str): keywords = [keywords]
-    elif not isinstance(keywords, list): raise TypeError("keywords must be a string or list of strings")
-    # Clean and drop empty keywords
+    # Clean and validate keywords
     keywords = [k for k in keywords if isinstance(k, str) and k.strip()]
-    if not keywords: raise ValueError("No valid keywords provided.")
+    if not keywords:
+        raise ValueError("No valid keywords provided.")
     def _normalize_series(s: pd.Series) -> pd.Series:
         s = s.fillna("").astype(str).str.lower()
@@ -85,31 +90,29 @@ def search_keywords(keywords: Union[str, List[str]]) -> pd.DataFrame:
         )
     # Combined normalized text for each row
-    text = datasets["name"].fillna("").astype(str) + " " + datasets["description"].fillna("").astype(str)
+    text = (
+        datasets["name"].fillna("").astype(str)
+        + " "
+        + datasets["description"].fillna("").astype(str)
+    )
     text_norm = _normalize_series(text)
-    # Normalize keywords similarly
     def _normalize_kw(kw: str) -> str:
-        kw = kw.lower()
-        kw = unicodedata.normalize("NFKD", kw)
+        kw = unicodedata.normalize("NFKD", kw.lower())
         return "".join(ch for ch in kw if not unicodedata.combining(ch))
     norm_keywords = [_normalize_kw(k) for k in keywords]
-    # Vectorized OR search + simple score = count of matching keywords
     overall_mask = pd.Series(False, index=datasets.index)
     score = pd.Series(0, index=datasets.index, dtype="int64")
     for kw in norm_keywords:
-        if not kw: continue
-        # plain substring search, no regex
         m = text_norm.str.contains(kw, na=False, regex=False)
         overall_mask |= m
         score = score.add(m.astype("int8"), fill_value=0)
-    # Filter and sort by relevance
     result = datasets.loc[overall_mask].copy()
     result["_match_score"] = score.loc[overall_mask]
     result = result.sort_values("_match_score", ascending=False)
-    return result[['agencyID', 'dataflowID', 'name', 'description']]
+    return result[["agencyID", "dataflowID", "name", "description"]]

{notoecd-0.1.1.dist-info → notoecd-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,113 +1,115 @@
-Metadata-Version: 2.4
-Name: notoecd
-Version: 0.1.1
-Summary: Library for interacting with the OECD Data Explorer through Python
-Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
-License-Expression: MIT
-Project-URL: Homepage, https://github.com/dani-37/notoecd
-Requires-Python: >=3.10
-Description-Content-Type: text/markdown
-Requires-Dist: pandas>=2.0
-Requires-Dist: requests>=2.31
-# notoecd
-⚠️ **Unofficial package, not endorsed by the OECD.**
-A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
-The package provides utilities for:
-- Discovering dataset metadata
-- Searching for relevant datasets using keyword matching
-- Exploring the structure and code lists of a dataset
-- Fetching filtered SDMX data directly into a pandas DataFrame
-------------------------------------------------------------
-## Installation
-You can install the package by running:
-    pip install notoecd
-------------------------------------------------------------
-## Quick Start
-    import notoecd
-The main functions in this module are:
-    search_keywords(keywords) -> pd.DataFrame
-    get_structure(agencyID, dataflowID) -> Structure
-    get_df(agencyID, dataflowID, filters) -> pd.DataFrame
-------------------------------------------------------------
-## Searching for datasets
-`search_keywords` performs:
-- Normalized text matching
-- Accent-insensitive search
-- Multi-keyword OR matching
-- Ranking by number of matched keywords
-Example:
-    hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
-This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
-------------------------------------------------------------
-## Inspecting dataset structure
-Once a dataset is identified, load its SDMX structure:
-    dataset = 'Gross domestic product - Regions'
-    agencyID = 'OECD.CFE.EDS'
-    dataflowID = 'DSD_REG_ECO@DF_GDP'
-    s = notoecd.get_structure(agencyID, dataflowID)
-### Table of contents
-    s.toc
-This shows all filters and their available values.
-### Exploring code values
-    s.explain_vals('MEASURE')
-    s.explain_vals('UNIT_MEASURE')
-This shows the available measures and units used in the dataset.
-------------------------------------------------------------
-## Filtering and downloading data
-To download data, build a dictionary of filters.
-Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
-    filters = {
-        'territorial_level': ['tl2', 'tl3'],
-        'measure': 'gdp',
-        'prices': 'Q',
-        'unit_measure': 'USD_PPP_PS'
-    }
-Fetch the filtered dataset:
-    df = notoecd.get_df(agency, dataflow, filters)
-    df.head()
-The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
-------------------------------------------------------------
-## Examples
-You can see this full example as a notebook called example.ipynb.
+Metadata-Version: 2.4
+Name: notoecd
+Version: 0.1.3
+Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
+Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/dani-37/notoecd
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pandas>=2.0
+Requires-Dist: requests>=2.31
+Dynamic: license-file
+# notoecd
+⚠️ **Unofficial package, not endorsed by the OECD.**
+A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
+The package provides utilities for:
+- Discovering dataset metadata
+- Searching for relevant datasets using keyword matching
+- Exploring the structure and code lists of a dataset
+- Fetching filtered SDMX data directly into a pandas DataFrame
+------------------------------------------------------------
+## Installation
+You can install the package by running:
+    pip install notoecd
+------------------------------------------------------------
+## Quick Start
+    import notoecd
+The main functions in this module are:
+    search_keywords(keywords) -> pd.DataFrame
+    get_structure(agencyID, dataflowID) -> Structure
+    get_df(agencyID, dataflowID, filters) -> pd.DataFrame
+------------------------------------------------------------
+## Searching for datasets
+`search_keywords` performs:
+- Normalized text matching
+- Accent-insensitive search
+- Multi-keyword OR matching
+- Ranking by number of matched keywords
+Example:
+    hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
+This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
+------------------------------------------------------------
+## Inspecting dataset structure
+Once a dataset is identified, load its SDMX structure:
+    dataset = 'Gross domestic product - Regions'
+    agencyID = 'OECD.CFE.EDS'
+    dataflowID = 'DSD_REG_ECO@DF_GDP'
+    s = notoecd.get_structure(agencyID, dataflowID)
+### Table of contents
+    s.toc
+This shows all filters and their available values.
+### Exploring code values
+    s.explain_vals('MEASURE')
+    s.explain_vals('UNIT_MEASURE')
+This shows the available measures and units used in the dataset.
+------------------------------------------------------------
+## Filtering and downloading data
+To download data, build a dictionary of filters.
+Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
+    filters = {
+        'territorial_level': ['tl2', 'tl3'],
+        'measure': 'gdp',
+        'prices': 'Q',
+        'unit_measure': 'USD_PPP_PS'
+    }
+Fetch the filtered dataset:
+    df = notoecd.get_df(agency, dataflow, filters)
+    df.head()
+The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
+------------------------------------------------------------
+## Examples
+You can see this full example as a notebook called example.ipynb.

notoecd-0.1.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+notoecd/__init__.py,sha256=gW8v_eB09ROOISMPTDD8pwSg0xtvNRZ-06lD2Q76Xb8,189
+notoecd/calls.py,sha256=SFM4kerc-K43Yo6oDBCsnvCIpN2Bg0-sHKpRfAujS-o,1496
+notoecd/datasets.py,sha256=c8iz2HzWyCGGQINNnzlHG-kJMqsDKFbDObvK11QZU0Y,3751
+notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
+notoecd-0.1.3.dist-info/licenses/LICENSE,sha256=jb9xgeCKfW-VCXFQtYmiM_SZ9tC2zPGtOIVsE5G17W8,1076
+notoecd-0.1.3.dist-info/METADATA,sha256=oHUyrIqCgzELJHZPrJ3cQKvpwWbR25d9YItSy5X8k6c,3123
+notoecd-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+notoecd-0.1.3.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
+notoecd-0.1.3.dist-info/RECORD,,

notoecd-0.1.3.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Daniel Vegara Balsa
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

notoecd-0.1.1.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-notoecd/__init__.py,sha256=134t-qMBmEQN_mtpGOSfQ5zr128wWT68J0e5RIBzN0g,189
-notoecd/calls.py,sha256=CDx-1wJ4myXtoihIfTvjHoXBvIwylvv7AdN_UL5gnF4,1492
-notoecd/datasets.py,sha256=nxLNP0L28mXTKYpdR6BQN5Tk6CKoQS7dygm8twnIVSg,3845
-notoecd/structure.py,sha256=sq6HrjNLfK-UWr9Cuqxun_DhHLPdegX7j7pKYcEYcJM,7169
-notoecd-0.1.1.dist-info/METADATA,sha256=YphZ-il5WvyxK0u1w3cI_vdH2uUdCLaWd3ms981SovI,3180
-notoecd-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-notoecd-0.1.1.dist-info/top_level.txt,sha256=GrcbH10OAguGh5dkpzst216N_C-NtZ-QF1nlXiUpeLs,8
-notoecd-0.1.1.dist-info/RECORD,,

{notoecd-0.1.1.dist-info → notoecd-0.1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{notoecd-0.1.1.dist-info → notoecd-0.1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

notoecd 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

notoecd 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl