PyPI - notoecd - Versions diffs - 0.1.0__tar.gz - Mend

notoecd 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

notoecd-0.1.0/PKG-INFO +114 -0
notoecd-0.1.0/README.md +102 -0
notoecd-0.1.0/notoecd/notoecd.egg-info/PKG-INFO +114 -0
notoecd-0.1.0/notoecd/notoecd.egg-info/SOURCES.txt +11 -0
notoecd-0.1.0/notoecd/notoecd.egg-info/dependency_links.txt +1 -0
notoecd-0.1.0/notoecd/notoecd.egg-info/requires.txt +2 -0
notoecd-0.1.0/notoecd/notoecd.egg-info/top_level.txt +1 -0
notoecd-0.1.0/pyproject.toml +30 -0
notoecd-0.1.0/setup.cfg +4 -0
notoecd-0.1.0/tests/test_api.py +11 -0
notoecd-0.1.0/tests/test_calls.py +70 -0
notoecd-0.1.0/tests/test_datasets.py +100 -0
notoecd-0.1.0/tests/test_structure.py +98 -0

notoecd-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,114 @@
+Metadata-Version: 2.4
+Name: notoecd
+Version: 0.1.0
+Summary: Library for interacting with the OECD Data Explorer through Python
+Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/dani-37/notoecd
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: pandas>=2.0
+Requires-Dist: requests>=2.31
+# notoecd
+⚠️ **Unofficial package, not endorsed by the OECD.**
+A lightweight Python interface for exploring OECD SDMX structures and downloading OECD regional datasets.
+The package provides utilities for:
+- Discovering dataset metadata
+- Searching for relevant datasets using keyword matching
+- Exploring the structure and code lists of a dataset
+- Fetching filtered SDMX data directly into a pandas DataFrame
+------------------------------------------------------------
+## Installation
+You can install the package by running:
+    pip install notoecd
+------------------------------------------------------------
+## Quick Start
+    import notoecd
+The main functions in this module are:
+    search_keywords(keywords) -> pd.DataFrame
+    get_structure(agencyID, dataflowID) -> Structure
+    get_df(agencyID, dataflowID, filters) -> pd.DataFrame
+------------------------------------------------------------
+## Searching for datasets
+`search_keywords` performs:
+- Normalized text matching
+- Accent-insensitive search
+- Multi-keyword OR matching
+- Ranking by number of matched keywords
+Example:
+    hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
+This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
+------------------------------------------------------------
+## Inspecting dataset structure
+Once a dataset is identified, load its SDMX structure:
+    dataset = 'Gross domestic product - Regions'
+    agencyID = 'OECD.CFE.EDS'
+    dataflowID = 'DSD_REG_ECO@DF_GDP'
+    s = notoecd.get_structure(agencyID, dataflowID)
+### Table of contents
+    s.toc
+This shows all filters and their available values.
+### Exploring code values
+    s.explain_vals('MEASURE')
+    s.explain_vals('UNIT_MEASURE')
+This shows the available measures and units used in the dataset.
+------------------------------------------------------------
+## Filtering and downloading data
+To download data, build a dictionary of filters.
+Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
+    filters = {
+        'territorial_level': ['tl2', 'tl3'],
+        'measure': 'gdp',
+        'prices': 'Q',
+        'unit_measure': 'USD_PPP_PS'
+    }
+Fetch the filtered dataset:
+    df = notoecd.get_df(agency, dataflow, filters)
+    df.head()
+The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
+------------------------------------------------------------
+## Examples
+You can see this full example as a notebook called example.ipynb.

notoecd-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,102 @@
+# notoecd
+⚠️ **Unofficial package, not endorsed by the OECD.**
+A lightweight Python interface for exploring OECD SDMX structures and downloading OECD regional datasets.
+The package provides utilities for:
+- Discovering dataset metadata
+- Searching for relevant datasets using keyword matching
+- Exploring the structure and code lists of a dataset
+- Fetching filtered SDMX data directly into a pandas DataFrame
+------------------------------------------------------------
+## Installation
+You can install the package by running:
+    pip install notoecd
+------------------------------------------------------------
+## Quick Start
+    import notoecd
+The main functions in this module are:
+    search_keywords(keywords) -> pd.DataFrame
+    get_structure(agencyID, dataflowID) -> Structure
+    get_df(agencyID, dataflowID, filters) -> pd.DataFrame
+------------------------------------------------------------
+## Searching for datasets
+`search_keywords` performs:
+- Normalized text matching
+- Accent-insensitive search
+- Multi-keyword OR matching
+- Ranking by number of matched keywords
+Example:
+    hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
+This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
+------------------------------------------------------------
+## Inspecting dataset structure
+Once a dataset is identified, load its SDMX structure:
+    dataset = 'Gross domestic product - Regions'
+    agencyID = 'OECD.CFE.EDS'
+    dataflowID = 'DSD_REG_ECO@DF_GDP'
+    s = notoecd.get_structure(agencyID, dataflowID)
+### Table of contents
+    s.toc
+This shows all filters and their available values.
+### Exploring code values
+    s.explain_vals('MEASURE')
+    s.explain_vals('UNIT_MEASURE')
+This shows the available measures and units used in the dataset.
+------------------------------------------------------------
+## Filtering and downloading data
+To download data, build a dictionary of filters.
+Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
+    filters = {
+        'territorial_level': ['tl2', 'tl3'],
+        'measure': 'gdp',
+        'prices': 'Q',
+        'unit_measure': 'USD_PPP_PS'
+    }
+Fetch the filtered dataset:
+    df = notoecd.get_df(agency, dataflow, filters)
+    df.head()
+The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
+------------------------------------------------------------
+## Examples
+You can see this full example as a notebook called example.ipynb.

notoecd-0.1.0/notoecd/notoecd.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,114 @@
+Metadata-Version: 2.4
+Name: notoecd
+Version: 0.1.0
+Summary: Library for interacting with the OECD Data Explorer through Python
+Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/dani-37/notoecd
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: pandas>=2.0
+Requires-Dist: requests>=2.31
+# notoecd
+⚠️ **Unofficial package, not endorsed by the OECD.**
+A lightweight Python interface for exploring OECD SDMX structures and downloading OECD regional datasets.
+The package provides utilities for:
+- Discovering dataset metadata
+- Searching for relevant datasets using keyword matching
+- Exploring the structure and code lists of a dataset
+- Fetching filtered SDMX data directly into a pandas DataFrame
+------------------------------------------------------------
+## Installation
+You can install the package by running:
+    pip install notoecd
+------------------------------------------------------------
+## Quick Start
+    import notoecd
+The main functions in this module are:
+    search_keywords(keywords) -> pd.DataFrame
+    get_structure(agencyID, dataflowID) -> Structure
+    get_df(agencyID, dataflowID, filters) -> pd.DataFrame
+------------------------------------------------------------
+## Searching for datasets
+`search_keywords` performs:
+- Normalized text matching
+- Accent-insensitive search
+- Multi-keyword OR matching
+- Ranking by number of matched keywords
+Example:
+    hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
+This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
+------------------------------------------------------------
+## Inspecting dataset structure
+Once a dataset is identified, load its SDMX structure:
+    dataset = 'Gross domestic product - Regions'
+    agencyID = 'OECD.CFE.EDS'
+    dataflowID = 'DSD_REG_ECO@DF_GDP'
+    s = notoecd.get_structure(agencyID, dataflowID)
+### Table of contents
+    s.toc
+This shows all filters and their available values.
+### Exploring code values
+    s.explain_vals('MEASURE')
+    s.explain_vals('UNIT_MEASURE')
+This shows the available measures and units used in the dataset.
+------------------------------------------------------------
+## Filtering and downloading data
+To download data, build a dictionary of filters.
+Keys correspond to SDMX dimensions, values are strings or lists (for multiple values):
+    filters = {
+        'territorial_level': ['tl2', 'tl3'],
+        'measure': 'gdp',
+        'prices': 'Q',
+        'unit_measure': 'USD_PPP_PS'
+    }
+Fetch the filtered dataset:
+    df = notoecd.get_df(agency, dataflow, filters)
+    df.head()
+The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
+------------------------------------------------------------
+## Examples
+You can see this full example as a notebook called example.ipynb.

notoecd-0.1.0/notoecd/notoecd.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,11 @@
+README.md
+pyproject.toml
+notoecd/notoecd.egg-info/PKG-INFO
+notoecd/notoecd.egg-info/SOURCES.txt
+notoecd/notoecd.egg-info/dependency_links.txt
+notoecd/notoecd.egg-info/requires.txt
+notoecd/notoecd.egg-info/top_level.txt
+tests/test_api.py
+tests/test_calls.py
+tests/test_datasets.py
+tests/test_structure.py

notoecd-0.1.0/notoecd/notoecd.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

notoecd-0.1.0/notoecd/notoecd.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ pandas>=2.0
2	+ requests>=2.31

notoecd-0.1.0/notoecd/notoecd.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

notoecd-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,30 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "notoecd"
+version = "0.1.0"
+description = "Library for interacting with the OECD Data Explorer through Python"
+readme = "README.md"
+license = "MIT"
+requires-python = ">=3.10"
+authors = [
+    { name = "Daniel Vegara Balsa", email = "daniel.vegarabalsa@oecd.org" }
+]
+dependencies = [
+  "pandas>=2.0",
+  "requests>=2.31"
+]
+[project.urls]
+Homepage = "https://github.com/dani-37/notoecd"
+[tool.setuptools]
+package-dir = {"" = "notoecd"}
+[tool.setuptools.packages.find]
+where = ["notoecd"]

notoecd-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

notoecd-0.1.0/tests/test_api.py ADDED Viewed

@@ -0,0 +1,11 @@
+import notoecd
+def test_public_api_exports():
+    assert callable(notoecd.get_df)
+    assert callable(notoecd.get_structure)
+    assert callable(notoecd.search_keywords)
+def test_import_package():
+    import importlib
+    m = importlib.import_module("notoecd")
+    assert m is not None

notoecd-0.1.0/tests/test_calls.py ADDED Viewed

@@ -0,0 +1,70 @@
+import pandas as pd
+from types import SimpleNamespace
+from unittest.mock import patch
+import notoecd.calls as calls
+def _fake_structure_with_toc_titles(titles):
+    toc = pd.DataFrame({"title": titles})
+    return SimpleNamespace(toc=toc)
+def test_build_filter_expression_orders_by_toc_and_uppercases():
+    fake_s = _fake_structure_with_toc_titles(["PRICES", "UNIT_MEASURE", "MEASURE"])
+    filters = {"prices": "q", "unit_measure": ["USD_PPP_PS"], "measure": "gdp"}
+    with patch("notoecd.calls.get_structure", return_value=fake_s):
+        expr = calls._build_filter_expression("A", "B", filters)
+    assert expr == "Q.USD_PPP_PS.GDP"
+def test_build_filter_expression_missing_dims_are_empty_parts():
+    fake_s = _fake_structure_with_toc_titles(["A", "B", "C"])
+    with patch("notoecd.calls.get_structure", return_value=fake_s):
+        expr = calls._build_filter_expression("A", "B", {"b": "x"})
+    assert expr == ".X."
+def test_build_filter_expression_multi_value_joins_plus():
+    fake_s = _fake_structure_with_toc_titles(["territorial_level"])
+    with patch("notoecd.calls.get_structure", return_value=fake_s):
+        expr = calls._build_filter_expression("A", "B", {"territorial_level": ["tl2", "tl3"]})
+    assert expr == "TL2+TL3"
+def test_get_df_builds_url_and_returns_copy():
+    calls._fetch_df.cache_clear()
+    fake_s = _fake_structure_with_toc_titles(["PRICES"])
+    fake_df = pd.DataFrame({"x": [1, 2]})
+    with patch("notoecd.calls.get_structure", return_value=fake_s), \
+         patch("notoecd.calls.pd.read_csv", return_value=fake_df) as mock_read_csv:
+        out = calls.get_df("OECD.CFE.EDS", "DSD_REG_ECO@DF_GDP", {"prices": "q"})
+    assert out.equals(fake_df)
+    assert out is not fake_df  # must be a copy()
+    (url,), kwargs = mock_read_csv.call_args
+    assert url.startswith("https://sdmx.oecd.org/public/rest/data/")
+    assert "OECD.CFE.EDS,DSD_REG_ECO@DF_GDP," in url
+    assert "/Q" in url
+    assert "dimensionAtObservation=AllDimensions" in url
+    assert "format=csvfile" in url
+    assert kwargs["storage_options"]["User-Agent"]
+def test_get_df_accepts_string_filter_expression_and_uppercases():
+    calls._fetch_df.cache_clear()
+    fake_df = pd.DataFrame({"x": [1]})
+    with patch("notoecd.calls.pd.read_csv", return_value=fake_df) as mock_read_csv:
+        _ = calls.get_df("A", "B", " tl2+tl3..gdp ")
+    (url,), _ = mock_read_csv.call_args
+    assert "/TL2+TL3..GDP" in url

notoecd-0.1.0/tests/test_datasets.py ADDED Viewed

@@ -0,0 +1,100 @@
+import importlib
+import requests
+import pandas as pd
+def _fake_dataflow_all_xml() -> bytes:
+    xml = """<?xml version="1.0" encoding="UTF-8"?>
+            <message:Structure
+            xmlns:message="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message"
+            xmlns:structure="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure"
+            xmlns:common="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common"
+            xmlns:xml="http://www.w3.org/XML/1998/namespace"
+            >
+            <message:Structures>
+                <structure:Dataflows>
+                <structure:Dataflow id="DSD_REG_ECO@DF_GDP" agencyID="OECD.CFE.EDS">
+                    <common:Name xml:lang="en">Gross domestic product - Regions</common:Name>
+                    <common:Description xml:lang="en">GDP by region</common:Description>
+                </structure:Dataflow>
+                <structure:Dataflow id="DF_CAFE" agencyID="OECD.TEST">
+                    <common:Name xml:lang="en">Café prices</common:Name>
+                    <common:Description xml:lang="en">Prices in cafes</common:Description>
+                </structure:Dataflow>
+                <structure:Dataflow id="DF_OTHER" agencyID="OECD">
+                    <common:Name xml:lang="en">Other dataset</common:Name>
+                    <common:Description xml:lang="en">Other description</common:Description>
+                </structure:Dataflow>
+                </structure:Dataflows>
+            </message:Structures>
+            </message:Structure>
+            """
+    return xml.encode("utf-8")
+class _Resp:
+    def __init__(self, content: bytes, status_code: int = 200):
+        self.content = content
+        self.status_code = status_code
+    def raise_for_status(self):
+        if self.status_code >= 400:
+            raise requests.HTTPError(f"HTTP {self.status_code}")
+def test_datasets_dataframe_built_on_import(monkeypatch):
+    def fake_get(url, *args, **kwargs):
+        if url.endswith("/public/rest/dataflow/all"):
+            return _Resp(_fake_dataflow_all_xml())
+        raise AssertionError(f"Unexpected URL in test_datasets: {url}")
+    monkeypatch.setattr(requests, "get", fake_get)
+    datasets_mod = importlib.import_module("notoecd.datasets")
+    importlib.reload(datasets_mod)
+    assert isinstance(datasets_mod.datasets, pd.DataFrame)
+    assert {"agencyID", "dataflowID", "name", "description"}.issubset(datasets_mod.datasets.columns)
+    assert len(datasets_mod.datasets) == 3
+def test_search_keywords_or_and_normalization(monkeypatch):
+    def fake_get(url, *args, **kwargs):
+        if url.endswith("/public/rest/dataflow/all"):
+            return _Resp(_fake_dataflow_all_xml())
+        raise AssertionError(f"Unexpected URL in test_datasets: {url}")
+    monkeypatch.setattr(requests, "get", fake_get)
+    datasets_mod = importlib.import_module("notoecd.datasets")
+    importlib.reload(datasets_mod)
+    # OR behavior: should match GDP OR tl2 (not present) OR cafe (accent-insensitive)
+    hits = datasets_mod.search_keywords(["gross domestic product", "cafe"])
+    assert len(hits) == 2
+    assert any(hits["dataflowID"] == "DSD_REG_ECO@DF_GDP")
+    assert any(hits["dataflowID"] == "DF_CAFE")
+    names = " ".join(hits["name"].fillna("").tolist()).lower()
+    assert ("gross domestic product" in names) or ("café" in names) or ("cafe" in names)
+def test_search_keywords_rejects_empty(monkeypatch):
+    def fake_get(url, *args, **kwargs):
+        if url.endswith("/public/rest/dataflow/all"):
+            return _Resp(_fake_dataflow_all_xml())
+        raise AssertionError(f"Unexpected URL in test_datasets: {url}")
+    monkeypatch.setattr(requests, "get", fake_get)
+    datasets_mod = importlib.import_module("notoecd.datasets")
+    importlib.reload(datasets_mod)
+    try:
+        datasets_mod.search_keywords(["   ", ""])
+        raise AssertionError("Expected ValueError for empty keywords")
+    except ValueError:
+        pass

notoecd-0.1.0/tests/test_structure.py ADDED Viewed

@@ -0,0 +1,98 @@
+import importlib
+import requests
+import pandas as pd
+def _fake_structure_xml() -> bytes:
+    return  b"""<?xml version="1.0" encoding="UTF-8"?>
+                <message:Structure
+                xmlns:message="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message"
+                xmlns:structure="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure"
+                xmlns:common="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common"
+                xmlns:xml="http://www.w3.org/XML/1998/namespace"
+                >
+                <message:Structures>
+                    <structure:Concepts>
+                    <structure:ConceptScheme>
+                        <structure:Concept id="PRICES">
+                        <common:Name xml:lang="en">Prices</common:Name>
+                        <structure:CoreRepresentation>
+                            <structure:Enumeration>
+                            <Ref id="CL_PRICES"/>
+                            </structure:Enumeration>
+                        </structure:CoreRepresentation>
+                        </structure:Concept>
+                    </structure:ConceptScheme>
+                    </structure:Concepts>
+                    <structure:Codelists>
+                    <structure:Codelist id="CL_PRICES">
+                        <structure:Code id="Q">
+                        <common:Name xml:lang="en">Quarterly</common:Name>
+                        </structure:Code>
+                        <structure:Code id="V">
+                        <common:Name xml:lang="en">Volume</common:Name>
+                        </structure:Code>
+                    </structure:Codelist>
+                    </structure:Codelists>
+                    <structure:Constraints>
+                    <structure:ContentConstraint>
+                        <structure:CubeRegion>
+                        <common:KeyValue id="PRICES">
+                            <common:Value>Q</common:Value>
+                            <common:Value>V</common:Value>
+                        </common:KeyValue>
+                        </structure:CubeRegion>
+                    </structure:ContentConstraint>
+                    </structure:Constraints>
+                    <structure:DataStructures>
+                    <structure:DataStructure>
+                        <structure:DataStructureComponents>
+                        <structure:DimensionList>
+                            <structure:Dimension id="PRICES" position="1"/>
+                        </structure:DimensionList>
+                        </structure:DataStructureComponents>
+                    </structure:DataStructure>
+                    </structure:DataStructures>
+                </message:Structures>
+                </message:Structure>
+                """
+class _Resp:
+    def __init__(self, content: bytes, status_code: int = 200):
+        self.content = content
+        self.status_code = status_code
+def test_get_structure_builds_toc_values_and_explain(monkeypatch):
+    def fake_get(url, *args, **kwargs):
+        if "/public/rest/dataflow/" in url and "?references=all" in url:
+            return _Resp(_fake_structure_xml())
+        raise AssertionError(f"Unexpected URL in test_structure: {url}")
+    # Patch the requests used by notoecd.structure
+    monkeypatch.setattr(requests, "get", fake_get)
+    structure_mod = importlib.import_module("notoecd.structure")
+    importlib.reload(structure_mod)
+    # Clear cache so test is isolated
+    structure_mod.get_structure.cache_clear()
+    s = structure_mod.get_structure("OECD.CFE.EDS", "DSD_REG_ECO@DF_GDP")
+    assert isinstance(s.toc, pd.DataFrame)
+    assert list(s.toc["title"]) == ["PRICES"]
+    assert s.toc.loc[0, "values"] == ["Q", "V"]
+    assert isinstance(s.concepts, dict)
+    assert "CODELISTS" in s.concepts
+    assert "PRICES" in s.concepts
+    d = s.explain_vals("PRICES")
+    assert d == {"Q": "Quarterly", "V": "Volume"}