notoecd 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: notoecd
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
5
5
  Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
6
6
  License-Expression: MIT
@@ -16,16 +16,16 @@ Dynamic: license-file
16
16
 
17
17
  ⚠️ **Unofficial package, not endorsed by the OECD.**
18
18
 
19
- A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
19
+ A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
20
20
 
21
21
  The package provides utilities for:
22
22
 
23
- - Discovering dataset metadata
24
- - Searching for relevant datasets using keyword matching
25
- - Exploring the structure and code lists of a dataset
26
- - Fetching filtered SDMX data directly into a pandas DataFrame
23
+ - Discovering dataset metadata
24
+ - Searching for relevant datasets using keyword matching
25
+ - Exploring the structure and code lists of a dataset
26
+ - Fetching filtered SDMX data directly into a pandas DataFrame
27
27
 
28
- ------------------------------------------------------------
28
+ ---
29
29
 
30
30
  ## Installation
31
31
 
@@ -33,7 +33,7 @@ You can install the package by running:
33
33
 
34
34
  pip install notoecd
35
35
 
36
- ------------------------------------------------------------
36
+ ---
37
37
 
38
38
  ## Quick Start
39
39
 
@@ -45,30 +45,30 @@ The main functions in this module are:
45
45
  get_structure(agencyID, dataflowID) -> Structure
46
46
  get_df(agencyID, dataflowID, filters) -> pd.DataFrame
47
47
 
48
- ------------------------------------------------------------
48
+ ---
49
49
 
50
50
  ## Searching for datasets
51
51
 
52
52
  `search_keywords` performs:
53
53
 
54
- - Normalized text matching
55
- - Accent-insensitive search
56
- - Multi-keyword OR matching
57
- - Ranking by number of matched keywords
54
+ - Normalized text matching
55
+ - Accent-insensitive search
56
+ - Multi-keyword OR matching
57
+ - Ranking by number of matched keywords
58
58
 
59
59
  Example:
60
60
 
61
- hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
61
+ hits = notoecd.search_keywords('gross domestic product', 'tl2', 'tl3')
62
62
 
63
63
  This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
64
64
 
65
- ------------------------------------------------------------
65
+ ---
66
66
 
67
67
  ## Inspecting dataset structure
68
68
 
69
69
  Once a dataset is identified, load its SDMX structure:
70
70
 
71
- dataset = 'Gross domestic product - Regions'
71
+ # Gross domestic product - Regions
72
72
  agencyID = 'OECD.CFE.EDS'
73
73
  dataflowID = 'DSD_REG_ECO@DF_GDP'
74
74
 
@@ -87,7 +87,7 @@ This shows all filters and their available values.
87
87
 
88
88
  This shows the available measures and units used in the dataset.
89
89
 
90
- ------------------------------------------------------------
90
+ ---
91
91
 
92
92
  ## Filtering and downloading data
93
93
 
@@ -103,12 +103,12 @@ Keys correspond to SDMX dimensions, values are strings or lists (for multiple va
103
103
 
104
104
  Fetch the filtered dataset:
105
105
 
106
- df = notoecd.get_df(agency, dataflow, filters)
106
+ df = notoecd.get_df(agencyID, dataflowID, filters)
107
107
  df.head()
108
108
 
109
109
  The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
110
110
 
111
- ------------------------------------------------------------
111
+ ---
112
112
 
113
113
  ## Examples
114
114
 
@@ -2,16 +2,16 @@
2
2
 
3
3
  ⚠️ **Unofficial package, not endorsed by the OECD.**
4
4
 
5
- A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
5
+ A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
6
6
 
7
7
  The package provides utilities for:
8
8
 
9
- - Discovering dataset metadata
10
- - Searching for relevant datasets using keyword matching
11
- - Exploring the structure and code lists of a dataset
12
- - Fetching filtered SDMX data directly into a pandas DataFrame
9
+ - Discovering dataset metadata
10
+ - Searching for relevant datasets using keyword matching
11
+ - Exploring the structure and code lists of a dataset
12
+ - Fetching filtered SDMX data directly into a pandas DataFrame
13
13
 
14
- ------------------------------------------------------------
14
+ ---
15
15
 
16
16
  ## Installation
17
17
 
@@ -19,7 +19,7 @@ You can install the package by running:
19
19
 
20
20
  pip install notoecd
21
21
 
22
- ------------------------------------------------------------
22
+ ---
23
23
 
24
24
  ## Quick Start
25
25
 
@@ -31,30 +31,30 @@ The main functions in this module are:
31
31
  get_structure(agencyID, dataflowID) -> Structure
32
32
  get_df(agencyID, dataflowID, filters) -> pd.DataFrame
33
33
 
34
- ------------------------------------------------------------
34
+ ---
35
35
 
36
36
  ## Searching for datasets
37
37
 
38
38
  `search_keywords` performs:
39
39
 
40
- - Normalized text matching
41
- - Accent-insensitive search
42
- - Multi-keyword OR matching
43
- - Ranking by number of matched keywords
40
+ - Normalized text matching
41
+ - Accent-insensitive search
42
+ - Multi-keyword OR matching
43
+ - Ranking by number of matched keywords
44
44
 
45
45
  Example:
46
46
 
47
- hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
47
+ hits = notoecd.search_keywords('gross domestic product', 'tl2', 'tl3')
48
48
 
49
49
  This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
50
50
 
51
- ------------------------------------------------------------
51
+ ---
52
52
 
53
53
  ## Inspecting dataset structure
54
54
 
55
55
  Once a dataset is identified, load its SDMX structure:
56
56
 
57
- dataset = 'Gross domestic product - Regions'
57
+ # Gross domestic product - Regions
58
58
  agencyID = 'OECD.CFE.EDS'
59
59
  dataflowID = 'DSD_REG_ECO@DF_GDP'
60
60
 
@@ -73,7 +73,7 @@ This shows all filters and their available values.
73
73
 
74
74
  This shows the available measures and units used in the dataset.
75
75
 
76
- ------------------------------------------------------------
76
+ ---
77
77
 
78
78
  ## Filtering and downloading data
79
79
 
@@ -89,12 +89,12 @@ Keys correspond to SDMX dimensions, values are strings or lists (for multiple va
89
89
 
90
90
  Fetch the filtered dataset:
91
91
 
92
- df = notoecd.get_df(agency, dataflow, filters)
92
+ df = notoecd.get_df(agencyID, dataflowID, filters)
93
93
  df.head()
94
94
 
95
95
  The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
96
96
 
97
- ------------------------------------------------------------
97
+ ---
98
98
 
99
99
  ## Examples
100
100
 
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.3"
1
+ __version__ = "0.1.5"
2
2
 
3
3
  from .calls import get_df
4
4
  from .structure import get_structure
@@ -0,0 +1,110 @@
1
+ import pandas as pd
2
+ from functools import lru_cache
3
+ from typing import Union, Optional
4
+ from .structure import get_structure
5
+
6
+ @lru_cache(maxsize=64)
7
+ def _fetch_df(url: str) -> pd.DataFrame:
8
+ return pd.read_csv(url, storage_options={"User-Agent": "Mozilla/5.0"})
9
+
10
+
11
+ def _clean(s: str) -> str:
12
+ return str(s).strip().lower()
13
+
14
+
15
+ def _clean_dict(d: dict) -> dict:
16
+ out = {}
17
+ for k, v in d.items():
18
+ k = _clean(k)
19
+ if isinstance(v, (list, tuple, set)):
20
+ out[k] = [_clean(x) for x in v]
21
+ else:
22
+ out[k] = _clean(v)
23
+ return out
24
+
25
+
26
+ def _build_filter_expression(
27
+ agencyID: str,
28
+ dataflowID: str,
29
+ filters: dict,
30
+ ) -> str:
31
+ """
32
+ Builds a valid OECD SDMX filter expression from a dictionary.
33
+
34
+ Args:
35
+ agencyID (str): The data provider agency identifier.
36
+ dataflowID (str): The dataflow identifier within the agency.
37
+ filters (dict): Dictionary with dimension names as keys and
38
+ either codes or labels as values.
39
+
40
+ Returns
41
+ str: A valid OECD SDMX filter expression.
42
+ """
43
+ s = get_structure(agencyID, dataflowID)
44
+ filters = _clean_dict(filters)
45
+
46
+ parts = []
47
+ for dim in s.toc.title:
48
+ dim_key = _clean(dim)
49
+
50
+ if dim_key in filters:
51
+ val = filters[dim_key]
52
+ concepts = _clean_dict(s.explain_vals(dim_key))
53
+ rev = {v: k for k, v in concepts.items()}
54
+
55
+ if isinstance(val, str):
56
+ val = [val]
57
+ val = [_clean(v) for v in val]
58
+
59
+ for i, v in enumerate(val):
60
+ if v in concepts: continue
61
+ if v in rev: val[i] = rev[v]
62
+ else: raise ValueError(f"Invalid value '{v}' for dimension '{dim_key}'. ")
63
+
64
+ parts.append("+".join(val))
65
+ else:
66
+ parts.append("")
67
+ return ".".join(parts).upper()
68
+
69
+
70
+ def get_df(
71
+ agencyID: str,
72
+ dataflowID: str,
73
+ filters: Union[str, dict] = "",
74
+ version: str = "",
75
+ startYear: Optional[int] = None,
76
+ endYear: Optional[int] = None,
77
+ ) -> pd.DataFrame:
78
+ """
79
+ Fetch data from the OECD SDMX API and return it as a pandas DataFrame.
80
+
81
+ Args:
82
+ agencyID (str): The data provider agency identifier.
83
+ dataflowID (str): The dataflow identifier within the agency.
84
+ filters (Union[str, dict], optional): Either a preformatted SDMX filter
85
+ string or a dictionary of filters.
86
+ version (str, optional): The dataflow version. Use an empty string for
87
+ the latest version.
88
+ startYear (int, optional): Start year (inclusive).
89
+ endYear (int, optional): End year (inclusive).
90
+
91
+ Returns:
92
+ pd.DataFrame: The resulting dataset.
93
+ """
94
+
95
+ if isinstance(filters, dict):
96
+ filter_expression = _build_filter_expression(agencyID, dataflowID, filters)
97
+ else:
98
+ filter_expression = _clean(filters).upper()
99
+
100
+ url = (
101
+ f"https://sdmx.oecd.org/public/rest/data/"
102
+ f"{agencyID},{dataflowID},{version}/{filter_expression}"
103
+ f"?dimensionAtObservation=AllDimensions&format=csvfile"
104
+ )
105
+
106
+ if startYear is not None: url += f"&startPeriod={startYear}"
107
+ if endYear is not None: url += f"&endPeriod={endYear}"
108
+
109
+ base_df = _fetch_df(url)
110
+ return base_df.copy()
@@ -68,7 +68,7 @@ def search_keywords(*keywords: str) -> pd.DataFrame:
68
68
  Searches OECD datasets for a set of keywords.
69
69
 
70
70
  Args:
71
- keywords (str | list[str]): Single keyword or list of keywords. Acts as OR.
71
+ *keywords (str): One or more keywords. Acts as OR.
72
72
 
73
73
  Returns:
74
74
  pd.DataFrame: Matching rows.
@@ -89,13 +89,13 @@ def search_keywords(*keywords: str) -> pd.DataFrame:
89
89
  )
90
90
  )
91
91
 
92
- # Combined normalized text for each row
93
92
  text = (
94
93
  datasets["name"].fillna("").astype(str)
95
94
  + " "
96
95
  + datasets["description"].fillna("").astype(str)
97
96
  )
98
97
  text_norm = _normalize_series(text)
98
+ name_norm = _normalize_series(datasets["name"])
99
99
 
100
100
  def _normalize_kw(kw: str) -> str:
101
101
  kw = unicodedata.normalize("NFKD", kw.lower())
@@ -108,8 +108,9 @@ def search_keywords(*keywords: str) -> pd.DataFrame:
108
108
 
109
109
  for kw in norm_keywords:
110
110
  m = text_norm.str.contains(kw, na=False, regex=False)
111
+ mt = name_norm.str.contains(kw, na=False, regex=False)
111
112
  overall_mask |= m
112
- score = score.add(m.astype("int8"), fill_value=0)
113
+ score = score.add(m.astype("int8"), fill_value=0) + mt.astype("int8")
113
114
 
114
115
  result = datasets.loc[overall_mask].copy()
115
116
  result["_match_score"] = score.loc[overall_mask]
@@ -138,7 +138,11 @@ class Structure():
138
138
  print(f"Could not find explanation for {unclean_values}")
139
139
 
140
140
  return {i:explanation[i] for i in clean_values}
141
+
142
+ def explain_columns(self, df:pd.DataFrame) -> dict:
143
+ dim_dict = self.concepts['DIMENSIONS']
144
+ return {i: dim_dict[i] for i in df.columns if i in dim_dict}
141
145
 
142
- @lru_cache(maxsize=128)
146
+ @lru_cache(maxsize=64)
143
147
  def get_structure(agencyID: str, dataflowID: str) -> Structure:
144
148
  return Structure(agencyID, dataflowID)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: notoecd
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Unofficial library for interacting with the OECD Data Explorer through Python.
5
5
  Author-email: Daniel Vegara Balsa <daniel.vegarabalsa@oecd.org>
6
6
  License-Expression: MIT
@@ -16,16 +16,16 @@ Dynamic: license-file
16
16
 
17
17
  ⚠️ **Unofficial package, not endorsed by the OECD.**
18
18
 
19
- A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
19
+ A lightweight Python interface for exploring OECD SDMX structures and downloading OECD datasets.
20
20
 
21
21
  The package provides utilities for:
22
22
 
23
- - Discovering dataset metadata
24
- - Searching for relevant datasets using keyword matching
25
- - Exploring the structure and code lists of a dataset
26
- - Fetching filtered SDMX data directly into a pandas DataFrame
23
+ - Discovering dataset metadata
24
+ - Searching for relevant datasets using keyword matching
25
+ - Exploring the structure and code lists of a dataset
26
+ - Fetching filtered SDMX data directly into a pandas DataFrame
27
27
 
28
- ------------------------------------------------------------
28
+ ---
29
29
 
30
30
  ## Installation
31
31
 
@@ -33,7 +33,7 @@ You can install the package by running:
33
33
 
34
34
  pip install notoecd
35
35
 
36
- ------------------------------------------------------------
36
+ ---
37
37
 
38
38
  ## Quick Start
39
39
 
@@ -45,30 +45,30 @@ The main functions in this module are:
45
45
  get_structure(agencyID, dataflowID) -> Structure
46
46
  get_df(agencyID, dataflowID, filters) -> pd.DataFrame
47
47
 
48
- ------------------------------------------------------------
48
+ ---
49
49
 
50
50
  ## Searching for datasets
51
51
 
52
52
  `search_keywords` performs:
53
53
 
54
- - Normalized text matching
55
- - Accent-insensitive search
56
- - Multi-keyword OR matching
57
- - Ranking by number of matched keywords
54
+ - Normalized text matching
55
+ - Accent-insensitive search
56
+ - Multi-keyword OR matching
57
+ - Ranking by number of matched keywords
58
58
 
59
59
  Example:
60
60
 
61
- hits = notoecd.search_keywords(['gross domestic product', 'tl2', 'tl3'])
61
+ hits = notoecd.search_keywords('gross domestic product', 'tl2', 'tl3')
62
62
 
63
63
  This returns datasets that mention GDP and regional levels (TL2/TL3). It gives their name, description, and identifiers (agencyID and dataflowID), which we will need for the next step.
64
64
 
65
- ------------------------------------------------------------
65
+ ---
66
66
 
67
67
  ## Inspecting dataset structure
68
68
 
69
69
  Once a dataset is identified, load its SDMX structure:
70
70
 
71
- dataset = 'Gross domestic product - Regions'
71
+ # Gross domestic product - Regions
72
72
  agencyID = 'OECD.CFE.EDS'
73
73
  dataflowID = 'DSD_REG_ECO@DF_GDP'
74
74
 
@@ -87,7 +87,7 @@ This shows all filters and their available values.
87
87
 
88
88
  This shows the available measures and units used in the dataset.
89
89
 
90
- ------------------------------------------------------------
90
+ ---
91
91
 
92
92
  ## Filtering and downloading data
93
93
 
@@ -103,12 +103,12 @@ Keys correspond to SDMX dimensions, values are strings or lists (for multiple va
103
103
 
104
104
  Fetch the filtered dataset:
105
105
 
106
- df = notoecd.get_df(agency, dataflow, filters)
106
+ df = notoecd.get_df(agencyID, dataflowID, filters)
107
107
  df.head()
108
108
 
109
109
  The returned object is a pandas DataFrame containing the requested subset of OECD SDMX data.
110
110
 
111
- ------------------------------------------------------------
111
+ ---
112
112
 
113
113
  ## Examples
114
114
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "notoecd"
7
- version = "0.1.3"
7
+ version = "0.1.5"
8
8
  description = "Unofficial library for interacting with the OECD Data Explorer through Python."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -1,57 +0,0 @@
1
- import pandas as pd
2
- from typing import Union
3
- from functools import lru_cache
4
- from .structure import get_structure
5
-
6
- @lru_cache(maxsize=64)
7
- def _fetch_df(url: str) -> pd.DataFrame:
8
- return pd.read_csv(url, storage_options={"User-Agent": "Mozilla/5.0"})
9
-
10
-
11
- def _clean(s: str) -> str:
12
- return str(s).strip().lower()
13
-
14
-
15
- def _build_filter_expression(
16
- agencyID: str,
17
- dataflowID: str,
18
- filters: dict,
19
- ) -> str:
20
-
21
- s = get_structure(agencyID, dataflowID)
22
- filters = {_clean(k): v for k, v in filters.items()}
23
-
24
- parts = []
25
- for dim in s.toc.title:
26
- dim_key = _clean(dim)
27
- if dim_key in filters:
28
- val = filters[dim_key]
29
- if isinstance(val, str):
30
- val = [val]
31
- parts.append("+".join(_clean(v) for v in val))
32
- else:
33
- parts.append("")
34
-
35
- return ".".join(parts).upper()
36
-
37
-
38
- def get_df(
39
- agencyID: str,
40
- dataflowID: str,
41
- filters: Union[str, dict] = "",
42
- version: str = "",
43
- ) -> pd.DataFrame:
44
-
45
- if isinstance(filters, dict):
46
- filter_expression = _build_filter_expression(agencyID, dataflowID, filters)
47
- else:
48
- filter_expression = _clean(filters).upper()
49
-
50
- url = (
51
- f"https://sdmx.oecd.org/public/rest/data/"
52
- f"{agencyID},{dataflowID},{version}/{filter_expression}"
53
- "?dimensionAtObservation=AllDimensions&format=csvfile"
54
- )
55
-
56
- base_df = _fetch_df(url)
57
- return base_df.copy()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes