pyBDL 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pybdl/__init__.py +6 -0
- pybdl/access/__init__.py +23 -0
- pybdl/access/aggregates.py +110 -0
- pybdl/access/attributes.py +91 -0
- pybdl/access/base.py +304 -0
- pybdl/access/data.py +473 -0
- pybdl/access/enrichment.py +404 -0
- pybdl/access/levels.py +91 -0
- pybdl/access/measures.py +91 -0
- pybdl/access/subjects.py +165 -0
- pybdl/access/units.py +375 -0
- pybdl/access/variables.py +210 -0
- pybdl/access/years.py +91 -0
- pybdl/api/__init__.py +23 -0
- pybdl/api/aggregates.py +130 -0
- pybdl/api/attributes.py +130 -0
- pybdl/api/client.py +1157 -0
- pybdl/api/data.py +675 -0
- pybdl/api/exceptions.py +81 -0
- pybdl/api/levels.py +130 -0
- pybdl/api/measures.py +130 -0
- pybdl/api/subjects.py +211 -0
- pybdl/api/units.py +408 -0
- pybdl/api/variables.py +226 -0
- pybdl/api/version.py +41 -0
- pybdl/api/years.py +130 -0
- pybdl/client.py +117 -0
- pybdl/config.py +403 -0
- pybdl/utils/cache.py +77 -0
- pybdl/utils/http_cache/__init__.py +12 -0
- pybdl/utils/http_cache/client_factory.py +62 -0
- pybdl/utils/http_cache/paths.py +12 -0
- pybdl/utils/http_cache/response.py +8 -0
- pybdl/utils/rate_limiter/__init__.py +18 -0
- pybdl/utils/rate_limiter/_async.py +104 -0
- pybdl/utils/rate_limiter/_base.py +142 -0
- pybdl/utils/rate_limiter/_cache.py +157 -0
- pybdl/utils/rate_limiter/_decorators.py +46 -0
- pybdl/utils/rate_limiter/_sync.py +73 -0
- pybdl-1.0.0.dist-info/METADATA +142 -0
- pybdl-1.0.0.dist-info/RECORD +43 -0
- pybdl-1.0.0.dist-info/WHEEL +4 -0
- pybdl-1.0.0.dist-info/licenses/LICENSE +21 -0
pybdl/__init__.py
ADDED
pybdl/access/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Access layer for converting API responses to pandas DataFrames."""
|
|
2
|
+
|
|
3
|
+
from pybdl.access.aggregates import AggregatesAccess
|
|
4
|
+
from pybdl.access.attributes import AttributesAccess
|
|
5
|
+
from pybdl.access.data import DataAccess
|
|
6
|
+
from pybdl.access.levels import LevelsAccess
|
|
7
|
+
from pybdl.access.measures import MeasuresAccess
|
|
8
|
+
from pybdl.access.subjects import SubjectsAccess
|
|
9
|
+
from pybdl.access.units import UnitsAccess
|
|
10
|
+
from pybdl.access.variables import VariablesAccess
|
|
11
|
+
from pybdl.access.years import YearsAccess
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"AggregatesAccess",
|
|
15
|
+
"AttributesAccess",
|
|
16
|
+
"DataAccess",
|
|
17
|
+
"LevelsAccess",
|
|
18
|
+
"MeasuresAccess",
|
|
19
|
+
"SubjectsAccess",
|
|
20
|
+
"UnitsAccess",
|
|
21
|
+
"VariablesAccess",
|
|
22
|
+
"YearsAccess",
|
|
23
|
+
]
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Access layer for aggregates API endpoints."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from pybdl.access.base import BaseAccess
|
|
8
|
+
from pybdl.access.enrichment import LEVELS_SPEC, with_enrichment
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AggregatesAccess(BaseAccess):
|
|
12
|
+
"""
|
|
13
|
+
Access layer for aggregates API, converting responses to DataFrames.
|
|
14
|
+
|
|
15
|
+
Example column renaming::
|
|
16
|
+
|
|
17
|
+
_column_renames = {
|
|
18
|
+
"list_aggregates": {
|
|
19
|
+
"id": "aggregate_id",
|
|
20
|
+
"name": "aggregate_name",
|
|
21
|
+
},
|
|
22
|
+
"get_aggregate": {
|
|
23
|
+
"id": "aggregate_id",
|
|
24
|
+
},
|
|
25
|
+
}
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
@with_enrichment(LEVELS_SPEC)
|
|
29
|
+
def list_aggregates(
|
|
30
|
+
self,
|
|
31
|
+
page_size: int | None = None,
|
|
32
|
+
max_pages: int | None = None,
|
|
33
|
+
**kwargs: Any,
|
|
34
|
+
) -> pd.DataFrame:
|
|
35
|
+
"""
|
|
36
|
+
List all aggregates as a DataFrame.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
page_size: Number of results per page (defaults to config.page_size or 100).
|
|
40
|
+
max_pages: Maximum number of pages to fetch (None for all pages).
|
|
41
|
+
**kwargs: Additional parameters passed to API layer (e.g., sort, lang, format, extra_query).
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
DataFrame with aggregates data.
|
|
45
|
+
"""
|
|
46
|
+
if page_size is None:
|
|
47
|
+
page_size = self._get_default_page_size()
|
|
48
|
+
data = self.api_client.list_aggregates(page_size=page_size, max_pages=max_pages, **kwargs)
|
|
49
|
+
return self._to_dataframe(data)
|
|
50
|
+
|
|
51
|
+
@with_enrichment(LEVELS_SPEC)
|
|
52
|
+
def get_aggregate(
|
|
53
|
+
self,
|
|
54
|
+
aggregate_id: str,
|
|
55
|
+
**kwargs: Any,
|
|
56
|
+
) -> pd.DataFrame:
|
|
57
|
+
"""
|
|
58
|
+
Retrieve metadata details for a specific aggregate as a DataFrame.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
aggregate_id: Aggregate identifier.
|
|
62
|
+
**kwargs: Additional parameters passed to API layer (e.g., lang, format, extra_query).
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
DataFrame with aggregate metadata.
|
|
66
|
+
"""
|
|
67
|
+
data = self.api_client.get_aggregate(aggregate_id, **kwargs)
|
|
68
|
+
return self._to_dataframe(data)
|
|
69
|
+
|
|
70
|
+
@with_enrichment(LEVELS_SPEC)
|
|
71
|
+
async def alist_aggregates(
|
|
72
|
+
self,
|
|
73
|
+
page_size: int | None = None,
|
|
74
|
+
max_pages: int | None = None,
|
|
75
|
+
**kwargs: Any,
|
|
76
|
+
) -> pd.DataFrame:
|
|
77
|
+
"""
|
|
78
|
+
Asynchronously list all aggregates as a DataFrame.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
page_size: Number of results per page (defaults to config.page_size or 100).
|
|
82
|
+
max_pages: Maximum number of pages to fetch (None for all pages).
|
|
83
|
+
**kwargs: Additional parameters passed to API layer (e.g., sort, lang, format, extra_query).
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
DataFrame with aggregates data.
|
|
87
|
+
"""
|
|
88
|
+
if page_size is None:
|
|
89
|
+
page_size = self._get_default_page_size()
|
|
90
|
+
data = await self.api_client.alist_aggregates(page_size=page_size, max_pages=max_pages, **kwargs)
|
|
91
|
+
return self._to_dataframe(data)
|
|
92
|
+
|
|
93
|
+
@with_enrichment(LEVELS_SPEC)
|
|
94
|
+
async def aget_aggregate(
|
|
95
|
+
self,
|
|
96
|
+
aggregate_id: str,
|
|
97
|
+
**kwargs: Any,
|
|
98
|
+
) -> pd.DataFrame:
|
|
99
|
+
"""
|
|
100
|
+
Asynchronously retrieve metadata details for a specific aggregate as a DataFrame.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
aggregate_id: Aggregate identifier.
|
|
104
|
+
**kwargs: Additional parameters passed to API layer (e.g., lang, format, extra_query).
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
DataFrame with aggregate metadata.
|
|
108
|
+
"""
|
|
109
|
+
data = await self.api_client.aget_aggregate(aggregate_id, **kwargs)
|
|
110
|
+
return self._to_dataframe(data)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""Access layer for attributes API endpoints."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from pybdl.access.base import BaseAccess
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AttributesAccess(BaseAccess):
|
|
11
|
+
"""Access layer for attributes API, converting responses to DataFrames."""
|
|
12
|
+
|
|
13
|
+
def list_attributes(
|
|
14
|
+
self,
|
|
15
|
+
page_size: int | None = None,
|
|
16
|
+
max_pages: int | None = None,
|
|
17
|
+
**kwargs: Any,
|
|
18
|
+
) -> pd.DataFrame:
|
|
19
|
+
"""
|
|
20
|
+
List all attributes as a DataFrame.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
page_size: Number of results per page (defaults to config.page_size or 100).
|
|
24
|
+
max_pages: Maximum number of pages to fetch (None for all pages).
|
|
25
|
+
**kwargs: Additional parameters passed to API layer (e.g., lang, format, extra_query).
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
DataFrame with attributes data.
|
|
29
|
+
"""
|
|
30
|
+
if page_size is None:
|
|
31
|
+
page_size = self._get_default_page_size()
|
|
32
|
+
data = self.api_client.list_attributes(page_size=page_size, max_pages=max_pages, **kwargs)
|
|
33
|
+
return self._to_dataframe(data)
|
|
34
|
+
|
|
35
|
+
def get_attribute(
|
|
36
|
+
self,
|
|
37
|
+
attribute_id: str,
|
|
38
|
+
**kwargs: Any,
|
|
39
|
+
) -> pd.DataFrame:
|
|
40
|
+
"""
|
|
41
|
+
Retrieve metadata details for a specific attribute as a DataFrame.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
attribute_id: Attribute identifier.
|
|
45
|
+
**kwargs: Additional parameters passed to API layer (e.g., lang, format, extra_query).
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
DataFrame with attribute metadata.
|
|
49
|
+
"""
|
|
50
|
+
data = self.api_client.get_attribute(attribute_id, **kwargs)
|
|
51
|
+
return self._to_dataframe(data)
|
|
52
|
+
|
|
53
|
+
async def alist_attributes(
|
|
54
|
+
self,
|
|
55
|
+
page_size: int | None = None,
|
|
56
|
+
max_pages: int | None = None,
|
|
57
|
+
**kwargs: Any,
|
|
58
|
+
) -> pd.DataFrame:
|
|
59
|
+
"""
|
|
60
|
+
Asynchronously list all attributes as a DataFrame.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
page_size: Number of results per page (defaults to config.page_size or 100).
|
|
64
|
+
max_pages: Maximum number of pages to fetch (None for all pages).
|
|
65
|
+
**kwargs: Additional parameters passed to API layer (e.g., lang, format, extra_query).
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
DataFrame with attributes data.
|
|
69
|
+
"""
|
|
70
|
+
if page_size is None:
|
|
71
|
+
page_size = self._get_default_page_size()
|
|
72
|
+
data = await self.api_client.alist_attributes(page_size=page_size, max_pages=max_pages, **kwargs)
|
|
73
|
+
return self._to_dataframe(data)
|
|
74
|
+
|
|
75
|
+
async def aget_attribute(
|
|
76
|
+
self,
|
|
77
|
+
attribute_id: str,
|
|
78
|
+
**kwargs: Any,
|
|
79
|
+
) -> pd.DataFrame:
|
|
80
|
+
"""
|
|
81
|
+
Asynchronously retrieve metadata details for a specific attribute as a DataFrame.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
attribute_id: Attribute identifier.
|
|
85
|
+
**kwargs: Additional parameters passed to API layer (e.g., lang, format, extra_query).
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
DataFrame with attribute metadata.
|
|
89
|
+
"""
|
|
90
|
+
data = await self.api_client.aget_attribute(attribute_id, **kwargs)
|
|
91
|
+
return self._to_dataframe(data)
|
pybdl/access/base.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""Base access class for converting API responses to DataFrames."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseAccess:
|
|
11
|
+
"""
|
|
12
|
+
Base class for access layer implementations.
|
|
13
|
+
|
|
14
|
+
Supports per-function column renaming through the `_column_renames` class attribute.
|
|
15
|
+
Child classes can define column rename mappings that apply to both sync and async methods.
|
|
16
|
+
|
|
17
|
+
Example::
|
|
18
|
+
|
|
19
|
+
class MyAccess(BaseAccess):
|
|
20
|
+
_column_renames = {
|
|
21
|
+
"list_items": {
|
|
22
|
+
"id": "item_id",
|
|
23
|
+
"name": "item_name",
|
|
24
|
+
},
|
|
25
|
+
"get_item": {
|
|
26
|
+
"id": "item_id",
|
|
27
|
+
},
|
|
28
|
+
}
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# Column rename configuration: maps function_name -> {old_column: new_column}
|
|
32
|
+
# Both sync and async methods use the same mapping (async methods are normalized by removing 'a' prefix)
|
|
33
|
+
_column_renames: dict[str, dict[str, str]] = {}
|
|
34
|
+
|
|
35
|
+
def __init__(self, api_client: Any):
|
|
36
|
+
"""
|
|
37
|
+
Initialize base access class.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
api_client: API client instance (e.g., LevelsAPI, AttributesAPI).
|
|
41
|
+
"""
|
|
42
|
+
self.api_client = api_client
|
|
43
|
+
# Cache for enrichment lookups (e.g., levels, measures) to avoid refetching
|
|
44
|
+
self._enrichment_cache: dict[str, Any] = {}
|
|
45
|
+
|
|
46
|
+
def _get_default_page_size(self) -> int:
|
|
47
|
+
"""
|
|
48
|
+
Get default page size from config.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Default page size from config, or 100 if not available.
|
|
52
|
+
"""
|
|
53
|
+
if hasattr(self.api_client, "config") and hasattr(self.api_client.config, "page_size"):
|
|
54
|
+
return self.api_client.config.page_size
|
|
55
|
+
return 100
|
|
56
|
+
|
|
57
|
+
def _resolve_api_params(
|
|
58
|
+
self,
|
|
59
|
+
explicit_params: dict[str, Any],
|
|
60
|
+
kwargs: dict[str, Any],
|
|
61
|
+
) -> dict[str, Any]:
|
|
62
|
+
"""
|
|
63
|
+
Resolve API parameters, giving precedence to kwargs over explicit parameters.
|
|
64
|
+
|
|
65
|
+
For any parameter present in both explicit_params and kwargs, kwargs takes precedence.
|
|
66
|
+
This prevents duplicate parameter errors when calling API client methods.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
explicit_params: Dictionary of explicitly passed parameters.
|
|
70
|
+
kwargs: Keyword arguments dict (will be modified to remove resolved params).
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Dictionary of resolved parameters (kwargs values override explicit values).
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
explicit = {"max_pages": None, "page_size": 100}
|
|
77
|
+
kwargs = {"max_pages": 1, "extra": "value"}
|
|
78
|
+
resolved = self._resolve_api_params(explicit, kwargs)
|
|
79
|
+
# Returns: {"max_pages": 1, "page_size": 100}
|
|
80
|
+
# kwargs now: {"extra": "value"} (max_pages removed)
|
|
81
|
+
"""
|
|
82
|
+
resolved = explicit_params.copy()
|
|
83
|
+
|
|
84
|
+
# For each key in kwargs that matches an explicit parameter, use kwargs value
|
|
85
|
+
for key in list(kwargs.keys()):
|
|
86
|
+
if key in resolved:
|
|
87
|
+
# kwargs takes precedence - use its value and remove from kwargs
|
|
88
|
+
resolved[key] = kwargs.pop(key)
|
|
89
|
+
|
|
90
|
+
return resolved
|
|
91
|
+
|
|
92
|
+
def _to_dataframe(
|
|
93
|
+
self,
|
|
94
|
+
data: list[dict[str, Any]] | dict[str, Any],
|
|
95
|
+
*,
|
|
96
|
+
function_name: str | None = None,
|
|
97
|
+
) -> pd.DataFrame:
|
|
98
|
+
"""
|
|
99
|
+
Convert API response to DataFrame with proper column names and data types.
|
|
100
|
+
|
|
101
|
+
Applies per-method column renames defined in `_column_renames`. Both sync and
|
|
102
|
+
async methods are supported (async names are normalized by removing 'a' prefix).
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
data: List of dictionaries or single dictionary from API response.
|
|
106
|
+
function_name: Optional explicit function name for column rename lookup.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
DataFrame with normalized column names and proper data types.
|
|
110
|
+
"""
|
|
111
|
+
# Handle single dict by converting to list
|
|
112
|
+
if isinstance(data, dict):
|
|
113
|
+
data = [data]
|
|
114
|
+
|
|
115
|
+
if not data:
|
|
116
|
+
return pd.DataFrame()
|
|
117
|
+
|
|
118
|
+
# Convert to DataFrame
|
|
119
|
+
df = pd.DataFrame(data)
|
|
120
|
+
|
|
121
|
+
# Normalize column names (camelCase to snake_case)
|
|
122
|
+
df.columns = [self._camel_to_snake(col) for col in df.columns]
|
|
123
|
+
|
|
124
|
+
# Infer and convert data types
|
|
125
|
+
df = self._infer_dtypes(df)
|
|
126
|
+
|
|
127
|
+
if function_name is None:
|
|
128
|
+
function_name = self._get_calling_function_name()
|
|
129
|
+
if function_name:
|
|
130
|
+
df = self._apply_column_renames(df, function_name)
|
|
131
|
+
|
|
132
|
+
return df
|
|
133
|
+
|
|
134
|
+
def _get_calling_function_name(self) -> str | None:
|
|
135
|
+
"""
|
|
136
|
+
Get the name of the function that called `_to_dataframe`.
|
|
137
|
+
|
|
138
|
+
Inspects the call stack to find the calling function, skipping internal
|
|
139
|
+
methods like `_to_dataframe` itself. The function name is automatically
|
|
140
|
+
detected from the call stack.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Function name if found, None otherwise.
|
|
144
|
+
"""
|
|
145
|
+
try:
|
|
146
|
+
func_name = sys._getframe(2).f_code.co_name
|
|
147
|
+
except ValueError:
|
|
148
|
+
return None
|
|
149
|
+
if func_name != "<module>" and not func_name.startswith("__"):
|
|
150
|
+
return func_name
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def _normalize_function_name(function_name: str) -> str:
|
|
155
|
+
"""
|
|
156
|
+
Normalize function name for column rename lookup.
|
|
157
|
+
|
|
158
|
+
Removes 'a' prefix from async function names so that sync and async
|
|
159
|
+
methods can share the same column rename configuration.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
function_name: Function name (e.g., 'list_aggregates' or 'alist_aggregates').
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Normalized function name (e.g., 'list_aggregates').
|
|
166
|
+
|
|
167
|
+
Examples:
|
|
168
|
+
>>> BaseAccess._normalize_function_name('alist_aggregates')
|
|
169
|
+
'list_aggregates'
|
|
170
|
+
>>> BaseAccess._normalize_function_name('list_aggregates')
|
|
171
|
+
'list_aggregates'
|
|
172
|
+
"""
|
|
173
|
+
if function_name.startswith("a") and len(function_name) > 1:
|
|
174
|
+
# Check if it's likely an async function (a + verb pattern)
|
|
175
|
+
# Common patterns: alist, aget, asearch, etc.
|
|
176
|
+
return function_name[1:]
|
|
177
|
+
return function_name
|
|
178
|
+
|
|
179
|
+
def _apply_column_renames(
|
|
180
|
+
self,
|
|
181
|
+
df: pd.DataFrame,
|
|
182
|
+
function_name: str,
|
|
183
|
+
) -> pd.DataFrame:
|
|
184
|
+
"""
|
|
185
|
+
Apply column renames for a specific function.
|
|
186
|
+
|
|
187
|
+
Looks up column rename mappings in `_column_renames` class attribute.
|
|
188
|
+
Both sync and async methods use the same mapping (async names are normalized).
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
df: DataFrame to rename columns in.
|
|
192
|
+
function_name: Function name (sync or async).
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
DataFrame with renamed columns (if mappings exist).
|
|
196
|
+
"""
|
|
197
|
+
# Normalize function name (remove 'a' prefix for async methods)
|
|
198
|
+
normalized_name = self._normalize_function_name(function_name)
|
|
199
|
+
|
|
200
|
+
# Get column rename mapping for this function
|
|
201
|
+
rename_map = self._column_renames.get(normalized_name, {})
|
|
202
|
+
|
|
203
|
+
if not rename_map:
|
|
204
|
+
return df
|
|
205
|
+
|
|
206
|
+
# Apply renames
|
|
207
|
+
df = df.copy()
|
|
208
|
+
df = df.rename(columns=rename_map)
|
|
209
|
+
|
|
210
|
+
return df
|
|
211
|
+
|
|
212
|
+
@staticmethod
|
|
213
|
+
def _camel_to_snake(name: str) -> str:
|
|
214
|
+
"""
|
|
215
|
+
Convert camelCase to snake_case.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
name: Column name in camelCase.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Column name in snake_case.
|
|
222
|
+
"""
|
|
223
|
+
# Insert underscore before uppercase letters (except at the start)
|
|
224
|
+
s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
|
|
225
|
+
# Insert underscore before uppercase letters that follow lowercase
|
|
226
|
+
s2 = re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1)
|
|
227
|
+
return s2.lower()
|
|
228
|
+
|
|
229
|
+
@staticmethod
|
|
230
|
+
def _infer_dtypes(df: pd.DataFrame) -> pd.DataFrame:
|
|
231
|
+
"""
|
|
232
|
+
Infer and convert data types for DataFrame columns.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
df: DataFrame with potentially incorrect types.
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
DataFrame with proper data types.
|
|
239
|
+
"""
|
|
240
|
+
df = df.copy()
|
|
241
|
+
|
|
242
|
+
for col in df.columns:
|
|
243
|
+
series = df[col]
|
|
244
|
+
if not (pd.api.types.is_object_dtype(series.dtype) or pd.api.types.is_string_dtype(series.dtype)):
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
pd.to_numeric(series, errors="raise")
|
|
249
|
+
non_na = series.dropna()
|
|
250
|
+
if (
|
|
251
|
+
len(non_na) > 0
|
|
252
|
+
and non_na.apply(lambda x: isinstance(x, (int, float)) and float(x).is_integer()).all()
|
|
253
|
+
):
|
|
254
|
+
df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
|
|
255
|
+
else:
|
|
256
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
257
|
+
except (ValueError, TypeError):
|
|
258
|
+
bool_values = {"true": True, "false": False, "True": True, "False": False}
|
|
259
|
+
non_na = df[col].dropna()
|
|
260
|
+
if len(non_na) > 0 and non_na.isin(bool_values.keys()).all():
|
|
261
|
+
df[col] = df[col].map(bool_values)
|
|
262
|
+
elif pd.api.types.is_string_dtype(df[col].dtype):
|
|
263
|
+
df[col] = df[col].astype(object)
|
|
264
|
+
|
|
265
|
+
return df
|
|
266
|
+
|
|
267
|
+
def _normalize_nested_data(
|
|
268
|
+
self,
|
|
269
|
+
data: list[dict[str, Any]],
|
|
270
|
+
nested_key: str = "values",
|
|
271
|
+
parent_keys: list[str] | None = None,
|
|
272
|
+
) -> list[dict[str, Any]]:
|
|
273
|
+
"""
|
|
274
|
+
Normalize nested data structures by flattening nested arrays.
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
data: List of dictionaries, each potentially containing nested arrays.
|
|
278
|
+
nested_key: Key name for the nested array (e.g., 'values').
|
|
279
|
+
parent_keys: List of keys from parent to include in each flattened row.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
List of flattened dictionaries.
|
|
283
|
+
"""
|
|
284
|
+
normalized = []
|
|
285
|
+
parent_keys = parent_keys or []
|
|
286
|
+
|
|
287
|
+
for item in data:
|
|
288
|
+
# Extract parent fields
|
|
289
|
+
parent_data = {k: v for k, v in item.items() if k != nested_key}
|
|
290
|
+
|
|
291
|
+
# Get nested array
|
|
292
|
+
nested_array = item.get(nested_key, [])
|
|
293
|
+
|
|
294
|
+
if not nested_array:
|
|
295
|
+
# If no nested data, include parent row as-is
|
|
296
|
+
normalized.append(parent_data)
|
|
297
|
+
else:
|
|
298
|
+
# Flatten: create one row per nested item
|
|
299
|
+
for nested_item in nested_array:
|
|
300
|
+
row = parent_data.copy()
|
|
301
|
+
row.update(nested_item)
|
|
302
|
+
normalized.append(row)
|
|
303
|
+
|
|
304
|
+
return normalized
|