interloper-pandas 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- interloper_pandas-0.2.0/PKG-INFO +14 -0
- interloper_pandas-0.2.0/README.md +3 -0
- interloper_pandas-0.2.0/pyproject.toml +43 -0
- interloper_pandas-0.2.0/src/interloper_pandas/__init__.py +6 -0
- interloper_pandas-0.2.0/src/interloper_pandas/adapter.py +44 -0
- interloper_pandas-0.2.0/src/interloper_pandas/normalizer.py +136 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: interloper-pandas
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Interloper pandas integration: DataFrame normalizer and adapter
|
|
5
|
+
Author: Guillaume Onfroy
|
|
6
|
+
Author-email: Guillaume Onfroy <guillaume@digitlcloud.com>
|
|
7
|
+
Requires-Dist: pandas>=1.5
|
|
8
|
+
Requires-Dist: interloper-core
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# interloper-pandas
|
|
13
|
+
|
|
14
|
+
Pandas DataFrame normalizer and adapter for Interloper.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# ###############
|
|
2
|
+
# PROJECT / UV
|
|
3
|
+
# ###############
|
|
4
|
+
[project]
|
|
5
|
+
name = "interloper-pandas"
|
|
6
|
+
version = "0.2.0"
|
|
7
|
+
description = "Interloper pandas integration: DataFrame normalizer and adapter"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
authors = [{ name = "Guillaume Onfroy", email = "guillaume@digitlcloud.com" }]
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pandas>=1.5",
|
|
13
|
+
"interloper-core",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[build-system]
|
|
17
|
+
requires = ["uv_build>=0.9.5,<0.10.0"]
|
|
18
|
+
build-backend = "uv_build"
|
|
19
|
+
|
|
20
|
+
[tool.uv.sources]
|
|
21
|
+
interloper-core = { workspace = true }
|
|
22
|
+
|
|
23
|
+
# ###############
|
|
24
|
+
# RUFF
|
|
25
|
+
# ###############
|
|
26
|
+
[tool.ruff]
|
|
27
|
+
line-length = 120
|
|
28
|
+
|
|
29
|
+
[tool.ruff.lint]
|
|
30
|
+
extend-select = ["E", "I", "UP", "ANN001", "ANN201", "ANN202"]
|
|
31
|
+
|
|
32
|
+
[tool.ruff.lint.per-file-ignores]
|
|
33
|
+
"__init__.py" = ["F401", "F403"]
|
|
34
|
+
"tests/**" = ["ANN", "F811"]
|
|
35
|
+
|
|
36
|
+
# ###############
|
|
37
|
+
# PYRIGHT
|
|
38
|
+
# ###############
|
|
39
|
+
[tool.pyright]
|
|
40
|
+
include = ["src"]
|
|
41
|
+
typeCheckingMode = "basic"
|
|
42
|
+
reportMissingParameterType = true
|
|
43
|
+
ignore = ["libs/**", "tests/**", "scripts/**"]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""DataFrame adapter for converting between pandas DataFrames and database rows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from interloper.errors import AdapterError
|
|
9
|
+
from interloper.io.adapter import DataAdapter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DataFrameAdapter(DataAdapter):
|
|
13
|
+
"""Adapter for pandas ``DataFrame``.
|
|
14
|
+
|
|
15
|
+
Converts between ``DataFrame`` and ``list[dict]`` row format used by
|
|
16
|
+
:class:`~interloper.io.database.DatabaseIO`.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def to_rows(self, data: Any) -> list[dict[str, Any]]:
|
|
20
|
+
"""Convert a ``DataFrame`` to a list of row dicts.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
data: A pandas ``DataFrame``
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Rows as list of dicts
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
TypeError: If *data* is not a ``DataFrame``
|
|
30
|
+
"""
|
|
31
|
+
if not isinstance(data, pd.DataFrame):
|
|
32
|
+
raise AdapterError(f"DataFrameAdapter expects a pandas DataFrame, got {type(data).__name__}.")
|
|
33
|
+
return data.to_dict("records")
|
|
34
|
+
|
|
35
|
+
def from_rows(self, rows: list[dict[str, Any]]) -> Any:
|
|
36
|
+
"""Convert rows to a pandas ``DataFrame``.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
rows: Raw rows from the database
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
A pandas ``DataFrame``
|
|
43
|
+
"""
|
|
44
|
+
return pd.DataFrame(rows)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""DataFrame-native normalizer for pandas DataFrames."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from interloper.normalizer.base import Normalizer
|
|
10
|
+
from interloper.schema import infer_schema, reconcile_schema, validate_schema
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class DataFrameNormalizer(Normalizer):
|
|
16
|
+
"""Type-native normalizer for pandas ``DataFrame`` asset data.
|
|
17
|
+
|
|
18
|
+
Accepts a ``DataFrame`` and returns a ``DataFrame`` — all transformations
|
|
19
|
+
are performed using native pandas operations for efficiency.
|
|
20
|
+
|
|
21
|
+
Usage::
|
|
22
|
+
|
|
23
|
+
@asset(normalizer=DataFrameNormalizer())
|
|
24
|
+
def my_asset(context):
|
|
25
|
+
return pd.DataFrame({"UserName": ["alice"], "Address": ["NYC"]})
|
|
26
|
+
|
|
27
|
+
Inherits all configuration fields from :class:`Normalizer`:
|
|
28
|
+
``normalize_columns``, ``flatten_max_level``, ``flatten_separator``,
|
|
29
|
+
``fill_missing``, ``infer``.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def normalize(self, data: Any) -> pd.DataFrame:
|
|
33
|
+
"""Normalize *data* to a ``DataFrame`` with configured transformations.
|
|
34
|
+
|
|
35
|
+
If the input is already a ``DataFrame``, operates on it directly.
|
|
36
|
+
Otherwise, coerces to ``list[dict]`` first, then converts to
|
|
37
|
+
``DataFrame``.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
data: Raw asset output (``DataFrame`` or any type supported by
|
|
41
|
+
the base :class:`Normalizer`).
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Normalized ``DataFrame``.
|
|
45
|
+
"""
|
|
46
|
+
if isinstance(data, pd.DataFrame):
|
|
47
|
+
df = data
|
|
48
|
+
else:
|
|
49
|
+
# Coerce to list[dict] using base class, then convert to DataFrame
|
|
50
|
+
rows = self._coerce(data)
|
|
51
|
+
df = pd.DataFrame(rows)
|
|
52
|
+
|
|
53
|
+
if df.empty:
|
|
54
|
+
return df
|
|
55
|
+
|
|
56
|
+
if self.flatten_max_level is None or self.flatten_max_level > 0:
|
|
57
|
+
df = self._flatten_dataframe(df)
|
|
58
|
+
|
|
59
|
+
if self.normalize_columns:
|
|
60
|
+
df = df.rename(columns=self.column_name)
|
|
61
|
+
|
|
62
|
+
# fill_missing is a no-op for DataFrames — pandas naturally fills
|
|
63
|
+
# missing columns with NaN when constructed from heterogeneous data.
|
|
64
|
+
|
|
65
|
+
return df
|
|
66
|
+
|
|
67
|
+
def infer_schema(self, data: pd.DataFrame) -> type[BaseModel]:
|
|
68
|
+
"""Infer a Pydantic model from a ``DataFrame``.
|
|
69
|
+
|
|
70
|
+
Converts the DataFrame to records and delegates to the shared
|
|
71
|
+
schema inference logic.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
data: Normalized ``DataFrame`` (output of :meth:`normalize`).
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
A dynamically created Pydantic ``BaseModel`` subclass.
|
|
78
|
+
"""
|
|
79
|
+
rows = data.to_dict("records")
|
|
80
|
+
return infer_schema(rows)
|
|
81
|
+
|
|
82
|
+
def validate_schema(
|
|
83
|
+
self,
|
|
84
|
+
data: pd.DataFrame,
|
|
85
|
+
schema: type[BaseModel],
|
|
86
|
+
*,
|
|
87
|
+
strict: bool = False,
|
|
88
|
+
) -> None:
|
|
89
|
+
"""Validate a ``DataFrame`` against a Pydantic schema.
|
|
90
|
+
|
|
91
|
+
Converts the DataFrame to records and validates each row.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
data: Normalized ``DataFrame``.
|
|
95
|
+
schema: Pydantic model class to validate against.
|
|
96
|
+
strict: When ``True``, reject extra and missing required fields.
|
|
97
|
+
"""
|
|
98
|
+
rows = data.to_dict("records")
|
|
99
|
+
validate_schema(rows, schema, strict=strict)
|
|
100
|
+
|
|
101
|
+
def reconcile(
|
|
102
|
+
self,
|
|
103
|
+
data: pd.DataFrame,
|
|
104
|
+
schema: type[BaseModel],
|
|
105
|
+
) -> pd.DataFrame:
|
|
106
|
+
"""Reconcile a ``DataFrame`` against a Pydantic schema.
|
|
107
|
+
|
|
108
|
+
Aligns columns to the schema (drops extras, adds missing) and
|
|
109
|
+
coerces values to the schema's types via Pydantic ``model_validate``.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
data: Normalized ``DataFrame``.
|
|
113
|
+
schema: Pydantic model class describing the target shape.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Reconciled ``DataFrame``.
|
|
117
|
+
"""
|
|
118
|
+
rows = data.to_dict("records")
|
|
119
|
+
reconciled = reconcile_schema(rows, schema)
|
|
120
|
+
return pd.DataFrame(reconciled)
|
|
121
|
+
|
|
122
|
+
def _flatten_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
123
|
+
"""Flatten nested dicts in DataFrame cells using separator-joined keys.
|
|
124
|
+
|
|
125
|
+
Any cell value that is a ``dict`` is expanded into separate columns
|
|
126
|
+
with keys joined by :attr:`flatten_separator`.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
df: Input ``DataFrame`` potentially containing dict-valued cells.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Flattened ``DataFrame``.
|
|
133
|
+
"""
|
|
134
|
+
rows = df.to_dict("records")
|
|
135
|
+
flattened = [self._flatten_dict(row) for row in rows]
|
|
136
|
+
return pd.DataFrame(flattened)
|