aptdata 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aptdata/__init__.py +3 -0
- aptdata/cli/__init__.py +5 -0
- aptdata/cli/app.py +247 -0
- aptdata/cli/commands/__init__.py +9 -0
- aptdata/cli/commands/config_cmd.py +128 -0
- aptdata/cli/commands/mesh_cmd.py +435 -0
- aptdata/cli/commands/plugin_cmd.py +107 -0
- aptdata/cli/commands/system_cmd.py +90 -0
- aptdata/cli/commands/telemetry_cmd.py +57 -0
- aptdata/cli/completions.py +56 -0
- aptdata/cli/interactive.py +269 -0
- aptdata/cli/rendering/__init__.py +31 -0
- aptdata/cli/rendering/console.py +119 -0
- aptdata/cli/rendering/logger.py +26 -0
- aptdata/cli/rendering/panels.py +87 -0
- aptdata/cli/rendering/tables.py +81 -0
- aptdata/cli/scaffold.py +1089 -0
- aptdata/config/__init__.py +13 -0
- aptdata/config/parser.py +136 -0
- aptdata/config/schema.py +27 -0
- aptdata/config/secrets.py +60 -0
- aptdata/core/__init__.py +46 -0
- aptdata/core/context.py +31 -0
- aptdata/core/dataset.py +39 -0
- aptdata/core/lineage.py +213 -0
- aptdata/core/state.py +27 -0
- aptdata/core/system.py +317 -0
- aptdata/core/workflow.py +372 -0
- aptdata/mcp/__init__.py +5 -0
- aptdata/mcp/server.py +198 -0
- aptdata/plugins/__init__.py +77 -0
- aptdata/plugins/ai/__init__.py +6 -0
- aptdata/plugins/ai/chunking.py +66 -0
- aptdata/plugins/ai/embeddings.py +56 -0
- aptdata/plugins/base.py +57 -0
- aptdata/plugins/dataset.py +62 -0
- aptdata/plugins/governance/__init__.py +32 -0
- aptdata/plugins/governance/catalog.py +115 -0
- aptdata/plugins/governance/classification.py +44 -0
- aptdata/plugins/governance/lineage_store.py +49 -0
- aptdata/plugins/governance/rules.py +180 -0
- aptdata/plugins/local_fs.py +241 -0
- aptdata/plugins/manager.py +142 -0
- aptdata/plugins/postgres.py +113 -0
- aptdata/plugins/quality/__init__.py +39 -0
- aptdata/plugins/quality/contract.py +128 -0
- aptdata/plugins/quality/expectations.py +310 -0
- aptdata/plugins/quality/report.py +94 -0
- aptdata/plugins/quality/validator.py +139 -0
- aptdata/plugins/rest.py +135 -0
- aptdata/plugins/transform/__init__.py +14 -0
- aptdata/plugins/transform/pandas.py +129 -0
- aptdata/plugins/transform/spark.py +134 -0
- aptdata/plugins/vector/__init__.py +6 -0
- aptdata/plugins/vector/base.py +19 -0
- aptdata/plugins/vector/qdrant.py +41 -0
- aptdata/telemetry/__init__.py +5 -0
- aptdata/telemetry/instrumentation.py +164 -0
- aptdata/tui/__init__.py +5 -0
- aptdata/tui/monitor.py +279 -0
- aptdata-0.0.2.dist-info/METADATA +330 -0
- aptdata-0.0.2.dist-info/RECORD +65 -0
- aptdata-0.0.2.dist-info/WHEEL +4 -0
- aptdata-0.0.2.dist-info/entry_points.txt +3 -0
- aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Quality validator — wraps expectations into a workflow-compatible step."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import warnings
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from aptdata.plugins.quality.contract import EnforcementMode
|
|
10
|
+
from aptdata.plugins.quality.expectations import BaseExpectation
|
|
11
|
+
from aptdata.plugins.quality.report import CheckStatus, QualityReport
|
|
12
|
+
from aptdata.telemetry.instrumentation import get_tracer
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class QualityValidator:
|
|
18
|
+
"""Runs a suite of :class:`~.expectations.BaseExpectation` objects against data.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
expectations:
|
|
23
|
+
List of expectations to evaluate.
|
|
24
|
+
enforcement:
|
|
25
|
+
How to react when an expectation fails
|
|
26
|
+
(see :class:`~.contract.EnforcementMode`).
|
|
27
|
+
name:
|
|
28
|
+
Human-readable identifier used in OTel span names.
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
::
|
|
33
|
+
|
|
34
|
+
from aptdata.plugins.quality import (
|
|
35
|
+
QualityValidator, ExpectColumnToNotBeNull, EnforcementMode
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
validator = QualityValidator(
|
|
39
|
+
expectations=[ExpectColumnToNotBeNull("age")],
|
|
40
|
+
enforcement=EnforcementMode.ABORT,
|
|
41
|
+
)
|
|
42
|
+
clean_df = validator.validate(df)
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
expectations: list[BaseExpectation],
|
|
48
|
+
enforcement: EnforcementMode = EnforcementMode.ABORT,
|
|
49
|
+
name: str = "QualityValidator",
|
|
50
|
+
) -> None:
|
|
51
|
+
self.expectations = expectations
|
|
52
|
+
self.enforcement = enforcement
|
|
53
|
+
self.name = name
|
|
54
|
+
|
|
55
|
+
def validate(self, data: Any) -> Any:
|
|
56
|
+
"""Validate *data* against all expectations and return *data*.
|
|
57
|
+
|
|
58
|
+
This method is compatible with :meth:`~aptdata.core.workflow.Workflow.add_step`.
|
|
59
|
+
On success the original *data* object is returned unchanged.
|
|
60
|
+
|
|
61
|
+
Parameters
|
|
62
|
+
----------
|
|
63
|
+
data:
|
|
64
|
+
A ``pd.DataFrame``, PySpark ``DataFrame``,
|
|
65
|
+
:class:`~aptdata.plugins.dataset.InMemoryDataset`, or
|
|
66
|
+
``list[dict]``.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
Any
|
|
71
|
+
The original *data* object (pass-through).
|
|
72
|
+
|
|
73
|
+
Raises
|
|
74
|
+
------
|
|
75
|
+
ValueError
|
|
76
|
+
When ``enforcement == ABORT`` and at least one expectation fails.
|
|
77
|
+
"""
|
|
78
|
+
import pandas as pd # type: ignore[import]
|
|
79
|
+
|
|
80
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
81
|
+
|
|
82
|
+
# Resolve to a DataFrame for expectations.
|
|
83
|
+
resolved: Any
|
|
84
|
+
if isinstance(data, InMemoryDataset):
|
|
85
|
+
resolved = pd.DataFrame(data.read())
|
|
86
|
+
elif isinstance(data, list):
|
|
87
|
+
resolved = pd.DataFrame(data)
|
|
88
|
+
else:
|
|
89
|
+
resolved = data
|
|
90
|
+
|
|
91
|
+
dataset_uri = data.uri if isinstance(data, InMemoryDataset) else "unknown"
|
|
92
|
+
report = QualityReport(dataset_uri=dataset_uri)
|
|
93
|
+
|
|
94
|
+
tracer = get_tracer("aptdata.quality")
|
|
95
|
+
with tracer.start_as_current_span(self.name) as span:
|
|
96
|
+
span.set_attribute("aptdata.quality.validator_name", self.name)
|
|
97
|
+
span.set_attribute("aptdata.quality.enforcement", self.enforcement.value)
|
|
98
|
+
span.set_attribute(
|
|
99
|
+
"aptdata.quality.num_expectations", len(self.expectations)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
for expectation in self.expectations:
|
|
103
|
+
result = expectation.validate(resolved)
|
|
104
|
+
report.checks.append(result)
|
|
105
|
+
|
|
106
|
+
passed = report.passed
|
|
107
|
+
span.set_attribute("aptdata.quality.passed", passed)
|
|
108
|
+
span.set_attribute("aptdata.quality.num_checks", len(report.checks))
|
|
109
|
+
failed_count = report.summary.get(CheckStatus.FAILED, 0)
|
|
110
|
+
span.set_attribute("aptdata.quality.failed_checks", failed_count)
|
|
111
|
+
|
|
112
|
+
if not passed:
|
|
113
|
+
failed_checks = [c for c in report.checks if c.status == CheckStatus.FAILED]
|
|
114
|
+
summary_msg = "; ".join(c.message for c in failed_checks)
|
|
115
|
+
|
|
116
|
+
if self.enforcement == EnforcementMode.ABORT:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f"Data quality validation failed [{self.name}]: {summary_msg}"
|
|
119
|
+
)
|
|
120
|
+
elif self.enforcement == EnforcementMode.WARN:
|
|
121
|
+
warnings.warn(
|
|
122
|
+
f"Data quality warning [{self.name}]: {summary_msg}",
|
|
123
|
+
stacklevel=2,
|
|
124
|
+
)
|
|
125
|
+
logger.warning("Quality validation warning: %s", summary_msg)
|
|
126
|
+
else:
|
|
127
|
+
# TAG mode — attach quality metadata; pass-through
|
|
128
|
+
if hasattr(data, "schema_metadata") and isinstance(
|
|
129
|
+
data.schema_metadata, dict
|
|
130
|
+
):
|
|
131
|
+
data.schema_metadata["quality_report"] = {
|
|
132
|
+
"passed": False,
|
|
133
|
+
"failed_checks": [c.expectation_name for c in failed_checks],
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
return data
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
__all__ = ["QualityValidator"]
|
aptdata/plugins/rest.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""REST API reader plugin.
|
|
2
|
+
|
|
3
|
+
Provides :class:`APIReader` — a generic reader that fetches data from a
|
|
4
|
+
RESTful JSON endpoint using HTTP GET, with optional header injection
|
|
5
|
+
(e.g. for bearer tokens) and simple offset/page-based pagination.
|
|
6
|
+
|
|
7
|
+
Requires the optional ``httpx`` package. A friendly
|
|
8
|
+
:class:`~aptdata.plugins.manager.PluginDependencyError` is raised
|
|
9
|
+
when it is not installed.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from aptdata.plugins.base import BaseReader
|
|
17
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
18
|
+
from aptdata.plugins.manager import PluginDependencyError
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _require_httpx() -> Any:
|
|
22
|
+
"""Import and return the ``httpx`` module, or raise a friendly error."""
|
|
23
|
+
try:
|
|
24
|
+
import httpx # noqa: WPS433
|
|
25
|
+
except ImportError:
|
|
26
|
+
raise PluginDependencyError("APIReader", "httpx") from None
|
|
27
|
+
return httpx
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class APIReader(BaseReader):
|
|
31
|
+
"""Read JSON data from a REST API endpoint.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
endpoint:
|
|
36
|
+
The URL to send GET requests to.
|
|
37
|
+
headers:
|
|
38
|
+
Optional HTTP headers (e.g. ``{"Authorization": "Bearer xxx"}``).
|
|
39
|
+
params:
|
|
40
|
+
Optional query-string parameters merged into every request.
|
|
41
|
+
pagination_key:
|
|
42
|
+
When set, the reader fetches pages until the response JSON list is
|
|
43
|
+
empty. The value is the query-string parameter name that carries the
|
|
44
|
+
page number (e.g. ``"page"``). Pages start at ``1``.
|
|
45
|
+
max_pages:
|
|
46
|
+
Safety limit on the number of pages fetched (default ``100``).
|
|
47
|
+
timeout:
|
|
48
|
+
HTTP request timeout in seconds (default ``30``).
|
|
49
|
+
records_path:
|
|
50
|
+
Optional dot-separated path into the JSON response that contains
|
|
51
|
+
the list of records (e.g. ``"data.items"``). When ``None`` the
|
|
52
|
+
response itself must be a JSON array.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
endpoint: str,
|
|
58
|
+
*,
|
|
59
|
+
headers: dict[str, str] | None = None,
|
|
60
|
+
params: dict[str, Any] | None = None,
|
|
61
|
+
pagination_key: str | None = None,
|
|
62
|
+
max_pages: int = 100,
|
|
63
|
+
timeout: float = 30,
|
|
64
|
+
records_path: str | None = None,
|
|
65
|
+
) -> None:
|
|
66
|
+
self.endpoint = endpoint
|
|
67
|
+
self.headers = headers or {}
|
|
68
|
+
self.params = params or {}
|
|
69
|
+
self.pagination_key = pagination_key
|
|
70
|
+
self.max_pages = max_pages
|
|
71
|
+
self.timeout = timeout
|
|
72
|
+
self.records_path = records_path
|
|
73
|
+
|
|
74
|
+
# -- helpers ------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def _extract_records(body: Any, path: str | None) -> list[dict[str, Any]]:
|
|
78
|
+
"""Navigate *path* inside *body* and return the records list."""
|
|
79
|
+
if path is None:
|
|
80
|
+
if isinstance(body, list):
|
|
81
|
+
return body
|
|
82
|
+
raise ValueError(
|
|
83
|
+
"Expected a JSON array from the API response. "
|
|
84
|
+
"Set 'records_path' if records are nested."
|
|
85
|
+
)
|
|
86
|
+
current: Any = body
|
|
87
|
+
for key in path.split("."):
|
|
88
|
+
if isinstance(current, dict):
|
|
89
|
+
current = current[key]
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError(f"Cannot traverse key '{key}' in a non-dict value.")
|
|
92
|
+
if not isinstance(current, list):
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"Expected a list at path '{path}', got {type(current).__name__}."
|
|
95
|
+
)
|
|
96
|
+
return current
|
|
97
|
+
|
|
98
|
+
# -- BaseReader ---------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
def read(self, **kwargs: Any) -> InMemoryDataset:
|
|
101
|
+
httpx = _require_httpx()
|
|
102
|
+
|
|
103
|
+
all_records: list[dict[str, Any]] = []
|
|
104
|
+
|
|
105
|
+
with httpx.Client(timeout=self.timeout) as client:
|
|
106
|
+
if self.pagination_key is None:
|
|
107
|
+
response = client.get(
|
|
108
|
+
self.endpoint,
|
|
109
|
+
headers=self.headers,
|
|
110
|
+
params=self.params,
|
|
111
|
+
)
|
|
112
|
+
response.raise_for_status()
|
|
113
|
+
all_records = self._extract_records(response.json(), self.records_path)
|
|
114
|
+
else:
|
|
115
|
+
for page in range(1, self.max_pages + 1):
|
|
116
|
+
params = {**self.params, self.pagination_key: page}
|
|
117
|
+
response = client.get(
|
|
118
|
+
self.endpoint,
|
|
119
|
+
headers=self.headers,
|
|
120
|
+
params=params,
|
|
121
|
+
)
|
|
122
|
+
response.raise_for_status()
|
|
123
|
+
page_records = self._extract_records(
|
|
124
|
+
response.json(), self.records_path
|
|
125
|
+
)
|
|
126
|
+
if not page_records:
|
|
127
|
+
break
|
|
128
|
+
all_records.extend(page_records)
|
|
129
|
+
|
|
130
|
+
ds = InMemoryDataset(uri=self.endpoint)
|
|
131
|
+
ds.write(all_records)
|
|
132
|
+
return ds
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
__all__ = ["APIReader"]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Transform plugin package — engine-agnostic transformation wrappers.
|
|
2
|
+
|
|
3
|
+
Provides :class:`PandasTransformer` and :class:`PySparkTransformer` as
|
|
4
|
+
concrete :class:`~aptdata.plugins.base.BaseTransformer` implementations.
|
|
5
|
+
Both use lazy imports so the framework core works without pandas or pyspark
|
|
6
|
+
installed.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from aptdata.plugins.transform.pandas import PandasTransformer
|
|
12
|
+
from aptdata.plugins.transform.spark import PySparkTransformer
|
|
13
|
+
|
|
14
|
+
__all__ = ["PandasTransformer", "PySparkTransformer"]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Pandas-based data transformer.
|
|
2
|
+
|
|
3
|
+
Uses lazy imports so that the framework core works even when pandas is not
|
|
4
|
+
installed. A :class:`~aptdata.plugins.manager.PluginDependencyError` is
|
|
5
|
+
raised at instantiation time if pandas is not available.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from typing import TYPE_CHECKING, Any
|
|
13
|
+
|
|
14
|
+
from aptdata.plugins.base import BaseTransformer
|
|
15
|
+
from aptdata.plugins.manager import PluginDependencyError
|
|
16
|
+
from aptdata.telemetry.instrumentation import get_tracer
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PandasTransformer(BaseTransformer):
|
|
23
|
+
"""Engine-agnostic transformer backed by a pandas callable.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
name:
|
|
28
|
+
Human-readable identifier for this transformer.
|
|
29
|
+
transform_func:
|
|
30
|
+
A callable ``(pd.DataFrame) -> pd.DataFrame`` that applies the
|
|
31
|
+
desired transformation.
|
|
32
|
+
|
|
33
|
+
Examples
|
|
34
|
+
--------
|
|
35
|
+
::
|
|
36
|
+
|
|
37
|
+
from aptdata.plugins.transform import PandasTransformer
|
|
38
|
+
|
|
39
|
+
def double_values(df):
|
|
40
|
+
return df.assign(value=df["value"] * 2)
|
|
41
|
+
|
|
42
|
+
transformer = PandasTransformer("double_values", double_values)
|
|
43
|
+
result = transformer.transform(my_dataset)
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
name: str,
|
|
49
|
+
transform_func: Callable[[Any], Any],
|
|
50
|
+
) -> None:
|
|
51
|
+
try:
|
|
52
|
+
import pandas # noqa: F401
|
|
53
|
+
except ImportError as exc:
|
|
54
|
+
raise PluginDependencyError("PandasTransformer", "pandas") from exc
|
|
55
|
+
self._name = name
|
|
56
|
+
self._transform_func = transform_func
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def name(self) -> str:
|
|
60
|
+
"""Human-readable name of this transformer."""
|
|
61
|
+
return self._name
|
|
62
|
+
|
|
63
|
+
def transform(self, data: Any) -> Any:
|
|
64
|
+
"""Apply the transformation to *data*.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
data:
|
|
69
|
+
Accepts a :class:`~aptdata.plugins.dataset.InMemoryDataset`,
|
|
70
|
+
a ``pd.DataFrame``, or a ``list[dict]``.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
Any
|
|
75
|
+
The transformed result. If the input was an
|
|
76
|
+
:class:`~aptdata.plugins.dataset.InMemoryDataset` the output
|
|
77
|
+
will also be one; otherwise a ``pd.DataFrame`` is returned.
|
|
78
|
+
"""
|
|
79
|
+
import pandas as pd
|
|
80
|
+
|
|
81
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
82
|
+
|
|
83
|
+
return_dataset = False
|
|
84
|
+
original_uri = "memory://transformed"
|
|
85
|
+
original_schema: dict[str, Any] = {}
|
|
86
|
+
|
|
87
|
+
if isinstance(data, InMemoryDataset):
|
|
88
|
+
return_dataset = True
|
|
89
|
+
original_uri = data.uri
|
|
90
|
+
original_schema = data.schema_metadata
|
|
91
|
+
df: pd.DataFrame = pd.DataFrame(data.read())
|
|
92
|
+
elif isinstance(data, pd.DataFrame):
|
|
93
|
+
df = data
|
|
94
|
+
elif isinstance(data, list):
|
|
95
|
+
df = pd.DataFrame(data)
|
|
96
|
+
else:
|
|
97
|
+
df = pd.DataFrame([data] if data is not None else [])
|
|
98
|
+
|
|
99
|
+
rows_in = len(df)
|
|
100
|
+
|
|
101
|
+
tracer = get_tracer("aptdata.transform")
|
|
102
|
+
with tracer.start_as_current_span(self._name) as span:
|
|
103
|
+
span.set_attribute("aptdata.transformer.name", self._name)
|
|
104
|
+
span.set_attribute("aptdata.transformer.engine", "pandas")
|
|
105
|
+
span.set_attribute("aptdata.transformer.rows_in", rows_in)
|
|
106
|
+
|
|
107
|
+
t0 = time.perf_counter()
|
|
108
|
+
result_df: pd.DataFrame = self._transform_func(df)
|
|
109
|
+
compute_time_ms = (time.perf_counter() - t0) * 1000.0
|
|
110
|
+
|
|
111
|
+
rows_out = len(result_df)
|
|
112
|
+
columns_out = list(result_df.columns)
|
|
113
|
+
span.set_attribute("aptdata.transformer.rows_out", rows_out)
|
|
114
|
+
span.set_attribute("aptdata.transformer.columns_out", str(columns_out))
|
|
115
|
+
span.set_attribute(
|
|
116
|
+
"aptdata.transformer.compute_time_ms", round(compute_time_ms, 3)
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if return_dataset:
|
|
120
|
+
out_dataset = InMemoryDataset(
|
|
121
|
+
uri=original_uri, schema_metadata=original_schema
|
|
122
|
+
)
|
|
123
|
+
out_dataset.write(result_df.to_dict(orient="records"))
|
|
124
|
+
return out_dataset
|
|
125
|
+
|
|
126
|
+
return result_df
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
__all__ = ["PandasTransformer"]
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""PySpark-based data transformer.
|
|
2
|
+
|
|
3
|
+
Uses lazy imports so that the framework core works even when pyspark is not
|
|
4
|
+
installed. A :class:`~aptdata.plugins.manager.PluginDependencyError` is
|
|
5
|
+
raised at instantiation time if pyspark is not available.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from aptdata.plugins.base import BaseTransformer
|
|
15
|
+
from aptdata.plugins.manager import PluginDependencyError
|
|
16
|
+
from aptdata.telemetry.instrumentation import get_tracer
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PySparkTransformer(BaseTransformer):
|
|
20
|
+
"""Engine-agnostic transformer backed by a PySpark callable.
|
|
21
|
+
|
|
22
|
+
Parameters
|
|
23
|
+
----------
|
|
24
|
+
name:
|
|
25
|
+
Human-readable identifier for this transformer.
|
|
26
|
+
transform_func:
|
|
27
|
+
A callable ``(SparkSession, DataFrame) -> DataFrame`` that applies
|
|
28
|
+
the desired transformation.
|
|
29
|
+
app_name:
|
|
30
|
+
Spark application name passed to ``SparkSession.builder``.
|
|
31
|
+
|
|
32
|
+
Examples
|
|
33
|
+
--------
|
|
34
|
+
::
|
|
35
|
+
|
|
36
|
+
from aptdata.plugins.transform import PySparkTransformer
|
|
37
|
+
|
|
38
|
+
def double_values(spark, df):
|
|
39
|
+
from pyspark.sql import functions as F
|
|
40
|
+
return df.withColumn("value", F.col("value") * 2)
|
|
41
|
+
|
|
42
|
+
transformer = PySparkTransformer("double_values", double_values)
|
|
43
|
+
result = transformer.transform(my_df)
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
name: str,
|
|
49
|
+
transform_func: Callable[[Any, Any], Any],
|
|
50
|
+
app_name: str = "SmartData",
|
|
51
|
+
) -> None:
|
|
52
|
+
try:
|
|
53
|
+
import pyspark # noqa: F401
|
|
54
|
+
except ImportError as exc:
|
|
55
|
+
raise PluginDependencyError("PySparkTransformer", "pyspark") from exc
|
|
56
|
+
self._name = name
|
|
57
|
+
self._transform_func = transform_func
|
|
58
|
+
self._app_name = app_name
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def name(self) -> str:
|
|
62
|
+
"""Human-readable name of this transformer."""
|
|
63
|
+
return self._name
|
|
64
|
+
|
|
65
|
+
def transform(self, data: Any) -> Any:
|
|
66
|
+
"""Apply the transformation to *data*.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
data:
|
|
71
|
+
Accepts a PySpark ``DataFrame``, a ``list[dict]``, or an
|
|
72
|
+
:class:`~aptdata.plugins.dataset.InMemoryDataset`.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
Any
|
|
77
|
+
The transformed PySpark ``DataFrame``.
|
|
78
|
+
"""
|
|
79
|
+
from pyspark.sql import SparkSession
|
|
80
|
+
from pyspark.sql.types import StructType
|
|
81
|
+
|
|
82
|
+
from aptdata.plugins.dataset import InMemoryDataset
|
|
83
|
+
|
|
84
|
+
spark: Any = SparkSession.builder.appName(self._app_name).getOrCreate()
|
|
85
|
+
|
|
86
|
+
# Convert input to a PySpark DataFrame if needed.
|
|
87
|
+
if isinstance(data, InMemoryDataset):
|
|
88
|
+
records = data.read()
|
|
89
|
+
df = (
|
|
90
|
+
spark.createDataFrame(records)
|
|
91
|
+
if records
|
|
92
|
+
else spark.createDataFrame([], StructType([]))
|
|
93
|
+
)
|
|
94
|
+
elif isinstance(data, list):
|
|
95
|
+
df = (
|
|
96
|
+
spark.createDataFrame(data)
|
|
97
|
+
if data
|
|
98
|
+
else spark.createDataFrame([], StructType([]))
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
# Assume it's already a PySpark DataFrame.
|
|
102
|
+
df = data
|
|
103
|
+
|
|
104
|
+
rows_in = df.count()
|
|
105
|
+
|
|
106
|
+
tracer = get_tracer("aptdata.transform")
|
|
107
|
+
with tracer.start_as_current_span(self._name) as span:
|
|
108
|
+
span.set_attribute("aptdata.transformer.name", self._name)
|
|
109
|
+
span.set_attribute("aptdata.transformer.engine", "pyspark")
|
|
110
|
+
span.set_attribute("aptdata.transformer.rows_in", rows_in)
|
|
111
|
+
span.set_attribute("aptdata.spark.app_name", self._app_name)
|
|
112
|
+
|
|
113
|
+
# Try to capture Spark UI URL.
|
|
114
|
+
try:
|
|
115
|
+
ui_url = spark.sparkContext.uiWebUrl
|
|
116
|
+
if ui_url:
|
|
117
|
+
span.set_attribute("aptdata.spark.ui_url", ui_url)
|
|
118
|
+
except Exception: # noqa: BLE001
|
|
119
|
+
pass
|
|
120
|
+
|
|
121
|
+
t0 = time.perf_counter()
|
|
122
|
+
result_df = self._transform_func(spark, df)
|
|
123
|
+
compute_time_ms = (time.perf_counter() - t0) * 1000.0
|
|
124
|
+
|
|
125
|
+
rows_out = result_df.count()
|
|
126
|
+
span.set_attribute("aptdata.transformer.rows_out", rows_out)
|
|
127
|
+
span.set_attribute(
|
|
128
|
+
"aptdata.transformer.compute_time_ms", round(compute_time_ms, 3)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return result_df
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
__all__ = ["PySparkTransformer"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Base abstractions for vector DB writers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import abstractmethod
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from aptdata.core.dataset import BaseDataset
|
|
9
|
+
from aptdata.plugins.base import BaseWriter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class VectorWriter(BaseWriter):
|
|
13
|
+
"""Base writer for vector databases."""
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def write(
|
|
17
|
+
self, dataset: BaseDataset, **kwargs: Any
|
|
18
|
+
) -> None: # pragma: no cover - interface only
|
|
19
|
+
"""Persist vectors from *dataset* into the destination vector store."""
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Qdrant vector writer plugin."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from opentelemetry import trace
|
|
8
|
+
|
|
9
|
+
from aptdata.core.dataset import BaseDataset
|
|
10
|
+
from aptdata.plugins.vector.base import VectorWriter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class QdrantWriter(VectorWriter):
|
|
14
|
+
"""Write embeddings to an in-memory Qdrant-like collection buffer."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self, *, collection: str, vector_column: str = "artigo_chunk_embedding"
|
|
18
|
+
) -> None:
|
|
19
|
+
self.collection = collection
|
|
20
|
+
self.vector_column = vector_column
|
|
21
|
+
self.points: list[dict[str, Any]] = []
|
|
22
|
+
|
|
23
|
+
def write(self, dataset: BaseDataset, **kwargs: Any) -> None:
|
|
24
|
+
rows: list[dict[str, Any]] = dataset.read()
|
|
25
|
+
with trace.get_tracer("aptdata.plugins.vector").start_as_current_span(
|
|
26
|
+
"QdrantWriter.write"
|
|
27
|
+
) as span:
|
|
28
|
+
for index, row in enumerate(rows):
|
|
29
|
+
vector = row.get(self.vector_column)
|
|
30
|
+
if vector is None:
|
|
31
|
+
continue
|
|
32
|
+
point = {
|
|
33
|
+
"id": row.get("document_id")
|
|
34
|
+
or row.get("id")
|
|
35
|
+
or f"{self.collection}-{index}",
|
|
36
|
+
"vector": vector,
|
|
37
|
+
"payload": row,
|
|
38
|
+
}
|
|
39
|
+
self.points.append(point)
|
|
40
|
+
span.set_attribute("aptdata.vector.collection", self.collection)
|
|
41
|
+
span.set_attribute("aptdata.vector.points", len(self.points))
|