aptdata 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. aptdata/__init__.py +3 -0
  2. aptdata/cli/__init__.py +5 -0
  3. aptdata/cli/app.py +247 -0
  4. aptdata/cli/commands/__init__.py +9 -0
  5. aptdata/cli/commands/config_cmd.py +128 -0
  6. aptdata/cli/commands/mesh_cmd.py +435 -0
  7. aptdata/cli/commands/plugin_cmd.py +107 -0
  8. aptdata/cli/commands/system_cmd.py +90 -0
  9. aptdata/cli/commands/telemetry_cmd.py +57 -0
  10. aptdata/cli/completions.py +56 -0
  11. aptdata/cli/interactive.py +269 -0
  12. aptdata/cli/rendering/__init__.py +31 -0
  13. aptdata/cli/rendering/console.py +119 -0
  14. aptdata/cli/rendering/logger.py +26 -0
  15. aptdata/cli/rendering/panels.py +87 -0
  16. aptdata/cli/rendering/tables.py +81 -0
  17. aptdata/cli/scaffold.py +1089 -0
  18. aptdata/config/__init__.py +13 -0
  19. aptdata/config/parser.py +136 -0
  20. aptdata/config/schema.py +27 -0
  21. aptdata/config/secrets.py +60 -0
  22. aptdata/core/__init__.py +46 -0
  23. aptdata/core/context.py +31 -0
  24. aptdata/core/dataset.py +39 -0
  25. aptdata/core/lineage.py +213 -0
  26. aptdata/core/state.py +27 -0
  27. aptdata/core/system.py +317 -0
  28. aptdata/core/workflow.py +372 -0
  29. aptdata/mcp/__init__.py +5 -0
  30. aptdata/mcp/server.py +198 -0
  31. aptdata/plugins/__init__.py +77 -0
  32. aptdata/plugins/ai/__init__.py +6 -0
  33. aptdata/plugins/ai/chunking.py +66 -0
  34. aptdata/plugins/ai/embeddings.py +56 -0
  35. aptdata/plugins/base.py +57 -0
  36. aptdata/plugins/dataset.py +62 -0
  37. aptdata/plugins/governance/__init__.py +32 -0
  38. aptdata/plugins/governance/catalog.py +115 -0
  39. aptdata/plugins/governance/classification.py +44 -0
  40. aptdata/plugins/governance/lineage_store.py +49 -0
  41. aptdata/plugins/governance/rules.py +180 -0
  42. aptdata/plugins/local_fs.py +241 -0
  43. aptdata/plugins/manager.py +142 -0
  44. aptdata/plugins/postgres.py +113 -0
  45. aptdata/plugins/quality/__init__.py +39 -0
  46. aptdata/plugins/quality/contract.py +128 -0
  47. aptdata/plugins/quality/expectations.py +310 -0
  48. aptdata/plugins/quality/report.py +94 -0
  49. aptdata/plugins/quality/validator.py +139 -0
  50. aptdata/plugins/rest.py +135 -0
  51. aptdata/plugins/transform/__init__.py +14 -0
  52. aptdata/plugins/transform/pandas.py +129 -0
  53. aptdata/plugins/transform/spark.py +134 -0
  54. aptdata/plugins/vector/__init__.py +6 -0
  55. aptdata/plugins/vector/base.py +19 -0
  56. aptdata/plugins/vector/qdrant.py +41 -0
  57. aptdata/telemetry/__init__.py +5 -0
  58. aptdata/telemetry/instrumentation.py +164 -0
  59. aptdata/tui/__init__.py +5 -0
  60. aptdata/tui/monitor.py +279 -0
  61. aptdata-0.0.2.dist-info/METADATA +330 -0
  62. aptdata-0.0.2.dist-info/RECORD +65 -0
  63. aptdata-0.0.2.dist-info/WHEEL +4 -0
  64. aptdata-0.0.2.dist-info/entry_points.txt +3 -0
  65. aptdata-0.0.2.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,139 @@
1
+ """Quality validator — wraps expectations into a workflow-compatible step."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import warnings
7
+ from typing import Any
8
+
9
+ from aptdata.plugins.quality.contract import EnforcementMode
10
+ from aptdata.plugins.quality.expectations import BaseExpectation
11
+ from aptdata.plugins.quality.report import CheckStatus, QualityReport
12
+ from aptdata.telemetry.instrumentation import get_tracer
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class QualityValidator:
18
+ """Runs a suite of :class:`~.expectations.BaseExpectation` objects against data.
19
+
20
+ Parameters
21
+ ----------
22
+ expectations:
23
+ List of expectations to evaluate.
24
+ enforcement:
25
+ How to react when an expectation fails
26
+ (see :class:`~.contract.EnforcementMode`).
27
+ name:
28
+ Human-readable identifier used in OTel span names.
29
+
30
+ Examples
31
+ --------
32
+ ::
33
+
34
+ from aptdata.plugins.quality import (
35
+ QualityValidator, ExpectColumnToNotBeNull, EnforcementMode
36
+ )
37
+
38
+ validator = QualityValidator(
39
+ expectations=[ExpectColumnToNotBeNull("age")],
40
+ enforcement=EnforcementMode.ABORT,
41
+ )
42
+ clean_df = validator.validate(df)
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ expectations: list[BaseExpectation],
48
+ enforcement: EnforcementMode = EnforcementMode.ABORT,
49
+ name: str = "QualityValidator",
50
+ ) -> None:
51
+ self.expectations = expectations
52
+ self.enforcement = enforcement
53
+ self.name = name
54
+
55
+ def validate(self, data: Any) -> Any:
56
+ """Validate *data* against all expectations and return *data*.
57
+
58
+ This method is compatible with :meth:`~aptdata.core.workflow.Workflow.add_step`.
59
+ On success the original *data* object is returned unchanged.
60
+
61
+ Parameters
62
+ ----------
63
+ data:
64
+ A ``pd.DataFrame``, PySpark ``DataFrame``,
65
+ :class:`~aptdata.plugins.dataset.InMemoryDataset`, or
66
+ ``list[dict]``.
67
+
68
+ Returns
69
+ -------
70
+ Any
71
+ The original *data* object (pass-through).
72
+
73
+ Raises
74
+ ------
75
+ ValueError
76
+ When ``enforcement == ABORT`` and at least one expectation fails.
77
+ """
78
+ import pandas as pd # type: ignore[import]
79
+
80
+ from aptdata.plugins.dataset import InMemoryDataset
81
+
82
+ # Resolve to a DataFrame for expectations.
83
+ resolved: Any
84
+ if isinstance(data, InMemoryDataset):
85
+ resolved = pd.DataFrame(data.read())
86
+ elif isinstance(data, list):
87
+ resolved = pd.DataFrame(data)
88
+ else:
89
+ resolved = data
90
+
91
+ dataset_uri = data.uri if isinstance(data, InMemoryDataset) else "unknown"
92
+ report = QualityReport(dataset_uri=dataset_uri)
93
+
94
+ tracer = get_tracer("aptdata.quality")
95
+ with tracer.start_as_current_span(self.name) as span:
96
+ span.set_attribute("aptdata.quality.validator_name", self.name)
97
+ span.set_attribute("aptdata.quality.enforcement", self.enforcement.value)
98
+ span.set_attribute(
99
+ "aptdata.quality.num_expectations", len(self.expectations)
100
+ )
101
+
102
+ for expectation in self.expectations:
103
+ result = expectation.validate(resolved)
104
+ report.checks.append(result)
105
+
106
+ passed = report.passed
107
+ span.set_attribute("aptdata.quality.passed", passed)
108
+ span.set_attribute("aptdata.quality.num_checks", len(report.checks))
109
+ failed_count = report.summary.get(CheckStatus.FAILED, 0)
110
+ span.set_attribute("aptdata.quality.failed_checks", failed_count)
111
+
112
+ if not passed:
113
+ failed_checks = [c for c in report.checks if c.status == CheckStatus.FAILED]
114
+ summary_msg = "; ".join(c.message for c in failed_checks)
115
+
116
+ if self.enforcement == EnforcementMode.ABORT:
117
+ raise ValueError(
118
+ f"Data quality validation failed [{self.name}]: {summary_msg}"
119
+ )
120
+ elif self.enforcement == EnforcementMode.WARN:
121
+ warnings.warn(
122
+ f"Data quality warning [{self.name}]: {summary_msg}",
123
+ stacklevel=2,
124
+ )
125
+ logger.warning("Quality validation warning: %s", summary_msg)
126
+ else:
127
+ # TAG mode — attach quality metadata; pass-through
128
+ if hasattr(data, "schema_metadata") and isinstance(
129
+ data.schema_metadata, dict
130
+ ):
131
+ data.schema_metadata["quality_report"] = {
132
+ "passed": False,
133
+ "failed_checks": [c.expectation_name for c in failed_checks],
134
+ }
135
+
136
+ return data
137
+
138
+
139
+ __all__ = ["QualityValidator"]
@@ -0,0 +1,135 @@
1
+ """REST API reader plugin.
2
+
3
+ Provides :class:`APIReader` — a generic reader that fetches data from a
4
+ RESTful JSON endpoint using HTTP GET, with optional header injection
5
+ (e.g. for bearer tokens) and simple offset/page-based pagination.
6
+
7
+ Requires the optional ``httpx`` package. A friendly
8
+ :class:`~aptdata.plugins.manager.PluginDependencyError` is raised
9
+ when it is not installed.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from typing import Any
15
+
16
+ from aptdata.plugins.base import BaseReader
17
+ from aptdata.plugins.dataset import InMemoryDataset
18
+ from aptdata.plugins.manager import PluginDependencyError
19
+
20
+
21
+ def _require_httpx() -> Any:
22
+ """Import and return the ``httpx`` module, or raise a friendly error."""
23
+ try:
24
+ import httpx # noqa: WPS433
25
+ except ImportError:
26
+ raise PluginDependencyError("APIReader", "httpx") from None
27
+ return httpx
28
+
29
+
30
+ class APIReader(BaseReader):
31
+ """Read JSON data from a REST API endpoint.
32
+
33
+ Parameters
34
+ ----------
35
+ endpoint:
36
+ The URL to send GET requests to.
37
+ headers:
38
+ Optional HTTP headers (e.g. ``{"Authorization": "Bearer xxx"}``).
39
+ params:
40
+ Optional query-string parameters merged into every request.
41
+ pagination_key:
42
+ When set, the reader fetches pages until the response JSON list is
43
+ empty. The value is the query-string parameter name that carries the
44
+ page number (e.g. ``"page"``). Pages start at ``1``.
45
+ max_pages:
46
+ Safety limit on the number of pages fetched (default ``100``).
47
+ timeout:
48
+ HTTP request timeout in seconds (default ``30``).
49
+ records_path:
50
+ Optional dot-separated path into the JSON response that contains
51
+ the list of records (e.g. ``"data.items"``). When ``None`` the
52
+ response itself must be a JSON array.
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ endpoint: str,
58
+ *,
59
+ headers: dict[str, str] | None = None,
60
+ params: dict[str, Any] | None = None,
61
+ pagination_key: str | None = None,
62
+ max_pages: int = 100,
63
+ timeout: float = 30,
64
+ records_path: str | None = None,
65
+ ) -> None:
66
+ self.endpoint = endpoint
67
+ self.headers = headers or {}
68
+ self.params = params or {}
69
+ self.pagination_key = pagination_key
70
+ self.max_pages = max_pages
71
+ self.timeout = timeout
72
+ self.records_path = records_path
73
+
74
+ # -- helpers ------------------------------------------------------------
75
+
76
+ @staticmethod
77
+ def _extract_records(body: Any, path: str | None) -> list[dict[str, Any]]:
78
+ """Navigate *path* inside *body* and return the records list."""
79
+ if path is None:
80
+ if isinstance(body, list):
81
+ return body
82
+ raise ValueError(
83
+ "Expected a JSON array from the API response. "
84
+ "Set 'records_path' if records are nested."
85
+ )
86
+ current: Any = body
87
+ for key in path.split("."):
88
+ if isinstance(current, dict):
89
+ current = current[key]
90
+ else:
91
+ raise ValueError(f"Cannot traverse key '{key}' in a non-dict value.")
92
+ if not isinstance(current, list):
93
+ raise ValueError(
94
+ f"Expected a list at path '{path}', got {type(current).__name__}."
95
+ )
96
+ return current
97
+
98
+ # -- BaseReader ---------------------------------------------------------
99
+
100
+ def read(self, **kwargs: Any) -> InMemoryDataset:
101
+ httpx = _require_httpx()
102
+
103
+ all_records: list[dict[str, Any]] = []
104
+
105
+ with httpx.Client(timeout=self.timeout) as client:
106
+ if self.pagination_key is None:
107
+ response = client.get(
108
+ self.endpoint,
109
+ headers=self.headers,
110
+ params=self.params,
111
+ )
112
+ response.raise_for_status()
113
+ all_records = self._extract_records(response.json(), self.records_path)
114
+ else:
115
+ for page in range(1, self.max_pages + 1):
116
+ params = {**self.params, self.pagination_key: page}
117
+ response = client.get(
118
+ self.endpoint,
119
+ headers=self.headers,
120
+ params=params,
121
+ )
122
+ response.raise_for_status()
123
+ page_records = self._extract_records(
124
+ response.json(), self.records_path
125
+ )
126
+ if not page_records:
127
+ break
128
+ all_records.extend(page_records)
129
+
130
+ ds = InMemoryDataset(uri=self.endpoint)
131
+ ds.write(all_records)
132
+ return ds
133
+
134
+
135
+ __all__ = ["APIReader"]
@@ -0,0 +1,14 @@
1
+ """Transform plugin package — engine-agnostic transformation wrappers.
2
+
3
+ Provides :class:`PandasTransformer` and :class:`PySparkTransformer` as
4
+ concrete :class:`~aptdata.plugins.base.BaseTransformer` implementations.
5
+ Both use lazy imports so the framework core works without pandas or pyspark
6
+ installed.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from aptdata.plugins.transform.pandas import PandasTransformer
12
+ from aptdata.plugins.transform.spark import PySparkTransformer
13
+
14
+ __all__ = ["PandasTransformer", "PySparkTransformer"]
@@ -0,0 +1,129 @@
1
+ """Pandas-based data transformer.
2
+
3
+ Uses lazy imports so that the framework core works even when pandas is not
4
+ installed. A :class:`~aptdata.plugins.manager.PluginDependencyError` is
5
+ raised at instantiation time if pandas is not available.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from collections.abc import Callable
12
+ from typing import TYPE_CHECKING, Any
13
+
14
+ from aptdata.plugins.base import BaseTransformer
15
+ from aptdata.plugins.manager import PluginDependencyError
16
+ from aptdata.telemetry.instrumentation import get_tracer
17
+
18
+ if TYPE_CHECKING:
19
+ import pandas as pd
20
+
21
+
22
+ class PandasTransformer(BaseTransformer):
23
+ """Engine-agnostic transformer backed by a pandas callable.
24
+
25
+ Parameters
26
+ ----------
27
+ name:
28
+ Human-readable identifier for this transformer.
29
+ transform_func:
30
+ A callable ``(pd.DataFrame) -> pd.DataFrame`` that applies the
31
+ desired transformation.
32
+
33
+ Examples
34
+ --------
35
+ ::
36
+
37
+ from aptdata.plugins.transform import PandasTransformer
38
+
39
+ def double_values(df):
40
+ return df.assign(value=df["value"] * 2)
41
+
42
+ transformer = PandasTransformer("double_values", double_values)
43
+ result = transformer.transform(my_dataset)
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ name: str,
49
+ transform_func: Callable[[Any], Any],
50
+ ) -> None:
51
+ try:
52
+ import pandas # noqa: F401
53
+ except ImportError as exc:
54
+ raise PluginDependencyError("PandasTransformer", "pandas") from exc
55
+ self._name = name
56
+ self._transform_func = transform_func
57
+
58
+ @property
59
+ def name(self) -> str:
60
+ """Human-readable name of this transformer."""
61
+ return self._name
62
+
63
+ def transform(self, data: Any) -> Any:
64
+ """Apply the transformation to *data*.
65
+
66
+ Parameters
67
+ ----------
68
+ data:
69
+ Accepts a :class:`~aptdata.plugins.dataset.InMemoryDataset`,
70
+ a ``pd.DataFrame``, or a ``list[dict]``.
71
+
72
+ Returns
73
+ -------
74
+ Any
75
+ The transformed result. If the input was an
76
+ :class:`~aptdata.plugins.dataset.InMemoryDataset` the output
77
+ will also be one; otherwise a ``pd.DataFrame`` is returned.
78
+ """
79
+ import pandas as pd
80
+
81
+ from aptdata.plugins.dataset import InMemoryDataset
82
+
83
+ return_dataset = False
84
+ original_uri = "memory://transformed"
85
+ original_schema: dict[str, Any] = {}
86
+
87
+ if isinstance(data, InMemoryDataset):
88
+ return_dataset = True
89
+ original_uri = data.uri
90
+ original_schema = data.schema_metadata
91
+ df: pd.DataFrame = pd.DataFrame(data.read())
92
+ elif isinstance(data, pd.DataFrame):
93
+ df = data
94
+ elif isinstance(data, list):
95
+ df = pd.DataFrame(data)
96
+ else:
97
+ df = pd.DataFrame([data] if data is not None else [])
98
+
99
+ rows_in = len(df)
100
+
101
+ tracer = get_tracer("aptdata.transform")
102
+ with tracer.start_as_current_span(self._name) as span:
103
+ span.set_attribute("aptdata.transformer.name", self._name)
104
+ span.set_attribute("aptdata.transformer.engine", "pandas")
105
+ span.set_attribute("aptdata.transformer.rows_in", rows_in)
106
+
107
+ t0 = time.perf_counter()
108
+ result_df: pd.DataFrame = self._transform_func(df)
109
+ compute_time_ms = (time.perf_counter() - t0) * 1000.0
110
+
111
+ rows_out = len(result_df)
112
+ columns_out = list(result_df.columns)
113
+ span.set_attribute("aptdata.transformer.rows_out", rows_out)
114
+ span.set_attribute("aptdata.transformer.columns_out", str(columns_out))
115
+ span.set_attribute(
116
+ "aptdata.transformer.compute_time_ms", round(compute_time_ms, 3)
117
+ )
118
+
119
+ if return_dataset:
120
+ out_dataset = InMemoryDataset(
121
+ uri=original_uri, schema_metadata=original_schema
122
+ )
123
+ out_dataset.write(result_df.to_dict(orient="records"))
124
+ return out_dataset
125
+
126
+ return result_df
127
+
128
+
129
+ __all__ = ["PandasTransformer"]
@@ -0,0 +1,134 @@
1
+ """PySpark-based data transformer.
2
+
3
+ Uses lazy imports so that the framework core works even when pyspark is not
4
+ installed. A :class:`~aptdata.plugins.manager.PluginDependencyError` is
5
+ raised at instantiation time if pyspark is not available.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from collections.abc import Callable
12
+ from typing import Any
13
+
14
+ from aptdata.plugins.base import BaseTransformer
15
+ from aptdata.plugins.manager import PluginDependencyError
16
+ from aptdata.telemetry.instrumentation import get_tracer
17
+
18
+
19
+ class PySparkTransformer(BaseTransformer):
20
+ """Engine-agnostic transformer backed by a PySpark callable.
21
+
22
+ Parameters
23
+ ----------
24
+ name:
25
+ Human-readable identifier for this transformer.
26
+ transform_func:
27
+ A callable ``(SparkSession, DataFrame) -> DataFrame`` that applies
28
+ the desired transformation.
29
+ app_name:
30
+ Spark application name passed to ``SparkSession.builder``.
31
+
32
+ Examples
33
+ --------
34
+ ::
35
+
36
+ from aptdata.plugins.transform import PySparkTransformer
37
+
38
+ def double_values(spark, df):
39
+ from pyspark.sql import functions as F
40
+ return df.withColumn("value", F.col("value") * 2)
41
+
42
+ transformer = PySparkTransformer("double_values", double_values)
43
+ result = transformer.transform(my_df)
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ name: str,
49
+ transform_func: Callable[[Any, Any], Any],
50
+ app_name: str = "SmartData",
51
+ ) -> None:
52
+ try:
53
+ import pyspark # noqa: F401
54
+ except ImportError as exc:
55
+ raise PluginDependencyError("PySparkTransformer", "pyspark") from exc
56
+ self._name = name
57
+ self._transform_func = transform_func
58
+ self._app_name = app_name
59
+
60
+ @property
61
+ def name(self) -> str:
62
+ """Human-readable name of this transformer."""
63
+ return self._name
64
+
65
+ def transform(self, data: Any) -> Any:
66
+ """Apply the transformation to *data*.
67
+
68
+ Parameters
69
+ ----------
70
+ data:
71
+ Accepts a PySpark ``DataFrame``, a ``list[dict]``, or an
72
+ :class:`~aptdata.plugins.dataset.InMemoryDataset`.
73
+
74
+ Returns
75
+ -------
76
+ Any
77
+ The transformed PySpark ``DataFrame``.
78
+ """
79
+ from pyspark.sql import SparkSession
80
+ from pyspark.sql.types import StructType
81
+
82
+ from aptdata.plugins.dataset import InMemoryDataset
83
+
84
+ spark: Any = SparkSession.builder.appName(self._app_name).getOrCreate()
85
+
86
+ # Convert input to a PySpark DataFrame if needed.
87
+ if isinstance(data, InMemoryDataset):
88
+ records = data.read()
89
+ df = (
90
+ spark.createDataFrame(records)
91
+ if records
92
+ else spark.createDataFrame([], StructType([]))
93
+ )
94
+ elif isinstance(data, list):
95
+ df = (
96
+ spark.createDataFrame(data)
97
+ if data
98
+ else spark.createDataFrame([], StructType([]))
99
+ )
100
+ else:
101
+ # Assume it's already a PySpark DataFrame.
102
+ df = data
103
+
104
+ rows_in = df.count()
105
+
106
+ tracer = get_tracer("aptdata.transform")
107
+ with tracer.start_as_current_span(self._name) as span:
108
+ span.set_attribute("aptdata.transformer.name", self._name)
109
+ span.set_attribute("aptdata.transformer.engine", "pyspark")
110
+ span.set_attribute("aptdata.transformer.rows_in", rows_in)
111
+ span.set_attribute("aptdata.spark.app_name", self._app_name)
112
+
113
+ # Try to capture Spark UI URL.
114
+ try:
115
+ ui_url = spark.sparkContext.uiWebUrl
116
+ if ui_url:
117
+ span.set_attribute("aptdata.spark.ui_url", ui_url)
118
+ except Exception: # noqa: BLE001
119
+ pass
120
+
121
+ t0 = time.perf_counter()
122
+ result_df = self._transform_func(spark, df)
123
+ compute_time_ms = (time.perf_counter() - t0) * 1000.0
124
+
125
+ rows_out = result_df.count()
126
+ span.set_attribute("aptdata.transformer.rows_out", rows_out)
127
+ span.set_attribute(
128
+ "aptdata.transformer.compute_time_ms", round(compute_time_ms, 3)
129
+ )
130
+
131
+ return result_df
132
+
133
+
134
+ __all__ = ["PySparkTransformer"]
@@ -0,0 +1,6 @@
1
+ """Vector database writers."""
2
+
3
+ from aptdata.plugins.vector.base import VectorWriter
4
+ from aptdata.plugins.vector.qdrant import QdrantWriter
5
+
6
+ __all__ = ["VectorWriter", "QdrantWriter"]
@@ -0,0 +1,19 @@
1
+ """Base abstractions for vector DB writers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import abstractmethod
6
+ from typing import Any
7
+
8
+ from aptdata.core.dataset import BaseDataset
9
+ from aptdata.plugins.base import BaseWriter
10
+
11
+
12
+ class VectorWriter(BaseWriter):
13
+ """Base writer for vector databases."""
14
+
15
+ @abstractmethod
16
+ def write(
17
+ self, dataset: BaseDataset, **kwargs: Any
18
+ ) -> None: # pragma: no cover - interface only
19
+ """Persist vectors from *dataset* into the destination vector store."""
@@ -0,0 +1,41 @@
1
+ """Qdrant vector writer plugin."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from opentelemetry import trace
8
+
9
+ from aptdata.core.dataset import BaseDataset
10
+ from aptdata.plugins.vector.base import VectorWriter
11
+
12
+
13
+ class QdrantWriter(VectorWriter):
14
+ """Write embeddings to an in-memory Qdrant-like collection buffer."""
15
+
16
+ def __init__(
17
+ self, *, collection: str, vector_column: str = "artigo_chunk_embedding"
18
+ ) -> None:
19
+ self.collection = collection
20
+ self.vector_column = vector_column
21
+ self.points: list[dict[str, Any]] = []
22
+
23
+ def write(self, dataset: BaseDataset, **kwargs: Any) -> None:
24
+ rows: list[dict[str, Any]] = dataset.read()
25
+ with trace.get_tracer("aptdata.plugins.vector").start_as_current_span(
26
+ "QdrantWriter.write"
27
+ ) as span:
28
+ for index, row in enumerate(rows):
29
+ vector = row.get(self.vector_column)
30
+ if vector is None:
31
+ continue
32
+ point = {
33
+ "id": row.get("document_id")
34
+ or row.get("id")
35
+ or f"{self.collection}-{index}",
36
+ "vector": vector,
37
+ "payload": row,
38
+ }
39
+ self.points.append(point)
40
+ span.set_attribute("aptdata.vector.collection", self.collection)
41
+ span.set_attribute("aptdata.vector.points", len(self.points))
@@ -0,0 +1,5 @@
1
+ """Telemetry helpers for aptdata."""
2
+
3
+ from aptdata.telemetry.instrumentation import configure_telemetry, get_meter, get_tracer
4
+
5
+ __all__ = ["configure_telemetry", "get_meter", "get_tracer"]