msaas-data-export 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ node_modules/
2
+ dist/
3
+ .next/
4
+ .turbo/
5
+ *.pyc
6
+ __pycache__/
7
+ .venv/
8
+ *.egg-info/
9
+ .pytest_cache/
10
+ .ruff_cache/
11
+ .env
12
+ .env.*
13
+ !.env.example
14
+ !.env.*.example
15
+ !.env.*.template
16
+ .DS_Store
17
+ coverage/
18
+
19
+ # Runtime artifacts
20
+ logs_llm/
21
+ vectors.db
22
+ vectors.db-shm
23
+ vectors.db-wal
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.4
2
+ Name: msaas-data-export
3
+ Version: 0.1.0
4
+ Summary: Data export and GDPR compliance library for the Willian SaaS platform
5
+ License: MIT
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: fastapi>=0.115.0
8
+ Requires-Dist: msaas-api-core
9
+ Requires-Dist: msaas-errors
10
+ Requires-Dist: pydantic>=2.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: httpx>=0.27.0; extra == 'dev'
13
+ Requires-Dist: openpyxl>=3.1.0; extra == 'dev'
14
+ Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
15
+ Requires-Dist: pytest>=8.0; extra == 'dev'
16
+ Requires-Dist: reportlab>=4.0; extra == 'dev'
17
+ Provides-Extra: excel
18
+ Requires-Dist: openpyxl>=3.1.0; extra == 'excel'
19
+ Provides-Extra: pdf
20
+ Requires-Dist: reportlab>=4.0; extra == 'pdf'
@@ -0,0 +1,42 @@
1
+ [project]
2
+ name = "msaas-data-export"
3
+ version = "0.1.0"
4
+ description = "Data export and GDPR compliance library for the Willian SaaS platform"
5
+ requires-python = ">=3.12"
6
+ license = { text = "MIT" }
7
+ dependencies = [
8
+ "msaas-api-core",
9
+ "msaas-errors",
10
+ "fastapi>=0.115.0",
11
+ "pydantic>=2.0",
12
+ ]
13
+
14
+ [project.optional-dependencies]
15
+ excel = ["openpyxl>=3.1.0"]
16
+ pdf = ["reportlab>=4.0"]
17
+ dev = [
18
+ "pytest>=8.0",
19
+ "pytest-asyncio>=0.24.0",
20
+ "httpx>=0.27.0",
21
+ "openpyxl>=3.1.0",
22
+ "reportlab>=4.0",
23
+ ]
24
+
25
+ [build-system]
26
+ requires = ["hatchling"]
27
+ build-backend = "hatchling.build"
28
+
29
+ [tool.hatch.build.targets.wheel]
30
+ packages = ["src/data_export"]
31
+
32
+ [tool.pytest.ini_options]
33
+ asyncio_mode = "auto"
34
+ testpaths = ["tests"]
35
+
36
+ [tool.ruff]
37
+ target-version = "py312"
38
+ line-length = 100
39
+
40
+ [tool.uv.sources]
41
+ msaas-api-core = { workspace = true }
42
+ msaas-errors = { workspace = true }
@@ -0,0 +1,40 @@
1
+ """Willian Data Export -- Data export and GDPR compliance library."""
2
+
3
+ from data_export.bulk import BulkImporter
4
+ from data_export.config import ExportConfig, get_export_service, init_export
5
+ from data_export.formats.base import ExportOptions
6
+ from data_export.gdpr import DataSource, GDPRService
7
+ from data_export.models import (
8
+ ColumnConfig,
9
+ ExportFormat,
10
+ ExportRequest,
11
+ ExportResult,
12
+ GDPRRequest,
13
+ GDPRRequestStatus,
14
+ GDPRRequestType,
15
+ ImportError_,
16
+ ImportResult,
17
+ )
18
+ from data_export.router import ExportRouter
19
+ from data_export.service import ExportService
20
+
21
+ __all__ = [
22
+ "BulkImporter",
23
+ "ColumnConfig",
24
+ "DataSource",
25
+ "ExportConfig",
26
+ "ExportFormat",
27
+ "ExportOptions",
28
+ "ExportRequest",
29
+ "ExportResult",
30
+ "ExportRouter",
31
+ "ExportService",
32
+ "GDPRRequest",
33
+ "GDPRRequestStatus",
34
+ "GDPRRequestType",
35
+ "GDPRService",
36
+ "ImportError_",
37
+ "ImportResult",
38
+ "get_export_service",
39
+ "init_export",
40
+ ]
@@ -0,0 +1,145 @@
1
+ """Bulk import operations with validation and error reporting."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import io
7
+ import json
8
+ import logging
9
+ from collections.abc import Callable, Coroutine
10
+ from typing import Any
11
+
12
+ from pydantic import BaseModel, ValidationError
13
+
14
+ from data_export.models import ImportError_, ImportResult
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class BulkImporter:
20
+ """Import CSV or JSON data with per-row validation and error reporting."""
21
+
22
+ def validate_row(self, row: dict[str, Any], schema: type[BaseModel]) -> list[str]:
23
+ """Validate a single row against a Pydantic schema.
24
+
25
+ Args:
26
+ row: The data row to validate.
27
+ schema: Pydantic model class for validation.
28
+
29
+ Returns:
30
+ List of error messages. Empty list means the row is valid.
31
+ """
32
+ try:
33
+ schema.model_validate(row)
34
+ return []
35
+ except ValidationError as exc:
36
+ return [
37
+ f"{err['loc'][0] if err['loc'] else 'unknown'}: {err['msg']}"
38
+ for err in exc.errors()
39
+ ]
40
+
41
+ async def import_with_report(
42
+ self,
43
+ file_data: bytes | str,
44
+ schema: type[BaseModel],
45
+ process_fn: Callable[[dict[str, Any]], Coroutine[Any, Any, None]],
46
+ *,
47
+ file_format: str = "csv",
48
+ ) -> ImportResult:
49
+ """Import data with validation and processing, returning a detailed report.
50
+
51
+ Args:
52
+ file_data: Raw file content as bytes or string.
53
+ schema: Pydantic model class for row validation.
54
+ process_fn: Async callable to process each valid row.
55
+ file_format: Input format, either 'csv' or 'json'.
56
+
57
+ Returns:
58
+ ImportResult with success/error counts and error details.
59
+ """
60
+ rows = self._parse_file(file_data, file_format)
61
+ total = len(rows)
62
+ success = 0
63
+ errors: list[ImportError_] = []
64
+
65
+ for row_num, row in enumerate(rows, start=1):
66
+ validation_errors = self.validate_row(row, schema)
67
+
68
+ if validation_errors:
69
+ for msg in validation_errors:
70
+ field = msg.split(":")[0] if ":" in msg else None
71
+ errors.append(ImportError_(row=row_num, field=field, message=msg))
72
+ continue
73
+
74
+ try:
75
+ await process_fn(row)
76
+ success += 1
77
+ except Exception as exc:
78
+ errors.append(ImportError_(row=row_num, message=str(exc)))
79
+
80
+ logger.info(
81
+ "Bulk import completed: total=%d success=%d errors=%d", total, success, len(errors)
82
+ )
83
+ return ImportResult(total=total, success=success, errors=errors)
84
+
85
+ async def dry_run(
86
+ self,
87
+ file_data: bytes | str,
88
+ schema: type[BaseModel],
89
+ *,
90
+ file_format: str = "csv",
91
+ ) -> ImportResult:
92
+ """Validate all rows without importing, returning a validation report.
93
+
94
+ Args:
95
+ file_data: Raw file content.
96
+ schema: Pydantic model class for row validation.
97
+ file_format: Input format, either 'csv' or 'json'.
98
+
99
+ Returns:
100
+ ImportResult with validation results (no data is modified).
101
+ """
102
+ rows = self._parse_file(file_data, file_format)
103
+ total = len(rows)
104
+ success = 0
105
+ errors: list[ImportError_] = []
106
+
107
+ for row_num, row in enumerate(rows, start=1):
108
+ validation_errors = self.validate_row(row, schema)
109
+ if validation_errors:
110
+ for msg in validation_errors:
111
+ field = msg.split(":")[0] if ":" in msg else None
112
+ errors.append(ImportError_(row=row_num, field=field, message=msg))
113
+ else:
114
+ success += 1
115
+
116
+ logger.info("Dry run completed: total=%d valid=%d invalid=%d", total, success, len(errors))
117
+ return ImportResult(total=total, success=success, errors=errors)
118
+
119
+ def _parse_file(self, file_data: bytes | str, file_format: str) -> list[dict[str, Any]]:
120
+ """Parse file content into a list of row dictionaries.
121
+
122
+ Args:
123
+ file_data: Raw file content.
124
+ file_format: Either 'csv' or 'json'.
125
+
126
+ Returns:
127
+ List of row dictionaries.
128
+
129
+ Raises:
130
+ ValueError: If the format is unsupported or parsing fails.
131
+ """
132
+ if isinstance(file_data, bytes):
133
+ file_data = file_data.decode("utf-8")
134
+
135
+ match file_format.lower():
136
+ case "csv":
137
+ reader = csv.DictReader(io.StringIO(file_data))
138
+ return list(reader)
139
+ case "json":
140
+ parsed = json.loads(file_data)
141
+ if not isinstance(parsed, list):
142
+ raise ValueError("JSON import data must be an array of objects")
143
+ return parsed
144
+ case _:
145
+ raise ValueError(f"Unsupported import format: {file_format}")
@@ -0,0 +1,87 @@
1
+ """Export module configuration and initialization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from tempfile import gettempdir
7
+
8
+ from pydantic import BaseModel
9
+
10
+ from data_export.models import ExportFormat
11
+
12
+
13
+ _config: ExportConfig | None = None
14
+ _service: object | None = None # Lazy ref to avoid circular import
15
+
16
+
17
+ class ExportConfig(BaseModel):
18
+ """Configuration for the data export module.
19
+
20
+ Args:
21
+ max_rows: Maximum rows allowed in a single export.
22
+ default_format: Default export format when none is specified.
23
+ temp_dir: Directory for temporary export files.
24
+ gdpr_retention_days: How long to keep GDPR request records.
25
+ streaming_chunk_size: Number of rows per chunk in streaming exports.
26
+ """
27
+
28
+ max_rows: int = 100_000
29
+ default_format: ExportFormat = ExportFormat.CSV
30
+ temp_dir: Path = Path(gettempdir())
31
+ gdpr_retention_days: int = 90
32
+ streaming_chunk_size: int = 1000
33
+
34
+
35
+ def init_export(config: ExportConfig | None = None) -> ExportConfig:
36
+ """Initialize the export module with the given configuration.
37
+
38
+ Args:
39
+ config: Export configuration. Uses defaults if None.
40
+
41
+ Returns:
42
+ The active ExportConfig.
43
+ """
44
+ global _config, _service
45
+ _config = config or ExportConfig()
46
+ _service = None # Reset cached service
47
+ return _config
48
+
49
+
50
+ def get_config() -> ExportConfig:
51
+ """Return the current module configuration.
52
+
53
+ Raises:
54
+ RuntimeError: If init_export() has not been called.
55
+ """
56
+ if _config is None:
57
+ raise RuntimeError("Export module not initialized. Call init_export() first.")
58
+ return _config
59
+
60
+
61
+ def get_export_service():
62
+ """Return or create the singleton ExportService.
63
+
64
+ Raises:
65
+ RuntimeError: If init_export() has not been called.
66
+ """
67
+ global _service
68
+ if _config is None:
69
+ raise RuntimeError("Export module not initialized. Call init_export() first.")
70
+ if _service is None:
71
+ from data_export.service import ExportService
72
+
73
+ _service = ExportService(_config)
74
+ return _service
75
+
76
+
77
+ def set_config(config: ExportConfig) -> None:
78
+ """Override the configuration (useful for testing)."""
79
+ global _config
80
+ _config = config
81
+
82
+
83
+ def reset() -> None:
84
+ """Reset module state (useful for testing)."""
85
+ global _config, _service
86
+ _config = None
87
+ _service = None
@@ -0,0 +1,62 @@
1
+ """Export format implementations."""
2
+
3
+ from data_export.formats.base import BaseExportFormat, ExportOptions
4
+ from data_export.formats.csv_format import CSVExportFormat
5
+ from data_export.formats.json_format import JSONExportFormat
6
+
7
+ __all__ = [
8
+ "BaseExportFormat",
9
+ "CSVExportFormat",
10
+ "ExportOptions",
11
+ "JSONExportFormat",
12
+ "get_format",
13
+ ]
14
+
15
+
16
+ def get_format(format_name: str) -> BaseExportFormat:
17
+ """Return the appropriate format handler for the given format name.
18
+
19
+ Args:
20
+ format_name: One of 'csv', 'json', 'excel', 'pdf'.
21
+
22
+ Returns:
23
+ An instance of the corresponding format handler.
24
+
25
+ Raises:
26
+ ValueError: If the format is not supported or its optional dependency is missing.
27
+ """
28
+ match format_name:
29
+ case "csv":
30
+ return CSVExportFormat()
31
+ case "json":
32
+ return JSONExportFormat()
33
+ case "excel":
34
+ return _get_excel_format()
35
+ case "pdf":
36
+ return _get_pdf_format()
37
+ case _:
38
+ raise ValueError(f"Unsupported export format: {format_name}")
39
+
40
+
41
+ def _get_excel_format() -> BaseExportFormat:
42
+ """Load Excel format, raising a clear error if openpyxl is missing."""
43
+ try:
44
+ from data_export.formats.excel import ExcelExportFormat
45
+
46
+ return ExcelExportFormat()
47
+ except ImportError:
48
+ raise ValueError(
49
+ "Excel export requires openpyxl. Install with: pip install willian-data-export[excel]"
50
+ ) from None
51
+
52
+
53
+ def _get_pdf_format() -> BaseExportFormat:
54
+ """Load PDF format, raising a clear error if reportlab is missing."""
55
+ try:
56
+ from data_export.formats.pdf_format import PDFExportFormat
57
+
58
+ return PDFExportFormat()
59
+ except ImportError:
60
+ raise ValueError(
61
+ "PDF export requires reportlab. Install with: pip install willian-data-export[pdf]"
62
+ ) from None
@@ -0,0 +1,84 @@
1
+ """Abstract base class for export formats."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel
9
+
10
+ from data_export.models import ColumnConfig
11
+
12
+
13
+ class ExportOptions(BaseModel):
14
+ """Options that control export behavior.
15
+
16
+ Args:
17
+ columns: Column configuration for field selection and renaming.
18
+ pretty: Enable human-readable formatting where applicable (e.g., JSON indent).
19
+ delimiter: Field delimiter for CSV exports.
20
+ encoding: Character encoding for text-based exports.
21
+ bom: Include a byte-order mark for Excel CSV compatibility.
22
+ sheet_name: Worksheet name for Excel exports.
23
+ """
24
+
25
+ columns: list[ColumnConfig] | None = None
26
+ pretty: bool = False
27
+ delimiter: str = ","
28
+ encoding: str = "utf-8"
29
+ bom: bool = False
30
+ sheet_name: str = "Sheet1"
31
+
32
+
33
+ class BaseExportFormat(ABC):
34
+ """Abstract base for all export format implementations.
35
+
36
+ Subclasses must implement `export()` and provide `content_type` and `extension`.
37
+ """
38
+
39
+ @property
40
+ @abstractmethod
41
+ def content_type(self) -> str:
42
+ """MIME type for this format (e.g. 'text/csv')."""
43
+
44
+ @property
45
+ @abstractmethod
46
+ def extension(self) -> str:
47
+ """File extension without dot (e.g. 'csv')."""
48
+
49
+ @abstractmethod
50
+ def export(self, data: list[dict[str, Any]], options: ExportOptions | None = None) -> bytes:
51
+ """Export data rows to bytes in this format.
52
+
53
+ Args:
54
+ data: List of dictionaries, each representing a row.
55
+ options: Export options controlling columns, formatting, etc.
56
+
57
+ Returns:
58
+ The exported file content as bytes.
59
+ """
60
+
61
+ def apply_columns(
62
+ self, data: list[dict[str, Any]], options: ExportOptions | None
63
+ ) -> tuple[list[dict[str, Any]], list[str]]:
64
+ """Apply column selection and renaming to the data.
65
+
66
+ Args:
67
+ data: Raw data rows.
68
+ options: Export options with optional column config.
69
+
70
+ Returns:
71
+ Tuple of (transformed rows, ordered header names).
72
+ """
73
+ if not data:
74
+ return [], []
75
+
76
+ if options and options.columns:
77
+ headers = [col.display_header for col in options.columns]
78
+ fields = [col.field for col in options.columns]
79
+ transformed = [{h: row.get(f) for h, f in zip(headers, fields)} for row in data]
80
+ return transformed, headers
81
+
82
+ # No column config: use all keys from the first row
83
+ headers = list(data[0].keys())
84
+ return data, headers
@@ -0,0 +1,56 @@
1
+ """CSV export format implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import io
7
+ from typing import Any
8
+
9
+ from data_export.formats.base import BaseExportFormat, ExportOptions
10
+
11
+
12
+ class CSVExportFormat(BaseExportFormat):
13
+ """Export data as CSV with configurable delimiter, encoding, and BOM support."""
14
+
15
+ @property
16
+ def content_type(self) -> str:
17
+ return "text/csv"
18
+
19
+ @property
20
+ def extension(self) -> str:
21
+ return "csv"
22
+
23
+ def export(self, data: list[dict[str, Any]], options: ExportOptions | None = None) -> bytes:
24
+ """Export data rows to CSV bytes.
25
+
26
+ Args:
27
+ data: List of row dictionaries.
28
+ options: Controls delimiter, encoding, BOM, and column mapping.
29
+
30
+ Returns:
31
+ CSV file content as bytes.
32
+ """
33
+ opts = options or ExportOptions()
34
+ transformed, headers = self.apply_columns(data, opts)
35
+
36
+ if not headers:
37
+ return b""
38
+
39
+ output = io.StringIO()
40
+ writer = csv.DictWriter(
41
+ output,
42
+ fieldnames=headers,
43
+ delimiter=opts.delimiter,
44
+ extrasaction="ignore",
45
+ )
46
+ writer.writeheader()
47
+ for row in transformed:
48
+ writer.writerow(row)
49
+
50
+ content = output.getvalue()
51
+ encoded = content.encode(opts.encoding)
52
+
53
+ if opts.bom and opts.encoding.lower().replace("-", "") == "utf8":
54
+ return b"\xef\xbb\xbf" + encoded
55
+
56
+ return encoded
@@ -0,0 +1,97 @@
1
+ """Excel export format implementation using openpyxl."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ from typing import Any
7
+
8
+ from data_export.formats.base import BaseExportFormat, ExportOptions
9
+
10
+ try:
11
+ import openpyxl
12
+ from openpyxl.utils import get_column_letter
13
+
14
+ HAS_OPENPYXL = True
15
+ except ImportError:
16
+ HAS_OPENPYXL = False
17
+
18
+
19
+ class ExcelExportFormat(BaseExportFormat):
20
+ """Export data as an Excel .xlsx file with header styling and auto-width columns."""
21
+
22
+ def __init__(self) -> None:
23
+ if not HAS_OPENPYXL:
24
+ raise ImportError(
25
+ "openpyxl is required for Excel export. "
26
+ "Install with: pip install willian-data-export[excel]"
27
+ )
28
+
29
+ @property
30
+ def content_type(self) -> str:
31
+ return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
32
+
33
+ @property
34
+ def extension(self) -> str:
35
+ return "xlsx"
36
+
37
+ def export(self, data: list[dict[str, Any]], options: ExportOptions | None = None) -> bytes:
38
+ """Export data rows to Excel bytes.
39
+
40
+ Args:
41
+ data: List of row dictionaries.
42
+ options: Controls column mapping, sheet name, and column widths.
43
+
44
+ Returns:
45
+ Excel file content as bytes.
46
+ """
47
+ opts = options or ExportOptions()
48
+ transformed, headers = self.apply_columns(data, opts)
49
+
50
+ wb = openpyxl.Workbook()
51
+ ws = wb.active
52
+ ws.title = opts.sheet_name
53
+
54
+ # Write header row with bold styling
55
+ from openpyxl.styles import Font
56
+
57
+ bold_font = Font(bold=True)
58
+ for col_idx, header in enumerate(headers, start=1):
59
+ cell = ws.cell(row=1, column=col_idx, value=header)
60
+ cell.font = bold_font
61
+
62
+ # Write data rows
63
+ for row_idx, row in enumerate(transformed, start=2):
64
+ for col_idx, header in enumerate(headers, start=1):
65
+ ws.cell(row=row_idx, column=col_idx, value=row.get(header))
66
+
67
+ # Auto-width columns based on content
68
+ self._auto_width(ws, headers, transformed, opts)
69
+
70
+ buffer = io.BytesIO()
71
+ wb.save(buffer)
72
+ return buffer.getvalue()
73
+
74
+ def _auto_width(
75
+ self,
76
+ ws: Any,
77
+ headers: list[str],
78
+ data: list[dict[str, Any]],
79
+ opts: ExportOptions,
80
+ ) -> None:
81
+ """Set column widths based on explicit config or content length."""
82
+ col_configs = {col.display_header: col for col in (opts.columns or [])}
83
+
84
+ for col_idx, header in enumerate(headers, start=1):
85
+ config = col_configs.get(header)
86
+ if config and config.width:
87
+ ws.column_dimensions[get_column_letter(col_idx)].width = config.width
88
+ continue
89
+
90
+ # Calculate width from content
91
+ max_len = len(str(header))
92
+ for row in data[:100]: # Sample first 100 rows for performance
93
+ val = row.get(header)
94
+ if val is not None:
95
+ max_len = max(max_len, len(str(val)))
96
+
97
+ ws.column_dimensions[get_column_letter(col_idx)].width = min(max_len + 2, 50)