aiecs 1.1.0__py3-none-any.whl → 1.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (81) hide show
  1. aiecs/__init__.py +1 -1
  2. aiecs/aiecs_client.py +1 -1
  3. aiecs/config/config.py +38 -0
  4. aiecs/domain/__init__.py +95 -0
  5. aiecs/domain/community/__init__.py +159 -0
  6. aiecs/domain/community/agent_adapter.py +516 -0
  7. aiecs/domain/community/analytics.py +465 -0
  8. aiecs/domain/community/collaborative_workflow.py +99 -7
  9. aiecs/domain/community/communication_hub.py +649 -0
  10. aiecs/domain/community/community_builder.py +322 -0
  11. aiecs/domain/community/community_integration.py +365 -12
  12. aiecs/domain/community/community_manager.py +481 -5
  13. aiecs/domain/community/decision_engine.py +459 -13
  14. aiecs/domain/community/exceptions.py +238 -0
  15. aiecs/domain/community/models/__init__.py +36 -0
  16. aiecs/domain/community/resource_manager.py +1 -1
  17. aiecs/domain/community/shared_context_manager.py +621 -0
  18. aiecs/domain/context/context_engine.py +37 -33
  19. aiecs/infrastructure/monitoring/__init__.py +22 -0
  20. aiecs/infrastructure/monitoring/global_metrics_manager.py +207 -0
  21. aiecs/infrastructure/persistence/file_storage.py +41 -28
  22. aiecs/llm/__init__.py +44 -7
  23. aiecs/llm/callbacks/__init__.py +12 -0
  24. aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +1 -1
  25. aiecs/llm/client_factory.py +23 -6
  26. aiecs/llm/clients/__init__.py +35 -0
  27. aiecs/llm/{base_client.py → clients/base_client.py} +73 -1
  28. aiecs/llm/{googleai_client.py → clients/googleai_client.py} +19 -15
  29. aiecs/llm/{openai_client.py → clients/openai_client.py} +9 -14
  30. aiecs/llm/{vertex_client.py → clients/vertex_client.py} +15 -15
  31. aiecs/llm/{xai_client.py → clients/xai_client.py} +36 -50
  32. aiecs/llm/config/__init__.py +54 -0
  33. aiecs/llm/config/config_loader.py +275 -0
  34. aiecs/llm/config/config_validator.py +237 -0
  35. aiecs/llm/config/model_config.py +132 -0
  36. aiecs/llm/utils/__init__.py +11 -0
  37. aiecs/llm/utils/validate_config.py +91 -0
  38. aiecs/main.py +32 -2
  39. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  40. aiecs/scripts/aid/__init__.py +15 -0
  41. aiecs/scripts/aid/version_manager.py +224 -0
  42. aiecs/scripts/dependance_check/download_nlp_data.py +1 -0
  43. aiecs/tools/__init__.py +23 -23
  44. aiecs/tools/docs/__init__.py +5 -2
  45. aiecs/tools/docs/ai_document_orchestrator.py +39 -26
  46. aiecs/tools/docs/ai_document_writer_orchestrator.py +61 -38
  47. aiecs/tools/docs/content_insertion_tool.py +48 -28
  48. aiecs/tools/docs/document_creator_tool.py +47 -29
  49. aiecs/tools/docs/document_layout_tool.py +35 -20
  50. aiecs/tools/docs/document_parser_tool.py +56 -36
  51. aiecs/tools/docs/document_writer_tool.py +115 -62
  52. aiecs/tools/schema_generator.py +56 -56
  53. aiecs/tools/statistics/__init__.py +82 -0
  54. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +581 -0
  55. aiecs/tools/statistics/ai_insight_generator_tool.py +473 -0
  56. aiecs/tools/statistics/ai_report_orchestrator_tool.py +629 -0
  57. aiecs/tools/statistics/data_loader_tool.py +518 -0
  58. aiecs/tools/statistics/data_profiler_tool.py +599 -0
  59. aiecs/tools/statistics/data_transformer_tool.py +531 -0
  60. aiecs/tools/statistics/data_visualizer_tool.py +460 -0
  61. aiecs/tools/statistics/model_trainer_tool.py +470 -0
  62. aiecs/tools/statistics/statistical_analyzer_tool.py +426 -0
  63. aiecs/tools/task_tools/chart_tool.py +2 -1
  64. aiecs/tools/task_tools/image_tool.py +43 -43
  65. aiecs/tools/task_tools/office_tool.py +39 -36
  66. aiecs/tools/task_tools/pandas_tool.py +37 -33
  67. aiecs/tools/task_tools/report_tool.py +67 -56
  68. aiecs/tools/task_tools/research_tool.py +32 -31
  69. aiecs/tools/task_tools/scraper_tool.py +53 -46
  70. aiecs/tools/task_tools/search_tool.py +1123 -0
  71. aiecs/tools/task_tools/stats_tool.py +20 -15
  72. aiecs/tools/tool_executor/__init__.py +2 -2
  73. aiecs/tools/tool_executor/tool_executor.py +3 -3
  74. {aiecs-1.1.0.dist-info → aiecs-1.2.1.dist-info}/METADATA +5 -1
  75. aiecs-1.2.1.dist-info/RECORD +144 -0
  76. {aiecs-1.1.0.dist-info → aiecs-1.2.1.dist-info}/entry_points.txt +1 -0
  77. aiecs/tools/task_tools/search_api.py +0 -7
  78. aiecs-1.1.0.dist-info/RECORD +0 -114
  79. {aiecs-1.1.0.dist-info → aiecs-1.2.1.dist-info}/WHEEL +0 -0
  80. {aiecs-1.1.0.dist-info → aiecs-1.2.1.dist-info}/licenses/LICENSE +0 -0
  81. {aiecs-1.1.0.dist-info → aiecs-1.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,518 @@
1
+ """
2
+ Data Loader Tool - Universal data loading from multiple file formats
3
+
4
+ This tool provides comprehensive data loading capabilities with:
5
+ - Auto-detection of file formats
6
+ - Multiple loading strategies (full, streaming, chunked, lazy)
7
+ - Data quality validation on load
8
+ - Schema inference and validation
9
+ - Support for CSV, Excel, JSON, Parquet, and other formats
10
+ """
11
+
12
+ import os
13
+ import logging
14
+ from typing import Dict, Any, List, Optional, Union, Iterator
15
+ from enum import Enum
16
+ from pathlib import Path
17
+
18
+ import pandas as pd
19
+ import numpy as np
20
+ from pydantic import BaseModel, Field, ValidationError, ConfigDict
21
+
22
+ from aiecs.tools.base_tool import BaseTool
23
+ from aiecs.tools import register_tool
24
+
25
+
26
+ class DataSourceType(str, Enum):
27
+ """Supported data source types"""
28
+ CSV = "csv"
29
+ EXCEL = "excel"
30
+ JSON = "json"
31
+ PARQUET = "parquet"
32
+ FEATHER = "feather"
33
+ HDF5 = "hdf5"
34
+ STATA = "stata"
35
+ SAS = "sas"
36
+ SPSS = "spss"
37
+ AUTO = "auto"
38
+
39
+
40
+ class LoadStrategy(str, Enum):
41
+ """Data loading strategies"""
42
+ FULL_LOAD = "full_load"
43
+ STREAMING = "streaming"
44
+ CHUNKED = "chunked"
45
+ LAZY = "lazy"
46
+ INCREMENTAL = "incremental"
47
+
48
+
49
+
50
+
51
+ class DataLoaderError(Exception):
52
+ """Base exception for DataLoader errors"""
53
+ pass
54
+
55
+
56
+ class FileFormatError(DataLoaderError):
57
+ """Raised when file format is unsupported or invalid"""
58
+ pass
59
+
60
+
61
+ class SchemaValidationError(DataLoaderError):
62
+ """Raised when schema validation fails"""
63
+ pass
64
+
65
+
66
+ class DataQualityError(DataLoaderError):
67
+ """Raised when data quality issues are detected"""
68
+ pass
69
+
70
+
71
+ @register_tool('data_loader')
72
+ class DataLoaderTool(BaseTool):
73
+ """
74
+ Universal data loading tool that can:
75
+ 1. Load data from multiple file formats
76
+ 2. Auto-detect data formats and schemas
77
+ 3. Handle large datasets with streaming
78
+ 4. Validate data quality on load
79
+
80
+ Integrates with pandas_tool for core data operations.
81
+ """
82
+
83
+ # Configuration schema
84
+ class Config(BaseModel):
85
+ """Configuration for the data loader tool"""
86
+ model_config = ConfigDict(env_prefix="DATA_LOADER_")
87
+
88
+ max_file_size_mb: int = Field(
89
+ default=500,
90
+ description="Maximum file size in megabytes"
91
+ )
92
+ default_chunk_size: int = Field(
93
+ default=10000,
94
+ description="Default chunk size for chunked loading"
95
+ )
96
+ max_memory_usage_mb: int = Field(
97
+ default=2000,
98
+ description="Maximum memory usage in megabytes"
99
+ )
100
+ enable_schema_inference: bool = Field(
101
+ default=True,
102
+ description="Whether to enable automatic schema inference"
103
+ )
104
+ enable_quality_validation: bool = Field(
105
+ default=True,
106
+ description="Whether to enable data quality validation"
107
+ )
108
+ default_encoding: str = Field(
109
+ default="utf-8",
110
+ description="Default text encoding for file operations"
111
+ )
112
+
113
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
114
+ """
115
+ Initialize DataLoaderTool with settings.
116
+
117
+ Args:
118
+ config: Optional configuration overrides
119
+ """
120
+ super().__init__(config)
121
+
122
+ # Parse configuration
123
+ self.config = self.Config(**(config or {}))
124
+
125
+ self.logger = logging.getLogger(__name__)
126
+ if not self.logger.handlers:
127
+ handler = logging.StreamHandler()
128
+ handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
129
+ self.logger.addHandler(handler)
130
+ self.logger.setLevel(logging.INFO)
131
+
132
+ # Initialize external tools
133
+ self._init_external_tools()
134
+
135
+ def _init_external_tools(self):
136
+ """Initialize external task tools"""
137
+ self.external_tools = {}
138
+
139
+ # Initialize PandasTool for data operations
140
+ try:
141
+ from aiecs.tools.task_tools.pandas_tool import PandasTool
142
+ self.external_tools['pandas'] = PandasTool()
143
+ self.logger.info("PandasTool initialized successfully")
144
+ except ImportError:
145
+ self.logger.warning("PandasTool not available")
146
+ self.external_tools['pandas'] = None
147
+
148
+ # Schema definitions
149
+ class LoadDataSchema(BaseModel):
150
+ """Schema for load_data operation"""
151
+ source: str = Field(description="Path to data source file")
152
+ source_type: Optional[DataSourceType] = Field(default=DataSourceType.AUTO, description="Data source type")
153
+ strategy: LoadStrategy = Field(default=LoadStrategy.FULL_LOAD, description="Loading strategy")
154
+ schema: Optional[Dict[str, Any]] = Field(default=None, description="Expected schema for validation")
155
+ validation_rules: Optional[Dict[str, Any]] = Field(default=None, description="Data quality validation rules")
156
+ nrows: Optional[int] = Field(default=None, description="Number of rows to load")
157
+ chunk_size: Optional[int] = Field(default=None, description="Chunk size for chunked loading")
158
+ encoding: Optional[str] = Field(default=None, description="File encoding")
159
+
160
+ class DetectFormatSchema(BaseModel):
161
+ """Schema for detect_format operation"""
162
+ source: str = Field(description="Path to data source file")
163
+
164
+ class ValidateSchemaSchema(BaseModel):
165
+ """Schema for validate_schema operation"""
166
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to validate")
167
+ schema: Dict[str, Any] = Field(description="Expected schema")
168
+
169
+ class StreamDataSchema(BaseModel):
170
+ """Schema for stream_data operation"""
171
+ source: str = Field(description="Path to data source file")
172
+ chunk_size: int = Field(default=10000, description="Chunk size for streaming")
173
+ source_type: Optional[DataSourceType] = Field(default=DataSourceType.AUTO, description="Data source type")
174
+
175
+ def load_data(
176
+ self,
177
+ source: str,
178
+ source_type: DataSourceType = DataSourceType.AUTO,
179
+ strategy: LoadStrategy = LoadStrategy.FULL_LOAD,
180
+ schema: Optional[Dict[str, Any]] = None,
181
+ validation_rules: Optional[Dict[str, Any]] = None,
182
+ nrows: Optional[int] = None,
183
+ chunk_size: Optional[int] = None,
184
+ encoding: Optional[str] = None
185
+ ) -> Dict[str, Any]:
186
+ """
187
+ Load data from source with automatic format detection.
188
+
189
+ Args:
190
+ source: Path to data source file
191
+ source_type: Type of data source (auto-detected if not specified)
192
+ strategy: Loading strategy to use
193
+ schema: Expected schema for validation
194
+ validation_rules: Data quality validation rules
195
+ nrows: Number of rows to load (None for all)
196
+ chunk_size: Chunk size for chunked loading
197
+ encoding: File encoding
198
+
199
+ Returns:
200
+ Dict containing:
201
+ - data: Loaded DataFrame or data structure
202
+ - metadata: Metadata about loaded data
203
+ - quality_report: Quality assessment results
204
+
205
+ Raises:
206
+ DataLoaderError: If loading fails
207
+ FileFormatError: If format is unsupported
208
+ """
209
+ try:
210
+ # Validate source exists
211
+ if not os.path.exists(source):
212
+ raise DataLoaderError(f"Source file not found: {source}")
213
+
214
+ # Detect format if auto
215
+ if source_type == DataSourceType.AUTO:
216
+ source_type = self._detect_format(source)
217
+
218
+ # Check file size
219
+ file_size_mb = os.path.getsize(source) / (1024 * 1024)
220
+ if file_size_mb > self.config.max_file_size_mb:
221
+ self.logger.warning(f"File size {file_size_mb:.2f}MB exceeds recommended limit")
222
+
223
+ # Load data based on strategy
224
+ if strategy == LoadStrategy.FULL_LOAD:
225
+ data = self._load_full(source, source_type, nrows, encoding)
226
+ elif strategy == LoadStrategy.CHUNKED:
227
+ data = self._load_chunked(source, source_type, chunk_size or self.config.default_chunk_size, encoding)
228
+ elif strategy == LoadStrategy.STREAMING:
229
+ data = self._load_streaming(source, source_type, chunk_size or self.config.default_chunk_size, encoding)
230
+ elif strategy == LoadStrategy.LAZY:
231
+ data = self._load_lazy(source, source_type, encoding)
232
+ else:
233
+ raise DataLoaderError(f"Unsupported loading strategy: {strategy}")
234
+
235
+ # Generate metadata
236
+ metadata = self._generate_metadata(data, source, source_type)
237
+
238
+ # Validate schema if provided
239
+ if schema and self.config.enable_schema_inference:
240
+ schema_valid = self._validate_schema_internal(data, schema)
241
+ metadata['schema_valid'] = schema_valid
242
+
243
+ # Validate quality if enabled
244
+ quality_report = {}
245
+ if self.config.enable_quality_validation and isinstance(data, pd.DataFrame):
246
+ quality_report = self._validate_quality(data, validation_rules)
247
+
248
+ self.logger.info(f"Successfully loaded data from {source}")
249
+
250
+ return {
251
+ 'data': data,
252
+ 'metadata': metadata,
253
+ 'quality_report': quality_report,
254
+ 'source': source,
255
+ 'source_type': source_type.value,
256
+ 'strategy': strategy.value
257
+ }
258
+
259
+ except Exception as e:
260
+ self.logger.error(f"Error loading data from {source}: {e}")
261
+ raise DataLoaderError(f"Failed to load data: {e}")
262
+
263
+ def detect_format(self, source: str) -> Dict[str, Any]:
264
+ """
265
+ Detect file format from source.
266
+
267
+ Args:
268
+ source: Path to data source file
269
+
270
+ Returns:
271
+ Dict containing detected format information
272
+ """
273
+ try:
274
+ detected_type = self._detect_format(source)
275
+
276
+ return {
277
+ 'source': source,
278
+ 'detected_type': detected_type.value,
279
+ 'file_extension': Path(source).suffix.lower(),
280
+ 'confidence': 'high'
281
+ }
282
+ except Exception as e:
283
+ self.logger.error(f"Error detecting format: {e}")
284
+ raise FileFormatError(f"Failed to detect format: {e}")
285
+
286
+ def validate_schema(self, data: Union[Dict[str, Any], List[Dict[str, Any]]], schema: Dict[str, Any]) -> Dict[str, Any]:
287
+ """
288
+ Validate data against expected schema.
289
+
290
+ Args:
291
+ data: Data to validate
292
+ schema: Expected schema definition
293
+
294
+ Returns:
295
+ Dict containing validation results
296
+ """
297
+ try:
298
+ # Convert to DataFrame if needed
299
+ if isinstance(data, list):
300
+ df = pd.DataFrame(data)
301
+ elif isinstance(data, dict):
302
+ df = pd.DataFrame([data])
303
+ else:
304
+ df = data
305
+
306
+ is_valid = self._validate_schema_internal(df, schema)
307
+
308
+ issues = []
309
+ if not is_valid:
310
+ # Check column presence
311
+ expected_columns = set(schema.get('columns', {}).keys())
312
+ actual_columns = set(df.columns)
313
+ missing = expected_columns - actual_columns
314
+ extra = actual_columns - expected_columns
315
+
316
+ if missing:
317
+ issues.append(f"Missing columns: {missing}")
318
+ if extra:
319
+ issues.append(f"Extra columns: {extra}")
320
+
321
+ return {
322
+ 'valid': is_valid,
323
+ 'issues': issues,
324
+ 'expected_columns': list(schema.get('columns', {}).keys()),
325
+ 'actual_columns': list(df.columns)
326
+ }
327
+
328
+ except Exception as e:
329
+ self.logger.error(f"Error validating schema: {e}")
330
+ raise SchemaValidationError(f"Schema validation failed: {e}")
331
+
332
+ def stream_data(
333
+ self,
334
+ source: str,
335
+ chunk_size: int = 10000,
336
+ source_type: DataSourceType = DataSourceType.AUTO
337
+ ) -> Dict[str, Any]:
338
+ """
339
+ Stream data in chunks for large files.
340
+
341
+ Args:
342
+ source: Path to data source file
343
+ chunk_size: Size of each chunk
344
+ source_type: Type of data source
345
+
346
+ Returns:
347
+ Dict containing streaming iterator information
348
+ """
349
+ try:
350
+ if source_type == DataSourceType.AUTO:
351
+ source_type = self._detect_format(source)
352
+
353
+ # Create iterator based on format
354
+ if source_type == DataSourceType.CSV:
355
+ iterator = pd.read_csv(source, chunksize=chunk_size)
356
+ elif source_type == DataSourceType.JSON:
357
+ iterator = pd.read_json(source, lines=True, chunksize=chunk_size)
358
+ else:
359
+ raise FileFormatError(f"Streaming not supported for format: {source_type}")
360
+
361
+ return {
362
+ 'iterator': iterator,
363
+ 'chunk_size': chunk_size,
364
+ 'source_type': source_type.value,
365
+ 'message': 'Streaming iterator created successfully'
366
+ }
367
+
368
+ except Exception as e:
369
+ self.logger.error(f"Error creating stream: {e}")
370
+ raise DataLoaderError(f"Failed to create stream: {e}")
371
+
372
+ # Internal helper methods
373
+
374
+ def _detect_format(self, source: str) -> DataSourceType:
375
+ """Detect file format from extension"""
376
+ ext = Path(source).suffix.lower()
377
+
378
+ format_map = {
379
+ '.csv': DataSourceType.CSV,
380
+ '.xlsx': DataSourceType.EXCEL,
381
+ '.xls': DataSourceType.EXCEL,
382
+ '.json': DataSourceType.JSON,
383
+ '.parquet': DataSourceType.PARQUET,
384
+ '.feather': DataSourceType.FEATHER,
385
+ '.h5': DataSourceType.HDF5,
386
+ '.hdf': DataSourceType.HDF5,
387
+ '.dta': DataSourceType.STATA,
388
+ '.sas7bdat': DataSourceType.SAS,
389
+ '.sav': DataSourceType.SPSS
390
+ }
391
+
392
+ detected = format_map.get(ext)
393
+ if not detected:
394
+ raise FileFormatError(f"Unsupported file format: {ext}")
395
+
396
+ return detected
397
+
398
+ def _load_full(self, source: str, source_type: DataSourceType, nrows: Optional[int], encoding: Optional[str]) -> pd.DataFrame:
399
+ """Load entire dataset into memory"""
400
+ encoding = encoding or self.config.default_encoding
401
+
402
+ if source_type == DataSourceType.CSV:
403
+ return pd.read_csv(source, nrows=nrows, encoding=encoding)
404
+ elif source_type == DataSourceType.EXCEL:
405
+ return pd.read_excel(source, nrows=nrows)
406
+ elif source_type == DataSourceType.JSON:
407
+ return pd.read_json(source, nrows=nrows, encoding=encoding)
408
+ elif source_type == DataSourceType.PARQUET:
409
+ return pd.read_parquet(source)
410
+ elif source_type == DataSourceType.FEATHER:
411
+ return pd.read_feather(source)
412
+ elif source_type == DataSourceType.HDF5:
413
+ return pd.read_hdf(source)
414
+ elif source_type == DataSourceType.STATA:
415
+ df = pd.read_stata(source)
416
+ if nrows:
417
+ return df.head(nrows)
418
+ return df
419
+ elif source_type == DataSourceType.SAS:
420
+ return pd.read_sas(source)
421
+ elif source_type == DataSourceType.SPSS:
422
+ try:
423
+ import pyreadstat
424
+ df, meta = pyreadstat.read_sav(source)
425
+ return df
426
+ except ImportError:
427
+ raise DataLoaderError("pyreadstat required for SPSS files")
428
+ else:
429
+ raise FileFormatError(f"Unsupported format for full load: {source_type}")
430
+
431
+ def _load_chunked(self, source: str, source_type: DataSourceType, chunk_size: int, encoding: Optional[str]) -> pd.DataFrame:
432
+ """Load data in chunks and combine"""
433
+ encoding = encoding or self.config.default_encoding
434
+ chunks = []
435
+
436
+ if source_type == DataSourceType.CSV:
437
+ for chunk in pd.read_csv(source, chunksize=chunk_size, encoding=encoding):
438
+ chunks.append(chunk)
439
+ elif source_type == DataSourceType.JSON:
440
+ for chunk in pd.read_json(source, lines=True, chunksize=chunk_size, encoding=encoding):
441
+ chunks.append(chunk)
442
+ else:
443
+ raise FileFormatError(f"Chunked loading not supported for: {source_type}")
444
+
445
+ return pd.concat(chunks, ignore_index=True)
446
+
447
+ def _load_streaming(self, source: str, source_type: DataSourceType, chunk_size: int, encoding: Optional[str]) -> Iterator[pd.DataFrame]:
448
+ """Create streaming iterator"""
449
+ encoding = encoding or self.config.default_encoding
450
+
451
+ if source_type == DataSourceType.CSV:
452
+ return pd.read_csv(source, chunksize=chunk_size, encoding=encoding)
453
+ elif source_type == DataSourceType.JSON:
454
+ return pd.read_json(source, lines=True, chunksize=chunk_size, encoding=encoding)
455
+ else:
456
+ raise FileFormatError(f"Streaming not supported for: {source_type}")
457
+
458
+ def _load_lazy(self, source: str, source_type: DataSourceType, encoding: Optional[str]) -> Any:
459
+ """Create lazy loading wrapper"""
460
+ # For now, return full load with warning
461
+ self.logger.warning("Lazy loading not fully implemented, using full load")
462
+ return self._load_full(source, source_type, None, encoding)
463
+
464
+ def _generate_metadata(self, data: Any, source: str, source_type: DataSourceType) -> Dict[str, Any]:
465
+ """Generate metadata about loaded data"""
466
+ if isinstance(data, pd.DataFrame):
467
+ return {
468
+ 'rows': len(data),
469
+ 'columns': len(data.columns),
470
+ 'column_names': list(data.columns),
471
+ 'dtypes': {col: str(dtype) for col, dtype in data.dtypes.items()},
472
+ 'memory_usage_mb': data.memory_usage(deep=True).sum() / (1024 * 1024),
473
+ 'file_size_mb': os.path.getsize(source) / (1024 * 1024)
474
+ }
475
+ else:
476
+ return {
477
+ 'type': str(type(data)),
478
+ 'file_size_mb': os.path.getsize(source) / (1024 * 1024)
479
+ }
480
+
481
+ def _validate_schema_internal(self, data: pd.DataFrame, schema: Dict[str, Any]) -> bool:
482
+ """Internal schema validation"""
483
+ if 'columns' not in schema:
484
+ return True
485
+
486
+ expected_columns = set(schema['columns'].keys())
487
+ actual_columns = set(data.columns)
488
+
489
+ return expected_columns.issubset(actual_columns)
490
+
491
+ def _validate_quality(self, data: pd.DataFrame, validation_rules: Optional[Dict[str, Any]]) -> Dict[str, Any]:
492
+ """Validate data quality"""
493
+ quality_report = {
494
+ 'total_rows': len(data),
495
+ 'total_columns': len(data.columns),
496
+ 'missing_values': data.isnull().sum().to_dict(),
497
+ 'duplicate_rows': data.duplicated().sum(),
498
+ 'quality_score': 1.0
499
+ }
500
+
501
+ # Calculate quality score
502
+ missing_ratio = data.isnull().sum().sum() / (len(data) * len(data.columns)) if len(data) > 0 else 0
503
+ duplicate_ratio = quality_report['duplicate_rows'] / len(data) if len(data) > 0 else 0
504
+
505
+ quality_score = 1.0 - (missing_ratio * 0.5 + duplicate_ratio * 0.5)
506
+ quality_report['quality_score'] = max(0.0, min(1.0, quality_score))
507
+
508
+ # Add issues list
509
+ issues = []
510
+ if missing_ratio > 0.1:
511
+ issues.append(f"High missing value ratio: {missing_ratio:.2%}")
512
+ if duplicate_ratio > 0.05:
513
+ issues.append(f"High duplicate ratio: {duplicate_ratio:.2%}")
514
+
515
+ quality_report['issues'] = issues
516
+
517
+ return quality_report
518
+