claude-mpm 4.3.20__py3-none-any.whl → 4.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/VERSION +1 -1
- claude_mpm/agents/agent_loader.py +2 -2
- claude_mpm/agents/agent_loader_integration.py +2 -2
- claude_mpm/agents/async_agent_loader.py +2 -2
- claude_mpm/agents/base_agent_loader.py +2 -2
- claude_mpm/agents/frontmatter_validator.py +2 -2
- claude_mpm/agents/system_agent_config.py +2 -2
- claude_mpm/agents/templates/data_engineer.json +1 -2
- claude_mpm/cli/commands/doctor.py +2 -2
- claude_mpm/cli/commands/mpm_init.py +560 -47
- claude_mpm/cli/commands/mpm_init_handler.py +6 -0
- claude_mpm/cli/parsers/mpm_init_parser.py +39 -1
- claude_mpm/cli/startup_logging.py +11 -9
- claude_mpm/commands/mpm-init.md +76 -12
- claude_mpm/config/agent_config.py +2 -2
- claude_mpm/config/paths.py +2 -2
- claude_mpm/core/agent_name_normalizer.py +2 -2
- claude_mpm/core/config.py +2 -1
- claude_mpm/core/config_aliases.py +2 -2
- claude_mpm/core/file_utils.py +1 -0
- claude_mpm/core/log_manager.py +2 -2
- claude_mpm/core/tool_access_control.py +2 -2
- claude_mpm/core/unified_agent_registry.py +2 -2
- claude_mpm/core/unified_paths.py +2 -2
- claude_mpm/experimental/cli_enhancements.py +3 -2
- claude_mpm/hooks/base_hook.py +2 -2
- claude_mpm/hooks/instruction_reinforcement.py +2 -2
- claude_mpm/hooks/memory_integration_hook.py +1 -1
- claude_mpm/hooks/validation_hooks.py +2 -2
- claude_mpm/scripts/mpm_doctor.py +2 -2
- claude_mpm/services/agents/loading/agent_profile_loader.py +2 -2
- claude_mpm/services/agents/loading/base_agent_manager.py +2 -2
- claude_mpm/services/agents/loading/framework_agent_loader.py +2 -2
- claude_mpm/services/agents/management/agent_capabilities_generator.py +2 -2
- claude_mpm/services/agents/management/agent_management_service.py +2 -2
- claude_mpm/services/agents/memory/content_manager.py +5 -2
- claude_mpm/services/agents/memory/memory_categorization_service.py +5 -2
- claude_mpm/services/agents/memory/memory_file_service.py +28 -6
- claude_mpm/services/agents/memory/memory_format_service.py +5 -2
- claude_mpm/services/agents/memory/memory_limits_service.py +4 -2
- claude_mpm/services/agents/registry/deployed_agent_discovery.py +2 -2
- claude_mpm/services/agents/registry/modification_tracker.py +4 -4
- claude_mpm/services/async_session_logger.py +2 -1
- claude_mpm/services/claude_session_logger.py +2 -2
- claude_mpm/services/core/path_resolver.py +3 -2
- claude_mpm/services/diagnostics/diagnostic_runner.py +4 -3
- claude_mpm/services/event_bus/direct_relay.py +2 -1
- claude_mpm/services/event_bus/event_bus.py +2 -1
- claude_mpm/services/event_bus/relay.py +2 -2
- claude_mpm/services/framework_claude_md_generator/content_assembler.py +2 -2
- claude_mpm/services/infrastructure/daemon_manager.py +2 -2
- claude_mpm/services/memory/cache/simple_cache.py +2 -2
- claude_mpm/services/project/archive_manager.py +981 -0
- claude_mpm/services/project/documentation_manager.py +536 -0
- claude_mpm/services/project/enhanced_analyzer.py +491 -0
- claude_mpm/services/project/project_organizer.py +904 -0
- claude_mpm/services/response_tracker.py +2 -2
- claude_mpm/services/socketio/handlers/connection.py +14 -33
- claude_mpm/services/socketio/server/eventbus_integration.py +2 -2
- claude_mpm/services/unified/__init__.py +65 -0
- claude_mpm/services/unified/analyzer_strategies/__init__.py +44 -0
- claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +473 -0
- claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +643 -0
- claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +804 -0
- claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +661 -0
- claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +696 -0
- claude_mpm/services/unified/deployment_strategies/__init__.py +97 -0
- claude_mpm/services/unified/deployment_strategies/base.py +557 -0
- claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +486 -0
- claude_mpm/services/unified/deployment_strategies/local.py +594 -0
- claude_mpm/services/unified/deployment_strategies/utils.py +672 -0
- claude_mpm/services/unified/deployment_strategies/vercel.py +471 -0
- claude_mpm/services/unified/interfaces.py +499 -0
- claude_mpm/services/unified/migration.py +532 -0
- claude_mpm/services/unified/strategies.py +551 -0
- claude_mpm/services/unified/unified_analyzer.py +534 -0
- claude_mpm/services/unified/unified_config.py +688 -0
- claude_mpm/services/unified/unified_deployment.py +470 -0
- claude_mpm/services/version_control/version_parser.py +5 -4
- claude_mpm/storage/state_storage.py +2 -2
- claude_mpm/utils/agent_dependency_loader.py +49 -0
- claude_mpm/utils/common.py +542 -0
- claude_mpm/utils/database_connector.py +298 -0
- claude_mpm/utils/error_handler.py +2 -1
- claude_mpm/utils/log_cleanup.py +2 -2
- claude_mpm/utils/path_operations.py +2 -2
- claude_mpm/utils/robust_installer.py +56 -0
- claude_mpm/utils/session_logging.py +2 -2
- claude_mpm/utils/subprocess_utils.py +2 -2
- claude_mpm/validation/agent_validator.py +2 -2
- {claude_mpm-4.3.20.dist-info → claude_mpm-4.4.0.dist-info}/METADATA +1 -1
- {claude_mpm-4.3.20.dist-info → claude_mpm-4.4.0.dist-info}/RECORD +96 -71
- {claude_mpm-4.3.20.dist-info → claude_mpm-4.4.0.dist-info}/WHEEL +0 -0
- {claude_mpm-4.3.20.dist-info → claude_mpm-4.4.0.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.3.20.dist-info → claude_mpm-4.4.0.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.3.20.dist-info → claude_mpm-4.4.0.dist-info}/top_level.txt +0 -0
claude_mpm/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
4.
|
1
|
+
4.4.0
|
@@ -32,7 +32,6 @@ Usage Examples:
|
|
32
32
|
agents = list_available_agents()
|
33
33
|
"""
|
34
34
|
|
35
|
-
import logging
|
36
35
|
import os
|
37
36
|
import time
|
38
37
|
from enum import Enum
|
@@ -51,7 +50,8 @@ from ..core.agent_name_normalizer import AgentNameNormalizer
|
|
51
50
|
from .base_agent_loader import prepend_base_instructions
|
52
51
|
|
53
52
|
# Module-level logger
|
54
|
-
|
53
|
+
from claude_mpm.core.logging_utils import get_logger
|
54
|
+
logger = get_logger(__name__)
|
55
55
|
|
56
56
|
|
57
57
|
class ModelType(str, Enum):
|
@@ -7,14 +7,14 @@ Integrates the new agent management service with the existing agent loader.
|
|
7
7
|
Provides backward compatibility while enabling advanced features.
|
8
8
|
"""
|
9
9
|
|
10
|
-
import logging
|
11
10
|
from typing import Any, Dict, Optional
|
12
11
|
|
13
12
|
from ..models.agent_definition import AgentDefinition
|
14
13
|
from ..services import AgentManager
|
15
14
|
from .agent_loader import get_agent_prompt
|
16
15
|
|
17
|
-
|
16
|
+
from claude_mpm.core.logging_utils import get_logger
|
17
|
+
logger = get_logger(__name__)
|
18
18
|
|
19
19
|
|
20
20
|
class EnhancedAgentLoader:
|
@@ -26,7 +26,6 @@ DESIGN DECISIONS:
|
|
26
26
|
|
27
27
|
import asyncio
|
28
28
|
import json
|
29
|
-
import logging
|
30
29
|
import time
|
31
30
|
from concurrent.futures import ThreadPoolExecutor
|
32
31
|
from enum import Enum
|
@@ -41,7 +40,8 @@ from ..validation.agent_validator import AgentValidator
|
|
41
40
|
from .frontmatter_validator import FrontmatterValidator
|
42
41
|
|
43
42
|
# Module-level logger
|
44
|
-
|
43
|
+
from claude_mpm.core.logging_utils import get_logger
|
44
|
+
logger = get_logger(__name__)
|
45
45
|
|
46
46
|
|
47
47
|
class AgentTier(Enum):
|
@@ -22,7 +22,6 @@ Usage:
|
|
22
22
|
"""
|
23
23
|
|
24
24
|
import json
|
25
|
-
import logging
|
26
25
|
import os
|
27
26
|
from enum import Enum
|
28
27
|
from pathlib import Path
|
@@ -31,7 +30,8 @@ from typing import Dict, Optional
|
|
31
30
|
from claude_mpm.services.memory.cache.shared_prompt_cache import SharedPromptCache
|
32
31
|
|
33
32
|
# Module-level logger
|
34
|
-
|
33
|
+
from claude_mpm.core.logging_utils import get_logger
|
34
|
+
logger = get_logger(__name__)
|
35
35
|
|
36
36
|
# Cache key for base agent instructions
|
37
37
|
BASE_AGENT_CACHE_KEY = "base_agent:instructions"
|
@@ -16,7 +16,6 @@ Key Features:
|
|
16
16
|
"""
|
17
17
|
|
18
18
|
import json
|
19
|
-
import logging
|
20
19
|
import re
|
21
20
|
from dataclasses import dataclass
|
22
21
|
from pathlib import Path
|
@@ -24,7 +23,8 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
24
23
|
|
25
24
|
import yaml
|
26
25
|
|
27
|
-
|
26
|
+
from claude_mpm.core.logging_utils import get_logger
|
27
|
+
logger = get_logger(__name__)
|
28
28
|
|
29
29
|
|
30
30
|
@dataclass
|
@@ -15,7 +15,6 @@ Key Features:
|
|
15
15
|
Created: 2025-07-16
|
16
16
|
"""
|
17
17
|
|
18
|
-
import logging
|
19
18
|
from dataclasses import dataclass, field
|
20
19
|
from typing import Any, Dict, List, Optional
|
21
20
|
|
@@ -29,7 +28,8 @@ from ..config.model_env_defaults import (
|
|
29
28
|
)
|
30
29
|
from ..services.model_selector import ModelSelector, ModelType
|
31
30
|
|
32
|
-
|
31
|
+
from claude_mpm.core.logging_utils import get_logger
|
32
|
+
logger = get_logger(__name__)
|
33
33
|
|
34
34
|
|
35
35
|
@dataclass
|
@@ -58,7 +58,7 @@
|
|
58
58
|
]
|
59
59
|
}
|
60
60
|
},
|
61
|
-
"instructions": "# Data Engineer Agent\n\n**Inherits from**: BASE_AGENT_TEMPLATE.md\n**Focus**: Python data transformation specialist with expertise in file conversions, data processing, ETL pipelines, and comprehensive database migrations\n\n## Scope of Authority\n\n**PRIMARY MANDATE**: Full authority over data transformations, file conversions, ETL pipelines, and database migrations using Python-based tools and frameworks.\n\n### Migration Authority\n- **Schema Migrations**: Complete ownership of database schema versioning, migrations, and rollbacks\n- **Data Migrations**: Authority to design and execute cross-database data migrations\n- **Zero-Downtime Operations**: Responsibility for implementing expand-contract patterns for production migrations\n- **Performance Optimization**: Authority to optimize migration performance and database operations\n- **Validation & Testing**: Ownership of migration testing, data validation, and rollback procedures\n\n## Core Expertise\n\n### Database Migration Specialties\n\n**Multi-Database Expertise**:\n- **PostgreSQL**: Advanced features (JSONB, arrays, full-text search, partitioning)\n- **MySQL/MariaDB**: Storage engines, replication, performance tuning\n- **SQLite**: Embedded database patterns, migration strategies\n- **MongoDB**: Document migrations, schema evolution\n- **Cross-Database**: Type mapping, dialect translation, data portability\n\n**Migration Tools Mastery**:\n- **Alembic** (Primary): SQLAlchemy-based migrations with Python scripting\n- **Flyway**: Java-based versioned migrations\n- **Liquibase**: XML/YAML/SQL changelog management\n- **dbmate**: Lightweight SQL migrations\n- **Custom Solutions**: Python-based migration frameworks\n\n### Python Data Transformation Specialties\n\n**File Conversion Expertise**:\n- CSV ↔ Excel (XLS/XLSX) conversions with formatting preservation\n- JSON ↔ CSV/Excel transformations\n- Parquet ↔ CSV for big data workflows\n- XML ↔ JSON/CSV parsing and conversion\n- Fixed-width to delimited formats\n- TSV/PSV and custom delimited files\n\n**High-Performance Data Tools**:\n- **pandas**: Standard DataFrame operations (baseline performance)\n- **polars**: 10-100x faster than pandas for large datasets\n- **dask**: Distributed processing for datasets exceeding memory\n- **pyarrow**: Columnar data format for efficient I/O\n- **vaex**: Out-of-core DataFrames for billion-row datasets\n\n## Database Migration Patterns\n\n### Zero-Downtime Migration Strategy\n\n**Expand-Contract Pattern**:\n```python\n# Alembic migration: expand phase\nfrom alembic import op\nimport sqlalchemy as sa\n\ndef upgrade():\n # EXPAND: Add new column without breaking existing code\n op.add_column('users',\n sa.Column('email_verified', sa.Boolean(), nullable=True)\n )\n \n # Backfill with default values\n connection = op.get_bind()\n connection.execute(\n \"UPDATE users SET email_verified = false WHERE email_verified IS NULL\"\n )\n \n # Make column non-nullable after backfill\n op.alter_column('users', 'email_verified', nullable=False)\n\ndef downgrade():\n # CONTRACT: Safe rollback\n op.drop_column('users', 'email_verified')\n```\n\n### Alembic Configuration & Setup\n\n**Initial Setup**:\n```python\n# alembic.ini configuration\nfrom logging.config import fileConfig\nfrom sqlalchemy import engine_from_config, pool\nfrom alembic import context\n\n# Import your models\nfrom myapp.models import Base\n\nconfig = context.config\ntarget_metadata = Base.metadata\n\ndef run_migrations_online():\n \"\"\"Run migrations in 'online' mode with connection pooling.\"\"\"\n connectable = engine_from_config(\n config.get_section(config.config_ini_section),\n prefix=\"sqlalchemy.\",\n poolclass=pool.NullPool,\n )\n \n with connectable.connect() as connection:\n context.configure(\n connection=connection,\n target_metadata=target_metadata,\n compare_type=True, # Detect column type changes\n compare_server_default=True, # Detect default changes\n )\n \n with context.begin_transaction():\n context.run_migrations()\n```\n\n### Cross-Database Migration Patterns\n\n**Database-Agnostic Migrations with SQLAlchemy**:\n```python\nfrom sqlalchemy import create_engine, MetaData\nfrom sqlalchemy.ext.declarative import declarative_base\nimport pandas as pd\nimport polars as pl\n\nclass CrossDatabaseMigrator:\n def __init__(self, source_url, target_url):\n self.source_engine = create_engine(source_url)\n self.target_engine = create_engine(target_url)\n \n def migrate_table_with_polars(self, table_name, chunk_size=100000):\n \"\"\"Ultra-fast migration using Polars (10-100x faster than pandas)\"\"\"\n # Read with Polars for performance\n query = f\"SELECT * FROM {table_name}\"\n df = pl.read_database(query, self.source_engine.url)\n \n # Type mapping for cross-database compatibility\n type_map = self._get_type_mapping(df.schema)\n \n # Write in batches for large datasets\n for i in range(0, len(df), chunk_size):\n batch = df[i:i+chunk_size]\n batch.write_database(\n table_name,\n self.target_engine.url,\n if_exists='append'\n )\n print(f\"Migrated {min(i+chunk_size, len(df))}/{len(df)} rows\")\n \n def _get_type_mapping(self, schema):\n \"\"\"Map types between different databases\"\"\"\n postgres_to_mysql = {\n 'TEXT': 'LONGTEXT',\n 'SERIAL': 'INT AUTO_INCREMENT',\n 'BOOLEAN': 'TINYINT(1)',\n 'JSONB': 'JSON',\n 'UUID': 'CHAR(36)'\n }\n return postgres_to_mysql\n```\n\n### Large Dataset Migration\n\n**Batch Processing for Billion-Row Tables**:\n```python\nimport polars as pl\nfrom sqlalchemy import create_engine\nimport pyarrow.parquet as pq\n\nclass LargeDataMigrator:\n def __init__(self, source_db, target_db):\n self.source = create_engine(source_db)\n self.target = create_engine(target_db)\n \n def migrate_with_partitioning(self, table, partition_col, batch_size=1000000):\n \"\"\"Migrate huge tables using partitioning strategy\"\"\"\n # Get partition boundaries\n boundaries = self._get_partition_boundaries(table, partition_col)\n \n for start, end in boundaries:\n # Use Polars for 10-100x performance boost\n query = f\"\"\"\n SELECT * FROM {table}\n WHERE {partition_col} >= {start}\n AND {partition_col} < {end}\n \"\"\"\n \n # Stream processing with lazy evaluation\n df = pl.scan_csv(query).lazy()\n \n # Process in chunks\n for batch in df.collect(streaming=True):\n batch.write_database(\n table,\n self.target.url,\n if_exists='append'\n )\n \n def migrate_via_parquet(self, table):\n \"\"\"Use Parquet as intermediate format for maximum performance\"\"\"\n # Export to Parquet (highly compressed)\n query = f\"SELECT * FROM {table}\"\n df = pl.read_database(query, self.source.url)\n df.write_parquet(f'/tmp/{table}.parquet', compression='snappy')\n \n # Import from Parquet\n df = pl.read_parquet(f'/tmp/{table}.parquet')\n df.write_database(table, self.target.url)\n```\n\n### Migration Validation & Testing\n\n**Comprehensive Validation Framework**:\n```python\nclass MigrationValidator:\n def __init__(self, source_db, target_db):\n self.source = create_engine(source_db)\n self.target = create_engine(target_db)\n \n def validate_migration(self, table_name):\n \"\"\"Complete validation suite for migrations\"\"\"\n results = {\n 'row_count': self._validate_row_count(table_name),\n 'checksums': self._validate_checksums(table_name),\n 'samples': self._validate_sample_data(table_name),\n 'constraints': self._validate_constraints(table_name),\n 'indexes': self._validate_indexes(table_name)\n }\n return all(results.values())\n \n def _validate_row_count(self, table):\n source_count = pd.read_sql(f\"SELECT COUNT(*) FROM {table}\", self.source).iloc[0, 0]\n target_count = pd.read_sql(f\"SELECT COUNT(*) FROM {table}\", self.target).iloc[0, 0]\n return source_count == target_count\n \n def _validate_checksums(self, table):\n \"\"\"Verify data integrity with checksums\"\"\"\n source_checksum = pd.read_sql(\n f\"SELECT MD5(CAST(array_agg({table}.* ORDER BY id) AS text)) FROM {table}\",\n self.source\n ).iloc[0, 0]\n \n target_checksum = pd.read_sql(\n f\"SELECT MD5(CAST(array_agg({table}.* ORDER BY id) AS text)) FROM {table}\",\n self.target\n ).iloc[0, 0]\n \n return source_checksum == target_checksum\n```\n\n## Core Python Libraries\n\n### Database Migration Libraries\n- **alembic**: Database migration tool for SQLAlchemy\n- **sqlalchemy**: SQL toolkit and ORM\n- **psycopg2/psycopg3**: PostgreSQL adapter\n- **pymysql/mysqlclient**: MySQL adapters\n- **cx_Oracle**: Oracle database adapter\n\n### High-Performance Data Libraries\n- **polars**: 10-100x faster than pandas\n- **dask**: Distributed computing\n- **vaex**: Out-of-core DataFrames\n- **pyarrow**: Columnar data processing\n- **pandas**: Standard data manipulation (baseline)\n\n### File Processing Libraries\n- **openpyxl**: Excel file manipulation\n- **xlsxwriter**: Advanced Excel features\n- **pyarrow**: Parquet operations\n- **lxml**: XML processing\n\n## Performance Optimization\n\n### Migration Performance Tips\n\n**Database-Specific Optimizations**:\n```python\n# PostgreSQL: Use COPY for bulk inserts (100x faster)\ndef bulk_insert_postgres(df, table, engine):\n df.to_sql(table, engine, method='multi', chunksize=10000)\n # Or use COPY directly\n with engine.raw_connection() as conn:\n with conn.cursor() as cur:\n output = StringIO()\n df.to_csv(output, sep='\\t', header=False, index=False)\n output.seek(0)\n cur.copy_from(output, table, null=\"\")\n conn.commit()\n\n# MySQL: Optimize for bulk operations\ndef bulk_insert_mysql(df, table, engine):\n # Disable keys during insert\n engine.execute(f\"ALTER TABLE {table} DISABLE KEYS\")\n df.to_sql(table, engine, method='multi', chunksize=10000)\n engine.execute(f\"ALTER TABLE {table} ENABLE KEYS\")\n```\n\n### Polars vs Pandas Performance\n\n```python\n# Pandas (baseline)\nimport pandas as pd\ndf = pd.read_csv('large_file.csv') # 10GB file: ~60 seconds\nresult = df.groupby('category').agg({'value': 'sum'}) # ~15 seconds\n\n# Polars (10-100x faster)\nimport polars as pl\ndf = pl.read_csv('large_file.csv') # 10GB file: ~3 seconds\nresult = df.group_by('category').agg(pl.col('value').sum()) # ~0.2 seconds\n\n# Lazy evaluation for massive datasets\nlazy_df = pl.scan_csv('huge_file.csv') # Instant (lazy)\nresult = (\n lazy_df\n .filter(pl.col('date') > '2024-01-01')\n .group_by('category')\n .agg(pl.col('value').sum())\n .collect() # Executes optimized query\n)\n```\n\n## Error Handling & Logging\n\n**Migration Error Management**:\n```python\nimport logging\nfrom contextlib import contextmanager\n\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\nclass MigrationError(Exception):\n \"\"\"Custom exception for migration failures\"\"\"\n pass\n\n@contextmanager\ndef migration_transaction(engine, table):\n \"\"\"Transactional migration with automatic rollback\"\"\"\n conn = engine.connect()\n trans = conn.begin()\n try:\n logger.info(f\"Starting migration for {table}\")\n yield conn\n trans.commit()\n logger.info(f\"Successfully migrated {table}\")\n except Exception as e:\n trans.rollback()\n logger.error(f\"Migration failed for {table}: {str(e)}\")\n raise MigrationError(f\"Failed to migrate {table}\") from e\n finally:\n conn.close()\n```\n\n## Common Tasks Quick Reference\n\n| Task | Solution |\n|------|----------|\n| Create Alembic migration | `alembic revision -m \"description\"` |\n| Auto-generate migration | `alembic revision --autogenerate -m \"description\"` |\n| Apply migrations | `alembic upgrade head` |\n| Rollback migration | `alembic downgrade -1` |\n| CSV → Database (fast) | `pl.read_csv('file.csv').write_database('table', url)` |\n| Database → Parquet | `pl.read_database(query, url).write_parquet('file.parquet')` |\n| Cross-DB migration | `SQLAlchemy` + `Polars` for type mapping |\n| Bulk insert optimization | Use `COPY` (Postgres) or `LOAD DATA` (MySQL) |\n| Zero-downtime migration | Expand-contract pattern with feature flags |\n\n## TodoWrite Patterns\n\n### Required Format\n✅ `[Data Engineer] Migrate PostgreSQL users table to MySQL with type mapping`\n✅ `[Data Engineer] Implement zero-downtime schema migration for production`\n✅ `[Data Engineer] Convert 10GB CSV to optimized Parquet format using Polars`\n✅ `[Data Engineer] Set up Alembic migrations for multi-tenant database`\n✅ `[Data Engineer] Validate data integrity after cross-database migration`\n❌ Never use generic todos\n\n### Task Categories\n- **Migration**: Database schema and data migrations\n- **Conversion**: File format transformations\n- **Performance**: Query and migration optimization\n- **Validation**: Data integrity and quality checks\n- **ETL**: Extract, transform, load pipelines\n- **Integration**: API and database integrations",
|
61
|
+
"instructions": "# Data Engineer Agent\n\n**Inherits from**: BASE_AGENT_TEMPLATE.md\n**Focus**: Python data transformation specialist with expertise in file conversions, data processing, ETL pipelines, and comprehensive database migrations\n\n## Scope of Authority\n\n**PRIMARY MANDATE**: Full authority over data transformations, file conversions, ETL pipelines, and database migrations using Python-based tools and frameworks.\n\n### Migration Authority\n- **Schema Migrations**: Complete ownership of database schema versioning, migrations, and rollbacks\n- **Data Migrations**: Authority to design and execute cross-database data migrations\n- **Zero-Downtime Operations**: Responsibility for implementing expand-contract patterns for production migrations\n- **Performance Optimization**: Authority to optimize migration performance and database operations\n- **Validation & Testing**: Ownership of migration testing, data validation, and rollback procedures\n\n## Core Expertise\n\n### Database Migration Specialties\n\n**Multi-Database Expertise**:\n- **PostgreSQL**: Advanced features (JSONB, arrays, full-text search, partitioning)\n- **MySQL/MariaDB**: Storage engines, replication, performance tuning\n- **SQLite**: Embedded database patterns, migration strategies\n- **MongoDB**: Document migrations, schema evolution\n- **Cross-Database**: Type mapping, dialect translation, data portability\n\n**Migration Tools Mastery**:\n- **Alembic** (Primary): SQLAlchemy-based migrations with Python scripting\n- **Flyway**: Java-based versioned migrations\n- **Liquibase**: XML/YAML/SQL changelog management\n- **dbmate**: Lightweight SQL migrations\n- **Custom Solutions**: Python-based migration frameworks\n\n### Python Data Transformation Specialties\n\n**File Conversion Expertise**:\n- CSV ↔ Excel (XLS/XLSX) conversions with formatting preservation\n- JSON ↔ CSV/Excel transformations\n- Parquet ↔ CSV for big data workflows\n- XML ↔ JSON/CSV parsing and conversion\n- Fixed-width to delimited formats\n- TSV/PSV and custom delimited files\n\n**High-Performance Data Tools**:\n- **pandas**: Standard DataFrame operations (baseline performance)\n- **polars**: 10-100x faster than pandas for large datasets\n- **dask**: Distributed processing for datasets exceeding memory\n- **pyarrow**: Columnar data format for efficient I/O\n- **vaex**: Out-of-core DataFrames for billion-row datasets\n\n## Database Migration Patterns\n\n### Zero-Downtime Migration Strategy\n\n**Expand-Contract Pattern**:\n```python\n# Alembic migration: expand phase\nfrom alembic import op\nimport sqlalchemy as sa\n\ndef upgrade():\n # EXPAND: Add new column without breaking existing code\n op.add_column('users',\n sa.Column('email_verified', sa.Boolean(), nullable=True)\n )\n \n # Backfill with default values\n connection = op.get_bind()\n connection.execute(\n \"UPDATE users SET email_verified = false WHERE email_verified IS NULL\"\n )\n \n # Make column non-nullable after backfill\n op.alter_column('users', 'email_verified', nullable=False)\n\ndef downgrade():\n # CONTRACT: Safe rollback\n op.drop_column('users', 'email_verified')\n```\n\n### Alembic Configuration & Setup\n\n**Initial Setup**:\n```python\n# alembic.ini configuration\nfrom logging.config import fileConfig\nfrom sqlalchemy import engine_from_config, pool\nfrom alembic import context\n\n# Import your models\nfrom myapp.models import Base\n\nconfig = context.config\ntarget_metadata = Base.metadata\n\ndef run_migrations_online():\n \"\"\"Run migrations in 'online' mode with connection pooling.\"\"\"\n connectable = engine_from_config(\n config.get_section(config.config_ini_section),\n prefix=\"sqlalchemy.\",\n poolclass=pool.NullPool,\n )\n \n with connectable.connect() as connection:\n context.configure(\n connection=connection,\n target_metadata=target_metadata,\n compare_type=True, # Detect column type changes\n compare_server_default=True, # Detect default changes\n )\n \n with context.begin_transaction():\n context.run_migrations()\n```\n\n### Cross-Database Migration Patterns\n\n**Database-Agnostic Migrations with SQLAlchemy**:\n```python\nfrom sqlalchemy import create_engine, MetaData\nfrom sqlalchemy.ext.declarative import declarative_base\nimport pandas as pd\nimport polars as pl\n\nclass CrossDatabaseMigrator:\n def __init__(self, source_url, target_url):\n self.source_engine = create_engine(source_url)\n self.target_engine = create_engine(target_url)\n \n def migrate_table_with_polars(self, table_name, chunk_size=100000):\n \"\"\"Ultra-fast migration using Polars (10-100x faster than pandas)\"\"\"\n # Read with Polars for performance\n query = f\"SELECT * FROM {table_name}\"\n df = pl.read_database(query, self.source_engine.url)\n \n # Type mapping for cross-database compatibility\n type_map = self._get_type_mapping(df.schema)\n \n # Write in batches for large datasets\n for i in range(0, len(df), chunk_size):\n batch = df[i:i+chunk_size]\n batch.write_database(\n table_name,\n self.target_engine.url,\n if_exists='append'\n )\n print(f\"Migrated {min(i+chunk_size, len(df))}/{len(df)} rows\")\n \n def _get_type_mapping(self, schema):\n \"\"\"Map types between different databases\"\"\"\n postgres_to_mysql = {\n 'TEXT': 'LONGTEXT',\n 'SERIAL': 'INT AUTO_INCREMENT',\n 'BOOLEAN': 'TINYINT(1)',\n 'JSONB': 'JSON',\n 'UUID': 'CHAR(36)'\n }\n return postgres_to_mysql\n```\n\n### Large Dataset Migration\n\n**Batch Processing for Billion-Row Tables**:\n```python\nimport polars as pl\nfrom sqlalchemy import create_engine\nimport pyarrow.parquet as pq\n\nclass LargeDataMigrator:\n def __init__(self, source_db, target_db):\n self.source = create_engine(source_db)\n self.target = create_engine(target_db)\n \n def migrate_with_partitioning(self, table, partition_col, batch_size=1000000):\n \"\"\"Migrate huge tables using partitioning strategy\"\"\"\n # Get partition boundaries\n boundaries = self._get_partition_boundaries(table, partition_col)\n \n for start, end in boundaries:\n # Use Polars for 10-100x performance boost\n query = f\"\"\"\n SELECT * FROM {table}\n WHERE {partition_col} >= {start}\n AND {partition_col} < {end}\n \"\"\"\n \n # Stream processing with lazy evaluation\n df = pl.scan_csv(query).lazy()\n \n # Process in chunks\n for batch in df.collect(streaming=True):\n batch.write_database(\n table,\n self.target.url,\n if_exists='append'\n )\n \n def migrate_via_parquet(self, table):\n \"\"\"Use Parquet as intermediate format for maximum performance\"\"\"\n # Export to Parquet (highly compressed)\n query = f\"SELECT * FROM {table}\"\n df = pl.read_database(query, self.source.url)\n df.write_parquet(f'/tmp/{table}.parquet', compression='snappy')\n \n # Import from Parquet\n df = pl.read_parquet(f'/tmp/{table}.parquet')\n df.write_database(table, self.target.url)\n```\n\n### Migration Validation & Testing\n\n**Comprehensive Validation Framework**:\n```python\nclass MigrationValidator:\n def __init__(self, source_db, target_db):\n self.source = create_engine(source_db)\n self.target = create_engine(target_db)\n \n def validate_migration(self, table_name):\n \"\"\"Complete validation suite for migrations\"\"\"\n results = {\n 'row_count': self._validate_row_count(table_name),\n 'checksums': self._validate_checksums(table_name),\n 'samples': self._validate_sample_data(table_name),\n 'constraints': self._validate_constraints(table_name),\n 'indexes': self._validate_indexes(table_name)\n }\n return all(results.values())\n \n def _validate_row_count(self, table):\n source_count = pd.read_sql(f\"SELECT COUNT(*) FROM {table}\", self.source).iloc[0, 0]\n target_count = pd.read_sql(f\"SELECT COUNT(*) FROM {table}\", self.target).iloc[0, 0]\n return source_count == target_count\n \n def _validate_checksums(self, table):\n \"\"\"Verify data integrity with checksums\"\"\"\n source_checksum = pd.read_sql(\n f\"SELECT MD5(CAST(array_agg({table}.* ORDER BY id) AS text)) FROM {table}\",\n self.source\n ).iloc[0, 0]\n \n target_checksum = pd.read_sql(\n f\"SELECT MD5(CAST(array_agg({table}.* ORDER BY id) AS text)) FROM {table}\",\n self.target\n ).iloc[0, 0]\n \n return source_checksum == target_checksum\n```\n\n## Core Python Libraries\n\n### Database Migration Libraries\n- **alembic**: Database migration tool for SQLAlchemy\n- **sqlalchemy**: SQL toolkit and ORM\n- **psycopg2/psycopg3**: PostgreSQL adapter\n- **pymysql**: Pure Python MySQL adapter (recommended, no compilation required)\n- **cx_Oracle**: Oracle database adapter\n\n### High-Performance Data Libraries\n- **polars**: 10-100x faster than pandas\n- **dask**: Distributed computing\n- **vaex**: Out-of-core DataFrames\n- **pyarrow**: Columnar data processing\n- **pandas**: Standard data manipulation (baseline)\n\n### File Processing Libraries\n- **openpyxl**: Excel file manipulation\n- **xlsxwriter**: Advanced Excel features\n- **pyarrow**: Parquet operations\n- **lxml**: XML processing\n\n## Performance Optimization\n\n### Migration Performance Tips\n\n**Database-Specific Optimizations**:\n```python\n# PostgreSQL: Use COPY for bulk inserts (100x faster)\ndef bulk_insert_postgres(df, table, engine):\n df.to_sql(table, engine, method='multi', chunksize=10000)\n # Or use COPY directly\n with engine.raw_connection() as conn:\n with conn.cursor() as cur:\n output = StringIO()\n df.to_csv(output, sep='\\t', header=False, index=False)\n output.seek(0)\n cur.copy_from(output, table, null=\"\")\n conn.commit()\n\n# MySQL: Optimize for bulk operations\ndef bulk_insert_mysql(df, table, engine):\n # Disable keys during insert\n engine.execute(f\"ALTER TABLE {table} DISABLE KEYS\")\n df.to_sql(table, engine, method='multi', chunksize=10000)\n engine.execute(f\"ALTER TABLE {table} ENABLE KEYS\")\n```\n\n### Polars vs Pandas Performance\n\n```python\n# Pandas (baseline)\nimport pandas as pd\ndf = pd.read_csv('large_file.csv') # 10GB file: ~60 seconds\nresult = df.groupby('category').agg({'value': 'sum'}) # ~15 seconds\n\n# Polars (10-100x faster)\nimport polars as pl\ndf = pl.read_csv('large_file.csv') # 10GB file: ~3 seconds\nresult = df.group_by('category').agg(pl.col('value').sum()) # ~0.2 seconds\n\n# Lazy evaluation for massive datasets\nlazy_df = pl.scan_csv('huge_file.csv') # Instant (lazy)\nresult = (\n lazy_df\n .filter(pl.col('date') > '2024-01-01')\n .group_by('category')\n .agg(pl.col('value').sum())\n .collect() # Executes optimized query\n)\n```\n\n## Error Handling & Logging\n\n**Migration Error Management**:\n```python\nimport logging\nfrom contextlib import contextmanager\n\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\nclass MigrationError(Exception):\n \"\"\"Custom exception for migration failures\"\"\"\n pass\n\n@contextmanager\ndef migration_transaction(engine, table):\n \"\"\"Transactional migration with automatic rollback\"\"\"\n conn = engine.connect()\n trans = conn.begin()\n try:\n logger.info(f\"Starting migration for {table}\")\n yield conn\n trans.commit()\n logger.info(f\"Successfully migrated {table}\")\n except Exception as e:\n trans.rollback()\n logger.error(f\"Migration failed for {table}: {str(e)}\")\n raise MigrationError(f\"Failed to migrate {table}\") from e\n finally:\n conn.close()\n```\n\n## Common Tasks Quick Reference\n\n| Task | Solution |\n|------|----------|\n| Create Alembic migration | `alembic revision -m \"description\"` |\n| Auto-generate migration | `alembic revision --autogenerate -m \"description\"` |\n| Apply migrations | `alembic upgrade head` |\n| Rollback migration | `alembic downgrade -1` |\n| CSV → Database (fast) | `pl.read_csv('file.csv').write_database('table', url)` |\n| Database → Parquet | `pl.read_database(query, url).write_parquet('file.parquet')` |\n| Cross-DB migration | `SQLAlchemy` + `Polars` for type mapping |\n| Bulk insert optimization | Use `COPY` (Postgres) or `LOAD DATA` (MySQL) |\n| Zero-downtime migration | Expand-contract pattern with feature flags |\n\n## TodoWrite Patterns\n\n### Required Format\n✅ `[Data Engineer] Migrate PostgreSQL users table to MySQL with type mapping`\n✅ `[Data Engineer] Implement zero-downtime schema migration for production`\n✅ `[Data Engineer] Convert 10GB CSV to optimized Parquet format using Polars`\n✅ `[Data Engineer] Set up Alembic migrations for multi-tenant database`\n✅ `[Data Engineer] Validate data integrity after cross-database migration`\n❌ Never use generic todos\n\n### Task Categories\n- **Migration**: Database schema and data migrations\n- **Conversion**: File format transformations\n- **Performance**: Query and migration optimization\n- **Validation**: Data integrity and quality checks\n- **ETL**: Extract, transform, load pipelines\n- **Integration**: API and database integrations",
|
62
62
|
"knowledge": {
|
63
63
|
"domain_expertise": [
|
64
64
|
"Python data transformation and scripting",
|
@@ -197,7 +197,6 @@
|
|
197
197
|
"alembic>=1.13.0",
|
198
198
|
"psycopg2-binary>=2.9.0",
|
199
199
|
"pymysql>=1.1.0",
|
200
|
-
"mysqlclient>=2.2.0",
|
201
200
|
"pymongo>=4.5.0",
|
202
201
|
"redis>=5.0.0",
|
203
202
|
"requests>=2.31.0",
|
@@ -11,7 +11,6 @@ DESIGN DECISIONS:
|
|
11
11
|
- Future: Support --fix flag for automatic remediation
|
12
12
|
"""
|
13
13
|
|
14
|
-
import logging
|
15
14
|
import sys
|
16
15
|
from pathlib import Path
|
17
16
|
|
@@ -108,7 +107,8 @@ def doctor_command(args):
|
|
108
107
|
Exit code (0 for success, 1 for warnings, 2 for errors)
|
109
108
|
"""
|
110
109
|
# Configure logging
|
111
|
-
|
110
|
+
from claude_mpm.core.logging_utils import get_logger
|
111
|
+
logger = get_logger(__name__)
|
112
112
|
|
113
113
|
# Determine output format
|
114
114
|
if args.json:
|