PyPI - kailash - Versions diffs - 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

kailash 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

kailash/__init__.py +1 -1
kailash/api/custom_nodes_secure.py +2 -2
kailash/api/studio_secure.py +1 -1
kailash/mcp/client_new.py +1 -1
kailash/nodes/ai/a2a.py +1 -1
kailash/nodes/api/__init__.py +26 -0
kailash/nodes/api/monitoring.py +463 -0
kailash/nodes/api/security.py +822 -0
kailash/nodes/base.py +3 -3
kailash/nodes/code/python.py +6 -0
kailash/nodes/data/__init__.py +9 -0
kailash/nodes/data/directory.py +278 -0
kailash/nodes/data/event_generation.py +297 -0
kailash/nodes/data/file_discovery.py +601 -0
kailash/nodes/data/sql.py +2 -2
kailash/nodes/transform/processors.py +32 -1
kailash/runtime/async_local.py +1 -1
kailash/runtime/docker.py +4 -4
kailash/runtime/local.py +41 -4
kailash/runtime/parallel.py +2 -2
kailash/runtime/parallel_cyclic.py +2 -2
kailash/runtime/testing.py +2 -2
kailash/utils/templates.py +6 -6
kailash/visualization/performance.py +16 -3
kailash/visualization/reports.py +5 -1
kailash/workflow/convergence.py +1 -1
kailash/workflow/cycle_analyzer.py +8 -1
kailash/workflow/cyclic_runner.py +1 -1
kailash/workflow/graph.py +33 -6
kailash/workflow/visualization.py +10 -2
kailash-0.3.0.dist-info/METADATA +428 -0
{kailash-0.2.1.dist-info → kailash-0.3.0.dist-info}/RECORD +36 -31
kailash-0.2.1.dist-info/METADATA +0 -1617
{kailash-0.2.1.dist-info → kailash-0.3.0.dist-info}/WHEEL +0 -0
{kailash-0.2.1.dist-info → kailash-0.3.0.dist-info}/entry_points.txt +0 -0
{kailash-0.2.1.dist-info → kailash-0.3.0.dist-info}/licenses/LICENSE +0 -0
{kailash-0.2.1.dist-info → kailash-0.3.0.dist-info}/top_level.txt +0 -0

kailash/nodes/base.py CHANGED Viewed

@@ -407,9 +407,9 @@ class Node(ABC):
         for param_name, param_def in params.items():
             if param_name not in self.config:
                 if param_def.required and param_def.default is None:
-                    raise NodeConfigurationError(
-                        f"Required parameter '{param_name}' not provided in configuration"
-                    )
+                    # During node construction, we may not have all parameters yet
+                    # Skip validation for required parameters - they will be validated at execution time
+                    continue
                 elif param_def.default is not None:
                     self.config[param_name] = param_def.default

kailash/nodes/code/python.py CHANGED Viewed

@@ -93,6 +93,12 @@ ALLOWED_MODULES = {
     "matplotlib",
     "seaborn",
     "plotly",
+    # File processing modules
+    "csv",  # For CSV file processing
+    "mimetypes",  # For MIME type detection
+    "pathlib",  # For modern path operations
+    "glob",  # For file pattern matching
+    "xml",  # For XML processing
 }

kailash/nodes/data/__init__.py CHANGED Viewed

@@ -80,6 +80,9 @@ Example Workflows:
     workflow.connect('process', 'publish')
 """
+from kailash.nodes.data.directory import DirectoryReaderNode
+from kailash.nodes.data.event_generation import EventGeneratorNode
+from kailash.nodes.data.file_discovery import FileDiscoveryNode
 from kailash.nodes.data.readers import CSVReaderNode, JSONReaderNode, TextReaderNode
 from kailash.nodes.data.retrieval import RelevanceScorerNode
 from kailash.nodes.data.sharepoint_graph import (
@@ -102,6 +105,12 @@ from kailash.nodes.data.vector_db import (
 from kailash.nodes.data.writers import CSVWriterNode, JSONWriterNode, TextWriterNode
 __all__ = [
+    # Directory
+    "DirectoryReaderNode",
+    # Event Generation
+    "EventGeneratorNode",
+    # File Discovery
+    "FileDiscoveryNode",
     # Readers
     "CSVReaderNode",
     "JSONReaderNode",

kailash/nodes/data/directory.py ADDED Viewed

@@ -0,0 +1,278 @@
+"""Directory processing nodes for file discovery and batch operations."""
+import mimetypes
+import os
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+from kailash.nodes.base import Node, NodeParameter, register_node
+from kailash.security import validate_file_path
+@register_node()
+class DirectoryReaderNode(Node):
+    """
+    Discovers and catalogs files in a directory with metadata extraction.
+    This node provides comprehensive directory scanning capabilities, handling
+    file discovery, metadata extraction, and filtering. It's designed for
+    batch file processing workflows and dynamic data source discovery.
+    Design Philosophy:
+        The DirectoryReaderNode embodies the principle of "dynamic data discovery."
+        Instead of hardcoding file paths, workflows can dynamically discover
+        available data sources at runtime. This makes workflows more flexible
+        and adaptable to changing data environments.
+    Features:
+        - Recursive directory scanning
+        - File type detection and filtering
+        - Metadata extraction (size, timestamps, MIME types)
+        - Pattern-based filtering
+        - Security-validated path operations
+    Use Cases:
+        - Batch file processing workflows
+        - Dynamic data pipeline creation
+        - File monitoring and cataloging
+        - Multi-format document processing
+        - Data lake exploration
+    Output Format:
+        Returns a structured catalog of discovered files with:
+        - File paths and names
+        - File types and MIME types
+        - File sizes and timestamps
+        - Directory structure information
+    """
+    def get_parameters(self) -> Dict[str, NodeParameter]:
+        """Define input parameters for directory scanning."""
+        return {
+            "directory_path": NodeParameter(
+                name="directory_path",
+                type=str,
+                required=True,
+                description="Path to the directory to scan",
+            ),
+            "recursive": NodeParameter(
+                name="recursive",
+                type=bool,
+                required=False,
+                default=False,
+                description="Whether to scan subdirectories recursively",
+            ),
+            "file_patterns": NodeParameter(
+                name="file_patterns",
+                type=list,
+                required=False,
+                default=[],
+                description="List of file patterns to include (e.g., ['*.csv', '*.json'])",
+            ),
+            "exclude_patterns": NodeParameter(
+                name="exclude_patterns",
+                type=list,
+                required=False,
+                default=[],
+                description="List of file patterns to exclude",
+            ),
+            "include_hidden": NodeParameter(
+                name="include_hidden",
+                type=bool,
+                required=False,
+                default=False,
+                description="Whether to include hidden files (starting with .)",
+            ),
+        }
+    def run(self, **kwargs) -> Dict[str, Any]:
+        """Execute directory scanning operation.
+        Returns:
+            Dictionary containing:
+            - discovered_files: List of file information dictionaries
+            - files_by_type: Files grouped by type
+            - directory_stats: Summary statistics
+        """
+        directory_path = kwargs.get("directory_path")
+        recursive = kwargs.get("recursive", False)
+        file_patterns = kwargs.get("file_patterns", [])
+        exclude_patterns = kwargs.get("exclude_patterns", [])
+        include_hidden = kwargs.get("include_hidden", False)
+        # Validate directory path for security
+        validated_path = validate_file_path(directory_path, operation="directory scan")
+        if not os.path.isdir(validated_path):
+            raise FileNotFoundError(f"Directory not found: {directory_path}")
+        discovered_files = []
+        try:
+            if recursive:
+                # Recursive scan
+                for root, dirs, files in os.walk(validated_path):
+                    for filename in files:
+                        file_path = os.path.join(root, filename)
+                        file_info = self._extract_file_info(
+                            file_path,
+                            filename,
+                            include_hidden,
+                            file_patterns,
+                            exclude_patterns,
+                        )
+                        if file_info:
+                            discovered_files.append(file_info)
+            else:
+                # Single directory scan
+                for filename in os.listdir(validated_path):
+                    file_path = os.path.join(validated_path, filename)
+                    # Skip directories in non-recursive mode
+                    if os.path.isdir(file_path):
+                        continue
+                    file_info = self._extract_file_info(
+                        file_path,
+                        filename,
+                        include_hidden,
+                        file_patterns,
+                        exclude_patterns,
+                    )
+                    if file_info:
+                        discovered_files.append(file_info)
+        except PermissionError as e:
+            raise PermissionError(f"Permission denied accessing directory: {e}")
+        except Exception as e:
+            raise RuntimeError(f"Error scanning directory: {e}")
+        # Group files by type
+        files_by_type = {}
+        for file_info in discovered_files:
+            file_type = file_info["file_type"]
+            if file_type not in files_by_type:
+                files_by_type[file_type] = []
+            files_by_type[file_type].append(file_info)
+        # Generate directory statistics
+        directory_stats = {
+            "total_files": len(discovered_files),
+            "file_types": list(files_by_type.keys()),
+            "files_by_type_count": {
+                file_type: len(files) for file_type, files in files_by_type.items()
+            },
+            "total_size": sum(f["file_size"] for f in discovered_files),
+            "scan_time": datetime.now().isoformat(),
+            "directory_path": directory_path,
+            "recursive": recursive,
+        }
+        return {
+            "discovered_files": discovered_files,
+            "files_by_type": files_by_type,
+            "directory_stats": directory_stats,
+        }
+    def _extract_file_info(
+        self,
+        file_path: str,
+        filename: str,
+        include_hidden: bool,
+        file_patterns: List[str],
+        exclude_patterns: List[str],
+    ) -> Optional[Dict[str, Any]]:
+        """Extract metadata from a single file.
+        Args:
+            file_path: Full path to the file
+            filename: Name of the file
+            include_hidden: Whether to include hidden files
+            file_patterns: Patterns to include
+            exclude_patterns: Patterns to exclude
+        Returns:
+            File information dictionary or None if file should be excluded
+        """
+        # Skip hidden files if not included
+        if not include_hidden and filename.startswith("."):
+            return None
+        # Check exclude patterns
+        for pattern in exclude_patterns:
+            if self._matches_pattern(filename, pattern):
+                return None
+        # Check include patterns (if specified)
+        if file_patterns:
+            included = any(
+                self._matches_pattern(filename, pattern) for pattern in file_patterns
+            )
+            if not included:
+                return None
+        try:
+            # Get file statistics
+            file_stat = os.stat(file_path)
+            file_ext = os.path.splitext(filename)[1].lower()
+            # Map extensions to types
+            ext_to_type = {
+                ".csv": "csv",
+                ".json": "json",
+                ".txt": "txt",
+                ".xml": "xml",
+                ".md": "markdown",
+                ".py": "python",
+                ".js": "javascript",
+                ".html": "html",
+                ".css": "css",
+                ".pdf": "pdf",
+                ".doc": "word",
+                ".docx": "word",
+                ".xls": "excel",
+                ".xlsx": "excel",
+                ".png": "image",
+                ".jpg": "image",
+                ".jpeg": "image",
+                ".gif": "image",
+                ".svg": "image",
+            }
+            file_type = ext_to_type.get(file_ext, "unknown")
+            # Get MIME type
+            mime_type, _ = mimetypes.guess_type(file_path)
+            if not mime_type:
+                mime_type = "application/octet-stream"
+            return {
+                "file_path": file_path,
+                "file_name": filename,
+                "file_type": file_type,
+                "file_extension": file_ext,
+                "file_size": file_stat.st_size,
+                "mime_type": mime_type,
+                "created_time": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                "modified_time": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                "discovered_at": datetime.now().isoformat(),
+            }
+        except (OSError, PermissionError) as e:
+            # Log error but continue with other files
+            self.logger.warning(f"Could not process file {file_path}: {e}")
+            return None
+    def _matches_pattern(self, filename: str, pattern: str) -> bool:
+        """Check if filename matches a glob-style pattern.
+        Args:
+            filename: Name of the file to check
+            pattern: Glob pattern (e.g., '*.csv', 'data*', 'file?.txt')
+        Returns:
+            True if filename matches pattern
+        """
+        import fnmatch
+        return fnmatch.fnmatch(filename, pattern)

kailash/nodes/data/event_generation.py ADDED Viewed

@@ -0,0 +1,297 @@
+"""Event generation nodes for event-driven architectures."""
+import random
+import uuid
+from datetime import datetime, timezone
+from typing import Any, Dict
+from kailash.nodes.base import Node, NodeParameter, register_node
+@register_node()
+class EventGeneratorNode(Node):
+    """
+    Generates events for event sourcing and event-driven architecture patterns.
+    This node creates realistic event streams for testing, development, and
+    demonstration of event-driven systems. It supports various event types
+    and can generate events with proper sequencing, timestamps, and metadata.
+    Design Philosophy:
+        Event sourcing requires consistent, well-structured events with proper
+        metadata. This node eliminates the need for DataTransformer with embedded
+        Python code by providing a dedicated, configurable event generation
+        capability.
+    Upstream Dependencies:
+        - Optional configuration nodes
+        - Timer/scheduler nodes for periodic generation
+        - Template nodes for event schemas
+    Downstream Consumers:
+        - Event processing nodes
+        - Stream aggregation nodes
+        - Event store writers
+        - Message queue publishers
+        - Analytics and monitoring nodes
+    Configuration:
+        - Event types and schemas
+        - Generation patterns (burst, continuous, scheduled)
+        - Data ranges and distributions
+        - Metadata templates
+    Implementation Details:
+        - Generates proper event IDs and timestamps
+        - Maintains event ordering and sequencing
+        - Supports custom event schemas
+        - Realistic data generation with configurable patterns
+        - Proper metadata structure
+    Error Handling:
+        - Validates event schemas
+        - Handles invalid configurations gracefully
+        - Ensures timestamp consistency
+        - Validates required fields
+    Side Effects:
+        - No external side effects
+        - Deterministic with seed parameter
+        - Generates new events on each execution
+    Examples:
+        >>> # Generate order events
+        >>> generator = EventGeneratorNode(
+        ...     event_types=['OrderCreated', 'PaymentProcessed', 'OrderShipped'],
+        ...     event_count=10,
+        ...     aggregate_prefix='ORDER-2024'
+        ... )
+        >>> result = generator.execute()
+        >>> assert len(result['events']) == 10
+        >>> assert result['events'][0]['event_type'] in ['OrderCreated', 'PaymentProcessed', 'OrderShipped']
+        >>>
+        >>> # Generate user events with custom data
+        >>> generator = EventGeneratorNode(
+        ...     event_types=['UserRegistered', 'UserLoggedIn'],
+        ...     event_count=5,
+        ...     custom_data_templates={
+        ...         'UserRegistered': {'username': 'user_{id}', 'email': '{username}@example.com'},
+        ...         'UserLoggedIn': {'ip_address': '192.168.1.{random_ip}', 'device': 'Chrome/Windows'}
+        ...     }
+        ... )
+        >>> result = generator.execute()
+        >>> assert 'events' in result
+        >>> assert result['metadata']['total_events'] == 5
+    """
+    def get_parameters(self) -> Dict[str, NodeParameter]:
+        return {
+            "event_types": NodeParameter(
+                name="event_types",
+                type=list,
+                required=True,
+                description="List of event types to generate",
+            ),
+            "event_count": NodeParameter(
+                name="event_count",
+                type=int,
+                required=False,
+                default=10,
+                description="Number of events to generate",
+            ),
+            "aggregate_prefix": NodeParameter(
+                name="aggregate_prefix",
+                type=str,
+                required=False,
+                default="AGG",
+                description="Prefix for aggregate IDs",
+            ),
+            "custom_data_templates": NodeParameter(
+                name="custom_data_templates",
+                type=dict,
+                required=False,
+                default={},
+                description="Custom data templates for each event type",
+            ),
+            "source_service": NodeParameter(
+                name="source_service",
+                type=str,
+                required=False,
+                default="event-generator",
+                description="Source service name for metadata",
+            ),
+            "time_range_hours": NodeParameter(
+                name="time_range_hours",
+                type=int,
+                required=False,
+                default=24,
+                description="Time range in hours for event timestamps",
+            ),
+            "seed": NodeParameter(
+                name="seed",
+                type=int,
+                required=False,
+                description="Random seed for reproducible generation",
+            ),
+        }
+    def run(self, **kwargs) -> Dict[str, Any]:
+        event_types = kwargs["event_types"]
+        event_count = kwargs.get("event_count", 10)
+        aggregate_prefix = kwargs.get("aggregate_prefix", "AGG")
+        custom_data_templates = kwargs.get("custom_data_templates", {})
+        source_service = kwargs.get("source_service", "event-generator")
+        time_range_hours = kwargs.get("time_range_hours", 24)
+        seed = kwargs.get("seed")
+        if seed is not None:
+            random.seed(seed)
+        # Generate events
+        events = []
+        now = datetime.now(timezone.utc)
+        # Create a set of aggregate IDs for realistic event grouping
+        num_aggregates = max(1, event_count // 3)  # Roughly 3 events per aggregate
+        aggregate_ids = [
+            f"{aggregate_prefix}-{i:04d}" for i in range(1, num_aggregates + 1)
+        ]
+        for i in range(event_count):
+            # Select event type and aggregate
+            event_type = random.choice(event_types)
+            aggregate_id = random.choice(aggregate_ids)
+            # Generate timestamp within range
+            hours_offset = random.uniform(-time_range_hours, 0)
+            event_timestamp = now.timestamp() + hours_offset * 3600
+            event_time = datetime.fromtimestamp(event_timestamp, tz=timezone.utc)
+            # Generate event data
+            event_data = self._generate_event_data(
+                event_type, aggregate_id, custom_data_templates.get(event_type, {})
+            )
+            # Create event
+            event = {
+                "event_id": f"evt-{uuid.uuid4().hex[:8]}",
+                "event_type": event_type,
+                "aggregate_id": aggregate_id,
+                "timestamp": event_time.isoformat() + "Z",
+                "data": event_data,
+                "metadata": {
+                    "source": source_service,
+                    "version": 1,
+                    "correlation_id": f"corr-{uuid.uuid4().hex[:8]}",
+                    "generated": True,
+                },
+            }
+            events.append(event)
+        # Sort events by timestamp for realistic ordering
+        events.sort(key=lambda x: x["timestamp"])
+        # Generate metadata
+        metadata = {
+            "total_events": len(events),
+            "event_types": list(set(e["event_type"] for e in events)),
+            "aggregate_count": len(set(e["aggregate_id"] for e in events)),
+            "time_range": {
+                "start": events[0]["timestamp"] if events else None,
+                "end": events[-1]["timestamp"] if events else None,
+            },
+            "generated_at": now.isoformat() + "Z",
+            "source": source_service,
+        }
+        return {
+            "events": events,
+            "metadata": metadata,
+            "event_count": len(events),
+            "event_types": metadata["event_types"],
+            "aggregate_count": metadata["aggregate_count"],
+        }
+    def _generate_event_data(
+        self, event_type: str, aggregate_id: str, template: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """Generate event-specific data based on type and template."""
+        # Default data generators by event type
+        default_generators = {
+            "OrderCreated": lambda: {
+                "customer_id": f"CUST-{random.randint(100, 999)}",
+                "total_amount": round(random.uniform(10.0, 1000.0), 2),
+                "item_count": random.randint(1, 5),
+                "status": "pending",
+                "payment_method": random.choice(
+                    ["credit_card", "debit_card", "paypal"]
+                ),
+            },
+            "PaymentProcessed": lambda: {
+                "payment_id": f"PAY-{random.randint(10000, 99999)}",
+                "amount": round(random.uniform(10.0, 1000.0), 2),
+                "method": random.choice(["credit_card", "debit_card", "paypal"]),
+                "status": random.choice(["success", "failed", "pending"]),
+                "transaction_id": f"txn-{uuid.uuid4().hex[:12]}",
+            },
+            "OrderShipped": lambda: {
+                "tracking_number": f"TRACK-{random.randint(100000, 999999)}",
+                "carrier": random.choice(["UPS", "FedEx", "DHL", "USPS"]),
+                "status": "shipped",
+                "estimated_delivery": datetime.now(timezone.utc)
+                .replace(day=datetime.now().day + random.randint(1, 7))
+                .isoformat()
+                + "Z",
+            },
+            "UserRegistered": lambda: {
+                "username": f"user_{random.randint(1000, 9999)}",
+                "email": f"user_{random.randint(1000, 9999)}@example.com",
+                "plan": random.choice(["free", "premium", "enterprise"]),
+                "registration_source": random.choice(["web", "mobile", "api"]),
+            },
+            "UserLoggedIn": lambda: {
+                "ip_address": f"192.168.1.{random.randint(1, 254)}",
+                "device": random.choice(
+                    [
+                        "Chrome/Windows",
+                        "Safari/macOS",
+                        "Firefox/Linux",
+                        "Mobile/iOS",
+                        "Mobile/Android",
+                    ]
+                ),
+                "session_id": f"sess-{uuid.uuid4().hex[:16]}",
+            },
+            "SubscriptionCreated": lambda: {
+                "plan": random.choice(["basic", "premium", "enterprise"]),
+                "price": random.choice([9.99, 29.99, 99.99, 199.99]),
+                "billing_cycle": random.choice(["monthly", "yearly"]),
+                "trial_days": random.choice([0, 7, 14, 30]),
+            },
+        }
+        # Use template if provided, otherwise use default generator
+        if template:
+            data = {}
+            for key, value_template in template.items():
+                if isinstance(value_template, str):
+                    # Simple string templating
+                    data[key] = value_template.format(
+                        id=random.randint(1, 999),
+                        random_ip=random.randint(1, 254),
+                        username=f"user_{random.randint(1000, 9999)}",
+                        aggregate_id=aggregate_id,
+                    )
+                else:
+                    data[key] = value_template
+            return data
+        elif event_type in default_generators:
+            return default_generators[event_type]()
+        else:
+            # Generic event data
+            return {
+                "event_data": f"Generated data for {event_type}",
+                "aggregate_id": aggregate_id,
+                "timestamp": datetime.now(timezone.utc).isoformat() + "Z",
+            }

kailash 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

kailash 0.2.1py3-none-any.whl → 0.3.0py3-none-any.whl