PyPI - graflo - Versions diffs - 1.3.7__py3-none-any.whl - Mend

graflo 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of graflo might be problematic. Click here for more details.

Files changed (70) hide show

graflo/README.md +18 -0
graflo/__init__.py +70 -0
graflo/architecture/__init__.py +38 -0
graflo/architecture/actor.py +1276 -0
graflo/architecture/actor_util.py +450 -0
graflo/architecture/edge.py +418 -0
graflo/architecture/onto.py +376 -0
graflo/architecture/onto_sql.py +54 -0
graflo/architecture/resource.py +163 -0
graflo/architecture/schema.py +135 -0
graflo/architecture/transform.py +292 -0
graflo/architecture/util.py +89 -0
graflo/architecture/vertex.py +562 -0
graflo/caster.py +736 -0
graflo/cli/__init__.py +14 -0
graflo/cli/ingest.py +203 -0
graflo/cli/manage_dbs.py +197 -0
graflo/cli/plot_schema.py +132 -0
graflo/cli/xml2json.py +93 -0
graflo/data_source/__init__.py +48 -0
graflo/data_source/api.py +339 -0
graflo/data_source/base.py +95 -0
graflo/data_source/factory.py +304 -0
graflo/data_source/file.py +148 -0
graflo/data_source/memory.py +70 -0
graflo/data_source/registry.py +82 -0
graflo/data_source/sql.py +183 -0
graflo/db/__init__.py +44 -0
graflo/db/arango/__init__.py +22 -0
graflo/db/arango/conn.py +1025 -0
graflo/db/arango/query.py +180 -0
graflo/db/arango/util.py +88 -0
graflo/db/conn.py +377 -0
graflo/db/connection/__init__.py +6 -0
graflo/db/connection/config_mapping.py +18 -0
graflo/db/connection/onto.py +717 -0
graflo/db/connection/wsgi.py +29 -0
graflo/db/manager.py +119 -0
graflo/db/neo4j/__init__.py +16 -0
graflo/db/neo4j/conn.py +639 -0
graflo/db/postgres/__init__.py +37 -0
graflo/db/postgres/conn.py +948 -0
graflo/db/postgres/fuzzy_matcher.py +281 -0
graflo/db/postgres/heuristics.py +133 -0
graflo/db/postgres/inference_utils.py +428 -0
graflo/db/postgres/resource_mapping.py +273 -0
graflo/db/postgres/schema_inference.py +372 -0
graflo/db/postgres/types.py +148 -0
graflo/db/postgres/util.py +87 -0
graflo/db/tigergraph/__init__.py +9 -0
graflo/db/tigergraph/conn.py +2365 -0
graflo/db/tigergraph/onto.py +26 -0
graflo/db/util.py +49 -0
graflo/filter/__init__.py +21 -0
graflo/filter/onto.py +525 -0
graflo/logging.conf +22 -0
graflo/onto.py +312 -0
graflo/plot/__init__.py +17 -0
graflo/plot/plotter.py +616 -0
graflo/util/__init__.py +23 -0
graflo/util/chunker.py +807 -0
graflo/util/merge.py +150 -0
graflo/util/misc.py +37 -0
graflo/util/onto.py +422 -0
graflo/util/transform.py +454 -0
graflo-1.3.7.dist-info/METADATA +243 -0
graflo-1.3.7.dist-info/RECORD +70 -0
graflo-1.3.7.dist-info/WHEEL +4 -0
graflo-1.3.7.dist-info/entry_points.txt +5 -0
graflo-1.3.7.dist-info/licenses/LICENSE +126 -0

graflo/util/merge.py ADDED Viewed

@@ -0,0 +1,150 @@
+"""Document merging utilities.
+This module provides functions for merging documents based on common index keys,
+preserving order and handling both dict and VertexRep objects.
+Key Functions:
+    - merge_doc_basis: Merge documents based on common index keys, preserving order
+"""
+from typing import cast, overload
+from graflo.architecture.onto import VertexRep
+@overload
+def merge_doc_basis(
+    docs: list[dict],
+    index_keys: tuple[str, ...],
+) -> list[dict]: ...
+@overload
+def merge_doc_basis(
+    docs: list[VertexRep],
+    index_keys: tuple[str, ...],
+) -> list[VertexRep]: ...
+def merge_doc_basis(
+    docs: list[dict] | list[VertexRep],
+    index_keys: tuple[str, ...],
+) -> list[dict] | list[VertexRep]:
+    """Merge documents based on common index keys, preserving order.
+    This function merges documents that share common index key-value combinations,
+    preserving the order of documents based on the first occurrence of each index
+    key combination. Documents without index keys are merged into the closest
+    preceding document with index keys. If no documents have index keys, all
+    documents are merged into a single document.
+    For VertexRep objects, the merge is performed on the `vertex` attribute, and
+    `ctx` dicts are merged among merged VertexReps.
+    Args:
+        docs: Homogeneous list of documents (all dict or all VertexRep) to merge
+        index_keys: Tuple of key names to use for merging
+    Returns:
+        Merged documents in order of first occurrence (same type as input)
+    """
+    if not docs:
+        return docs
+    # Check if we're working with VertexRep objects
+    is_vertexrep = isinstance(docs[0], VertexRep)
+    # Track merged documents in order of first occurrence
+    # Type: list[dict] if not is_vertexrep, list[VertexRep] if is_vertexrep
+    merged_docs: list[dict | VertexRep] = []
+    # Map from index tuple to position in merged_docs
+    index_to_position: dict[tuple, int] = {}
+    # Accumulate documents without index keys
+    # Type: list[dict] if not is_vertexrep, list[VertexRep] if is_vertexrep
+    pending_non_ids: list[dict | VertexRep] = []
+    def get_index_tuple(doc: dict | VertexRep) -> tuple:
+        """Extract index tuple from a document."""
+        if is_vertexrep:
+            assert isinstance(doc, VertexRep)
+            data = doc.vertex
+        else:
+            assert isinstance(doc, dict)
+            data = doc
+        return tuple(sorted((k, v) for k, v in data.items() if k in index_keys))
+    def has_index_keys(doc: dict | VertexRep) -> bool:
+        """Check if document has any index keys."""
+        if is_vertexrep:
+            assert isinstance(doc, VertexRep)
+            return any(k in doc.vertex for k in index_keys)
+        else:
+            assert isinstance(doc, dict)
+            return any(k in doc for k in index_keys)
+    def merge_doc(target: dict | VertexRep, source: dict | VertexRep) -> None:
+        """Merge source into target."""
+        if is_vertexrep:
+            assert isinstance(target, VertexRep) and isinstance(source, VertexRep)
+            target.vertex.update(source.vertex)
+            target.ctx.update(source.ctx)
+        else:
+            assert isinstance(target, dict) and isinstance(source, dict)
+            target.update(source)
+    def copy_doc(doc: dict | VertexRep) -> dict | VertexRep:
+        """Create a copy of a document."""
+        if is_vertexrep:
+            assert isinstance(doc, VertexRep)
+            return VertexRep(vertex=doc.vertex.copy(), ctx=doc.ctx.copy())
+        else:
+            assert isinstance(doc, dict)
+            return doc.copy()
+    for doc in docs:
+        if has_index_keys(doc):
+            # This is a document with index keys
+            index_tuple = get_index_tuple(doc)
+            # First, handle any accumulated non-ID documents
+            if pending_non_ids:
+                if merged_docs:
+                    # Merge accumulated non-IDs into the last ID doc
+                    for pending in pending_non_ids:
+                        merge_doc(merged_docs[-1], pending)
+                else:
+                    # No previous ID doc, merge pending non-IDs into the current ID doc
+                    for pending in pending_non_ids:
+                        merge_doc(doc, pending)
+                pending_non_ids.clear()
+            # Handle the current document with index keys
+            if index_tuple in index_to_position:
+                # Merge into existing document at that position
+                merge_doc(merged_docs[index_to_position[index_tuple]], doc)
+            else:
+                # First occurrence of this index tuple, add new document
+                merged_docs.append(copy_doc(doc))
+                index_to_position[index_tuple] = len(merged_docs) - 1
+        else:
+            # This is a document without index keys, accumulate it
+            pending_non_ids.append(doc)
+    # Handle any remaining non-ID documents at the end
+    if pending_non_ids and merged_docs:
+        # Merge into last ID doc
+        for pending in pending_non_ids:
+            merge_doc(merged_docs[-1], pending)
+    elif pending_non_ids:
+        # No documents with index keys: merge all into a single document
+        if is_vertexrep:
+            merged_doc = VertexRep(vertex={}, ctx={})
+        else:
+            merged_doc = {}
+        for pending in pending_non_ids:
+            merge_doc(merged_doc, pending)
+        merged_docs.append(merged_doc)
+    # Type narrowing: return type matches input type due to homogeneous list requirement
+    return cast(list[dict] | list[VertexRep], merged_docs)

graflo/util/misc.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Miscellaneous utility functions.
+This module provides various utility functions for data manipulation and processing.
+Key Functions:
+    - sorted_dicts: Recursively sort dictionaries and lists for consistent ordering
+"""
+def sorted_dicts(d):
+    """Recursively sort dictionaries and lists for consistent ordering.
+    This function recursively sorts dictionaries and lists to ensure consistent
+    ordering of data structures. It handles nested structures and preserves
+    non-collection values.
+    Args:
+        d: Data structure to sort (dict, list, tuple, or other)
+    Returns:
+        The sorted data structure with consistent ordering
+    Example:
+        >>> data = {"b": 2, "a": 1, "c": [3, 1, 2]}
+        >>> sorted_dicts(data)
+        {"a": 1, "b": 2, "c": [1, 2, 3]}
+    """
+    if isinstance(d, (tuple, list)):
+        if d and all([not isinstance(dd, (list, tuple, dict)) for dd in d[0].values()]):
+            return sorted(d, key=lambda x: tuple(x.items()))
+    elif isinstance(d, dict):
+        return {
+            k: v if not isinstance(v, (list, tuple, dict)) else sorted_dicts(v)
+            for k, v in d.items()
+        }
+    return d

graflo/util/onto.py ADDED Viewed

@@ -0,0 +1,422 @@
+"""Utility ontology classes for resource patterns and configurations.
+This module provides data classes for managing resource patterns (files and database tables)
+and configurations used throughout the system. These classes support resource discovery,
+pattern matching, and configuration management.
+Key Components:
+    - ResourcePattern: Abstract base class for resource patterns
+    - FilePattern: Configuration for file pattern matching
+    - TablePattern: Configuration for database table pattern matching
+    - Patterns: Collection of named resource patterns with connection management
+"""
+import abc
+import dataclasses
+import pathlib
+import re
+from typing import TYPE_CHECKING, Any
+from graflo.onto import BaseDataclass, BaseEnum
+if TYPE_CHECKING:
+    from graflo.db.connection.onto import PostgresConfig
+else:
+    # Import at runtime for type evaluation
+    try:
+        from graflo.db.connection.onto import PostgresConfig
+    except ImportError:
+        PostgresConfig = Any  # type: ignore
+class ResourceType(BaseEnum):
+    """Resource types for data sources.
+    Resource types distinguish between different data source categories.
+    File type detection (CSV, JSON, JSONL, Parquet, etc.) is handled
+    automatically by the loader based on file extensions.
+    Attributes:
+        FILE: File-based data source (any format: CSV, JSON, JSONL, Parquet, etc.)
+        SQL_TABLE: SQL database table (e.g., PostgreSQL table)
+    """
+    FILE = "file"
+    SQL_TABLE = "sql_table"
+@dataclasses.dataclass
+class ResourcePattern(BaseDataclass, abc.ABC):
+    """Abstract base class for resource patterns (files or tables).
+    Provides common API for pattern matching and resource identification.
+    All concrete pattern types inherit from this class.
+    Attributes:
+        resource_name: Name of the resource this pattern matches
+    """
+    resource_name: str | None = None
+    @abc.abstractmethod
+    def matches(self, resource_identifier: str) -> bool:
+        """Check if pattern matches a resource identifier.
+        Args:
+            resource_identifier: Identifier to match (filename or table name)
+        Returns:
+            bool: True if pattern matches
+        """
+        pass
+    @abc.abstractmethod
+    def get_resource_type(self) -> ResourceType:
+        """Get the type of resource this pattern matches.
+        Returns:
+            ResourceType: Resource type enum value
+        """
+        pass
+@dataclasses.dataclass
+class FilePattern(ResourcePattern):
+    """Pattern for matching files.
+    Attributes:
+        regex: Regular expression pattern for matching filenames
+        sub_path: Path to search for matching files (default: "./")
+        date_field: Name of the date field to filter on (for date-based filtering)
+        date_filter: SQL-style date filter condition (e.g., "> '2020-10-10'")
+        date_range_start: Start date for range filtering (e.g., "2015-11-11")
+        date_range_days: Number of days after start date (used with date_range_start)
+    """
+    class _(BaseDataclass.Meta):
+        tag = "file"
+    regex: str | None = None
+    sub_path: None | pathlib.Path = dataclasses.field(
+        default_factory=lambda: pathlib.Path("./")
+    )
+    date_field: str | None = None
+    date_filter: str | None = None
+    date_range_start: str | None = None
+    date_range_days: int | None = None
+    def __post_init__(self):
+        """Initialize and validate the file pattern.
+        Ensures that sub_path is a Path object and is not None.
+        """
+        if not isinstance(self.sub_path, pathlib.Path):
+            self.sub_path = pathlib.Path(self.sub_path)
+        assert self.sub_path is not None
+        # Validate date filtering parameters (note: date filtering for files is not yet implemented)
+        if (self.date_filter or self.date_range_start) and not self.date_field:
+            raise ValueError(
+                "date_field is required when using date_filter or date_range_start"
+            )
+        if self.date_range_days is not None and not self.date_range_start:
+            raise ValueError("date_range_start is required when using date_range_days")
+    def matches(self, filename: str) -> bool:
+        """Check if pattern matches a filename.
+        Args:
+            filename: Filename to match
+        Returns:
+            bool: True if pattern matches
+        """
+        if self.regex is None:
+            return False
+        return bool(re.match(self.regex, filename))
+    def get_resource_type(self) -> ResourceType:
+        """Get resource type.
+        FilePattern always represents a FILE resource type.
+        The specific file format (CSV, JSON, JSONL, Parquet, etc.) is
+        automatically detected by the loader based on file extensions.
+        """
+        return ResourceType.FILE
+@dataclasses.dataclass
+class TablePattern(ResourcePattern):
+    """Pattern for matching database tables.
+    Attributes:
+        table_name: Exact table name or regex pattern
+        schema_name: Schema name (optional, defaults to public)
+        database: Database name (optional)
+        date_field: Name of the date field to filter on (for date-based filtering)
+        date_filter: SQL-style date filter condition (e.g., "> '2020-10-10'")
+        date_range_start: Start date for range filtering (e.g., "2015-11-11")
+        date_range_days: Number of days after start date (used with date_range_start)
+    """
+    class _(BaseDataclass.Meta):
+        tag = "table"
+    table_name: str = ""
+    schema_name: str | None = None
+    database: str | None = None
+    date_field: str | None = None
+    date_filter: str | None = None
+    date_range_start: str | None = None
+    date_range_days: int | None = None
+    def __post_init__(self):
+        """Validate table pattern after initialization."""
+        if not self.table_name:
+            raise ValueError("table_name is required for TablePattern")
+        # Validate date filtering parameters
+        if (self.date_filter or self.date_range_start) and not self.date_field:
+            raise ValueError(
+                "date_field is required when using date_filter or date_range_start"
+            )
+        if self.date_range_days is not None and not self.date_range_start:
+            raise ValueError("date_range_start is required when using date_range_days")
+    def matches(self, table_identifier: str) -> bool:
+        """Check if pattern matches a table name.
+        Args:
+            table_identifier: Table name to match (format: schema.table or just table)
+        Returns:
+            bool: True if pattern matches
+        """
+        if not self.table_name:
+            return False
+        # Compile regex pattern
+        if self.table_name.startswith("^") or self.table_name.endswith("$"):
+            # Already a regex pattern
+            pattern = re.compile(self.table_name)
+        else:
+            # Exact match pattern
+            pattern = re.compile(f"^{re.escape(self.table_name)}$")
+        # Check if table_identifier matches
+        if pattern.match(table_identifier):
+            return True
+        # If schema_name is specified, also check schema.table format
+        if self.schema_name:
+            full_name = f"{self.schema_name}.{table_identifier}"
+            if pattern.match(full_name):
+                return True
+        return False
+    def get_resource_type(self) -> ResourceType:
+        """Get resource type."""
+        return ResourceType.SQL_TABLE
+    def build_where_clause(self) -> str:
+        """Build SQL WHERE clause from date filtering parameters.
+        Returns:
+            WHERE clause string (without the WHERE keyword) or empty string if no filters
+        """
+        conditions = []
+        if self.date_field:
+            if self.date_range_start and self.date_range_days is not None:
+                # Range filtering: dt >= start_date AND dt < start_date + interval
+                # Example: Ingest for k days after 2015-11-11
+                conditions.append(
+                    f"\"{self.date_field}\" >= '{self.date_range_start}'::date"
+                )
+                conditions.append(
+                    f"\"{self.date_field}\" < '{self.date_range_start}'::date + INTERVAL '{self.date_range_days} days'"
+                )
+            elif self.date_filter:
+                # Direct filter: dt > 2020-10-10 or dt > '2020-10-10'
+                # The date_filter should include the operator and value
+                # If value doesn't have quotes, add them
+                filter_parts = self.date_filter.strip().split(None, 1)
+                if len(filter_parts) == 2:
+                    operator, value = filter_parts
+                    # Add quotes if not already present and value looks like a date
+                    if not (value.startswith("'") and value.endswith("'")):
+                        # Check if it's a date-like string (YYYY-MM-DD format)
+                        if len(value) == 10 and value.count("-") == 2:
+                            value = f"'{value}'"
+                    conditions.append(f'"{self.date_field}" {operator} {value}')
+                else:
+                    # If format is unexpected, use as-is
+                    conditions.append(f'"{self.date_field}" {self.date_filter}')
+        if conditions:
+            return " AND ".join(conditions)
+        return ""
+@dataclasses.dataclass
+class Patterns(BaseDataclass):
+    """Collection of named resource patterns with connection management.
+    This class manages a collection of resource patterns (files or tables),
+    each associated with a name. It efficiently handles PostgreSQL connections
+    by grouping tables that share the same connection configuration.
+    The constructor accepts:
+    - resource_mapping: dict mapping resource_name -> (file_path or table_name)
+    - postgres_connections: dict mapping config_key -> PostgresConfig
+      where config_key identifies a connection configuration
+    - postgres_tables: dict mapping table_name -> (config_key, schema_name, table_name)
+    Attributes:
+        patterns: Dictionary mapping resource names to ResourcePattern instances
+        postgres_configs: Dictionary mapping (config_key, schema_name) to PostgresConfig
+        postgres_table_configs: Dictionary mapping resource_name to (config_key, schema_name, table_name)
+    """
+    patterns: dict[str, TablePattern | FilePattern] = dataclasses.field(
+        default_factory=dict
+    )
+    postgres_configs: dict[tuple[str, str | None], Any] = dataclasses.field(
+        default_factory=dict, metadata={"exclude": True}
+    )
+    postgres_table_configs: dict[str, tuple[str, str | None, str]] = dataclasses.field(
+        default_factory=dict, metadata={"exclude": True}
+    )
+    # Initialization parameters (not stored as fields, excluded from serialization)
+    # Use Any for _postgres_connections to avoid type evaluation issues with dataclass_wizard
+    _resource_mapping: dict[str, str | tuple[str, str]] | None = dataclasses.field(
+        default=None, repr=False, compare=False, metadata={"exclude": True}
+    )
+    _postgres_connections: dict[str, Any] | None = dataclasses.field(
+        default=None, repr=False, compare=False, metadata={"exclude": True}
+    )
+    _postgres_tables: dict[str, tuple[str, str | None, str]] | None = dataclasses.field(
+        default=None, repr=False, compare=False, metadata={"exclude": True}
+    )
+    def __post_init__(self):
+        """Initialize Patterns from resource mappings and PostgreSQL configurations."""
+        # Store PostgreSQL connection configs
+        if self._postgres_connections:
+            for config_key, config in self._postgres_connections.items():
+                if config is not None:
+                    schema_name = config.schema_name
+                    self.postgres_configs[(config_key, schema_name)] = config
+        # Process resource mappings
+        if self._resource_mapping:
+            for resource_name, resource_spec in self._resource_mapping.items():
+                if isinstance(resource_spec, str):
+                    # File path - create FilePattern
+                    file_path = pathlib.Path(resource_spec)
+                    pattern = FilePattern(
+                        regex=f"^{re.escape(file_path.name)}$",
+                        sub_path=file_path.parent,
+                        resource_name=resource_name,
+                    )
+                    self.patterns[resource_name] = pattern
+                elif isinstance(resource_spec, tuple) and len(resource_spec) == 2:
+                    # (config_key, table_name) tuple - create TablePattern
+                    config_key, table_name = resource_spec
+                    # Find the schema_name from the config
+                    config = (
+                        self._postgres_connections.get(config_key)
+                        if self._postgres_connections
+                        else None
+                    )
+                    schema_name = config.schema_name if config else None
+                    pattern = TablePattern(
+                        table_name=table_name,
+                        schema_name=schema_name,
+                        resource_name=resource_name,
+                    )
+                    self.patterns[resource_name] = pattern
+                    # Store the config mapping
+                    self.postgres_table_configs[resource_name] = (
+                        config_key,
+                        schema_name,
+                        table_name,
+                    )
+        # Process explicit postgres_tables mapping
+        if self._postgres_tables:
+            for table_name, (
+                config_key,
+                schema_name,
+                actual_table_name,
+            ) in self._postgres_tables.items():
+                pattern = TablePattern(
+                    table_name=actual_table_name,
+                    schema_name=schema_name,
+                    resource_name=table_name,
+                )
+                self.patterns[table_name] = pattern
+                self.postgres_table_configs[table_name] = (
+                    config_key,
+                    schema_name,
+                    actual_table_name,
+                )
+    def add_file_pattern(self, name: str, file_pattern: FilePattern):
+        """Add a file pattern to the collection.
+        Args:
+            name: Name of the pattern
+            file_pattern: FilePattern instance
+        """
+        self.patterns[name] = file_pattern
+    def add_table_pattern(self, name: str, table_pattern: TablePattern):
+        """Add a table pattern to the collection.
+        Args:
+            name: Name of the pattern
+            table_pattern: TablePattern instance
+        """
+        self.patterns[name] = table_pattern
+    def get_postgres_config(self, resource_name: str) -> Any:
+        """Get PostgreSQL connection config for a resource.
+        Args:
+            resource_name: Name of the resource
+        Returns:
+            PostgresConfig if resource is a PostgreSQL table, None otherwise
+        """
+        if resource_name in self.postgres_table_configs:
+            config_key, schema_name, _ = self.postgres_table_configs[resource_name]
+            return self.postgres_configs.get((config_key, schema_name))
+        return None
+    def get_resource_type(self, resource_name: str) -> ResourceType | None:
+        """Get the resource type for a resource name.
+        Args:
+            resource_name: Name of the resource
+        Returns:
+            ResourceType enum value or None if not found
+        """
+        if resource_name in self.patterns:
+            return self.patterns[resource_name].get_resource_type()
+        return None
+    def get_table_info(self, resource_name: str) -> tuple[str, str | None] | None:
+        """Get table name and schema for a PostgreSQL table resource.
+        Args:
+            resource_name: Name of the resource
+        Returns:
+            Tuple of (table_name, schema_name) or None if not a table resource
+        """
+        if resource_name in self.postgres_table_configs:
+            _, schema_name, table_name = self.postgres_table_configs[resource_name]
+            return (table_name, schema_name)
+        return None