PyPI - graflo - Versions diffs - 1.3.7__py3-none-any.whl - Mend

graflo 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of graflo might be problematic. Click here for more details.

Files changed (70) hide show

graflo/README.md +18 -0
graflo/__init__.py +70 -0
graflo/architecture/__init__.py +38 -0
graflo/architecture/actor.py +1276 -0
graflo/architecture/actor_util.py +450 -0
graflo/architecture/edge.py +418 -0
graflo/architecture/onto.py +376 -0
graflo/architecture/onto_sql.py +54 -0
graflo/architecture/resource.py +163 -0
graflo/architecture/schema.py +135 -0
graflo/architecture/transform.py +292 -0
graflo/architecture/util.py +89 -0
graflo/architecture/vertex.py +562 -0
graflo/caster.py +736 -0
graflo/cli/__init__.py +14 -0
graflo/cli/ingest.py +203 -0
graflo/cli/manage_dbs.py +197 -0
graflo/cli/plot_schema.py +132 -0
graflo/cli/xml2json.py +93 -0
graflo/data_source/__init__.py +48 -0
graflo/data_source/api.py +339 -0
graflo/data_source/base.py +95 -0
graflo/data_source/factory.py +304 -0
graflo/data_source/file.py +148 -0
graflo/data_source/memory.py +70 -0
graflo/data_source/registry.py +82 -0
graflo/data_source/sql.py +183 -0
graflo/db/__init__.py +44 -0
graflo/db/arango/__init__.py +22 -0
graflo/db/arango/conn.py +1025 -0
graflo/db/arango/query.py +180 -0
graflo/db/arango/util.py +88 -0
graflo/db/conn.py +377 -0
graflo/db/connection/__init__.py +6 -0
graflo/db/connection/config_mapping.py +18 -0
graflo/db/connection/onto.py +717 -0
graflo/db/connection/wsgi.py +29 -0
graflo/db/manager.py +119 -0
graflo/db/neo4j/__init__.py +16 -0
graflo/db/neo4j/conn.py +639 -0
graflo/db/postgres/__init__.py +37 -0
graflo/db/postgres/conn.py +948 -0
graflo/db/postgres/fuzzy_matcher.py +281 -0
graflo/db/postgres/heuristics.py +133 -0
graflo/db/postgres/inference_utils.py +428 -0
graflo/db/postgres/resource_mapping.py +273 -0
graflo/db/postgres/schema_inference.py +372 -0
graflo/db/postgres/types.py +148 -0
graflo/db/postgres/util.py +87 -0
graflo/db/tigergraph/__init__.py +9 -0
graflo/db/tigergraph/conn.py +2365 -0
graflo/db/tigergraph/onto.py +26 -0
graflo/db/util.py +49 -0
graflo/filter/__init__.py +21 -0
graflo/filter/onto.py +525 -0
graflo/logging.conf +22 -0
graflo/onto.py +312 -0
graflo/plot/__init__.py +17 -0
graflo/plot/plotter.py +616 -0
graflo/util/__init__.py +23 -0
graflo/util/chunker.py +807 -0
graflo/util/merge.py +150 -0
graflo/util/misc.py +37 -0
graflo/util/onto.py +422 -0
graflo/util/transform.py +454 -0
graflo-1.3.7.dist-info/METADATA +243 -0
graflo-1.3.7.dist-info/RECORD +70 -0
graflo-1.3.7.dist-info/WHEEL +4 -0
graflo-1.3.7.dist-info/entry_points.txt +5 -0
graflo-1.3.7.dist-info/licenses/LICENSE +126 -0

graflo/architecture/onto.py ADDED Viewed

@@ -0,0 +1,376 @@
+"""Core ontology and data structures for graph database operations.
+This module defines the fundamental data structures and types used throughout the graflo
+package for working with graph databases. It provides:
+- Core data types for vertices and edges
+- Database index configurations
+- Graph container implementations
+- Edge mapping and casting utilities
+- Action context for graph transformations
+The module is designed to be database-agnostic, supporting both ArangoDB and Neo4j through
+the DBFlavor enum. It provides a unified interface for working with graph data structures
+while allowing for database-specific optimizations and features.
+Key Components:
+    - EdgeMapping: Defines how edges are mapped between vertices
+    - IndexType: Supported database index types
+    - EdgeType: Types of edge handling in the graph database
+    - GraphContainer: Main container for graph data
+    - ActionContext: Context for graph transformation operations
+Example:
+    >>> container = GraphContainer(vertices={}, edges={}, linear=[])
+    >>> index = Index(fields=["name", "age"], type=IndexType.PERSISTENT)
+    >>> context = ActionContext()
+"""
+from __future__ import annotations
+import dataclasses
+import logging
+from abc import ABCMeta
+from collections import defaultdict
+from typing import Any, TypeAlias
+from dataclass_wizard import JSONWizard, YAMLWizard
+from graflo.onto import BaseDataclass, BaseEnum, DBFlavor
+from graflo.util.transform import pick_unique_dict
+# type for vertex or edge name (index)
+EdgeId: TypeAlias = tuple[str, str, str | None]
+GraphEntity: TypeAlias = str | EdgeId
+logger = logging.getLogger(__name__)
+class EdgeMapping(BaseEnum):
+    """Defines how edges are mapped between vertices.
+    ALL: Maps all vertices to all vertices
+    ONE_N: Maps one vertex to many vertices
+    """
+    ALL = "all"
+    ONE_N = "1-n"
+class EncodingType(BaseEnum):
+    """Supported character encodings for data input/output."""
+    ISO_8859 = "ISO-8859-1"
+    UTF_8 = "utf-8"
+class IndexType(BaseEnum):
+    """Types of database indexes supported.
+    PERSISTENT: Standard persistent index
+    HASH: Hash-based index for fast lookups
+    SKIPLIST: Sorted index using skip list data structure
+    FULLTEXT: Index optimized for text search
+    """
+    PERSISTENT = "persistent"
+    HASH = "hash"
+    SKIPLIST = "skiplist"
+    FULLTEXT = "fulltext"
+class EdgeType(BaseEnum):
+    """Defines how edges are handled in the graph database.
+    INDIRECT: Defined as a collection with indexes, may be used after data ingestion
+    DIRECT: In addition to indexes, these edges are generated during ingestion
+    """
+    INDIRECT = "indirect"
+    DIRECT = "direct"
+@dataclasses.dataclass
+class ABCFields(BaseDataclass, metaclass=ABCMeta):
+    """Abstract base class for entities that have fields.
+    Attributes:
+        name: Optional name of the entity
+        fields: List of field names
+    """
+    name: str | None = None
+    fields: list[str] = dataclasses.field(default_factory=list)
+    keep_vertex_name: bool = True
+    def cfield(self, x: str) -> str:
+        """Creates a composite field name by combining the entity name with a field name.
+        Args:
+            x: Field name to combine with entity name
+        Returns:
+            Composite field name in format "entity@field"
+        """
+        return f"{self.name}@{x}" if self.keep_vertex_name else x
+@dataclasses.dataclass
+class Weight(ABCFields):
+    """Defines weight configuration for edges.
+    Attributes:
+        map: Dictionary mapping field values to weights
+        filter: Dictionary of filter conditions for weights
+    """
+    map: dict = dataclasses.field(default_factory=dict)
+    filter: dict = dataclasses.field(default_factory=dict)
+@dataclasses.dataclass
+class Index(BaseDataclass):
+    """Configuration for database indexes.
+    Attributes:
+        name: Optional name of the index
+        fields: List of fields to index
+        unique: Whether the index enforces uniqueness
+        type: Type of index to create
+        deduplicate: Whether to deduplicate index entries
+        sparse: Whether to create a sparse index
+        exclude_edge_endpoints: Whether to exclude edge endpoints from index
+    """
+    name: str | None = None
+    fields: list[str] = dataclasses.field(default_factory=list)
+    unique: bool = True
+    type: IndexType = IndexType.PERSISTENT
+    deduplicate: bool = True
+    sparse: bool = False
+    exclude_edge_endpoints: bool = False
+    def __iter__(self):
+        """Iterate over the indexed fields."""
+        return iter(self.fields)
+    def db_form(self, db_type: DBFlavor) -> dict:
+        """Convert index configuration to database-specific format.
+        Args:
+            db_type: Type of database (ARANGO or NEO4J)
+        Returns:
+            Dictionary of index configuration in database-specific format
+        Raises:
+            ValueError: If db_type is not supported
+        """
+        r = self.to_dict()
+        if db_type == DBFlavor.ARANGO:
+            _ = r.pop("name")
+            _ = r.pop("exclude_edge_endpoints")
+        elif db_type == DBFlavor.NEO4J:
+            pass
+        else:
+            raise ValueError(f"Unknown db_type {db_type}")
+        return r
+class ItemsView:
+    """View class for iterating over vertices and edges in a GraphContainer."""
+    def __init__(self, gc: GraphContainer):
+        self._dictlike = gc
+    def __iter__(self):
+        """Iterate over vertices and edges in the container."""
+        for key in self._dictlike.vertices:
+            yield key, self._dictlike.vertices[key]
+        for key in self._dictlike.edges:
+            yield key, self._dictlike.edges[key]
+@dataclasses.dataclass
+class GraphContainer(BaseDataclass):
+    """Container for graph data including vertices and edges.
+    Attributes:
+        vertices: Dictionary mapping vertex names to lists of vertex data
+        edges: Dictionary mapping edge IDs to lists of edge data
+        linear: List of default dictionaries containing linear data
+    """
+    vertices: dict[str, list]
+    edges: dict[tuple[str, str, str | None], list]
+    linear: list[defaultdict[str | tuple[str, str, str | None], list[Any]]]
+    def __post_init__(self):
+        pass
+    def items(self):
+        """Get an ItemsView of the container's contents."""
+        return ItemsView(self)
+    def pick_unique(self):
+        """Remove duplicate entries from vertices and edges."""
+        for k, v in self.vertices.items():
+            self.vertices[k] = pick_unique_dict(v)
+        for k, v in self.edges.items():
+            self.edges[k] = pick_unique_dict(v)
+    def loop_over_relations(self, edge_def: tuple[str, str, str | None]):
+        """Iterate over edges matching the given edge definition.
+        Args:
+            edge_def: Tuple of (source, target, optional_purpose)
+        Returns:
+            Generator yielding matching edge IDs
+        """
+        source, target, _ = edge_def
+        return (ed for ed in self.edges if source == ed[0] and target == ed[1])
+    @classmethod
+    def from_docs_list(
+        cls, list_default_dicts: list[defaultdict[GraphEntity, list]]
+    ) -> GraphContainer:
+        """Create a GraphContainer from a list of default dictionaries.
+        Args:
+            list_default_dicts: List of default dictionaries containing vertex and edge data
+        Returns:
+            New GraphContainer instance
+        Raises:
+            AssertionError: If edge IDs are not properly formatted
+        """
+        vdict: defaultdict[str, list] = defaultdict(list)
+        edict: defaultdict[tuple[str, str, str | None], list] = defaultdict(list)
+        for d in list_default_dicts:
+            for k, v in d.items():
+                if isinstance(k, str):
+                    vdict[k].extend(v)
+                elif isinstance(k, tuple):
+                    assert (
+                        len(k) == 3
+                        and all(isinstance(item, str) for item in k[:-1])
+                        and isinstance(k[-1], (str, type(None)))
+                    )
+                    edict[k].extend(v)
+        return GraphContainer(
+            vertices=dict(vdict.items()),
+            edges=dict(edict.items()),
+            linear=list_default_dicts,
+        )
+class EdgeCastingType(BaseEnum):
+    """Types of edge casting supported.
+    PAIR: Edges are cast as pairs of vertices
+    PRODUCT: Edges are cast as combinations of vertex sets
+    """
+    PAIR = "pair"
+    PRODUCT = "product"
+    COMBINATIONS = "combinations"
+def inner_factory_vertex() -> defaultdict[LocationIndex, list]:
+    """Create a default dictionary for vertex data."""
+    return defaultdict(list)
+def outer_factory() -> defaultdict[str, defaultdict[LocationIndex, list]]:
+    """Create a nested default dictionary for vertex data."""
+    return defaultdict(inner_factory_vertex)
+def dd_factory() -> defaultdict[GraphEntity, list]:
+    """Create a default dictionary for graph entity data."""
+    return defaultdict(list)
+@dataclasses.dataclass(kw_only=True)
+class VertexRep(BaseDataclass):
+    """Context for graph transformation actions.
+    Attributes:
+        vertex: doc representing a vertex
+        ctx: context (for edge definition upstream
+    """
+    vertex: dict
+    ctx: dict
+@dataclasses.dataclass(frozen=True, eq=True)
+class LocationIndex(JSONWizard, YAMLWizard):
+    path: tuple[str | int | None, ...] = dataclasses.field(default_factory=tuple)
+    def extend(self, extension: tuple[str | int | None, ...]) -> LocationIndex:
+        return LocationIndex((*self.path, *extension))
+    def depth(self):
+        return len(self.path)
+    def congruence_measure(self, other: LocationIndex):
+        neq_position = 0
+        for step_a, step_b in zip(self.path, other.path):
+            if step_a != step_b:
+                break
+            neq_position += 1
+        return neq_position
+    def filter(self, lindex_list: list[LocationIndex]) -> list[LocationIndex]:
+        return [
+            t
+            for t in lindex_list
+            if t.depth() >= self.depth() and t.path[: self.depth()] == self.path
+        ]
+    def __lt__(self, other: LocationIndex):
+        return len(self.path) < len(other.path)
+    def __contains__(self, item):
+        return item in self.path
+    def __len__(self):
+        return len(self.path)
+    def __iter__(self):
+        return iter(self.path)
+    def __getitem__(self, item):
+        return self.path[item]
+@dataclasses.dataclass(kw_only=True)
+class ActionContext(BaseDataclass):
+    """Context for graph transformation actions.
+    Attributes:
+        acc_vertex: Local accumulation of vertices
+        acc_global: Global accumulation of graph entities
+        buffer_vertex: Buffer for vertex data
+        buffer_transforms: Buffer for transforms data
+        target_vertices: Set of target vertex names indicating user intention
+    """
+    acc_vertex: defaultdict[str, defaultdict[LocationIndex, list]] = dataclasses.field(
+        default_factory=outer_factory
+    )
+    acc_global: defaultdict[GraphEntity, list] = dataclasses.field(
+        default_factory=dd_factory
+    )
+    buffer_vertex: defaultdict[GraphEntity, list] = dataclasses.field(
+        default_factory=lambda: defaultdict(list)
+    )
+    buffer_transforms: defaultdict[LocationIndex, list[dict]] = dataclasses.field(
+        default_factory=lambda: defaultdict(list)
+    )
+    target_vertices: set[str] = dataclasses.field(default_factory=set)

graflo/architecture/onto_sql.py ADDED Viewed

@@ -0,0 +1,54 @@
+from pydantic import BaseModel
+class ColumnInfo(BaseModel):
+    """Column information from PostgreSQL table."""
+    name: str
+    type: str
+    description: str = ""
+    is_nullable: str = "YES"
+    column_default: str | None = None
+    is_pk: bool = False
+class ForeignKeyInfo(BaseModel):
+    """Foreign key relationship information."""
+    column: str
+    references_table: str
+    references_column: str | None = None
+    constraint_name: str | None = None
+class VertexTableInfo(BaseModel):
+    """Vertex table information from schema introspection."""
+    name: str
+    schema_name: str
+    columns: list[ColumnInfo]
+    primary_key: list[str]
+    foreign_keys: list[ForeignKeyInfo]
+class EdgeTableInfo(BaseModel):
+    """Edge table information from schema introspection."""
+    name: str
+    schema_name: str
+    columns: list[ColumnInfo]
+    primary_key: list[str]
+    foreign_keys: list[ForeignKeyInfo]
+    source_table: str
+    target_table: str
+    source_column: str
+    target_column: str
+    relation: str | None = None
+class SchemaIntrospectionResult(BaseModel):
+    """Result of PostgreSQL schema introspection."""
+    vertex_tables: list[VertexTableInfo]
+    edge_tables: list[EdgeTableInfo]
+    schema_name: str

graflo/architecture/resource.py ADDED Viewed

@@ -0,0 +1,163 @@
+"""Resource management and processing for graph databases.
+This module provides the core resource handling functionality for graph databases.
+It defines how data resources are processed, transformed, and mapped to graph
+structures through a system of actors and transformations.
+Key Components:
+    - Resource: Main class for resource processing and transformation
+    - ActorWrapper: Wrapper for processing actors
+    - ActionContext: Context for processing actions
+The resource system allows for:
+    - Data encoding and transformation
+    - Vertex and edge creation
+    - Weight management
+    - Collection merging
+    - Type casting and validation
+Example:
+    >>> resource = Resource(
+    ...     resource_name="users",
+    ...     apply=[VertexActor("user"), EdgeActor("follows")],
+    ...     encoding=EncodingType.UTF_8
+    ... )
+    >>> result = resource(doc)
+"""
+import dataclasses
+import logging
+from collections import defaultdict
+from typing import Callable
+from dataclass_wizard import JSONWizard
+from graflo.architecture.actor import (
+    ActorWrapper,
+)
+from graflo.architecture.edge import Edge, EdgeConfig
+from graflo.architecture.onto import (
+    ActionContext,
+    EncodingType,
+    GraphEntity,
+)
+from graflo.architecture.transform import ProtoTransform
+from graflo.architecture.vertex import (
+    VertexConfig,
+)
+from graflo.onto import BaseDataclass
+logger = logging.getLogger(__name__)
+@dataclasses.dataclass(kw_only=True)
+class Resource(BaseDataclass, JSONWizard):
+    """Resource configuration and processing.
+    This class represents a data resource that can be processed and transformed
+    into graph structures. It manages the processing pipeline through actors
+    and handles data encoding, transformation, and mapping.
+    Attributes:
+        resource_name: Name of the resource
+        apply: List of actors to apply in sequence
+        encoding: Data encoding type (default: UTF_8)
+        merge_collections: List of collections to merge
+        extra_weights: List of additional edge weights
+        types: Dictionary of field type mappings
+        root: Root actor wrapper for processing
+        vertex_config: Configuration for vertices
+        edge_config: Configuration for edges
+    """
+    resource_name: str
+    apply: list
+    encoding: EncodingType = EncodingType.UTF_8
+    merge_collections: list[str] = dataclasses.field(default_factory=list)
+    extra_weights: list[Edge] = dataclasses.field(default_factory=list)
+    types: dict[str, str] = dataclasses.field(default_factory=dict)
+    edge_greedy: bool = True
+    def __post_init__(self):
+        """Initialize the resource after dataclass initialization.
+        Sets up the actor wrapper and type mappings. Evaluates type expressions
+        for field type casting.
+        Raises:
+            Exception: If type evaluation fails for any field
+        """
+        self.root = ActorWrapper(*self.apply)
+        self._types: dict[str, Callable] = dict()
+        self.vertex_config: VertexConfig
+        self.edge_config: EdgeConfig
+        for k, v in self.types.items():
+            try:
+                self._types[k] = eval(v)
+            except Exception as ex:
+                logger.error(
+                    f"For resource {self.name} for field {k} failed to cast type {v} : {ex}"
+                )
+    @property
+    def name(self):
+        """Get the resource name.
+        Returns:
+            str: Name of the resource
+        """
+        return self.resource_name
+    def finish_init(
+        self,
+        vertex_config: VertexConfig,
+        edge_config: EdgeConfig,
+        transforms: dict[str, ProtoTransform],
+    ):
+        """Complete resource initialization.
+        Initializes the resource with vertex and edge configurations,
+        and sets up the processing pipeline.
+        Args:
+            vertex_config: Configuration for vertices
+            edge_config: Configuration for edges
+            transforms: Dictionary of available transforms
+        """
+        self.vertex_config = vertex_config
+        self.edge_config = edge_config
+        logger.debug(f"total resource actor count : {self.root.count()}")
+        self.root.finish_init(
+            vertex_config=vertex_config,
+            transforms=transforms,
+            edge_config=edge_config,
+            edge_greedy=self.edge_greedy,
+        )
+        logger.debug(f"total resource actor count (after 2 finit): {self.root.count()}")
+        for e in self.extra_weights:
+            e.finish_init(vertex_config)
+    def __call__(self, doc: dict) -> defaultdict[GraphEntity, list]:
+        """Process a document through the resource pipeline.
+        Args:
+            doc: Document to process
+        Returns:
+            defaultdict[GraphEntity, list]: Processed graph entities
+        """
+        ctx = ActionContext()
+        ctx = self.root(ctx, doc=doc)
+        acc = self.root.normalize_ctx(ctx)
+        return acc
+    def count(self):
+        """Get the total number of actors in the resource.
+        Returns:
+            int: Number of actors
+        """
+        return self.root.count()