PyPI - graflo - Versions diffs - 1.1.0__py3-none-any.whl - Mend

graflo 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of graflo might be problematic. Click here for more details.

Files changed (45) hide show

graflo/README.md +18 -0
graflo/__init__.py +39 -0
graflo/architecture/__init__.py +37 -0
graflo/architecture/actor.py +974 -0
graflo/architecture/actor_util.py +425 -0
graflo/architecture/edge.py +295 -0
graflo/architecture/onto.py +374 -0
graflo/architecture/resource.py +161 -0
graflo/architecture/schema.py +136 -0
graflo/architecture/transform.py +292 -0
graflo/architecture/util.py +93 -0
graflo/architecture/vertex.py +277 -0
graflo/caster.py +409 -0
graflo/cli/__init__.py +14 -0
graflo/cli/ingest.py +144 -0
graflo/cli/manage_dbs.py +193 -0
graflo/cli/plot_schema.py +132 -0
graflo/cli/xml2json.py +93 -0
graflo/db/__init__.py +32 -0
graflo/db/arango/__init__.py +16 -0
graflo/db/arango/conn.py +734 -0
graflo/db/arango/query.py +180 -0
graflo/db/arango/util.py +88 -0
graflo/db/connection.py +304 -0
graflo/db/manager.py +104 -0
graflo/db/neo4j/__init__.py +16 -0
graflo/db/neo4j/conn.py +432 -0
graflo/db/util.py +49 -0
graflo/filter/__init__.py +21 -0
graflo/filter/onto.py +400 -0
graflo/logging.conf +22 -0
graflo/onto.py +186 -0
graflo/plot/__init__.py +17 -0
graflo/plot/plotter.py +556 -0
graflo/util/__init__.py +23 -0
graflo/util/chunker.py +739 -0
graflo/util/merge.py +148 -0
graflo/util/misc.py +37 -0
graflo/util/onto.py +63 -0
graflo/util/transform.py +406 -0
graflo-1.1.0.dist-info/METADATA +157 -0
graflo-1.1.0.dist-info/RECORD +45 -0
graflo-1.1.0.dist-info/WHEEL +4 -0
graflo-1.1.0.dist-info/entry_points.txt +5 -0
graflo-1.1.0.dist-info/licenses/LICENSE +126 -0

graflo/architecture/vertex.py ADDED Viewed

@@ -0,0 +1,277 @@
+"""Vertex configuration and management for graph databases.
+This module provides classes and utilities for managing vertices in graph databases.
+It handles vertex configuration, field management, indexing, and filtering operations.
+The module supports both ArangoDB and Neo4j through the DBFlavor enum.
+Key Components:
+    - Vertex: Represents a vertex with its fields and indexes
+    - VertexConfig: Manages collections of vertices and their configurations
+Example:
+    >>> vertex = Vertex(name="user", fields=["id", "name"])
+    >>> config = VertexConfig(vertices=[vertex])
+    >>> fields = config.fields("user", with_aux=True)
+"""
+import dataclasses
+import logging
+from typing import Optional
+from graflo.architecture.onto import Index
+from graflo.filter.onto import Expression
+from graflo.onto import BaseDataclass, DBFlavor
+logger = logging.getLogger(__name__)
+@dataclasses.dataclass
+class Vertex(BaseDataclass):
+    """Represents a vertex in the graph database.
+    A vertex is a fundamental unit in the graph that can have fields, indexes,
+    and filters
+    Attributes:
+        name: Name of the vertex
+        fields: List of field names
+        fields_aux: List of auxiliary field names for weight passing
+        indexes: List of indexes for the vertex
+        filters: List of filter expressions
+        dbname: Optional database name (defaults to vertex name)
+    """
+    name: str
+    fields: list[str]
+    fields_aux: list[str] = dataclasses.field(
+        default_factory=list
+    )  # temporary field necessary to pass weights to edges
+    indexes: list[Index] = dataclasses.field(default_factory=list)
+    filters: list[Expression] = dataclasses.field(default_factory=list)
+    dbname: Optional[str] = None
+    @property
+    def fields_all(self):
+        """Get all fields including auxiliary fields.
+        Returns:
+            list[str]: Combined list of regular and auxiliary fields
+        """
+        return self.fields + self.fields_aux
+    def __post_init__(self):
+        """Initialize the vertex after dataclass initialization.
+        Sets the database name if not provided and updates fields based on indexes.
+        """
+        if self.dbname is None:
+            self.dbname = self.name
+        union_fields = set(self.fields)
+        if not self.indexes:
+            self.indexes = [Index(fields=self.fields)]
+        for ei in self.indexes:
+            union_fields |= set(ei.fields)
+        self.fields = list(union_fields)
+    def update_aux_fields(self, fields_aux: list):
+        """Update auxiliary fields.
+        Args:
+            fields_aux: List of new auxiliary fields to add
+        Returns:
+            Vertex: Self for method chaining
+        """
+        self.fields_aux = list(set(self.fields_aux) | set(fields_aux))
+        return self
+@dataclasses.dataclass
+class VertexConfig(BaseDataclass):
+    """Configuration for managing collections of vertices.
+    This class manages a collection of vertices, providing methods for accessing
+    and manipulating vertex configurations.
+    Attributes:
+        vertices: List of vertex configurations
+        blank_vertices: List of blank vertex names
+        force_types: Dictionary mapping vertex names to type lists
+        db_flavor: Database flavor (ARANGO or NEO4J)
+    """
+    vertices: list[Vertex]
+    blank_vertices: list[str] = dataclasses.field(default_factory=list)
+    force_types: dict[str, list] = dataclasses.field(default_factory=dict)
+    db_flavor: DBFlavor = DBFlavor.ARANGO
+    def __post_init__(self):
+        """Initialize the vertex configuration.
+        Creates internal mappings and validates blank vertices.
+        Raises:
+            ValueError: If blank vertices are not defined in the configuration
+        """
+        self._vertices_map: dict[str, Vertex] = {
+            item.name: item for item in self.vertices
+        }
+        # TODO replace by types
+        # vertex_collection_name -> [numeric fields]
+        self._vcollection_numeric_fields_map = {}
+        if set(self.blank_vertices) - set(self.vertex_set):
+            raise ValueError(
+                f" Blank collections {self.blank_vertices} are not defined"
+                " as vertex collections"
+            )
+    @property
+    def vertex_set(self):
+        """Get set of vertex names.
+        Returns:
+            set[str]: Set of vertex names
+        """
+        return set(self._vertices_map.keys())
+    @property
+    def vertex_list(self):
+        """Get list of vertex configurations.
+        Returns:
+            list[Vertex]: List of vertex configurations
+        """
+        return list(self._vertices_map.values())
+    def vertex_dbname(self, vertex_name):
+        """Get database name for a vertex.
+        Args:
+            vertex_name: Name of the vertex
+        Returns:
+            str: Database name for the vertex
+        Raises:
+            KeyError: If vertex is not found
+        """
+        try:
+            value = self._vertices_map[vertex_name].dbname
+        except KeyError as e:
+            logger.error(
+                "Available vertex collections :"
+                f" {self._vertices_map.keys()}; vertex collection"
+                f" requested : {vertex_name}"
+            )
+            raise e
+        return value
+    def index(self, vertex_name) -> Index:
+        """Get primary index for a vertex.
+        Args:
+            vertex_name: Name of the vertex
+        Returns:
+            Index: Primary index for the vertex
+        """
+        return self._vertices_map[vertex_name].indexes[0]
+    def indexes(self, vertex_name) -> list[Index]:
+        """Get all indexes for a vertex.
+        Args:
+            vertex_name: Name of the vertex
+        Returns:
+            list[Index]: List of indexes for the vertex
+        """
+        return self._vertices_map[vertex_name].indexes
+    def fields(self, vertex_name: str, with_aux=False):
+        """Get fields for a vertex.
+        Args:
+            vertex_name: Name of the vertex
+            with_aux: Whether to include auxiliary fields
+        Returns:
+            list[str]: List of fields
+        """
+        if with_aux:
+            return self._vertices_map[vertex_name].fields_all
+        else:
+            return self._vertices_map[vertex_name].fields
+    def numeric_fields_list(self, vertex_name):
+        """Get list of numeric fields for a vertex.
+        Args:
+            vertex_name: Name of the vertex
+        Returns:
+            tuple: Tuple of numeric field names
+        Raises:
+            ValueError: If vertex is not defined in config
+        """
+        if vertex_name in self.vertex_set:
+            if vertex_name in self._vcollection_numeric_fields_map:
+                return self._vcollection_numeric_fields_map[vertex_name]
+            else:
+                return ()
+        else:
+            raise ValueError(
+                " Accessing vertex collection numeric fields: vertex"
+                f" collection {vertex_name} was not defined in config"
+            )
+    def filters(self, vertex_name) -> list[Expression]:
+        """Get filter expressions for a vertex.
+        Args:
+            vertex_name: Name of the vertex
+        Returns:
+            list[Expression]: List of filter expressions
+        """
+        if vertex_name in self._vertices_map:
+            return self._vertices_map[vertex_name].filters
+        else:
+            return []
+    def update_vertex(self, v: Vertex):
+        """Update vertex configuration.
+        Args:
+            v: Vertex configuration to update
+        """
+        self._vertices_map[v.name] = v
+    def __getitem__(self, key: str):
+        """Get vertex configuration by name.
+        Args:
+            key: Vertex name
+        Returns:
+            Vertex: Vertex configuration
+        Raises:
+            KeyError: If vertex is not found
+        """
+        if key in self._vertices_map:
+            return self._vertices_map[key]
+        else:
+            raise KeyError(f"Vertex {key} absent")
+    def __setitem__(self, key: str, value: Vertex):
+        """Set vertex configuration by name.
+        Args:
+            key: Vertex name
+            value: Vertex configuration
+        """
+        self._vertices_map[key] = value

graflo/caster.py ADDED Viewed

@@ -0,0 +1,409 @@
+"""Data casting and ingestion system for graph databases.
+This module provides functionality for casting and ingesting data into graph databases.
+It handles batch processing, file discovery, and database operations for both ArangoDB
+and Neo4j.
+Key Components:
+    - Caster: Main class for data casting and ingestion
+    - FilePattern: Pattern matching for file discovery
+    - Patterns: Collection of file patterns for different resources
+Example:
+    >>> caster = Caster(schema=schema)
+    >>> caster.ingest_files(path="data/", conn_conf=db_config)
+"""
+import logging
+import multiprocessing as mp
+import queue
+import re
+import sys
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from pathlib import Path
+from typing import cast
+import pandas as pd
+from suthing import ConnectionKind, DBConnectionConfig, Timer
+from graflo.architecture.onto import GraphContainer
+from graflo.architecture.schema import Schema
+from graflo.db import ConnectionManager
+from graflo.util.chunker import ChunkerFactory
+from graflo.util.onto import FilePattern, Patterns
+logger = logging.getLogger(__name__)
+class Caster:
+    """Main class for data casting and ingestion.
+    This class handles the process of casting data into graph structures and
+    ingesting them into the database. It supports batch processing, parallel
+    execution, and various data formats.
+    Attributes:
+        clean_start: Whether to clean the database before ingestion
+        n_cores: Number of CPU cores to use for parallel processing
+        max_items: Maximum number of items to process
+        batch_size: Size of batches for processing
+        n_threads: Number of threads for parallel processing
+        dry: Whether to perform a dry run (no database changes)
+        schema: Schema configuration for the graph
+    """
+    def __init__(self, schema: Schema, **kwargs):
+        """Initialize the caster with schema and configuration.
+        Args:
+            schema: Schema configuration for the graph
+            **kwargs: Additional configuration options:
+                - clean_start: Whether to clean the database before ingestion
+                - n_cores: Number of CPU cores to use
+                - max_items: Maximum number of items to process
+                - batch_size: Size of batches for processing
+                - n_threads: Number of threads for parallel processing
+                - dry: Whether to perform a dry run
+        """
+        self.clean_start: bool = False
+        self.n_cores = kwargs.pop("n_cores", 1)
+        self.max_items = kwargs.pop("max_items", None)
+        self.batch_size = kwargs.pop("batch_size", 10000)
+        self.n_threads = kwargs.pop("n_threads", 1)
+        self.dry = kwargs.pop("dry", False)
+        self.schema = schema
+    @staticmethod
+    def discover_files(
+        fpath: Path | str, pattern: FilePattern, limit_files=None
+    ) -> list[Path]:
+        """Discover files matching a pattern in a directory.
+        Args:
+            fpath: Path to search in
+            pattern: Pattern to match files against
+            limit_files: Optional limit on number of files to return
+        Returns:
+            list[Path]: List of matching file paths
+        Raises:
+            AssertionError: If pattern.sub_path is None
+        """
+        assert pattern.sub_path is not None
+        if isinstance(fpath, str):
+            fpath_pathlib = Path(fpath)
+        else:
+            fpath_pathlib = fpath
+        files = [
+            f
+            for f in (fpath_pathlib / pattern.sub_path).iterdir()
+            if f.is_file()
+            and (
+                True
+                if pattern.regex is None
+                else re.search(pattern.regex, f.name) is not None
+            )
+        ]
+        if limit_files is not None:
+            files = files[:limit_files]
+        return files
+    def cast_normal_resource(
+        self, data, resource_name: str | None = None
+    ) -> GraphContainer:
+        """Cast data into a graph container using a resource.
+        Args:
+            data: Data to cast
+            resource_name: Optional name of the resource to use
+        Returns:
+            GraphContainer: Container with cast graph data
+        """
+        rr = self.schema.fetch_resource(resource_name)
+        with ThreadPoolExecutor(max_workers=self.n_threads) as executor:
+            docs = list(
+                executor.map(
+                    lambda doc: rr(doc),
+                    data,
+                )
+            )
+        graph = GraphContainer.from_docs_list(docs)
+        return graph
+    def process_batch(
+        self,
+        batch,
+        resource_name: str | None,
+        conn_conf: None | DBConnectionConfig = None,
+    ):
+        """Process a batch of data.
+        Args:
+            batch: Batch of data to process
+            resource_name: Optional name of the resource to use
+            conn_conf: Optional database connection configuration
+        """
+        gc = self.cast_normal_resource(batch, resource_name=resource_name)
+        if conn_conf is not None:
+            self.push_db(gc, conn_conf, resource_name=resource_name)
+    def process_resource(
+        self,
+        resource_instance: Path,
+        resource_name: str | None,
+        conn_conf: None | DBConnectionConfig = None,
+        **kwargs,
+    ):
+        """Process a resource instance.
+        Args:
+            resource_instance: Path to the resource file
+            resource_name: Optional name of the resource
+            conn_conf: Optional database connection configuration
+        """
+        chunker = ChunkerFactory.create_chunker(
+            resource=resource_instance,
+            batch_size=self.batch_size,
+            limit=self.max_items,
+            **kwargs,
+        )
+        for batch in chunker:
+            self.process_batch(batch, resource_name=resource_name, conn_conf=conn_conf)
+    def push_db(
+        self,
+        gc: GraphContainer,
+        conn_conf: DBConnectionConfig,
+        resource_name: str | None,
+    ):
+        """Push graph container data to the database.
+        Args:
+            gc: Graph container with data to push
+            conn_conf: Database connection configuration
+            resource_name: Optional name of the resource
+        """
+        vc = self.schema.vertex_config
+        resource = self.schema.fetch_resource(resource_name)
+        with ConnectionManager(connection_config=conn_conf) as db_client:
+            for vcol, data in gc.vertices.items():
+                # blank nodes: push and get back their keys  {"_key": ...}
+                if vcol in vc.blank_vertices:
+                    query0 = db_client.insert_return_batch(data, vc.vertex_dbname(vcol))
+                    cursor = db_client.execute(query0)
+                    gc.vertices[vcol] = [item for item in cursor]
+                else:
+                    db_client.upsert_docs_batch(
+                        data,
+                        vc.vertex_dbname(vcol),
+                        vc.index(vcol),
+                        update_keys="doc",
+                        filter_uniques=True,
+                        dry=self.dry,
+                    )
+            # update edge misc with blank node edges
+            for vcol in vc.blank_vertices:
+                for edge_id, edge in self.schema.edge_config.edges_items():
+                    vfrom, vto, relation = edge_id
+                    if vcol == vfrom or vcol == vto:
+                        if edge_id not in gc.edges:
+                            gc.edges[edge_id] = []
+                        gc.edges[edge_id].extend(
+                            [
+                                (x, y, {})
+                                for x, y in zip(gc.vertices[vfrom], gc.vertices[vto])
+                            ]
+                        )
+        with ConnectionManager(connection_config=conn_conf) as db_client:
+            # currently works only on item level
+            for edge in resource.extra_weights:
+                if edge.weights is None:
+                    continue
+                for weight in edge.weights.vertices:
+                    if weight.name in vc.vertex_set:
+                        index_fields = vc.index(weight.name)
+                        if not self.dry and weight.name in gc.vertices:
+                            weights_per_item = db_client.fetch_present_documents(
+                                class_name=vc.vertex_dbname(weight.name),
+                                batch=gc.vertices[weight.name],
+                                match_keys=index_fields.fields,
+                                keep_keys=weight.fields,
+                            )
+                            for j, item in enumerate(gc.linear):
+                                weights = weights_per_item[j]
+                                for ee in item[edge.edge_id]:
+                                    weight_collection_attached = {
+                                        weight.cfield(k): v
+                                        for k, v in weights[0].items()
+                                    }
+                                    ee.update(weight_collection_attached)
+                    else:
+                        logger.error(f"{weight.name} not a valid vertex")
+        with ConnectionManager(connection_config=conn_conf) as db_client:
+            for edge_id, edge in self.schema.edge_config.edges_items():
+                for ee in gc.loop_over_relations(edge_id):
+                    _, _, relation = ee
+                    if not self.dry:
+                        data = gc.edges[ee]
+                        db_client.insert_edges_batch(
+                            docs_edges=data,
+                            source_class=vc.vertex_dbname(edge.source),
+                            target_class=vc.vertex_dbname(edge.target),
+                            relation_name=relation,
+                            collection_name=edge.collection_name,
+                            match_keys_source=vc.index(edge.source).fields,
+                            match_keys_target=vc.index(edge.target).fields,
+                            filter_uniques=False,
+                            dry=self.dry,
+                        )
+    def process_with_queue(self, tasks: mp.Queue, **kwargs):
+        """Process tasks from a queue.
+        Args:
+            tasks: Queue of tasks to process
+            **kwargs: Additional keyword arguments
+        """
+        while True:
+            try:
+                task = tasks.get_nowait()
+                filepath, resource_name = task
+            except queue.Empty:
+                break
+            else:
+                self.process_resource(
+                    resource_instance=filepath, resource_name=resource_name, **kwargs
+                )
+    @staticmethod
+    def normalize_resource(
+        data: pd.DataFrame | list[list] | list[dict], columns: list[str] | None = None
+    ) -> list[dict]:
+        """Normalize resource data into a list of dictionaries.
+        Args:
+            data: Data to normalize (DataFrame, list of lists, or list of dicts)
+            columns: Optional column names for list data
+        Returns:
+            list[dict]: Normalized data as list of dictionaries
+        Raises:
+            ValueError: If columns is not provided for list data
+        """
+        if isinstance(data, pd.DataFrame):
+            columns = data.columns.tolist()
+            _data = data.values.tolist()
+        elif data and isinstance(data[0], list):
+            _data = cast(list[list], data)  # Tell mypy this is list[list]
+            if columns is None:
+                raise ValueError("columns should be set")
+        else:
+            return cast(list[dict], data)  # Tell mypy this is list[dict]
+        rows_dressed = [{k: v for k, v in zip(columns, item)} for item in _data]
+        return rows_dressed
+    def ingest_files(self, path: Path | str, **kwargs):
+        """Ingest files from a directory.
+        Args:
+            path: Path to directory containing files
+            **kwargs: Additional keyword arguments:
+                - conn_conf: Database connection configuration
+                - clean_start: Whether to clean the database before ingestion
+                - n_cores: Number of CPU cores to use
+                - max_items: Maximum number of items to process
+                - batch_size: Size of batches for processing
+                - dry: Whether to perform a dry run
+                - init_only: Whether to only initialize the database
+                - limit_files: Optional limit on number of files to process
+                - patterns: Optional file patterns to match
+        """
+        path = Path(path).expanduser()
+        conn_conf: DBConnectionConfig = kwargs.get("conn_conf")
+        self.clean_start = kwargs.pop("clean_start", self.clean_start)
+        self.n_cores = kwargs.pop("n_cores", self.n_cores)
+        self.max_items = kwargs.pop("max_items", self.max_items)
+        self.batch_size = kwargs.pop("batch_size", self.batch_size)
+        self.dry = kwargs.pop("dry", self.dry)
+        init_only = kwargs.pop("init_only", False)
+        limit_files = kwargs.pop("limit_files", None)
+        patterns = kwargs.pop("patterns", Patterns())
+        if (
+            conn_conf.connection_type == ConnectionKind.ARANGO
+            and conn_conf.database == "_system"
+        ):
+            db_name = self.schema.general.name
+            try:
+                with ConnectionManager(connection_config=conn_conf) as db_client:
+                    db_client.create_database(db_name)
+            except Exception as exc:
+                logger.error(exc)
+            conn_conf.database = db_name
+        with ConnectionManager(connection_config=conn_conf) as db_client:
+            db_client.init_db(self.schema, self.clean_start)
+        if init_only:
+            logger.info("ingest execution bound to init")
+            sys.exit(0)
+        tasks: list[tuple[Path, str]] = []
+        for r in self.schema.resources:
+            pattern = (
+                FilePattern(regex=r.name)
+                if r.name not in patterns.patterns
+                else patterns.patterns[r.name]
+            )
+            files = Caster.discover_files(
+                path, limit_files=limit_files, pattern=pattern
+            )
+            logger.info(f"For resource name {r.name} {len(files)} were found")
+            tasks += [(f, r.name) for f in files]
+        with Timer() as klepsidra:
+            if self.n_cores > 1:
+                queue_tasks: mp.Queue = mp.Queue()
+                for item in tasks:
+                    queue_tasks.put(item)
+                func = partial(
+                    self.process_with_queue,
+                    **kwargs,
+                )
+                assert mp.get_start_method() == "fork", (
+                    "Requires 'forking' operating system"
+                )
+                processes = []
+                for w in range(self.n_cores):
+                    p = mp.Process(target=func, args=(queue_tasks,), kwargs=kwargs)
+                    processes.append(p)
+                    p.start()
+                    for p in processes:
+                        p.join()
+            else:
+                for f, resource_name in tasks:
+                    self.process_resource(
+                        resource_instance=f, resource_name=resource_name, **kwargs
+                    )
+        logger.info(f"Processing took {klepsidra.elapsed:.1f} sec")

graflo/cli/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Command-line interface for graflo.
+This module provides command-line tools for working with graflo, including
+utilities for data ingestion, schema management, and graph operations.
+Key Components:
+    - Command-line tools for data processing
+    - Schema management utilities
+    - Graph database operations
+Example:
+    >>> uv run ingest --config config.json --data data.json
+    >>> uv run plot_schema --config schema.yaml --output figs
+"""