PyPI - graflo - Versions diffs - 1.1.0__py3-none-any.whl - Mend

graflo 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of graflo might be problematic. Click here for more details.

Files changed (45) hide show

graflo/README.md +18 -0
graflo/__init__.py +39 -0
graflo/architecture/__init__.py +37 -0
graflo/architecture/actor.py +974 -0
graflo/architecture/actor_util.py +425 -0
graflo/architecture/edge.py +295 -0
graflo/architecture/onto.py +374 -0
graflo/architecture/resource.py +161 -0
graflo/architecture/schema.py +136 -0
graflo/architecture/transform.py +292 -0
graflo/architecture/util.py +93 -0
graflo/architecture/vertex.py +277 -0
graflo/caster.py +409 -0
graflo/cli/__init__.py +14 -0
graflo/cli/ingest.py +144 -0
graflo/cli/manage_dbs.py +193 -0
graflo/cli/plot_schema.py +132 -0
graflo/cli/xml2json.py +93 -0
graflo/db/__init__.py +32 -0
graflo/db/arango/__init__.py +16 -0
graflo/db/arango/conn.py +734 -0
graflo/db/arango/query.py +180 -0
graflo/db/arango/util.py +88 -0
graflo/db/connection.py +304 -0
graflo/db/manager.py +104 -0
graflo/db/neo4j/__init__.py +16 -0
graflo/db/neo4j/conn.py +432 -0
graflo/db/util.py +49 -0
graflo/filter/__init__.py +21 -0
graflo/filter/onto.py +400 -0
graflo/logging.conf +22 -0
graflo/onto.py +186 -0
graflo/plot/__init__.py +17 -0
graflo/plot/plotter.py +556 -0
graflo/util/__init__.py +23 -0
graflo/util/chunker.py +739 -0
graflo/util/merge.py +148 -0
graflo/util/misc.py +37 -0
graflo/util/onto.py +63 -0
graflo/util/transform.py +406 -0
graflo-1.1.0.dist-info/METADATA +157 -0
graflo-1.1.0.dist-info/RECORD +45 -0
graflo-1.1.0.dist-info/WHEEL +4 -0
graflo-1.1.0.dist-info/entry_points.txt +5 -0
graflo-1.1.0.dist-info/licenses/LICENSE +126 -0

graflo/cli/ingest.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""Data ingestion command-line interface for graph databases.
+This module provides a CLI tool for ingesting data into graph databases. It supports
+batch processing, parallel execution, and various data formats. The tool can handle
+both initial database setup and incremental data ingestion.
+Key Features:
+    - Configurable batch processing
+    - Multi-core and multi-threaded execution
+    - Support for custom resource patterns
+    - Database initialization and cleanup options
+    - Flexible file discovery and processing
+Example:
+    $ uv run ingest \\
+        --db-config-path config/db.yaml \\
+        --schema-path config/schema.yaml \\
+        --source-path data/ \\
+        --batch-size 5000 \\
+        --n-cores 4
+"""
+import logging.config
+import pathlib
+from os.path import dirname, join, realpath
+import click
+from suthing import ConfigFactory, FileHandle
+from graflo import Caster, Patterns, Schema
+logger = logging.getLogger(__name__)
+@click.command()
+@click.option(
+    "--db-config-path",
+    type=click.Path(exists=True, path_type=pathlib.Path),
+    required=True,
+)
+@click.option(
+    "--schema-path",
+    type=click.Path(exists=True, path_type=pathlib.Path),
+    required=True,
+)
+@click.option(
+    "--source-path",
+    type=click.Path(exists=True, path_type=pathlib.Path),
+    required=True,
+)
+@click.option(
+    "--resource-pattern-config-path",
+    type=click.Path(exists=True, path_type=pathlib.Path),
+    default=None,
+)
+@click.option("--limit-files", type=int, default=None)
+@click.option("--batch-size", type=int, default=5000)
+@click.option("--n-cores", type=int, default=1)
+@click.option(
+    "--n-threads",
+    type=int,
+    default=1,
+)
+@click.option("--fresh-start", type=bool, help="wipe existing database")
+@click.option(
+    "--init-only", default=False, is_flag=True, help="skip ingestion; only init the db"
+)
+def ingest(
+    db_config_path,
+    schema_path,
+    source_path,
+    limit_files,
+    batch_size,
+    n_cores,
+    n_threads,
+    fresh_start,
+    init_only,
+    resource_pattern_config_path,
+):
+    """Ingest data into a graph database.
+    This command processes data files and ingests them into a graph database according
+    to the provided schema. It supports various configuration options for controlling
+    the ingestion process.
+    Args:
+        db_config_path: Path to database configuration file
+        schema_path: Path to schema configuration file
+        source_path: Path to source data directory
+        limit_files: Optional limit on number of files to process
+        batch_size: Number of items to process in each batch (default: 5000)
+        n_cores: Number of CPU cores to use for parallel processing (default: 1)
+        n_threads: Number of threads per core for parallel processing (default: 1)
+        fresh_start: Whether to wipe existing database before ingestion
+        init_only: Whether to only initialize the database without ingestion
+        resource_pattern_config_path: Optional path to resource pattern configuration
+    Example:
+        $ uv run ingest \\
+            --db-config-path config/db.yaml \\
+            --schema-path config/schema.yaml \\
+            --source-path data/ \\
+            --batch-size 5000 \\
+            --n-cores 4 \\
+            --fresh-start
+    """
+    cdir = dirname(realpath(__file__))
+    logging.config.fileConfig(
+        join(cdir, "../logging.conf"), disable_existing_loggers=False
+    )
+    logging.basicConfig(level=logging.INFO)
+    schema = Schema.from_dict(FileHandle.load(schema_path))
+    conn_conf = ConfigFactory.create_config(db_config_path)
+    if resource_pattern_config_path is not None:
+        patterns = Patterns.from_dict(FileHandle.load(resource_pattern_config_path))
+    else:
+        patterns = Patterns()
+    schema.fetch_resource()
+    caster = Caster(
+        schema,
+        n_cores=n_cores,
+        n_threads=n_threads,
+    )
+    caster.ingest_files(
+        path=source_path,
+        limit_files=limit_files,
+        clean_start=fresh_start,
+        batch_size=batch_size,
+        conn_conf=conn_conf,
+        patterns=patterns,
+        init_only=init_only,
+    )
+if __name__ == "__main__":
+    ingest()

graflo/cli/manage_dbs.py ADDED Viewed

@@ -0,0 +1,193 @@
+"""Database management utilities for ArangoDB.
+This module provides command-line tools for managing ArangoDB databases, including
+backup and restore operations. It supports both local and Docker-based operations.
+Key Features:
+    - Database backup and restore
+    - Docker and local execution modes
+    - Configurable connection settings
+    - Batch processing of multiple databases
+Example:
+    $ uv run manage_dbs \\
+        --db-config-path config/db.yaml \\
+        --db mydb1 mydb2 \\
+        --store-directory-path /backups \\
+        --use-docker
+"""
+import logging
+import pathlib
+import subprocess
+import sys
+from datetime import date
+import click
+from suthing import ArangoConnectionConfig, ConfigFactory, FileHandle, Timer
+logger = logging.getLogger(__name__)
+def act_db(
+    conf: ArangoConnectionConfig,
+    db_name: str,
+    output_path: pathlib.Path,
+    restore: bool,
+    docker_version: str,
+    use_docker: bool,
+):
+    """Execute database backup or restore operation.
+    This function performs either a backup (arangodump) or restore (arangorestore)
+    operation on an ArangoDB database. It can use either the local arangodump/arangorestore
+    tools or run them in a Docker container.
+    Args:
+        conf: Database connection configuration
+        db_name: Name of the database to backup/restore
+        output_path: Path where backup will be stored or restored from
+        restore: Whether to restore (True) or backup (False)
+        docker_version: Version of ArangoDB Docker image to use
+        use_docker: Whether to use Docker for the operation
+    Returns:
+        None
+    Raises:
+        subprocess.CalledProcessError: If the backup/restore operation fails
+    """
+    host = f"tcp://{conf.hostname}:{conf.port}"
+    db_folder = output_path / db_name
+    cmd = "arangorestore" if restore else "arangodump"
+    if use_docker:
+        ru = (
+            f"docker run --rm --network=host -v {db_folder}:/dump"
+            f" arangodb/arangodb:{docker_version} {cmd}"
+        )
+        output = "--output-directory /dump"
+    else:
+        ru = f"{cmd}"
+        output = f"--output-directory {db_folder}"
+    dir_spec = "input" if restore else "output"
+    query = f"""{ru} --server.endpoint {host} --server.username {conf.cred_name} --server.password "{conf.cred_pass}" --{dir_spec}-directory {output} --server.database "{db_name}" """
+    restore_suffix = "--create-database true --force-same-database true"
+    if restore:
+        query += restore_suffix
+    else:
+        query += "--overwrite true"
+    flag = subprocess.run(query, shell=True)
+    logger.info(f"returned {flag}")
+@click.command()
+@click.option(
+    "--db-config-path",
+    type=click.Path(exists=True, path_type=pathlib.Path),
+    required=False,
+    default=None,
+)
+@click.option("--db-host", type=str)
+@click.option("--db-password", type=str)
+@click.option("--db-user", type=str, default="root")
+@click.option(
+    "--db",
+    type=str,
+    multiple=True,
+    required=True,
+    help="filesystem path where to dump db snapshot",
+)
+@click.option(
+    "--store-directory-path",
+    type=click.Path(path_type=pathlib.Path),
+    required=True,
+    help="filesystem path where to dump db snapshot",
+)
+@click.option("--docker-version", type=str, default="3.12.1")
+@click.option("--restore", type=bool, default=False, is_flag=True)
+@click.option("--use-docker", type=bool, default=True)
+def manage_dbs(
+    db_config_path,
+    db_host,
+    db_password,
+    db_user,
+    db,
+    store_directory_path,
+    restore,
+    docker_version,
+    use_docker=True,
+):
+    """Manage ArangoDB database backups and restores.
+    This command provides functionality to backup and restore ArangoDB databases.
+    It supports both local execution and Docker-based operations. The command can
+    process multiple databases in sequence and provides timing information for
+    each operation.
+    Args:
+        db_config_path: Path to database configuration file (optional)
+        db_host: Database host address (if not using config file)
+        db_password: Database password (if not using config file)
+        db_user: Database username (default: root)
+        db: List of database names to process
+        store_directory_path: Path where backups will be stored/restored
+        restore: Whether to restore (True) or backup (False)
+        docker_version: Version of ArangoDB Docker image (default: 3.12.1)
+        use_docker: Whether to use Docker for operations (default: True)
+    Example:
+        $ uv run manage_dbs \\
+            --db-config-path config/db.yaml \\
+            --db mydb1 mydb2 \\
+            --store-directory-path /backups \\
+            --use-docker
+    """
+    if db_config_path is None:
+        db_conf: ArangoConnectionConfig = ArangoConnectionConfig(
+            cred_name=db_user, cred_pass=db_password, hosts=db_host
+        )
+    else:
+        conn_conf = FileHandle.load(fpath=db_config_path)
+        db_conf: ArangoConnectionConfig = ConfigFactory.create_config(
+            dict_like=conn_conf
+        )
+    action = "restoring" if restore else "dumping"
+    if restore:
+        out_path = store_directory_path
+    else:
+        out_path = (
+            store_directory_path.expanduser().resolve() / date.today().isoformat()
+        )
+        if not out_path.exists():
+            out_path.mkdir(exist_ok=True)
+    with Timer() as t_all:
+        for dbname in db:
+            with Timer() as t_dump:
+                try:
+                    act_db(
+                        db_conf,
+                        dbname,
+                        out_path,
+                        restore=restore,
+                        docker_version=docker_version,
+                        use_docker=use_docker,
+                    )
+                except Exception as e:
+                    logging.error(e)
+            logging.info(
+                f"{action} {dbname} took  {t_dump.mins} mins {t_dump.secs:.2f} sec"
+            )
+    logging.info(f"all {action} took  {t_all.mins} mins {t_all.secs:.2f} sec")
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+    manage_dbs()

graflo/cli/plot_schema.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""Schema visualization tool for graph databases.
+This module provides functionality for visualizing graph database schemas using Graphviz.
+It includes tools for plotting vertex-to-vertex relationships, vertex fields, and resource
+mappings. The module supports various visualization options and graph layout customization.
+Key Components:
+    - SchemaPlotter: Main class for schema visualization
+    - knapsack: Utility for optimizing graph layout
+    - plot_schema: CLI command for schema visualization
+Graphviz Attributes Reference:
+    - https://renenyffenegger.ch/notes/tools/Graphviz/attributes/index
+    - https://rsms.me/graphviz/
+    - https://graphviz.readthedocs.io/en/stable/examples.html
+    - https://graphviz.org/doc/info/attrs.html
+Example:
+    >>> plot_schema(schema_path="schema.yaml", figure_output_path="schema.png")
+"""
+import logging
+import sys
+import click
+from graflo.plot.plotter import SchemaPlotter
+"""
+graphviz attributes
+https://renenyffenegger.ch/notes/tools/Graphviz/attributes/index
+https://rsms.me/graphviz/
+https://graphviz.readthedocs.io/en/stable/examples.html
+https://graphviz.org/doc/info/attrs.html
+usage:
+    color='red',style='filled', fillcolor='blue',shape='square'
+to keep
+level_one = [node1, node2]
+sg_one = ag.add_subgraph(level_one, rank='same')
+"""
+def knapsack(weights, ks_size=7):
+    """Split a set of weights into groups of at most threshold weight.
+    This function implements a greedy algorithm to partition weights into groups
+    where each group's total weight is at most ks_size. It's used for optimizing
+    graph layout by balancing node distribution.
+    Args:
+        weights: List of weights to partition
+        ks_size: Maximum total weight per group (default: 7)
+    Returns:
+        list[list[int]]: List of groups, where each group is a list of indices
+            from the original weights list
+    Raises:
+        ValueError: If any single weight exceeds ks_size
+    Example:
+        >>> weights = [3, 4, 2, 5, 1]
+        >>> knapsack(weights, ks_size=7)
+        [[4, 0, 2], [1, 3]]  # Groups with weights [6, 7]
+    """
+    pp = sorted(list(zip(range(len(weights)), weights)), key=lambda x: x[1])
+    print(pp)
+    acc = []
+    if pp[-1][1] > ks_size:
+        raise ValueError("One of the items is larger than the knapsack")
+    while pp:
+        w_item = []
+        w_item += [pp.pop()]
+        ww_item = sum([item for _, item in w_item])
+        while ww_item < ks_size:
+            cnt = 0
+            for j, item in enumerate(pp[::-1]):
+                diff = ks_size - item[1] - ww_item
+                if diff >= 0:
+                    cnt += 1
+                    w_item += [pp.pop(len(pp) - j - 1)]
+                    ww_item += w_item[-1][1]
+                else:
+                    break
+            if ww_item >= ks_size or cnt == 0:
+                acc += [w_item]
+                break
+    acc_ret = [[y for y, _ in subitem] for subitem in acc]
+    return acc_ret
+@click.command()
+@click.option("-c", "--schema-path", type=click.Path(), required=True)
+@click.option("-o", "--figure-output-path", type=click.Path(), required=True)
+@click.option("-p", "--prune-low-degree-nodes", type=bool, default=False)
+def plot_schema(schema_path, figure_output_path, prune_low_degree_nodes):
+    """Generate visualizations of the graph database schema.
+    This command creates multiple visualizations of the schema:
+    1. Vertex-to-vertex relationships
+    2. Vertex fields and their relationships
+    3. Resource mappings
+    The visualizations are saved to the specified output path.
+    Args:
+        schema_path: Path to the schema configuration file
+        figure_output_path: Path where the visualization will be saved
+        prune_low_degree_nodes: Whether to remove nodes with low connectivity
+            from the visualization (default: False)
+    Example:
+        $ uv run plot_schema -c schema.yaml -o schema.png
+    """
+    logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+    plotter = SchemaPlotter(schema_path, figure_output_path)
+    plotter.plot_vc2vc(prune_leaves=prune_low_degree_nodes)
+    plotter.plot_vc2fields()
+    plotter.plot_resources()
+    # plotter.plot_source2vc()
+    # plotter.plot_source2vc_detailed()
+if __name__ == "__main__":
+    plot_schema()

graflo/cli/xml2json.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""XML to JSON conversion tool for data preprocessing.
+This module provides a command-line tool for converting XML files to JSON format,
+with support for different data sources and chunking options. It's particularly
+useful for preprocessing scientific literature data from sources like Web of Science
+and PubMed.
+Key Features:
+    - Support for Web of Science and PubMed XML formats
+    - Configurable chunking for large files
+    - Batch processing of multiple files
+    - Customizable output format
+Example:
+    $ uv run xml2json \\
+        --source-path data/wos.xml \\
+        --chunk-size 1000 \\
+        --mode wos_csv
+"""
+import logging
+import pathlib
+import sys
+import click
+from graflo.util.chunker import convert, force_list_wos, tag_wos
+logger = logging.getLogger(__name__)
+@click.command()
+@click.option(
+    "-s",
+    "--source-path",
+    type=click.Path(path_type=pathlib.Path),
+    required=True,
+)
+@click.option("-c", "--chunk-size", type=int, default=1000)
+@click.option("-m", "--max-chunks", type=int, default=None)
+@click.option("--mode", type=str)
+def do(source_path, chunk_size, max_chunks, mode):
+    """Convert XML files to JSON format.
+    This command processes XML files and converts them to JSON format, with support
+    for different data sources and chunking options.
+    Args:
+        source_path: Path to source XML file or directory
+        chunk_size: Number of records per output file (default: 1000)
+        max_chunks: Maximum number of chunks to process (default: None)
+        mode: Data source mode ('wos_csv' or 'pubmed')
+    Example:
+        $ uv run xml2json \\
+            --source-path data/wos.xml \\
+            --chunk-size 1000 \\
+            --mode wos_csv
+    """
+    if mode == "wos_csv":
+        pattern = r"xmlns=\".*[^\"]\"(?=>)"
+        force_list = force_list_wos
+        tag = tag_wos
+    elif mode == "pubmed":
+        pattern = None
+        force_list = None
+        tag = "PubmedArticle"
+    else:
+        raise ValueError(f"Unknown mode {mode}")
+    if source_path.is_dir():
+        files = [
+            fp for fp in source_path.iterdir() if not fp.is_dir() and "xml" in fp.name
+        ]
+    else:
+        files = [source_path] if ".xml." in source_path.name else []
+    for fp in files:
+        target_root = str(fp.parent / fp.name.split(".")[0])
+        convert(
+            fp,
+            target_root=target_root,
+            chunk_size=chunk_size,
+            max_chunks=max_chunks,
+            pattern=pattern,
+            force_list=force_list,
+            root_tag=tag,
+        )
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+    do()

graflo/db/__init__.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""Database connection and management components.
+This package provides database connection implementations and management utilities
+for different graph databases (ArangoDB, Neo4j). It includes connection interfaces,
+query execution, and database operations.
+Key Components:
+    - Connection: Abstract database connection interface
+    - ConnectionManager: Database connection management
+    - ArangoDB: ArangoDB-specific implementation
+    - Neo4j: Neo4j-specific implementation
+    - Query: Query generation and execution utilities
+Example:
+    >>> from graflo.db import ConnectionManager
+    >>> from graflo.db.arango import ArangoConnection
+    >>> manager = ConnectionManager(
+    ...     connection_config={"url": "http://localhost:8529"},
+    ...     conn_class=ArangoConnection
+    ... )
+    >>> with manager as conn:
+    ...     conn.init_db(schema)
+"""
+from .connection import Connection, ConnectionType
+from .manager import ConnectionManager
+__all__ = [
+    "Connection",
+    "ConnectionManager",
+    "ConnectionType",
+]

graflo/db/arango/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""ArangoDB database implementation.
+This package provides ArangoDB-specific implementations of the database interface,
+including connection management, query execution, and utility functions.
+Key Components:
+    - ArangoConnection: ArangoDB connection implementation
+    - Query: AQL query execution and profiling
+    - Util: ArangoDB-specific utility functions
+Example:
+    >>> from graflo.db.arango import ArangoConnection
+    >>> conn = ArangoConnection(config)
+    >>> cursor = conn.execute("FOR doc IN users RETURN doc")
+    >>> results = cursor.batch()
+"""