PyPI - structured2graph - Versions diffs - 0.1.1__py3-none-any.whl - Mend

structured2graph 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

__init__.py +47 -0
core/__init__.py +23 -0
core/hygm/__init__.py +74 -0
core/hygm/hygm.py +2351 -0
core/hygm/models/__init__.py +82 -0
core/hygm/models/graph_models.py +667 -0
core/hygm/models/llm_models.py +229 -0
core/hygm/models/operations.py +176 -0
core/hygm/models/sources.py +68 -0
core/hygm/models/user_operations.py +139 -0
core/hygm/strategies/__init__.py +17 -0
core/hygm/strategies/base.py +36 -0
core/hygm/strategies/deterministic.py +262 -0
core/hygm/strategies/llm.py +904 -0
core/hygm/validation/__init__.py +38 -0
core/hygm/validation/base.py +194 -0
core/hygm/validation/graph_schema_validator.py +687 -0
core/hygm/validation/memgraph_data_validator.py +991 -0
core/migration_agent.py +1369 -0
core/schema/spec.json +155 -0
core/utils/meta_graph.py +108 -0
database/__init__.py +36 -0
database/adapters/__init__.py +11 -0
database/adapters/memgraph.py +318 -0
database/adapters/mysql.py +311 -0
database/adapters/postgresql.py +335 -0
database/analyzer.py +396 -0
database/factory.py +219 -0
database/models.py +209 -0
main.py +518 -0
query_generation/__init__.py +20 -0
query_generation/cypher_generator.py +129 -0
query_generation/schema_utilities.py +88 -0
structured2graph-0.1.1.dist-info/METADATA +197 -0
structured2graph-0.1.1.dist-info/RECORD +41 -0
structured2graph-0.1.1.dist-info/WHEEL +4 -0
structured2graph-0.1.1.dist-info/entry_points.txt +2 -0
structured2graph-0.1.1.dist-info/licenses/LICENSE +21 -0
utils/__init__.py +57 -0
utils/config.py +235 -0
utils/environment.py +404 -0

database/adapters/mysql.py ADDED Viewed

@@ -0,0 +1,311 @@
+"""
+MySQL-specific database analyzer implementation.
+This module provides MySQL-specific implementation of the DatabaseAnalyzer interface.
+"""
+import mysql.connector
+from typing import Dict, List, Any, Optional
+import logging
+from ..analyzer import (
+    DatabaseAnalyzer,
+    ColumnInfo,
+    ForeignKeyInfo,
+    TableInfo,
+    TableType,
+)
+logger = logging.getLogger(__name__)
+class MySQLAnalyzer(DatabaseAnalyzer):
+    """MySQL-specific implementation of DatabaseAnalyzer."""
+    def __init__(
+        self, host: str, user: str, password: str, database: str, port: int = 3306
+    ):
+        """
+        Initialize MySQL analyzer.
+        Args:
+            host: MySQL server hostname
+            user: MySQL username
+            password: MySQL password
+            database: Database name
+            port: MySQL port (default: 3306)
+        """
+        connection_config = {
+            "host": host,
+            "user": user,
+            "password": password,
+            "database": database,
+            "port": port,
+        }
+        super().__init__(connection_config)
+    def _get_database_type(self) -> str:
+        """Return the database type."""
+        return "mysql"
+    def connect(self) -> bool:
+        """Establish connection to MySQL database."""
+        try:
+            self.connection = mysql.connector.connect(**self.connection_config)
+            logger.info("Successfully connected to MySQL database")
+            return True
+        except mysql.connector.Error as e:
+            logger.error(f"Error connecting to MySQL: {e}")
+            return False
+    def disconnect(self) -> None:
+        """Close MySQL connection."""
+        if self.connection and self.connection.is_connected():
+            self.connection.close()
+            logger.info("MySQL connection closed")
+    def get_tables(self) -> List[str]:
+        """Get list of all tables in the database."""
+        if not self.connection:
+            raise ConnectionError("Not connected to database")
+        cursor = self.connection.cursor()
+        cursor.execute("SHOW TABLES")
+        tables = [table[0] for table in cursor.fetchall()]
+        cursor.close()
+        return tables
+    def get_table_schema(self, table_name: str) -> List[ColumnInfo]:
+        """Get schema information for a specific table."""
+        if not self.connection:
+            raise ConnectionError("Not connected to database")
+        cursor = self.connection.cursor()
+        cursor.execute(f"DESCRIBE {table_name}")
+        columns = []
+        for row in cursor.fetchall():
+            field_name = row[0]
+            data_type = row[1]
+            is_nullable = row[2] == "YES"
+            key_type = row[3]
+            default_value = row[4]
+            extra = row[5]
+            # Determine if it's a primary key
+            is_primary_key = key_type == "PRI"
+            # Determine if it's a foreign key (will be checked separately)
+            is_foreign_key = False
+            # Check for auto increment
+            auto_increment = "auto_increment" in extra.lower()
+            # Parse data type for length, precision, scale
+            max_length = None
+            precision = None
+            scale = None
+            if "(" in data_type:
+                type_part = data_type.split("(")[0].lower()
+                params_part = data_type.split("(")[1].rstrip(")")
+                try:
+                    if "," in params_part:
+                        # Decimal type with precision and scale
+                        precision, scale = map(int, params_part.split(","))
+                    elif type_part in ("varchar", "char", "varbinary", "binary", "bit"):
+                        # Types that have numeric length parameters
+                        max_length = int(params_part)
+                    elif type_part in ("decimal", "numeric", "float", "double"):
+                        # Numeric types that might have precision
+                        if params_part.isdigit():
+                            precision = int(params_part)
+                    # For enum, set, and other types, we don't parse the
+                    # parameters as integers
+                    # They will be handled as part of the type definition
+                except (ValueError, TypeError) as e:
+                    # If we can't parse the parameters as integers, it's
+                    # likely an enum, set, etc.
+                    # Keep the full type definition including parameters
+                    logger.debug(
+                        "Could not parse type parameters for " "%s: %s", data_type, e
+                    )
+                    # Don't modify data_type in this case, keep it as is
+                    continue
+                # Only update data_type if we successfully parsed parameters
+                length_types = ("varchar", "char", "varbinary", "binary", "bit")
+                numeric_types = ("decimal", "numeric", "float", "double")
+                if type_part in length_types or type_part in numeric_types:
+                    data_type = type_part
+            columns.append(
+                ColumnInfo(
+                    name=field_name,
+                    data_type=data_type,
+                    is_nullable=is_nullable,
+                    is_primary_key=is_primary_key,
+                    is_foreign_key=is_foreign_key,
+                    default_value=default_value,
+                    auto_increment=auto_increment,
+                    max_length=max_length,
+                    precision=precision,
+                    scale=scale,
+                )
+            )
+        cursor.close()
+        # Now check for foreign keys and update the columns
+        foreign_keys = self.get_foreign_keys(table_name)
+        fk_column_names = {fk.column_name for fk in foreign_keys}
+        for column in columns:
+            if column.name in fk_column_names:
+                column.is_foreign_key = True
+        return columns
+    def get_foreign_keys(self, table_name: str) -> List[ForeignKeyInfo]:
+        """Get foreign key relationships for a table."""
+        if not self.connection:
+            raise ConnectionError("Not connected to database")
+        cursor = self.connection.cursor()
+        query = """
+        SELECT
+            COLUMN_NAME,
+            REFERENCED_TABLE_NAME,
+            REFERENCED_COLUMN_NAME,
+            CONSTRAINT_NAME
+        FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE
+        WHERE TABLE_SCHEMA = %s
+        AND TABLE_NAME = %s
+        AND REFERENCED_TABLE_NAME IS NOT NULL
+        """
+        cursor.execute(query, (self.connection_config["database"], table_name))
+        foreign_keys = []
+        for row in cursor.fetchall():
+            foreign_keys.append(
+                ForeignKeyInfo(
+                    column_name=row[0],
+                    referenced_table=row[1],
+                    referenced_column=row[2],
+                    constraint_name=row[3],
+                )
+            )
+        cursor.close()
+        return foreign_keys
+    def get_table_data(
+        self, table_name: str, limit: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
+        """Get data from a specific table."""
+        if not self.connection:
+            raise ConnectionError("Not connected to database")
+        cursor = self.connection.cursor(dictionary=True)
+        query = f"SELECT * FROM {table_name}"
+        if limit:
+            query += f" LIMIT {limit}"
+        cursor.execute(query)
+        data = cursor.fetchall()
+        cursor.close()
+        return data
+    def get_table_row_count(self, table_name: str) -> int:
+        """Get the number of rows in a table."""
+        if not self.connection:
+            raise ConnectionError("Not connected to database")
+        cursor = self.connection.cursor()
+        cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
+        count = cursor.fetchone()[0]
+        cursor.close()
+        return count
+    def is_view(self, table_name: str) -> bool:
+        """Check if a table is actually a view."""
+        if not self.connection:
+            raise ConnectionError("Not connected to database")
+        cursor = self.connection.cursor()
+        query = """
+        SELECT TABLE_TYPE
+        FROM INFORMATION_SCHEMA.TABLES
+        WHERE TABLE_SCHEMA = %s
+        AND TABLE_NAME = %s
+        """
+        cursor.execute(query, (self.connection_config["database"], table_name))
+        result = cursor.fetchone()
+        cursor.close()
+        if result:
+            return result[0] == "VIEW"
+        return False
+    def get_tables_excluding_views(self) -> List[str]:
+        """Get list of all tables in the database, excluding views."""
+        if not self.connection:
+            raise ConnectionError("Not connected to database")
+        cursor = self.connection.cursor()
+        query = """
+        SELECT TABLE_NAME
+        FROM INFORMATION_SCHEMA.TABLES
+        WHERE TABLE_SCHEMA = %s
+        AND TABLE_TYPE = 'BASE TABLE'
+        """
+        cursor.execute(query, (self.connection_config["database"],))
+        tables = [table[0] for table in cursor.fetchall()]
+        cursor.close()
+        return tables
+    def get_indexes(self, table_name: str) -> List[Dict[str, Any]]:
+        """
+        Get index information for a table.
+        Args:
+            table_name: Name of the table
+        Returns:
+            List of index information dictionaries
+        """
+        if not self.connection:
+            raise ConnectionError("Not connected to database")
+        cursor = self.connection.cursor()
+        query = """
+        SELECT
+            INDEX_NAME,
+            COLUMN_NAME,
+            NON_UNIQUE,
+            INDEX_TYPE
+        FROM INFORMATION_SCHEMA.STATISTICS
+        WHERE TABLE_SCHEMA = %s
+        AND TABLE_NAME = %s
+        ORDER BY INDEX_NAME, SEQ_IN_INDEX
+        """
+        cursor.execute(query, (self.connection_config["database"], table_name))
+        indexes = {}
+        for row in cursor.fetchall():
+            index_name = row[0]
+            column_name = row[1]
+            is_unique = row[2] == 0
+            index_type = row[3]
+            if index_name not in indexes:
+                indexes[index_name] = {
+                    "name": index_name,
+                    "columns": [],
+                    "is_unique": is_unique,
+                    "type": index_type,
+                }
+            indexes[index_name]["columns"].append(column_name)
+        cursor.close()
+        return list(indexes.values())

database/adapters/postgresql.py ADDED Viewed

@@ -0,0 +1,335 @@
+"""PostgreSQL-specific database analyzer implementation."""
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+try:
+    import psycopg2  # type: ignore[import-not-found]
+    import psycopg2.extras  # type: ignore[import-not-found]
+    from psycopg2 import sql  # type: ignore[import-not-found]
+except ImportError as import_error:  # pragma: no cover - optional dependency
+    psycopg2 = None  # type: ignore[assignment]
+    sql = None  # type: ignore[assignment]
+    _PSYCOPG2_IMPORT_ERROR = import_error
+else:
+    _PSYCOPG2_IMPORT_ERROR = None
+from ..analyzer import DatabaseAnalyzer
+from ..models import ColumnInfo, ForeignKeyInfo
+logger = logging.getLogger(__name__)
+class PostgreSQLAnalyzer(DatabaseAnalyzer):
+    """PostgreSQL-specific implementation of DatabaseAnalyzer."""
+    def __init__(
+        self,
+        host: str,
+        user: str,
+        password: str,
+        database: str,
+        port: int = 5432,
+        schema: str = "public",
+    ):
+        connection_config = {
+            "host": host,
+            "user": user,
+            "password": password,
+            "database": database,
+            "port": port,
+            "schema": schema,
+        }
+        self._schema = schema
+        super().__init__(connection_config)
+    def _get_database_type(self) -> str:
+        return "postgresql"
+    def connect(self) -> bool:
+        if psycopg2 is None:
+            raise ImportError(
+                "psycopg2 is required for PostgreSQL support"
+            ) from _PSYCOPG2_IMPORT_ERROR
+        try:
+            connect_config = {
+                key: value
+                for key, value in self.connection_config.items()
+                if key != "schema"
+            }
+            self.connection = psycopg2.connect(**connect_config)
+            logger.info("Successfully connected to PostgreSQL database")
+            return True
+        except psycopg2.Error as exc:
+            logger.error("Error connecting to PostgreSQL: %s", exc)
+            self.connection = None
+            return False
+    def disconnect(self) -> None:
+        if self.connection:
+            self.connection.close()
+            self.connection = None
+            logger.info("PostgreSQL connection closed")
+    def get_tables(self) -> List[str]:
+        connection = self._require_connection()
+        schema = self._schema_name()
+        query = """
+        SELECT table_name
+        FROM information_schema.tables
+        WHERE table_schema = %s
+          AND table_type IN ('BASE TABLE', 'VIEW')
+        ORDER BY table_name
+        """
+        cursor = connection.cursor()
+        cursor.execute(query, (schema,))
+        tables = [row[0] for row in cursor.fetchall()]
+        cursor.close()
+        return tables
+    def get_table_schema(self, table_name: str) -> List[ColumnInfo]:
+        connection = self._require_connection()
+        schema = self._schema_name()
+        column_query = """
+        SELECT
+            column_name,
+            data_type,
+            is_nullable,
+            column_default,
+            character_maximum_length,
+            numeric_precision,
+            numeric_scale
+        FROM information_schema.columns
+        WHERE table_schema = %s
+          AND table_name = %s
+        ORDER BY ordinal_position
+        """
+        cursor = connection.cursor()
+        cursor.execute(column_query, (schema, table_name))
+        column_rows = cursor.fetchall()
+        cursor.close()
+        primary_keys = set(self._get_primary_key_columns(table_name))
+        foreign_keys = self.get_foreign_keys(table_name)
+        fk_column_names = {fk.column_name for fk in foreign_keys}
+        columns: List[ColumnInfo] = []
+        for (
+            column_name,
+            data_type,
+            is_nullable,
+            column_default,
+            char_max_length,
+            numeric_precision,
+            numeric_scale,
+        ) in column_rows:
+            auto_increment = False
+            if isinstance(column_default, str):
+                auto_increment = column_default.lower().startswith("nextval(")
+            max_length = int(char_max_length) if char_max_length is not None else None
+            precision = (
+                int(numeric_precision) if numeric_precision is not None else None
+            )
+            scale = int(numeric_scale) if numeric_scale is not None else None
+            columns.append(
+                ColumnInfo(
+                    name=column_name,
+                    data_type=data_type,
+                    is_nullable=is_nullable == "YES",
+                    is_primary_key=column_name in primary_keys,
+                    is_foreign_key=column_name in fk_column_names,
+                    default_value=column_default,
+                    auto_increment=auto_increment,
+                    max_length=max_length,
+                    precision=precision,
+                    scale=scale,
+                )
+            )
+        return columns
+    def get_foreign_keys(self, table_name: str) -> List[ForeignKeyInfo]:
+        connection = self._require_connection()
+        schema = self._schema_name()
+        query = """
+        SELECT
+            kcu.column_name,
+            ccu.table_name AS referenced_table,
+            ccu.column_name AS referenced_column,
+            tc.constraint_name
+        FROM information_schema.table_constraints AS tc
+        JOIN information_schema.key_column_usage AS kcu
+          ON tc.constraint_name = kcu.constraint_name
+         AND tc.table_schema = kcu.table_schema
+        JOIN information_schema.constraint_column_usage AS ccu
+          ON ccu.constraint_name = tc.constraint_name
+         AND ccu.table_schema = tc.table_schema
+        WHERE tc.table_schema = %s
+          AND tc.table_name = %s
+          AND tc.constraint_type = 'FOREIGN KEY'
+        ORDER BY tc.constraint_name, kcu.ordinal_position
+        """
+        cursor = connection.cursor()
+        cursor.execute(query, (schema, table_name))
+        foreign_keys = [
+            ForeignKeyInfo(
+                column_name=row[0],
+                referenced_table=row[1],
+                referenced_column=row[2],
+                constraint_name=row[3],
+            )
+            for row in cursor.fetchall()
+        ]
+        cursor.close()
+        return foreign_keys
+    def get_table_data(
+        self, table_name: str, limit: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
+        connection = self._require_connection()
+        schema = self._schema_name()
+        if sql is None or psycopg2 is None:  # pragma: no cover - import guard
+            raise ImportError("psycopg2 is required for PostgreSQL support")
+        query = sql.SQL("SELECT * FROM {}.{}").format(
+            sql.Identifier(schema), sql.Identifier(table_name)
+        )
+        params: Optional[Tuple[int, ...]] = None
+        if limit is not None:
+            query = query + sql.SQL(" LIMIT %s")
+            params = (limit,)
+        cursor = connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+        cursor.execute(query, params)
+        rows = cursor.fetchall()
+        cursor.close()
+        return [dict(row) for row in rows]
+    def get_table_row_count(self, table_name: str) -> int:
+        connection = self._require_connection()
+        schema = self._schema_name()
+        if sql is None:  # pragma: no cover - import guard
+            raise ImportError("psycopg2 is required for PostgreSQL support")
+        query = sql.SQL("SELECT COUNT(*) FROM {}.{}").format(
+            sql.Identifier(schema), sql.Identifier(table_name)
+        )
+        cursor = connection.cursor()
+        cursor.execute(query)
+        count = cursor.fetchone()[0]
+        cursor.close()
+        return int(count)
+    def is_view(self, table_name: str) -> bool:
+        connection = self._require_connection()
+        schema = self._schema_name()
+        query = """
+        SELECT table_type
+        FROM information_schema.tables
+        WHERE table_schema = %s
+          AND table_name = %s
+        """
+        cursor = connection.cursor()
+        cursor.execute(query, (schema, table_name))
+        result = cursor.fetchone()
+        cursor.close()
+        if result:
+            return result[0] == "VIEW"
+        return False
+    def get_indexes(self, table_name: str) -> List[Dict[str, Any]]:
+        connection = self._require_connection()
+        schema = self._schema_name()
+        query = """
+        SELECT indexname, indexdef
+        FROM pg_indexes
+        WHERE schemaname = %s
+          AND tablename = %s
+        ORDER BY indexname
+        """
+        cursor = connection.cursor()
+        cursor.execute(query, (schema, table_name))
+        indexes: List[Dict[str, Any]] = []
+        for index_name, index_def in cursor.fetchall():
+            index_info: Dict[str, Any] = {
+                "name": index_name,
+                "columns": self._parse_index_columns(index_def),
+                "is_unique": index_def.upper().startswith("CREATE UNIQUE"),
+                "type": self._parse_index_type(index_def),
+                "definition": index_def,
+            }
+            indexes.append(index_info)
+        cursor.close()
+        return indexes
+    def _schema_name(self) -> str:
+        return self.connection_config.get("schema", self._schema or "public")
+    def _require_connection(self) -> Any:
+        if self.connection is None:
+            raise ConnectionError("Not connected to database")
+        return self.connection
+    def _get_primary_key_columns(self, table_name: str) -> List[str]:
+        connection = self._require_connection()
+        schema = self._schema_name()
+        query = """
+        SELECT kcu.column_name
+        FROM information_schema.table_constraints AS tc
+        JOIN information_schema.key_column_usage AS kcu
+          ON tc.constraint_name = kcu.constraint_name
+         AND tc.table_schema = kcu.table_schema
+        WHERE tc.table_schema = %s
+          AND tc.table_name = %s
+          AND tc.constraint_type = 'PRIMARY KEY'
+        ORDER BY kcu.ordinal_position
+        """
+        cursor = connection.cursor()
+        cursor.execute(query, (schema, table_name))
+        primary_keys = [row[0] for row in cursor.fetchall()]
+        cursor.close()
+        return primary_keys
+    def _parse_index_columns(self, index_def: str) -> List[str]:
+        if "(" not in index_def or ")" not in index_def:
+            return []
+        try:
+            columns_part = index_def.split("(", 1)[1].rsplit(")", 1)[0]
+        except (IndexError, ValueError):
+            return []
+        columns = []
+        for raw_column in columns_part.split(","):
+            column = raw_column.strip().strip('"')
+            if column:
+                columns.append(column)
+        return columns
+    def _parse_index_type(self, index_def: str) -> Optional[str]:
+        marker = " USING "
+        upper_def = index_def.upper()
+        if marker not in upper_def:
+            return None
+        try:
+            start_index = upper_def.index(marker) + len(marker)
+            postfix = index_def[start_index:]
+            index_type = postfix.split(" ", 1)[0].strip()
+            return index_type.lower() if index_type else None
+        except ValueError:
+            return None