structured2graph 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. structured2graph-0.1.1/.env.example +52 -0
  2. structured2graph-0.1.1/.gitignore +182 -0
  3. structured2graph-0.1.1/LICENSE +21 -0
  4. structured2graph-0.1.1/PKG-INFO +197 -0
  5. structured2graph-0.1.1/PROMPT.md +90 -0
  6. structured2graph-0.1.1/README.md +155 -0
  7. structured2graph-0.1.1/__init__.py +47 -0
  8. structured2graph-0.1.1/core/__init__.py +23 -0
  9. structured2graph-0.1.1/core/hygm/__init__.py +74 -0
  10. structured2graph-0.1.1/core/hygm/hygm.py +2351 -0
  11. structured2graph-0.1.1/core/hygm/models/__init__.py +82 -0
  12. structured2graph-0.1.1/core/hygm/models/graph_models.py +667 -0
  13. structured2graph-0.1.1/core/hygm/models/llm_models.py +229 -0
  14. structured2graph-0.1.1/core/hygm/models/operations.py +176 -0
  15. structured2graph-0.1.1/core/hygm/models/sources.py +68 -0
  16. structured2graph-0.1.1/core/hygm/models/user_operations.py +139 -0
  17. structured2graph-0.1.1/core/hygm/strategies/__init__.py +17 -0
  18. structured2graph-0.1.1/core/hygm/strategies/base.py +36 -0
  19. structured2graph-0.1.1/core/hygm/strategies/deterministic.py +262 -0
  20. structured2graph-0.1.1/core/hygm/strategies/llm.py +904 -0
  21. structured2graph-0.1.1/core/hygm/validation/__init__.py +38 -0
  22. structured2graph-0.1.1/core/hygm/validation/base.py +194 -0
  23. structured2graph-0.1.1/core/hygm/validation/graph_schema_validator.py +687 -0
  24. structured2graph-0.1.1/core/hygm/validation/memgraph_data_validator.py +991 -0
  25. structured2graph-0.1.1/core/migration_agent.py +1369 -0
  26. structured2graph-0.1.1/core/schema/spec.json +155 -0
  27. structured2graph-0.1.1/core/utils/meta_graph.py +108 -0
  28. structured2graph-0.1.1/database/__init__.py +36 -0
  29. structured2graph-0.1.1/database/adapters/__init__.py +11 -0
  30. structured2graph-0.1.1/database/adapters/memgraph.py +318 -0
  31. structured2graph-0.1.1/database/adapters/mysql.py +311 -0
  32. structured2graph-0.1.1/database/adapters/postgresql.py +335 -0
  33. structured2graph-0.1.1/database/analyzer.py +396 -0
  34. structured2graph-0.1.1/database/factory.py +219 -0
  35. structured2graph-0.1.1/database/models.py +209 -0
  36. structured2graph-0.1.1/examples/__init__.py +3 -0
  37. structured2graph-0.1.1/examples/basic_migration.py +59 -0
  38. structured2graph-0.1.1/examples/constraint_operations_example.py +0 -0
  39. structured2graph-0.1.1/main.py +518 -0
  40. structured2graph-0.1.1/output/mapping.json +115 -0
  41. structured2graph-0.1.1/pyproject.toml +79 -0
  42. structured2graph-0.1.1/query_generation/__init__.py +20 -0
  43. structured2graph-0.1.1/query_generation/cypher_generator.py +129 -0
  44. structured2graph-0.1.1/query_generation/schema_utilities.py +88 -0
  45. structured2graph-0.1.1/tests/__init__.py +3 -0
  46. structured2graph-0.1.1/tests/test_integration.py +1523 -0
  47. structured2graph-0.1.1/utils/__init__.py +57 -0
  48. structured2graph-0.1.1/utils/config.py +235 -0
  49. structured2graph-0.1.1/utils/environment.py +404 -0
  50. structured2graph-0.1.1/uv.lock +4333 -0
@@ -0,0 +1,52 @@
1
+ # LLM API Configuration (choose one or more)
2
+ # OpenAI
3
+ OPENAI_API_KEY=your_actual_openai_api_key
4
+
5
+ # Anthropic (Claude)
6
+ # ANTHROPIC_API_KEY=your_actual_anthropic_api_key
7
+
8
+ # Google (Gemini)
9
+ # GOOGLE_API_KEY=your_actual_google_api_key
10
+
11
+ # LLM Provider Configuration (optional - auto-detects from available API keys)
12
+ # LLM_PROVIDER: openai, anthropic, gemini
13
+ # LLM_PROVIDER=openai
14
+ # LLM_MODEL=gpt-4o
15
+ # Defaults by provider:
16
+ # - OpenAI: gpt-4o
17
+ # - Anthropic: claude-3-5-sonnet-20241022
18
+ # - Gemini: gemini-2.0-flash-exp
19
+
20
+ # Migration Defaults (can be overridden via CLI flags)
21
+ # SQL2MG_MODE: automatic, incremental
22
+ SQL2MG_MODE=automatic
23
+ # SQL2MG_STRATEGY: deterministic, llm
24
+ SQL2MG_STRATEGY=deterministic
25
+ # SQL2MG_META_POLICY: auto, reset, skip
26
+ SQL2MG_META_POLICY=auto
27
+ SQL2MG_LOG_LEVEL=INFO
28
+
29
+ # Source Database Selection
30
+ # SOURCE_DB_TYPE: mysql, postgresql
31
+ SOURCE_DB_TYPE=mysql
32
+
33
+ # MySQL Database Configuration (used when SOURCE_DB_TYPE=mysql)
34
+ MYSQL_HOST=host.docker.internal
35
+ MYSQL_USER=root
36
+ MYSQL_PASSWORD=your_mysql_password
37
+ MYSQL_DATABASE=sakila
38
+ MYSQL_PORT=3306
39
+
40
+ # PostgreSQL Database Configuration (used when SOURCE_DB_TYPE=postgresql)
41
+ POSTGRES_HOST=localhost
42
+ POSTGRES_USER=postgres
43
+ POSTGRES_PASSWORD=your_postgres_password
44
+ POSTGRES_DATABASE=postgres
45
+ POSTGRES_PORT=5432
46
+ POSTGRES_SCHEMA=public
47
+
48
+ # Memgraph Database Configuration
49
+ MEMGRAPH_URL=bolt://localhost:7687
50
+ MEMGRAPH_USERNAME=
51
+ MEMGRAPH_PASSWORD=
52
+ MEMGRAPH_DATABASE=memgraph
@@ -0,0 +1,182 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
175
+
176
+
177
+ # VS Code
178
+ .vscode/
179
+
180
+
181
+ # Project specfic files
182
+ /enterprise-context/sic-agent/sic-scrapper/output/*
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Memgraph Ltd.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,197 @@
1
+ Metadata-Version: 2.4
2
+ Name: structured2graph
3
+ Version: 0.1.1
4
+ Summary: Database migration agent from structured data (e.g. SQL) to graph.
5
+ Project-URL: Homepage, https://github.com/memgraph/ai-toolkit
6
+ Project-URL: Repository, https://github.com/memgraph/ai-toolkit
7
+ Project-URL: Issues, https://github.com/memgraph/ai-toolkit/issues
8
+ Project-URL: Documentation, https://github.com/memgraph/ai-toolkit/tree/main/agents/sql2graph
9
+ Author-email: Memgraph <tech@memgraph.com>
10
+ Maintainer-email: Memgraph <tech@memgraph.com>
11
+ License-Expression: MIT
12
+ License-File: LICENSE
13
+ Keywords: database-migration,etl,graph-database,knowledge-graph,memgraph,sql-to-graph
14
+ Classifier: Development Status :: 3 - Alpha
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: anthropic>=0.40.0
26
+ Requires-Dist: langchain-anthropic>=0.3.0
27
+ Requires-Dist: langchain-core>=1.0.0
28
+ Requires-Dist: langchain-google-genai>=2.0.0
29
+ Requires-Dist: langchain-openai>=0.2.0
30
+ Requires-Dist: langchain>=1.0.0
31
+ Requires-Dist: langgraph>=0.2.0
32
+ Requires-Dist: memgraph-toolbox>=0.1.4
33
+ Requires-Dist: mysql-connector-python>=9.0.0
34
+ Requires-Dist: neo4j>=5.0.0
35
+ Requires-Dist: openai>=1.0.0
36
+ Requires-Dist: psycopg2-binary>=2.9
37
+ Requires-Dist: pydantic>=2.0.0
38
+ Requires-Dist: pymysql>=1.1.0
39
+ Requires-Dist: python-dotenv>=1.0.0
40
+ Requires-Dist: sqlalchemy>=2.0.0
41
+ Description-Content-Type: text/markdown
42
+
43
+ # SQL Database to Graph Migration Agent
44
+
45
+ Intelligent database migration agent that transforms SQL databases (MySQL, PostgreSQL) into graph databases, powered by LLM analysis and LangGraph workflows.
46
+
47
+ ## Overview
48
+
49
+ This package provides a sophisticated migration agent that:
50
+
51
+ - **Analyzes SQL database schemas** - Automatically discovers tables, relationships, and constraints
52
+ - **Generates optimal graph models** - Uses AI to create node and relationship structures
53
+ - **Creates indexes and constraints** - Ensures performance and data integrity
54
+ - **Handles complex relationships** - Converts foreign keys to graph relationships
55
+ - **Incremental refinement** - Review each table, adjust the model
56
+ immediately, then enter the interactive refinement loop once all tables
57
+ are processed
58
+ - **Comprehensive validation** - Verifies migration results and data integrity
59
+
60
+ ## Installation
61
+
62
+ ```bash
63
+ # Install the package
64
+ uv pip install .
65
+
66
+ # Or install in development mode
67
+ uv pip install -e .
68
+ ```
69
+
70
+ ## Quick Start
71
+
72
+ Run the migration agent:
73
+
74
+ ```bash
75
+ uv run main
76
+ ```
77
+
78
+ The agent will guide you through:
79
+
80
+ 1. Environment setup and database connections
81
+ 2. Graph modeling strategy selection
82
+ 3. Automatic or incremental migration mode
83
+ 4. Complete migration workflow with progress tracking
84
+
85
+ > **Incremental review:** The LLM now drafts the entire graph model in a single
86
+ > shot and then walks you through table-level changes detected since the last
87
+ > migration. You only need to approve (or tweak) the differences that matter.
88
+
89
+ You can also preconfigure the workflow using CLI flags or environment variables:
90
+
91
+ ```bash
92
+ uv run main --mode incremental --strategy llm --meta-graph reset --log-level DEBUG
93
+ ```
94
+
95
+ | Option | Environment | Description |
96
+ | -------------------------------------- | -------------------- | ------------------------------------------------------------- |
97
+ | `--mode {automatic,incremental}` | `SQL2MG_MODE` | Selects automatic or incremental modeling flow. |
98
+ | `--strategy {deterministic,llm}` | `SQL2MG_STRATEGY` | Chooses deterministic or LLM-powered HyGM strategy. |
99
+ | `--provider {openai,anthropic,gemini}` | `LLM_PROVIDER` | Selects LLM provider (auto-detects if not specified). |
100
+ | `--model MODEL_NAME` | `LLM_MODEL` | Specifies LLM model name (uses provider default if not set). |
101
+ | `--meta-graph {auto,skip,reset}` | `SQL2MG_META_POLICY` | Controls how stored meta graph data is used (default `auto`). |
102
+ | `--log-level LEVEL` | `SQL2MG_LOG_LEVEL` | Sets logging verbosity (`DEBUG`, `INFO`, etc.). |
103
+
104
+ ## Configuration
105
+
106
+ Set up your environment variables in `.env`:
107
+
108
+ ```bash
109
+ # Select source database (mysql or postgresql)
110
+ SOURCE_DB_TYPE=postgresql
111
+
112
+ # PostgreSQL Database (used when SOURCE_DB_TYPE=postgresql)
113
+ POSTGRES_HOST=localhost
114
+ POSTGRES_PORT=5432
115
+ POSTGRES_DATABASE=pagila
116
+ POSTGRES_USER=username
117
+ POSTGRES_PASSWORD=password
118
+ POSTGRES_SCHEMA=public
119
+
120
+ # MySQL Database (used when SOURCE_DB_TYPE=mysql)
121
+ MYSQL_HOST=localhost
122
+ MYSQL_PORT=3306
123
+ MYSQL_DATABASE=sakila
124
+ MYSQL_USER=username
125
+ MYSQL_PASSWORD=password
126
+
127
+ # Memgraph Database
128
+ MEMGRAPH_URL=bolt://localhost:7687
129
+ MEMGRAPH_USERNAME=
130
+ MEMGRAPH_PASSWORD=
131
+ MEMGRAPH_DATABASE=memgraph
132
+
133
+ # LLM API Keys (for LLM-powered features - choose one or more)
134
+ OPENAI_API_KEY=your_openai_key # For GPT models
135
+ # ANTHROPIC_API_KEY=your_anthropic_key # For Claude models
136
+ # GOOGLE_API_KEY=your_google_key # For Gemini models
137
+
138
+ # LLM Provider Configuration (optional - auto-detects if not set)
139
+ # LLM_PROVIDER=openai # Options: openai, anthropic, gemini
140
+ # LLM_MODEL=gpt-4o-mini # Specific model name
141
+
142
+ # Optional migration defaults (override CLI prompts)
143
+ SQL2MG_MODE=automatic
144
+ SQL2MG_STRATEGY=deterministic
145
+ SQL2MG_META_POLICY=auto
146
+ SQL2MG_LOG_LEVEL=INFO
147
+ ```
148
+
149
+ When switching `SOURCE_DB_TYPE` remember to update the matching credential block and rerun `uv sync` so dependencies like `psycopg2-binary` are installed for PostgreSQL support.
150
+
151
+ Make sure that Memgraph is started with the `--schema-info-enabled=true`, since agent uses the schema information from Memgraph `SHOW SCHEMA INFO`.
152
+
153
+ ## Multi-LLM Provider Support
154
+
155
+ The agent supports multiple LLM providers for AI-powered graph modeling:
156
+
157
+ ### Supported Providers
158
+
159
+ - **OpenAI** (GPT models) - Default: `gpt-4o-mini`
160
+ - **Anthropic** (Claude models) - Default: `claude-3-5-sonnet-20241022`
161
+ - **Google** (Gemini models) - Default: `gemini-1.5-pro`
162
+
163
+ ### Usage Examples
164
+
165
+ ```bash
166
+ # Auto-detect provider based on API keys
167
+ uv run main --strategy llm
168
+
169
+ # Use specific provider
170
+ uv run main --strategy llm --provider anthropic
171
+
172
+ # Use specific model
173
+ uv run main --strategy llm --provider openai --model gpt-4o
174
+
175
+ # All options together
176
+ uv run main --mode incremental --strategy llm --provider gemini --model gemini-1.5-flash
177
+ ```
178
+
179
+ All providers support **structured outputs** for consistent graph model generation. The system automatically validates schemas using Pydantic models.
180
+
181
+ 📖 **[Full Multi-Provider Documentation](docs/MULTI_PROVIDER_SUPPORT.md)**
182
+
183
+ # Arhitecture
184
+
185
+ ```
186
+ core/hygm/
187
+ ├── hygm.py # Main orchestrator class
188
+ ├── models/ # Data models and structures
189
+ │ ├── graph_models.py # Core graph representation
190
+ │ ├── llm_models.py # LLM-specific models
191
+ │ ├── operations.py # Interactive operations
192
+ │ └── sources.py # Source tracking
193
+ └── strategies/ # Modeling strategies
194
+ ├── base.py # Abstract interface
195
+ ├── deterministic.py # Rule-based modeling
196
+ └── llm.py # AI-powered modeling
197
+ ```
@@ -0,0 +1,90 @@
1
+ # SQL → Graph Migration Agent Prompt
2
+
3
+ ## TL;DR
4
+
5
+ You are working inside the `agents/sql2graph` package, a UV-managed Python project that turns relational schemas (MySQL/PostgreSQL) into graph schemas and data. The primary entry point is `main.py`, which wires configuration, environment validation, database analyzers, and the HyGM graph-modeling subsystem. Changes usually touch:
6
+
7
+ - `core/` — Orchestrates the migration workflow (`migration_agent.py`) and HyGM graph modeling (`hygm/`).
8
+ - `database/` — Connectors and analyzers for the source RDBMSs.
9
+ - `query_generation/` — Cypher generation helpers.
10
+ - `utils/` — Environment setup, connection probes, and CLI helpers.
11
+
12
+ Always maintain the CLI experience (`main.py`) and respect the line-length < 79 char lint rule.
13
+
14
+ ## Tech Stack & Tooling
15
+
16
+ - Python 3.10+, managed with [uv](https://github.com/astral-sh/uv).
17
+ - Memgraph as the target graph database (Bolt connection).
18
+ - Optional LLM features powered by OpenAI (LangChain / LangGraph patterns inside `core/hygm`).
19
+ - Testing: `pytest` under `tests/` (integration heavy, uses mocks for DB analyzers).
20
+
21
+ ## Core Concepts
22
+
23
+ - **HyGM (Hypothetical Graph Modeling)** lives in `core/hygm/hygm.py` and exposes modeling modes via `ModelingMode`:
24
+ - `AUTOMATIC` – one-shot graph generation.
25
+ - `INCREMENTAL` – table-by-table confirmation flow with an optional refinement
26
+ loop after processing all tables.
27
+ - **GraphModelingStrategy**: `DETERMINISTIC` (rule-based) and `LLM_POWERED` (needs an LLM+API key).
28
+ - **SQLToMemgraphAgent** (`core/migration_agent.py`) coordinates schema analysis, HyGM modeling, query generation, execution, and validation (note: class name retained for compatibility).
29
+ - **Database analyzers** in `database/` introspect MySQL/PostgreSQL schemas and emit a normalized metadata structure consumed by HyGM.
30
+ - **Query generation** in `query_generation/` converts the graph model + metadata into Cypher migrations, indexes, and constraints.
31
+ - **Database data interfaces** in `database/models.py` define the canonical `TableInfo`, `ColumnInfo`, `RelationshipInfo`, and `DatabaseStructure` data classes. These objects flow from analyzers into HyGM via the `to_hygm_format()` helpers, ensuring consistent schema metadata for every modeling mode.
32
+ - **Graph schema structures** in `core/hygm/models/graph_models.py` (e.g., `GraphModel`, `GraphNode`, `GraphRelationship`) capture the in-memory graph representation HyGM produces and later serializes to schema format.
33
+ - **LLM structured output models** in `core/hygm/models/llm_models.py` (`LLMGraphModel`, `LLMGraphNode`, `LLMGraphRelationship`) describe the contract for AI-generated schemas and include `to_graph_model()` utilities to convert LLM responses into the standard `GraphModel` objects.
34
+ - The `GraphModel` serialization format matches the canonical spec in `core/schema/spec.json`, so any changes to the schema data classes should be mirrored against that document.
35
+ - Source tracking helpers in `core/hygm/models/sources.py` annotate nodes, relationships, properties, indexes, and constraints with origin metadata. Preserve these when modifying `GraphModel` so downstream migrations retain the link back to the originating tables or user-applied changes.
36
+
37
+ ## Entry Points & CLI Flow
38
+
39
+ - Run with `uv run main.py` (banner, env checks, connection probes, then the migration workflow).
40
+ - CLI prompts include:
41
+ - Graph modeling mode (automatic / incremental with interactive refinement).
42
+ - Modeling strategy (deterministic / AI-powered).
43
+ - Confirmation dialogs during automatic or incremental flows.
44
+ - Post-session prompts that let users launch the interactive refinement loop
45
+ after reviewing every table in an incremental run.
46
+ - Environment validation happens before migration; failures raise `MigrationEnvironmentError` or `DatabaseConnectionError` from `utils/`.
47
+
48
+ ## Configuration & Environment
49
+
50
+ - `.env` (or env vars) must provide:
51
+ - `SOURCE_DB_*` (host, port, name, user, password, type [`mysql|postgresql`]).
52
+ - `MEMGRAPH_*` connection details.
53
+ - `OPENAI_API_KEY` for LLM features; omit or leave empty to disable LLM strategy.
54
+ - Memgraph must run with `--schema-info-enabled=true` for schema validation.
55
+
56
+ ## Testing & Validation
57
+
58
+ - Install deps: `uv sync` (or `uv pip install -e .`).
59
+ - Run targeted tests: `uv run python -m pytest tests/test_integration.py -v`.
60
+ - Keep graph-modeling logic covered via integration tests; they rely on mocked analyzers.
61
+ - Observe linting: adhere to 79-character lines and existing logging conventions (`logging` module).
62
+
63
+ ## Development Tips
64
+
65
+ - Update `PROMPT.md` when project layout or workflows change.
66
+ - Prefer existing abstractions: use `SQLToMemgraphAgent` methods, HyGM strategies/helpers, and database adapters.
67
+ - For new modeling flows, ensure `ModelingMode` and CLI choices stay in sync.
68
+ - Preserve user-facing prompts/emojis—they guide the interactive experience.
69
+ - When adding LLM-dependent features, guard them when API keys or clients are missing.
70
+ - Document new commands or config expectations in this prompt and `README.md` if user-facing.
71
+
72
+ ## Useful Commands
73
+
74
+ ```bash
75
+ # Sync dependencies
76
+ uv sync
77
+
78
+ # Run the CLI
79
+ uv run main.py
80
+
81
+ # Run tests
82
+ uv run python -m pytest tests -v
83
+ ```
84
+
85
+ ## When Generating Code
86
+
87
+ - Mention which files you change and why, referencing modules above.
88
+ - Explain how to rerun the CLI or relevant tests after modifications.
89
+ - Provide small follow-up suggestions if more validation is needed.
90
+ - Keep output concise but cover context so the next agent run has everything it needs.
@@ -0,0 +1,155 @@
1
+ # SQL Database to Graph Migration Agent
2
+
3
+ Intelligent database migration agent that transforms SQL databases (MySQL, PostgreSQL) into graph databases, powered by LLM analysis and LangGraph workflows.
4
+
5
+ ## Overview
6
+
7
+ This package provides a sophisticated migration agent that:
8
+
9
+ - **Analyzes SQL database schemas** - Automatically discovers tables, relationships, and constraints
10
+ - **Generates optimal graph models** - Uses AI to create node and relationship structures
11
+ - **Creates indexes and constraints** - Ensures performance and data integrity
12
+ - **Handles complex relationships** - Converts foreign keys to graph relationships
13
+ - **Incremental refinement** - Review each table, adjust the model
14
+ immediately, then enter the interactive refinement loop once all tables
15
+ are processed
16
+ - **Comprehensive validation** - Verifies migration results and data integrity
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ # Install the package
22
+ uv pip install .
23
+
24
+ # Or install in development mode
25
+ uv pip install -e .
26
+ ```
27
+
28
+ ## Quick Start
29
+
30
+ Run the migration agent:
31
+
32
+ ```bash
33
+ uv run main
34
+ ```
35
+
36
+ The agent will guide you through:
37
+
38
+ 1. Environment setup and database connections
39
+ 2. Graph modeling strategy selection
40
+ 3. Automatic or incremental migration mode
41
+ 4. Complete migration workflow with progress tracking
42
+
43
+ > **Incremental review:** The LLM now drafts the entire graph model in a single
44
+ > shot and then walks you through table-level changes detected since the last
45
+ > migration. You only need to approve (or tweak) the differences that matter.
46
+
47
+ You can also preconfigure the workflow using CLI flags or environment variables:
48
+
49
+ ```bash
50
+ uv run main --mode incremental --strategy llm --meta-graph reset --log-level DEBUG
51
+ ```
52
+
53
+ | Option | Environment | Description |
54
+ | -------------------------------------- | -------------------- | ------------------------------------------------------------- |
55
+ | `--mode {automatic,incremental}` | `SQL2MG_MODE` | Selects automatic or incremental modeling flow. |
56
+ | `--strategy {deterministic,llm}` | `SQL2MG_STRATEGY` | Chooses deterministic or LLM-powered HyGM strategy. |
57
+ | `--provider {openai,anthropic,gemini}` | `LLM_PROVIDER` | Selects LLM provider (auto-detects if not specified). |
58
+ | `--model MODEL_NAME` | `LLM_MODEL` | Specifies LLM model name (uses provider default if not set). |
59
+ | `--meta-graph {auto,skip,reset}` | `SQL2MG_META_POLICY` | Controls how stored meta graph data is used (default `auto`). |
60
+ | `--log-level LEVEL` | `SQL2MG_LOG_LEVEL` | Sets logging verbosity (`DEBUG`, `INFO`, etc.). |
61
+
62
+ ## Configuration
63
+
64
+ Set up your environment variables in `.env`:
65
+
66
+ ```bash
67
+ # Select source database (mysql or postgresql)
68
+ SOURCE_DB_TYPE=postgresql
69
+
70
+ # PostgreSQL Database (used when SOURCE_DB_TYPE=postgresql)
71
+ POSTGRES_HOST=localhost
72
+ POSTGRES_PORT=5432
73
+ POSTGRES_DATABASE=pagila
74
+ POSTGRES_USER=username
75
+ POSTGRES_PASSWORD=password
76
+ POSTGRES_SCHEMA=public
77
+
78
+ # MySQL Database (used when SOURCE_DB_TYPE=mysql)
79
+ MYSQL_HOST=localhost
80
+ MYSQL_PORT=3306
81
+ MYSQL_DATABASE=sakila
82
+ MYSQL_USER=username
83
+ MYSQL_PASSWORD=password
84
+
85
+ # Memgraph Database
86
+ MEMGRAPH_URL=bolt://localhost:7687
87
+ MEMGRAPH_USERNAME=
88
+ MEMGRAPH_PASSWORD=
89
+ MEMGRAPH_DATABASE=memgraph
90
+
91
+ # LLM API Keys (for LLM-powered features - choose one or more)
92
+ OPENAI_API_KEY=your_openai_key # For GPT models
93
+ # ANTHROPIC_API_KEY=your_anthropic_key # For Claude models
94
+ # GOOGLE_API_KEY=your_google_key # For Gemini models
95
+
96
+ # LLM Provider Configuration (optional - auto-detects if not set)
97
+ # LLM_PROVIDER=openai # Options: openai, anthropic, gemini
98
+ # LLM_MODEL=gpt-4o-mini # Specific model name
99
+
100
+ # Optional migration defaults (override CLI prompts)
101
+ SQL2MG_MODE=automatic
102
+ SQL2MG_STRATEGY=deterministic
103
+ SQL2MG_META_POLICY=auto
104
+ SQL2MG_LOG_LEVEL=INFO
105
+ ```
106
+
107
+ When switching `SOURCE_DB_TYPE` remember to update the matching credential block and rerun `uv sync` so dependencies like `psycopg2-binary` are installed for PostgreSQL support.
108
+
109
+ Make sure that Memgraph is started with the `--schema-info-enabled=true`, since agent uses the schema information from Memgraph `SHOW SCHEMA INFO`.
110
+
111
+ ## Multi-LLM Provider Support
112
+
113
+ The agent supports multiple LLM providers for AI-powered graph modeling:
114
+
115
+ ### Supported Providers
116
+
117
+ - **OpenAI** (GPT models) - Default: `gpt-4o-mini`
118
+ - **Anthropic** (Claude models) - Default: `claude-3-5-sonnet-20241022`
119
+ - **Google** (Gemini models) - Default: `gemini-1.5-pro`
120
+
121
+ ### Usage Examples
122
+
123
+ ```bash
124
+ # Auto-detect provider based on API keys
125
+ uv run main --strategy llm
126
+
127
+ # Use specific provider
128
+ uv run main --strategy llm --provider anthropic
129
+
130
+ # Use specific model
131
+ uv run main --strategy llm --provider openai --model gpt-4o
132
+
133
+ # All options together
134
+ uv run main --mode incremental --strategy llm --provider gemini --model gemini-1.5-flash
135
+ ```
136
+
137
+ All providers support **structured outputs** for consistent graph model generation. The system automatically validates schemas using Pydantic models.
138
+
139
+ 📖 **[Full Multi-Provider Documentation](docs/MULTI_PROVIDER_SUPPORT.md)**
140
+
141
+ # Arhitecture
142
+
143
+ ```
144
+ core/hygm/
145
+ ├── hygm.py # Main orchestrator class
146
+ ├── models/ # Data models and structures
147
+ │ ├── graph_models.py # Core graph representation
148
+ │ ├── llm_models.py # LLM-specific models
149
+ │ ├── operations.py # Interactive operations
150
+ │ └── sources.py # Source tracking
151
+ └── strategies/ # Modeling strategies
152
+ ├── base.py # Abstract interface
153
+ ├── deterministic.py # Rule-based modeling
154
+ └── llm.py # AI-powered modeling
155
+ ```