structured2graph 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +47 -0
- core/__init__.py +23 -0
- core/hygm/__init__.py +74 -0
- core/hygm/hygm.py +2351 -0
- core/hygm/models/__init__.py +82 -0
- core/hygm/models/graph_models.py +667 -0
- core/hygm/models/llm_models.py +229 -0
- core/hygm/models/operations.py +176 -0
- core/hygm/models/sources.py +68 -0
- core/hygm/models/user_operations.py +139 -0
- core/hygm/strategies/__init__.py +17 -0
- core/hygm/strategies/base.py +36 -0
- core/hygm/strategies/deterministic.py +262 -0
- core/hygm/strategies/llm.py +904 -0
- core/hygm/validation/__init__.py +38 -0
- core/hygm/validation/base.py +194 -0
- core/hygm/validation/graph_schema_validator.py +687 -0
- core/hygm/validation/memgraph_data_validator.py +991 -0
- core/migration_agent.py +1369 -0
- core/schema/spec.json +155 -0
- core/utils/meta_graph.py +108 -0
- database/__init__.py +36 -0
- database/adapters/__init__.py +11 -0
- database/adapters/memgraph.py +318 -0
- database/adapters/mysql.py +311 -0
- database/adapters/postgresql.py +335 -0
- database/analyzer.py +396 -0
- database/factory.py +219 -0
- database/models.py +209 -0
- main.py +518 -0
- query_generation/__init__.py +20 -0
- query_generation/cypher_generator.py +129 -0
- query_generation/schema_utilities.py +88 -0
- structured2graph-0.1.1.dist-info/METADATA +197 -0
- structured2graph-0.1.1.dist-info/RECORD +41 -0
- structured2graph-0.1.1.dist-info/WHEEL +4 -0
- structured2graph-0.1.1.dist-info/entry_points.txt +2 -0
- structured2graph-0.1.1.dist-info/licenses/LICENSE +21 -0
- utils/__init__.py +57 -0
- utils/config.py +235 -0
- utils/environment.py +404 -0
utils/environment.py
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Environment and Database Configuration Utilities
|
|
3
|
+
|
|
4
|
+
This module handles environment variable validation, database connection
|
|
5
|
+
probing, and configuration setup for the SQL to graph migration agent.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any, Dict, List, Tuple, Optional
|
|
11
|
+
from dotenv import load_dotenv
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MigrationEnvironmentError(Exception):
|
|
17
|
+
"""Custom exception for environment-related errors."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DatabaseConnectionError(Exception):
|
|
21
|
+
"""Custom exception for database connection errors."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
SUPPORTED_DATABASES = {"mysql", "postgresql"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_environment() -> None:
|
|
28
|
+
"""Load environment variables from .env file."""
|
|
29
|
+
load_dotenv()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_source_db_type() -> str:
|
|
33
|
+
"""Return the configured source database type with validation."""
|
|
34
|
+
db_type = os.getenv("SOURCE_DB_TYPE", "mysql").strip().lower()
|
|
35
|
+
if db_type not in SUPPORTED_DATABASES:
|
|
36
|
+
logger.warning(
|
|
37
|
+
"Unsupported SOURCE_DB_TYPE '%s'; defaulting to MySQL",
|
|
38
|
+
db_type,
|
|
39
|
+
)
|
|
40
|
+
return "mysql"
|
|
41
|
+
return db_type
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def get_required_environment_variables() -> Dict[str, str]:
|
|
45
|
+
"""Get the required environment variables and their descriptions."""
|
|
46
|
+
db_type = get_source_db_type()
|
|
47
|
+
|
|
48
|
+
base_vars: Dict[str, str] = {
|
|
49
|
+
"SOURCE_DB_TYPE": "Source database type (mysql|postgresql)",
|
|
50
|
+
"MEMGRAPH_URL": ("Memgraph connection URL (default: bolt://localhost:7687)"),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Note: LLM API keys are optional, at least one needed for LLM strategy
|
|
54
|
+
# - OPENAI_API_KEY: OpenAI (GPT models)
|
|
55
|
+
# - ANTHROPIC_API_KEY: Anthropic (Claude models)
|
|
56
|
+
# - GOOGLE_API_KEY: Google (Gemini models)
|
|
57
|
+
|
|
58
|
+
if db_type == "postgresql":
|
|
59
|
+
base_vars.update(
|
|
60
|
+
{
|
|
61
|
+
"POSTGRES_HOST": "PostgreSQL host (default: localhost)",
|
|
62
|
+
"POSTGRES_PORT": "PostgreSQL port (default: 5432)",
|
|
63
|
+
"POSTGRES_USER": "PostgreSQL user (default: postgres)",
|
|
64
|
+
"POSTGRES_PASSWORD": "PostgreSQL database password",
|
|
65
|
+
"POSTGRES_DATABASE": ("PostgreSQL database name (default: postgres)"),
|
|
66
|
+
"POSTGRES_SCHEMA": "PostgreSQL schema (default: public)",
|
|
67
|
+
}
|
|
68
|
+
)
|
|
69
|
+
else:
|
|
70
|
+
base_vars.update(
|
|
71
|
+
{
|
|
72
|
+
"MYSQL_HOST": "MySQL host (default: host.docker.internal)",
|
|
73
|
+
"MYSQL_PORT": "MySQL port (default: 3306)",
|
|
74
|
+
"MYSQL_USER": "MySQL user (default: root)",
|
|
75
|
+
"MYSQL_PASSWORD": "MySQL database password",
|
|
76
|
+
"MYSQL_DATABASE": "MySQL database name (default: sakila)",
|
|
77
|
+
}
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
return base_vars
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def get_optional_environment_variables() -> Dict[str, str]:
|
|
84
|
+
"""Get optional environment variables and their descriptions."""
|
|
85
|
+
optional_vars = {
|
|
86
|
+
"MEMGRAPH_USERNAME": "Memgraph username (default: empty)",
|
|
87
|
+
"MEMGRAPH_PASSWORD": "Memgraph password (default: empty)",
|
|
88
|
+
"MEMGRAPH_DATABASE": "Memgraph database name (default: memgraph)",
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if get_source_db_type() == "postgresql":
|
|
92
|
+
optional_vars.setdefault(
|
|
93
|
+
"POSTGRES_SCHEMA", "PostgreSQL schema (default: public)"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return optional_vars
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def validate_environment_variables() -> Tuple[bool, List[str]]:
|
|
100
|
+
"""
|
|
101
|
+
Validate required environment variables.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Tuple of (is_valid, missing_variables)
|
|
105
|
+
"""
|
|
106
|
+
missing_vars: List[str] = []
|
|
107
|
+
required_vars = get_required_environment_variables()
|
|
108
|
+
|
|
109
|
+
db_type = get_source_db_type()
|
|
110
|
+
|
|
111
|
+
# LLM API keys are no longer required - checked separately if using LLM strategy
|
|
112
|
+
|
|
113
|
+
if db_type == "postgresql":
|
|
114
|
+
if not os.getenv("POSTGRES_PASSWORD"):
|
|
115
|
+
logger.warning(
|
|
116
|
+
"POSTGRES_PASSWORD missing; attempting passwordless connection"
|
|
117
|
+
)
|
|
118
|
+
else:
|
|
119
|
+
if not os.getenv("MYSQL_PASSWORD"):
|
|
120
|
+
logger.warning("MYSQL_PASSWORD missing; attempting passwordless connection")
|
|
121
|
+
|
|
122
|
+
return len(missing_vars) == 0, missing_vars
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def get_source_db_config() -> Dict[str, Any]:
|
|
126
|
+
"""Get source database configuration from environment variables."""
|
|
127
|
+
db_type = get_source_db_type()
|
|
128
|
+
|
|
129
|
+
if db_type == "postgresql":
|
|
130
|
+
return {
|
|
131
|
+
"database_type": "postgresql",
|
|
132
|
+
"host": os.getenv("POSTGRES_HOST", "localhost"),
|
|
133
|
+
"user": os.getenv("POSTGRES_USER", "postgres"),
|
|
134
|
+
"password": os.getenv("POSTGRES_PASSWORD", ""),
|
|
135
|
+
"database": os.getenv("POSTGRES_DATABASE", "postgres"),
|
|
136
|
+
"port": int(os.getenv("POSTGRES_PORT", "5432")),
|
|
137
|
+
"schema": os.getenv("POSTGRES_SCHEMA", "public"),
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
"database_type": "mysql",
|
|
142
|
+
"host": os.getenv("MYSQL_HOST", "host.docker.internal"),
|
|
143
|
+
"user": os.getenv("MYSQL_USER", "root"),
|
|
144
|
+
"password": os.getenv("MYSQL_PASSWORD", ""),
|
|
145
|
+
"database": os.getenv("MYSQL_DATABASE", "sakila"),
|
|
146
|
+
"port": int(os.getenv("MYSQL_PORT", "3306")),
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def get_memgraph_config() -> Dict[str, str]:
|
|
151
|
+
"""
|
|
152
|
+
Get Memgraph configuration from environment variables.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Dictionary with Memgraph connection parameters.
|
|
156
|
+
"""
|
|
157
|
+
return {
|
|
158
|
+
"url": os.getenv("MEMGRAPH_URL", "bolt://localhost:7687"),
|
|
159
|
+
"username": os.getenv("MEMGRAPH_USERNAME", ""),
|
|
160
|
+
"password": os.getenv("MEMGRAPH_PASSWORD", ""),
|
|
161
|
+
"database": os.getenv("MEMGRAPH_DATABASE", "memgraph"),
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def probe_source_connection(
|
|
166
|
+
source_db_config: Dict[str, Any]
|
|
167
|
+
) -> Tuple[bool, Optional[str]]:
|
|
168
|
+
"""Test source database connection using the configured analyzer."""
|
|
169
|
+
try:
|
|
170
|
+
import sys
|
|
171
|
+
from pathlib import Path
|
|
172
|
+
|
|
173
|
+
agents_root = Path(__file__).parent.parent
|
|
174
|
+
if str(agents_root) not in sys.path:
|
|
175
|
+
sys.path.insert(0, str(agents_root))
|
|
176
|
+
|
|
177
|
+
from database.factory import DatabaseAnalyzerFactory
|
|
178
|
+
|
|
179
|
+
config = source_db_config.copy()
|
|
180
|
+
db_type = config.pop("database_type", "mysql")
|
|
181
|
+
analyzer = DatabaseAnalyzerFactory.create_analyzer(db_type, **config)
|
|
182
|
+
if analyzer.connect():
|
|
183
|
+
analyzer.get_database_structure()
|
|
184
|
+
analyzer.disconnect()
|
|
185
|
+
return True, None
|
|
186
|
+
return False, "Failed to establish connection"
|
|
187
|
+
|
|
188
|
+
except ImportError as e:
|
|
189
|
+
return False, f"Missing database dependencies: {e}"
|
|
190
|
+
except Exception as e: # pylint: disable=broad-except
|
|
191
|
+
return False, f"Connection error: {e}"
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def probe_memgraph_connection(
|
|
195
|
+
memgraph_config: Dict[str, str]
|
|
196
|
+
) -> Tuple[bool, Optional[str]]:
|
|
197
|
+
"""
|
|
198
|
+
Test Memgraph database connection.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
memgraph_config: Memgraph connection configuration
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Tuple of (is_connected, error_message)
|
|
205
|
+
"""
|
|
206
|
+
try:
|
|
207
|
+
from memgraph_toolbox.api.memgraph import Memgraph
|
|
208
|
+
|
|
209
|
+
client = Memgraph(
|
|
210
|
+
url=str(memgraph_config.get("url", "bolt://localhost:7687")),
|
|
211
|
+
username=str(memgraph_config.get("username", "")),
|
|
212
|
+
password=str(memgraph_config.get("password", "")),
|
|
213
|
+
database=str(memgraph_config.get("database", "memgraph")),
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
client.query("MATCH (n) RETURN count(n) as node_count LIMIT 1")
|
|
217
|
+
client.close()
|
|
218
|
+
return True, None
|
|
219
|
+
|
|
220
|
+
except ImportError as e:
|
|
221
|
+
return False, f"Missing Memgraph dependencies: {e}"
|
|
222
|
+
except Exception as e: # pylint: disable=broad-except
|
|
223
|
+
return False, f"Connection error: {e}"
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def validate_llm_providers() -> Tuple[bool, List[str], List[str]]:
|
|
227
|
+
"""
|
|
228
|
+
Validate LLM provider API keys by making test requests.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Tuple of (has_valid_provider, valid_providers, error_messages)
|
|
232
|
+
"""
|
|
233
|
+
valid_providers = []
|
|
234
|
+
errors = []
|
|
235
|
+
|
|
236
|
+
# Check OpenAI
|
|
237
|
+
openai_key = os.getenv("OPENAI_API_KEY")
|
|
238
|
+
if openai_key:
|
|
239
|
+
try:
|
|
240
|
+
from langchain_openai import ChatOpenAI
|
|
241
|
+
|
|
242
|
+
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
|
|
243
|
+
llm.invoke("Test")
|
|
244
|
+
valid_providers.append("OpenAI")
|
|
245
|
+
except ImportError:
|
|
246
|
+
errors.append("OpenAI: Missing dependencies (langchain-openai)")
|
|
247
|
+
except Exception as e: # pylint: disable=broad-except
|
|
248
|
+
errors.append(f"OpenAI: {e}")
|
|
249
|
+
|
|
250
|
+
# Check Anthropic
|
|
251
|
+
anthropic_key = os.getenv("ANTHROPIC_API_KEY")
|
|
252
|
+
if anthropic_key:
|
|
253
|
+
try:
|
|
254
|
+
from langchain_anthropic import ChatAnthropic
|
|
255
|
+
|
|
256
|
+
llm = ChatAnthropic(model="claude-3-5-sonnet-20241022", temperature=0.1)
|
|
257
|
+
llm.invoke("Test")
|
|
258
|
+
valid_providers.append("Anthropic")
|
|
259
|
+
except ImportError:
|
|
260
|
+
errors.append("Anthropic: Missing dependencies (langchain-anthropic)")
|
|
261
|
+
except Exception as e: # pylint: disable=broad-except
|
|
262
|
+
errors.append(f"Anthropic: {e}")
|
|
263
|
+
|
|
264
|
+
# Check Google (Gemini)
|
|
265
|
+
google_key = os.getenv("GOOGLE_API_KEY")
|
|
266
|
+
if google_key:
|
|
267
|
+
try:
|
|
268
|
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
269
|
+
|
|
270
|
+
llm = ChatGoogleGenerativeAI(
|
|
271
|
+
model="gemini-2.0-flash-exp",
|
|
272
|
+
temperature=0.1,
|
|
273
|
+
google_api_key=google_key,
|
|
274
|
+
)
|
|
275
|
+
llm.invoke("Test")
|
|
276
|
+
valid_providers.append("Gemini")
|
|
277
|
+
except ImportError:
|
|
278
|
+
errors.append("Gemini: Missing dependencies (langchain-google-genai)")
|
|
279
|
+
except Exception as e: # pylint: disable=broad-except
|
|
280
|
+
errors.append(f"Gemini: {e}")
|
|
281
|
+
|
|
282
|
+
has_valid = len(valid_providers) > 0
|
|
283
|
+
return has_valid, valid_providers, errors
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def setup_and_validate_environment() -> Tuple[Dict[str, Any], Dict[str, str]]:
|
|
287
|
+
"""
|
|
288
|
+
Complete environment setup and validation.
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
Tuple of (source_db_config, memgraph_config)
|
|
292
|
+
|
|
293
|
+
Raises:
|
|
294
|
+
MigrationEnvironmentError: If environment validation fails
|
|
295
|
+
DatabaseConnectionError: If database connections fail
|
|
296
|
+
"""
|
|
297
|
+
load_environment()
|
|
298
|
+
|
|
299
|
+
is_valid, missing_vars = validate_environment_variables()
|
|
300
|
+
if not is_valid:
|
|
301
|
+
error_msg = "Missing required environment variables:\n"
|
|
302
|
+
for var in missing_vars:
|
|
303
|
+
error_msg += f" - {var}\n"
|
|
304
|
+
error_msg += (
|
|
305
|
+
"\nPlease check your .env file and ensure all required variables " "are set"
|
|
306
|
+
)
|
|
307
|
+
raise MigrationEnvironmentError(error_msg)
|
|
308
|
+
|
|
309
|
+
source_db_config = get_source_db_config()
|
|
310
|
+
memgraph_config = get_memgraph_config()
|
|
311
|
+
|
|
312
|
+
logger.info("Environment variables loaded successfully")
|
|
313
|
+
return source_db_config, memgraph_config
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def probe_all_connections(
|
|
317
|
+
source_db_config: Dict[str, Any], memgraph_config: Dict[str, str]
|
|
318
|
+
) -> None:
|
|
319
|
+
"""
|
|
320
|
+
Probe all database connections and validate API keys.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
source_db_config: Source database connection configuration
|
|
324
|
+
memgraph_config: Memgraph connection configuration
|
|
325
|
+
|
|
326
|
+
Raises:
|
|
327
|
+
DatabaseConnectionError: If any connection fails
|
|
328
|
+
"""
|
|
329
|
+
errors: List[str] = []
|
|
330
|
+
|
|
331
|
+
logger.info("Validating LLM provider API keys...")
|
|
332
|
+
has_valid, valid_providers, llm_errors = validate_llm_providers()
|
|
333
|
+
if has_valid:
|
|
334
|
+
logger.info("✅ Valid LLM providers: %s", ", ".join(valid_providers))
|
|
335
|
+
else:
|
|
336
|
+
logger.warning("⚠️ No valid LLM providers found")
|
|
337
|
+
|
|
338
|
+
for error in llm_errors:
|
|
339
|
+
logger.warning(" %s", error)
|
|
340
|
+
|
|
341
|
+
db_type = source_db_config.get("database_type", "mysql")
|
|
342
|
+
logger.info("Testing %s connection...", db_type.capitalize())
|
|
343
|
+
source_connected, source_error = probe_source_connection(source_db_config)
|
|
344
|
+
if not source_connected:
|
|
345
|
+
errors.append(f"{db_type}: {source_error}")
|
|
346
|
+
else:
|
|
347
|
+
logger.info(
|
|
348
|
+
"✅ %s connection successful to %s@%s",
|
|
349
|
+
db_type.capitalize(),
|
|
350
|
+
source_db_config.get("database"),
|
|
351
|
+
source_db_config.get("host"),
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
logger.info("Testing Memgraph connection...")
|
|
355
|
+
memgraph_connected, memgraph_error = probe_memgraph_connection(memgraph_config)
|
|
356
|
+
if not memgraph_connected:
|
|
357
|
+
errors.append(f"Memgraph: {memgraph_error}")
|
|
358
|
+
else:
|
|
359
|
+
logger.info(
|
|
360
|
+
"✅ Memgraph connection successful to %s",
|
|
361
|
+
memgraph_config["url"],
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
if errors:
|
|
365
|
+
error_msg = "Database connection failures:\n"
|
|
366
|
+
for error in errors:
|
|
367
|
+
error_msg += f" - {error}\n"
|
|
368
|
+
raise DatabaseConnectionError(error_msg)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def print_environment_help() -> None:
|
|
372
|
+
"""Print helpful environment setup information."""
|
|
373
|
+
print("❌ Setup Error: Missing required environment variables")
|
|
374
|
+
print("\nPlease ensure you have:")
|
|
375
|
+
print("1. Created a .env file (copy from .env.example)")
|
|
376
|
+
print("2. Set your OPENAI_API_KEY")
|
|
377
|
+
print("3. Set SOURCE_DB_TYPE to mysql or postgresql")
|
|
378
|
+
print("4. Provide the credentials for the selected source database")
|
|
379
|
+
print("\nExample .env file:")
|
|
380
|
+
print("OPENAI_API_KEY=your_openai_key_here")
|
|
381
|
+
print("SOURCE_DB_TYPE=postgresql")
|
|
382
|
+
print("POSTGRES_PASSWORD=your_postgres_password")
|
|
383
|
+
print("POSTGRES_HOST=localhost")
|
|
384
|
+
print("POSTGRES_USER=postgres")
|
|
385
|
+
print("POSTGRES_DATABASE=your_database")
|
|
386
|
+
print("MEMGRAPH_URL=bolt://localhost:7687")
|
|
387
|
+
|
|
388
|
+
print("\nRequired environment variables:")
|
|
389
|
+
for var, desc in get_required_environment_variables().items():
|
|
390
|
+
print(f" - {var}: {desc}")
|
|
391
|
+
|
|
392
|
+
print("\nOptional environment variables:")
|
|
393
|
+
for var, desc in get_optional_environment_variables().items():
|
|
394
|
+
print(f" - {var}: {desc}")
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def print_troubleshooting_help() -> None:
|
|
398
|
+
"""Print troubleshooting information."""
|
|
399
|
+
print("\nTroubleshooting steps:")
|
|
400
|
+
print("1. Check your .env file exists and contains required variables")
|
|
401
|
+
print("2. Verify your OpenAI API key is valid")
|
|
402
|
+
print("3. Test the source database connection with dedicated probe scripts")
|
|
403
|
+
print("4. Ensure Memgraph is running on the specified URL")
|
|
404
|
+
print("5. Check network connectivity between services")
|