structured2graph 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. __init__.py +47 -0
  2. core/__init__.py +23 -0
  3. core/hygm/__init__.py +74 -0
  4. core/hygm/hygm.py +2351 -0
  5. core/hygm/models/__init__.py +82 -0
  6. core/hygm/models/graph_models.py +667 -0
  7. core/hygm/models/llm_models.py +229 -0
  8. core/hygm/models/operations.py +176 -0
  9. core/hygm/models/sources.py +68 -0
  10. core/hygm/models/user_operations.py +139 -0
  11. core/hygm/strategies/__init__.py +17 -0
  12. core/hygm/strategies/base.py +36 -0
  13. core/hygm/strategies/deterministic.py +262 -0
  14. core/hygm/strategies/llm.py +904 -0
  15. core/hygm/validation/__init__.py +38 -0
  16. core/hygm/validation/base.py +194 -0
  17. core/hygm/validation/graph_schema_validator.py +687 -0
  18. core/hygm/validation/memgraph_data_validator.py +991 -0
  19. core/migration_agent.py +1369 -0
  20. core/schema/spec.json +155 -0
  21. core/utils/meta_graph.py +108 -0
  22. database/__init__.py +36 -0
  23. database/adapters/__init__.py +11 -0
  24. database/adapters/memgraph.py +318 -0
  25. database/adapters/mysql.py +311 -0
  26. database/adapters/postgresql.py +335 -0
  27. database/analyzer.py +396 -0
  28. database/factory.py +219 -0
  29. database/models.py +209 -0
  30. main.py +518 -0
  31. query_generation/__init__.py +20 -0
  32. query_generation/cypher_generator.py +129 -0
  33. query_generation/schema_utilities.py +88 -0
  34. structured2graph-0.1.1.dist-info/METADATA +197 -0
  35. structured2graph-0.1.1.dist-info/RECORD +41 -0
  36. structured2graph-0.1.1.dist-info/WHEEL +4 -0
  37. structured2graph-0.1.1.dist-info/entry_points.txt +2 -0
  38. structured2graph-0.1.1.dist-info/licenses/LICENSE +21 -0
  39. utils/__init__.py +57 -0
  40. utils/config.py +235 -0
  41. utils/environment.py +404 -0
@@ -0,0 +1,1369 @@
1
+ # flake8: noqa
2
+ """
3
+ SQL Database to Graph Migration Agent
4
+
5
+ This agent analyzes SQL databases, generates appropriate Cypher queries,
6
+ and migrates data to graph databases using LangGraph workflow.
7
+ """
8
+
9
+ import hashlib
10
+ import json
11
+ import logging
12
+ import os
13
+ import sys
14
+ from typing import Dict, List, Any, TypedDict, Optional, cast
15
+ from pathlib import Path
16
+
17
+ # Add parent directories to path for imports
18
+ sys.path.append(str(Path(__file__).parent.parent.parent / "memgraph-toolbox" / "src"))
19
+ sys.path.append(str(Path(__file__).parent.parent / "langchain-memgraph"))
20
+ sys.path.append(str(Path(__file__).parent.parent)) # Add agents root to path
21
+
22
+ from langgraph.graph import StateGraph, END # noqa: E402
23
+ from langchain_core.runnables.config import RunnableConfig # noqa: E402
24
+ from langchain_openai import ChatOpenAI # noqa: E402
25
+ from langchain_anthropic import ChatAnthropic # noqa: E402
26
+ from langchain_google_genai import ChatGoogleGenerativeAI # noqa: E402
27
+ from dotenv import load_dotenv # noqa: E402
28
+
29
+ from query_generation.cypher_generator import CypherGenerator # noqa: E402
30
+ from core.hygm import HyGM, ModelingMode, GraphModelingStrategy # noqa: E402
31
+ from core.hygm.validation import validate_memgraph_data # noqa: E402
32
+ from memgraph_toolbox.api.memgraph import Memgraph # noqa: E402
33
+ from database.factory import DatabaseAnalyzerFactory # noqa: E402
34
+ from core.utils.meta_graph import ( # noqa: E402
35
+ node_key as meta_node_key,
36
+ relationship_key as meta_relationship_key,
37
+ summarize_node as meta_summarize_node,
38
+ summarize_relationship as meta_summarize_relationship,
39
+ summarize_nodes as meta_summarize_nodes,
40
+ summarize_relationships as meta_summarize_relationships,
41
+ )
42
+
43
+ # Load environment variables
44
+ load_dotenv()
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+
49
+ class MigrationState(TypedDict):
50
+ """State for the migration workflow."""
51
+
52
+ source_db_config: Dict[str, Any]
53
+ memgraph_config: Optional[Dict[str, str]]
54
+ database_structure: Dict[str, Any]
55
+ graph_model: Any # HyGM GraphModel object
56
+ migration_queries: List[str]
57
+ current_step: str
58
+ errors: List[str]
59
+ completed_tables: List[str]
60
+ total_tables: int
61
+ created_indexes: List[str]
62
+ created_constraints: List[str]
63
+ validation_report: Dict[str, Any] # Post-migration validation results
64
+ existing_meta_graph: Optional[Dict[str, Any]]
65
+
66
+
67
+ class SQLToMemgraphAgent:
68
+ """Agent for migrating SQL databases to graph databases."""
69
+
70
+ def __init__(
71
+ self,
72
+ modeling_mode: ModelingMode = ModelingMode.AUTOMATIC,
73
+ graph_modeling_strategy: GraphModelingStrategy = (
74
+ GraphModelingStrategy.DETERMINISTIC
75
+ ),
76
+ meta_graph_policy: str = "auto",
77
+ llm_provider: Optional[str] = None,
78
+ llm_model: Optional[str] = None,
79
+ ):
80
+ """Initialize the migration agent.
81
+
82
+ Args:
83
+ modeling_mode: Graph modeling mode
84
+ - AUTOMATIC: Generate graph model automatically (default)
85
+ - INCREMENTAL: Review tables and refine interactively
86
+ graph_modeling_strategy: Strategy for graph model creation
87
+ - DETERMINISTIC: Rule-based graph creation (default)
88
+ - LLM_POWERED: LLM generates the graph model
89
+ meta_graph_policy: Meta graph handling policy
90
+ llm_provider: LLM provider name (openai/anthropic/gemini)
91
+ llm_model: Specific model name to use
92
+ """
93
+ # Initialize LLM client if using LLM strategy OR incremental mode
94
+ # (incremental mode needs LLM for natural language modifications)
95
+ self.llm = None
96
+ needs_llm = (
97
+ graph_modeling_strategy == GraphModelingStrategy.LLM_POWERED
98
+ or modeling_mode == ModelingMode.INCREMENTAL
99
+ )
100
+
101
+ if needs_llm:
102
+ # Auto-detect provider from environment if not specified
103
+ if llm_provider is None:
104
+ llm_provider = os.getenv("LLM_PROVIDER")
105
+ if llm_provider:
106
+ llm_provider = llm_provider.lower()
107
+ logger.info("Using LLM provider from environment: %s", llm_provider)
108
+ else:
109
+ # Auto-detect based on available API keys
110
+ if os.getenv("OPENAI_API_KEY"):
111
+ llm_provider = "openai"
112
+ logger.info("Auto-detected LLM provider: OpenAI")
113
+ elif os.getenv("ANTHROPIC_API_KEY"):
114
+ llm_provider = "anthropic"
115
+ logger.info("Auto-detected LLM provider: Anthropic")
116
+ elif os.getenv("GOOGLE_API_KEY"):
117
+ llm_provider = "gemini"
118
+ logger.info("Auto-detected LLM provider: Gemini")
119
+ else:
120
+ raise ValueError(
121
+ "No LLM provider configured. Please set one of: "
122
+ "OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, "
123
+ "or set LLM_PROVIDER environment variable"
124
+ )
125
+
126
+ # Create LangChain chat model based on provider
127
+ provider_lower = llm_provider.lower()
128
+
129
+ if provider_lower == "openai":
130
+ model = llm_model or os.getenv("LLM_MODEL", "gpt-4o")
131
+ self.llm = ChatOpenAI(model=model, temperature=0.1)
132
+ logger.info("Initialized OpenAI client with model: %s", model)
133
+
134
+ elif provider_lower == "anthropic":
135
+ model = llm_model or os.getenv(
136
+ "LLM_MODEL", "claude-3-5-sonnet-20241022"
137
+ )
138
+ self.llm = ChatAnthropic(model=model, temperature=0.1)
139
+ logger.info("Initialized Anthropic client with model: %s", model)
140
+
141
+ elif provider_lower == "gemini":
142
+ model = llm_model or os.getenv("LLM_MODEL", "gemini-2.0-flash-exp")
143
+ self.llm = ChatGoogleGenerativeAI(
144
+ model=model,
145
+ temperature=0.1,
146
+ google_api_key=os.getenv("GOOGLE_API_KEY"),
147
+ )
148
+ logger.info("Initialized Gemini client with model: %s", model)
149
+
150
+ else:
151
+ raise ValueError(
152
+ f"Unsupported LLM provider: {llm_provider}. "
153
+ "Supported providers: openai, anthropic, gemini"
154
+ )
155
+
156
+ self.database_analyzer = None
157
+ self.cypher_generator = CypherGenerator()
158
+ self.modeling_mode = modeling_mode
159
+ self.graph_modeling_strategy = graph_modeling_strategy
160
+ policy = (meta_graph_policy or "auto").lower()
161
+ if policy not in {"auto", "skip", "reset"}:
162
+ logger.warning(
163
+ "Unknown meta graph policy '%s'; defaulting to auto",
164
+ meta_graph_policy,
165
+ )
166
+ policy = "auto"
167
+ self.meta_graph_policy = policy
168
+
169
+ self.memgraph_client: Optional[Memgraph] = None
170
+ self._existing_meta_graph: Optional[Dict[str, Any]] = None
171
+ self._current_graph_model: Optional[Any] = None
172
+ self._ingestion_plan: Dict[str, Any] = {}
173
+ self._source_signature: Dict[str, str] = {}
174
+
175
+ # Build the workflow graph
176
+ self.workflow = self._build_workflow()
177
+
178
+ def _get_db_config_for_migrate(self, db_config: Dict[str, Any]) -> str:
179
+ """
180
+ Convert database config for use with migrate module in Memgraph.
181
+
182
+ Adjusts localhost/127.0.0.1 to host.docker.internal for Docker.
183
+ """
184
+ migrate_host = db_config["host"]
185
+ if migrate_host == "localhost" or migrate_host == "127.0.0.1":
186
+ migrate_host = "host.docker.internal"
187
+
188
+ config_lines = [
189
+ f"user: '{db_config['user']}'",
190
+ f"password: '{db_config['password']}'",
191
+ f"host: '{migrate_host}'",
192
+ f"database: '{db_config['database']}'",
193
+ ]
194
+
195
+ port = db_config.get("port")
196
+ if port:
197
+ config_lines.append(f"port: {port}")
198
+
199
+ config_body = ",\n ".join(config_lines)
200
+ return f"""{{
201
+ {config_body}
202
+ }}"""
203
+
204
+ def _qualify_table_name(self, table_name: str, db_config: Dict[str, Any]) -> str:
205
+ """Add schema qualification when needed for the source database."""
206
+
207
+ if not table_name:
208
+ return table_name
209
+
210
+ db_type = db_config.get("database_type", "mysql")
211
+ schema = db_config.get("schema")
212
+
213
+ if db_type == "postgresql" and schema and "." not in table_name:
214
+ return f"{schema}.{table_name}"
215
+
216
+ return table_name
217
+
218
+ def _compute_source_signature(
219
+ self,
220
+ state: MigrationState,
221
+ ) -> Dict[str, str]:
222
+ """Create a deterministic signature for the source database."""
223
+ config = state.get("source_db_config", {})
224
+ structure = state.get("database_structure", {})
225
+ host = config.get("host", "")
226
+ database = config.get("database", "")
227
+ db_type = (
228
+ structure.get("database_type") or config.get("database_type") or "mysql"
229
+ )
230
+ signature = {
231
+ "host": host,
232
+ "database": database,
233
+ "type": db_type,
234
+ }
235
+ self._source_signature = signature
236
+ return signature
237
+
238
+ def _node_key(self, node: Any) -> str:
239
+ """Generate a stable key for a graph node definition."""
240
+ return meta_node_key(node)
241
+
242
+ def _relationship_key(self, rel: Any) -> str:
243
+ """Generate a stable key for a graph relationship."""
244
+ return meta_relationship_key(rel)
245
+
246
+ def _summarize_node(self, node: Any) -> Dict[str, Any]:
247
+ """Create a JSON-serializable summary for a node definition."""
248
+ return meta_summarize_node(node)
249
+
250
+ def _summarize_relationship(self, rel: Any) -> Dict[str, Any]:
251
+ """Create a JSON-serializable summary for a relationship."""
252
+ return meta_summarize_relationship(rel)
253
+
254
+ def _graph_model_schema(self, model: Any) -> Dict[str, Any]:
255
+ """Convert a graph model to schema format if possible."""
256
+ if hasattr(model, "to_schema_format"):
257
+ return model.to_schema_format()
258
+ return {}
259
+
260
+ def _graph_model_hash(self, schema: Dict[str, Any]) -> str:
261
+ """Compute a stable hash for a schema dictionary."""
262
+ schema_json = json.dumps(schema, sort_keys=True)
263
+ return hashlib.sha256(schema_json.encode("utf-8")).hexdigest()
264
+
265
+ def _build_node_summaries(self, model: Any) -> Dict[str, Any]:
266
+ """Build summaries for all nodes in a model."""
267
+ return meta_summarize_nodes(getattr(model, "nodes", []))
268
+
269
+ def _build_relationship_summaries(self, model: Any) -> Dict[str, Any]:
270
+ """Build summaries for all relationships in a model."""
271
+ return meta_summarize_relationships(getattr(model, "edges", []))
272
+
273
+ def _load_existing_meta_graph(self, state: MigrationState) -> None:
274
+ """Read stored migration metadata from Memgraph if available."""
275
+ if not self.memgraph_client:
276
+ return
277
+
278
+ signature = self._compute_source_signature(state)
279
+ query = (
280
+ "MATCH (meta:MigrationAgent {source_host: $host, "
281
+ "source_database: $database, source_type: $type}) "
282
+ "RETURN meta LIMIT 1"
283
+ )
284
+ result = self.memgraph_client.query(
285
+ query,
286
+ {
287
+ "host": signature["host"],
288
+ "database": signature["database"],
289
+ "type": signature["type"],
290
+ },
291
+ )
292
+
293
+ if result:
294
+ meta = result[0].get("meta", {})
295
+ node_data = meta.get("node_summaries") or "{}"
296
+ rel_data = meta.get("relationship_summaries") or "{}"
297
+ table_counts = meta.get("table_counts") or "{}"
298
+ self._existing_meta_graph = {
299
+ "model_hash": meta.get("model_hash"),
300
+ "node_summaries": json.loads(node_data),
301
+ "relationship_summaries": json.loads(rel_data),
302
+ "table_counts": json.loads(table_counts),
303
+ }
304
+ state["existing_meta_graph"] = self._existing_meta_graph
305
+ logger.info(
306
+ "Loaded existing migration metadata for %s/%s",
307
+ signature["host"],
308
+ signature["database"],
309
+ )
310
+ else:
311
+ self._existing_meta_graph = None
312
+ state["existing_meta_graph"] = None
313
+ logger.info(
314
+ "No existing migration metadata found for %s/%s",
315
+ signature["host"],
316
+ signature["database"],
317
+ )
318
+
319
+ def _calculate_ingestion_plan(
320
+ self,
321
+ graph_model: Any,
322
+ structure: Dict[str, Any],
323
+ ) -> Dict[str, Any]:
324
+ """Determine which nodes and relationships need migration."""
325
+ plan = {
326
+ "nodes": set(),
327
+ "relationships": set(),
328
+ "node_reasons": {},
329
+ "relationship_reasons": {},
330
+ }
331
+
332
+ table_counts = structure.get("table_counts", {}) or {}
333
+ existing = self._existing_meta_graph or {}
334
+ existing_nodes = existing.get("node_summaries", {}) or {}
335
+ existing_rels = existing.get("relationship_summaries", {}) or {}
336
+ existing_counts = existing.get("table_counts", {}) or {}
337
+
338
+ node_keys_by_source: Dict[str, str] = {}
339
+ label_keys: Dict[str, str] = {}
340
+
341
+ for node in getattr(graph_model, "nodes", []):
342
+ key = self._node_key(node)
343
+ summary = self._summarize_node(node)
344
+ source_name = summary.get("source")
345
+ if source_name:
346
+ node_keys_by_source[source_name] = key
347
+ label_key = "|".join(sorted(summary.get("labels", [])))
348
+ label_keys[label_key] = key
349
+
350
+ reasons: List[str] = []
351
+ stored = existing_nodes.get(key)
352
+ if not stored:
353
+ reasons.append("new node definition")
354
+ else:
355
+ if summary["properties"] != stored.get("properties", []):
356
+ reasons.append("properties changed")
357
+ if summary["id_field"] != stored.get("id_field"):
358
+ reasons.append("identifier changed")
359
+
360
+ table_name = summary.get("source")
361
+ if table_name:
362
+ new_count = table_counts.get(table_name)
363
+ old_count = existing_counts.get(table_name)
364
+ if new_count is not None:
365
+ if old_count is None:
366
+ reasons.append("table count unavailable previously")
367
+ elif new_count != old_count:
368
+ if new_count > old_count:
369
+ reasons.append("source data increased")
370
+ else:
371
+ reasons.append("source data changed")
372
+
373
+ if reasons or not existing_nodes:
374
+ plan["nodes"].add(key)
375
+ plan["node_reasons"][key] = reasons or ["initial migration"]
376
+
377
+ for rel in getattr(graph_model, "edges", []):
378
+ key = self._relationship_key(rel)
379
+ summary = self._summarize_relationship(rel)
380
+ reasons: List[str] = []
381
+ stored = existing_rels.get(key)
382
+ if not stored:
383
+ reasons.append("new relationship definition")
384
+ else:
385
+ if summary["mapping"] != stored.get("mapping", {}):
386
+ reasons.append("mapping changed")
387
+ if summary["start"] != stored.get("start", []):
388
+ reasons.append("start labels changed")
389
+ if summary["end"] != stored.get("end", []):
390
+ reasons.append("end labels changed")
391
+
392
+ start_key = None
393
+ end_key = None
394
+ start_table = summary.get("start_table")
395
+ end_table = summary.get("end_table")
396
+ if start_table and start_table in node_keys_by_source:
397
+ start_key = node_keys_by_source[start_table]
398
+ if end_table and end_table in node_keys_by_source:
399
+ end_key = node_keys_by_source[end_table]
400
+ if not start_key:
401
+ label = "|".join(summary.get("start", []))
402
+ start_key = label_keys.get(label)
403
+ if not end_key:
404
+ label = "|".join(summary.get("end", []))
405
+ end_key = label_keys.get(label)
406
+
407
+ dependent_update = False
408
+ if start_key and start_key in plan["nodes"]:
409
+ dependent_update = True
410
+ if end_key and end_key in plan["nodes"]:
411
+ dependent_update = True
412
+ if dependent_update and "dependent node update" not in reasons:
413
+ reasons.append("dependent node update")
414
+
415
+ if reasons or not existing_rels:
416
+ plan["relationships"].add(key)
417
+ plan["relationship_reasons"][key] = reasons or ["initial migration"]
418
+
419
+ self._ingestion_plan = plan
420
+ return plan
421
+
422
+ def _store_meta_graph(self, state: MigrationState) -> None:
423
+ """Persist the current graph model metadata to Memgraph."""
424
+ if not self.memgraph_client:
425
+ return
426
+
427
+ graph_model = state.get("graph_model")
428
+ if not graph_model:
429
+ return
430
+
431
+ structure = state.get("database_structure", {})
432
+ schema = self._graph_model_schema(graph_model)
433
+ node_summaries = self._build_node_summaries(graph_model)
434
+ rel_summaries = self._build_relationship_summaries(graph_model)
435
+ table_counts = structure.get("table_counts", {}) or {}
436
+
437
+ model_hash = self._graph_model_hash(schema)
438
+ signature = self._source_signature or self._compute_source_signature(state)
439
+
440
+ query = (
441
+ "MERGE (meta:MigrationAgent {source_host: $host, "
442
+ "source_database: $database, source_type: $type}) "
443
+ "SET meta.last_migrated_at = datetime(), "
444
+ "meta.model_hash = $model_hash, "
445
+ "meta.schema = $schema, "
446
+ "meta.node_summaries = $node_summaries, "
447
+ "meta.relationship_summaries = $relationship_summaries, "
448
+ "meta.table_counts = $table_counts"
449
+ )
450
+
451
+ self.memgraph_client.query(
452
+ query,
453
+ {
454
+ "host": signature.get("host", ""),
455
+ "database": signature.get("database", ""),
456
+ "type": signature.get("type", ""),
457
+ "model_hash": model_hash,
458
+ "schema": json.dumps(schema, sort_keys=True),
459
+ "node_summaries": json.dumps(node_summaries, sort_keys=True),
460
+ "relationship_summaries": json.dumps(
461
+ rel_summaries,
462
+ sort_keys=True,
463
+ ),
464
+ "table_counts": json.dumps(table_counts, sort_keys=True),
465
+ },
466
+ )
467
+
468
+ logger.info(
469
+ "Stored migration metadata for %s/%s",
470
+ signature.get("host", ""),
471
+ signature.get("database", ""),
472
+ )
473
+
474
+ def _build_workflow(self) -> StateGraph:
475
+ """Build the LangGraph workflow with clear separation of concerns."""
476
+ workflow = StateGraph(MigrationState)
477
+
478
+ # Add nodes - refactored for better modularity
479
+ workflow.add_node(
480
+ "connect_and_analyze_schema",
481
+ self._connect_and_analyze_schema,
482
+ )
483
+ workflow.add_node(
484
+ "create_graph_model",
485
+ self._create_graph_model,
486
+ )
487
+ workflow.add_node(
488
+ "create_indexes",
489
+ self._create_indexes,
490
+ )
491
+ workflow.add_node(
492
+ "generate_cypher_queries",
493
+ self._generate_cypher_queries,
494
+ )
495
+ workflow.add_node(
496
+ "prepare_target_database",
497
+ self._prepare_target_database,
498
+ )
499
+ workflow.add_node(
500
+ "execute_data_migration",
501
+ self._execute_data_migration,
502
+ )
503
+ workflow.add_node(
504
+ "validate_post_migration",
505
+ self._validate_post_migration,
506
+ )
507
+
508
+ # Add conditional edges for better error handling
509
+ workflow.add_edge(
510
+ "connect_and_analyze_schema",
511
+ "prepare_target_database",
512
+ )
513
+ workflow.add_edge(
514
+ "prepare_target_database",
515
+ "create_graph_model",
516
+ )
517
+ workflow.add_edge("create_graph_model", "create_indexes")
518
+ workflow.add_edge("create_indexes", "generate_cypher_queries")
519
+ workflow.add_edge("generate_cypher_queries", "execute_data_migration")
520
+ workflow.add_edge("execute_data_migration", "validate_post_migration")
521
+ workflow.add_edge("validate_post_migration", END)
522
+
523
+ # Set entry point
524
+ workflow.set_entry_point("connect_and_analyze_schema")
525
+
526
+ # Return the workflow (not compiled) so caller can add checkpointer
527
+ return workflow
528
+
529
+ def _connect_and_analyze_schema(
530
+ self,
531
+ state: MigrationState,
532
+ ) -> MigrationState:
533
+ """Connect to source database and prepare info for HyGM."""
534
+ logger.info("Preparing database connection for HyGM analysis...")
535
+
536
+ try:
537
+ # Initialize database analyzer to test connection
538
+ source_config = state["source_db_config"].copy()
539
+ db_type = source_config.pop("database_type", "mysql")
540
+
541
+ database_analyzer = DatabaseAnalyzerFactory.create_analyzer(
542
+ database_type=db_type,
543
+ **source_config,
544
+ )
545
+
546
+ if not database_analyzer.connect():
547
+ raise Exception("Failed to connect to source database")
548
+
549
+ # Get basic database structure for HyGM
550
+ db_structure = database_analyzer.get_database_structure()
551
+ hygm_data = db_structure.to_hygm_format()
552
+
553
+ # Store the database structure for HyGM
554
+ state["database_structure"] = hygm_data
555
+ state["total_tables"] = len(hygm_data.get("entity_tables", {}))
556
+ state["current_step"] = "Database connection established"
557
+
558
+ logger.info("Database structure prepared for HyGM analysis")
559
+
560
+ except Exception as e:
561
+ logger.error(f"Error connecting to database: {e}")
562
+ state["errors"].append(f"Database connection failed: {e}")
563
+
564
+ return state
565
+
566
+ def _create_graph_model(self, state: MigrationState) -> MigrationState:
567
+ """Create graph model using HyGM based on analyzed schema."""
568
+ logger.info("Creating graph model using HyGM...")
569
+
570
+ try:
571
+ hygm_data = state["database_structure"]
572
+
573
+ # Log the modeling mode being used
574
+ if self.modeling_mode == ModelingMode.INCREMENTAL:
575
+ logger.info(
576
+ "Using incremental graph modeling mode with an "
577
+ "end-of-session interactive refinement option"
578
+ )
579
+ else:
580
+ logger.info("Using automatic graph modeling mode")
581
+
582
+ # Create graph modeler with strategy and mode
583
+ graph_modeler = HyGM(
584
+ llm=self.llm,
585
+ mode=self.modeling_mode,
586
+ strategy=self.graph_modeling_strategy,
587
+ existing_meta_graph=state.get("existing_meta_graph"),
588
+ )
589
+
590
+ # Log the strategy being used
591
+ strategy_name = self.graph_modeling_strategy.value
592
+ logger.info(f"Using {strategy_name} graph modeling strategy")
593
+
594
+ # Generate graph model using new unified interface
595
+ graph_model = graph_modeler.create_graph_model(
596
+ hygm_data,
597
+ domain_context="Database migration to graph database",
598
+ )
599
+
600
+ # Store the graph model in state
601
+ state["graph_model"] = graph_model
602
+
603
+ logger.info(
604
+ f"Graph model created with {len(graph_model.nodes)} "
605
+ f"node types and {len(graph_model.edges)} "
606
+ f"relationship types"
607
+ )
608
+
609
+ state["current_step"] = "Graph model created successfully"
610
+
611
+ except Exception as e:
612
+ logger.error(f"Graph modeling failed: {e}")
613
+ # HyGM is required - propagate the error
614
+ return self._handle_step_error(state, "creating graph model", e)
615
+
616
+ return state
617
+
618
+ def _prepare_target_database(
619
+ self,
620
+ state: MigrationState,
621
+ ) -> MigrationState:
622
+ """Prepare the target Memgraph database for migration."""
623
+ logger.info("Preparing target database for migration...")
624
+
625
+ try:
626
+ # Initialize Memgraph connection
627
+ config_value = state.get("memgraph_config")
628
+ if not config_value:
629
+ raise Exception("Memgraph configuration is required")
630
+ config = cast(Dict[str, str], config_value)
631
+
632
+ url = config.get("url")
633
+ if not url:
634
+ raise Exception("Memgraph configuration must include 'url'")
635
+
636
+ username = config.get("username") or ""
637
+ password = config.get("password") or ""
638
+ database = config.get("database") or "memgraph"
639
+
640
+ self.memgraph_client = Memgraph(
641
+ url=url,
642
+ username=username,
643
+ password=password,
644
+ database=database,
645
+ )
646
+
647
+ # Test Memgraph connection
648
+ test_query = "MATCH (n) RETURN count(n) as node_count LIMIT 1"
649
+ self.memgraph_client.query(test_query)
650
+ logger.info("Memgraph connection established successfully")
651
+
652
+ # Load existing meta graph to plan incremental ingestion
653
+ policy = getattr(self, "meta_graph_policy", "auto")
654
+ if policy == "skip":
655
+ logger.info("Meta graph loading skipped by configuration")
656
+ self._existing_meta_graph = None
657
+ state["existing_meta_graph"] = None
658
+ else:
659
+ self._load_existing_meta_graph(state)
660
+ if self._existing_meta_graph:
661
+ if policy == "reset":
662
+ logger.info(
663
+ "Existing migration metadata ignored due to reset policy",
664
+ )
665
+ self._existing_meta_graph = None
666
+ state["existing_meta_graph"] = None
667
+ else:
668
+ logger.info(
669
+ "Existing migration metadata detected; data will be merged",
670
+ )
671
+ else:
672
+ logger.info(
673
+ "No migration metadata found; treating this as an initial run",
674
+ )
675
+
676
+ state["current_step"] = "Target database prepared successfully"
677
+
678
+ except Exception as e:
679
+ logger.error(f"Error preparing target database: {e}")
680
+ state["errors"].append(f"Database preparation failed: {e}")
681
+ state["current_step"] = "Database preparation failed"
682
+
683
+ return state
684
+
685
+ def _execute_data_migration(self, state: MigrationState) -> MigrationState:
686
+ """Execute the actual data migration queries."""
687
+ logger.info("Executing data migration...")
688
+
689
+ try:
690
+ memgraph_client = self.memgraph_client
691
+ if not memgraph_client:
692
+ raise Exception("Memgraph client is not initialized")
693
+
694
+ queries = state["migration_queries"]
695
+
696
+ # Execute all migration queries sequentially
697
+ successful_queries = 0
698
+ for i, query in enumerate(queries):
699
+ # Skip empty queries but keep comment-only blocks for context
700
+ query_lines = [line.strip() for line in query.strip().split("\n")]
701
+ non_comment_lines = [
702
+ line for line in query_lines if line and not line.startswith("//")
703
+ ]
704
+
705
+ if non_comment_lines: # Has actual Cypher code
706
+ try:
707
+ logger.info(
708
+ "Executing query %d/%d...",
709
+ i + 1,
710
+ len(queries),
711
+ )
712
+ memgraph_client.query(query)
713
+ successful_queries += 1
714
+
715
+ # Log progress for node creation queries
716
+ if "MERGE (n:" in query or "CREATE (n:" in query:
717
+ # Extract table name from comment line
718
+ # Comment format: "// Merge {label} nodes from {table} table (HyGM optimized)"
719
+ table_name = None
720
+ for line in query_lines:
721
+ if (
722
+ line.startswith("//")
723
+ and " from " in line
724
+ and " table" in line
725
+ ):
726
+ try:
727
+ # Extract table name from comment
728
+ parts = (
729
+ line.split(" from ")[1]
730
+ .split(" table")[0]
731
+ .strip()
732
+ )
733
+ table_name = parts
734
+ break
735
+ except (IndexError, AttributeError):
736
+ pass
737
+
738
+ if table_name:
739
+ logger.info(
740
+ f"Successfully migrated data from table: "
741
+ f"{table_name}"
742
+ )
743
+ # Update completed tables list
744
+ if table_name not in state["completed_tables"]:
745
+ state["completed_tables"].append(table_name)
746
+ elif (
747
+ "MERGE (" in query or "CREATE (" in query
748
+ ) and "-[:" in query:
749
+ logger.info("Successfully created relationships")
750
+
751
+ except Exception as e:
752
+ logger.error(f"Failed to execute query {i + 1}: {e}")
753
+ logger.error(f"Query: {query[:100]}...")
754
+ state["errors"].append(f"Query execution failed: {e}")
755
+
756
+ logger.info(
757
+ f"Migration completed: {successful_queries}/{len(queries)} "
758
+ f"queries executed successfully"
759
+ )
760
+ state["current_step"] = "Data migration completed"
761
+
762
+ except Exception as e:
763
+ logger.error(f"Error executing data migration: {e}")
764
+ state["errors"].append(f"Data migration failed: {e}")
765
+
766
+ return state
767
+
768
+ def _execute_queries_with_logging(
769
+ self,
770
+ queries: List[str],
771
+ query_type: str,
772
+ memgraph_client: Memgraph,
773
+ success_list: List[str],
774
+ warning_prefix: str = "warning",
775
+ ) -> None:
776
+ """Execute queries with consistent logging and error handling."""
777
+ for query in queries:
778
+ try:
779
+ logger.info("Creating %s: %s", query_type, query)
780
+ memgraph_client.query(query)
781
+ success_list.append(query)
782
+ except Exception as e:
783
+ # Some queries might already exist, log but continue
784
+ logger.warning(
785
+ f"{query_type.capitalize()} creation {warning_prefix}: %s",
786
+ e,
787
+ )
788
+
789
+ def _handle_step_error(
790
+ self,
791
+ state: MigrationState,
792
+ step_name: str,
793
+ error: Exception,
794
+ ) -> MigrationState:
795
+ """Standardized error handling for workflow steps."""
796
+ error_msg = f"Error {step_name}: {error}"
797
+ failure_msg = f"{step_name.capitalize()} failed: {error}"
798
+
799
+ logger.error(error_msg)
800
+ state["errors"].append(failure_msg)
801
+ state["current_step"] = f"{step_name.capitalize()} failed"
802
+
803
+ return state
804
+
805
+ def _create_indexes(self, state: MigrationState) -> MigrationState:
806
+ """Create indexes and constraints in Memgraph before migration."""
807
+ logger.info("Creating HyGM indexes and constraints...")
808
+
809
+ try:
810
+ # Use the existing Memgraph connection from prepare_target_database
811
+ if not self.memgraph_client:
812
+ raise Exception("No Memgraph connection available")
813
+
814
+ # Track created indexes and constraints
815
+ created_indexes = []
816
+ created_constraints = []
817
+
818
+ # Get the HyGM graph model (required)
819
+ graph_model = state.get("graph_model")
820
+ if not graph_model or not hasattr(graph_model, "node_indexes"):
821
+ raise Exception("HyGM graph model with indexes is required")
822
+
823
+ logger.info("Using HyGM-provided indexes and constraints")
824
+
825
+ # Generate index queries from HyGM graph model
826
+ index_queries = self.cypher_generator.generate_index_queries_from_hygm(
827
+ graph_model.node_indexes
828
+ )
829
+
830
+ # Generate constraint queries from HyGM graph model
831
+ constraint_queries = (
832
+ self.cypher_generator.generate_constraint_queries_from_hygm(
833
+ graph_model.node_constraints
834
+ )
835
+ )
836
+
837
+ logger.info(
838
+ "HyGM provided %d indexes and %d constraints",
839
+ len(index_queries),
840
+ len(constraint_queries),
841
+ )
842
+
843
+ # Execute constraint queries first
844
+ self._execute_queries_with_logging(
845
+ constraint_queries,
846
+ "constraint",
847
+ self.memgraph_client,
848
+ created_constraints,
849
+ )
850
+
851
+ # Execute index queries
852
+ self._execute_queries_with_logging(
853
+ index_queries, "index", self.memgraph_client, created_indexes
854
+ )
855
+
856
+ # Store results in state
857
+ state["created_indexes"] = created_indexes
858
+ state["created_constraints"] = created_constraints
859
+ state["current_step"] = "HyGM indexes and constraints created"
860
+
861
+ logger.info(
862
+ "Created %d constraints and %d indexes from HyGM model",
863
+ len(created_constraints),
864
+ len(created_indexes),
865
+ )
866
+
867
+ except Exception as e:
868
+ return self._handle_step_error(state, "creating indexes", e)
869
+
870
+ return state
871
+
872
+ def _generate_cypher_queries(
873
+ self,
874
+ state: MigrationState,
875
+ ) -> MigrationState:
876
+ """Generate merge-based Cypher queries using the ingestion plan."""
877
+ logger.info("Generating Cypher queries based on HyGM graph model...")
878
+
879
+ try:
880
+ source_db_config = state["source_db_config"]
881
+ graph_model = state.get("graph_model")
882
+ if not graph_model:
883
+ raise Exception("HyGM graph model is required for migration")
884
+
885
+ self._current_graph_model = graph_model
886
+
887
+ structure = state.get("database_structure", {})
888
+ plan = self._calculate_ingestion_plan(graph_model, structure)
889
+ nodes_to_migrate = plan["nodes"]
890
+ relationships_to_migrate = plan["relationships"]
891
+
892
+ if not nodes_to_migrate and not relationships_to_migrate:
893
+ logger.info(
894
+ "Schema and table counts already match stored metadata; "
895
+ "no migration queries generated"
896
+ )
897
+ state["migration_queries"] = []
898
+ state["current_step"] = "No new data to migrate"
899
+ return state
900
+
901
+ for node_key in sorted(nodes_to_migrate):
902
+ reasons = plan["node_reasons"].get(node_key, [])
903
+ reason_text = ", ".join(reasons) if reasons else "initial migration"
904
+ logger.info("Node plan %s → %s", node_key, reason_text)
905
+
906
+ for rel_key in sorted(relationships_to_migrate):
907
+ reasons = plan["relationship_reasons"].get(rel_key, [])
908
+ reason_text = ", ".join(reasons) if reasons else "initial migration"
909
+ logger.info("Relationship plan %s → %s", rel_key, reason_text)
910
+
911
+ queries: List[str] = []
912
+ db_config_str = self._get_db_config_for_migrate(source_db_config)
913
+ db_type = source_db_config.get("database_type", "mysql")
914
+ procedure_name = f"migrate.{db_type}"
915
+ logger.info("Using %s procedure for data ingestion", procedure_name)
916
+
917
+ for node_def in graph_model.nodes:
918
+ node_key = self._node_key(node_def)
919
+ if nodes_to_migrate and node_key not in nodes_to_migrate:
920
+ continue
921
+
922
+ source = getattr(node_def, "source", None)
923
+ source_table = getattr(source, "name", None) or "unknown"
924
+ qualified_table = self._qualify_table_name(
925
+ source_table, source_db_config
926
+ )
927
+ node_label = node_def.primary_label
928
+
929
+ properties = [
930
+ prop.key if hasattr(prop, "key") else str(prop)
931
+ for prop in getattr(node_def, "properties", [])
932
+ ]
933
+
934
+ node_mapping = getattr(source, "mapping", {}) if source else {}
935
+ id_field = node_mapping.get("id_field")
936
+ if not id_field and properties:
937
+ id_field = properties[0]
938
+
939
+ if id_field and id_field not in properties:
940
+ properties.append(id_field)
941
+
942
+ if not id_field:
943
+ logger.warning(
944
+ "Skipping node %s: identifier field missing",
945
+ node_label,
946
+ )
947
+ continue
948
+
949
+ if not properties:
950
+ logger.warning(
951
+ "No properties found for node %s from table %s",
952
+ node_label,
953
+ source_table,
954
+ )
955
+ continue
956
+
957
+ properties_str = ", ".join(properties)
958
+ node_query = f"""
959
+ // Merge {node_label} nodes from {source_table} table (HyGM optimized)
960
+ CALL {procedure_name}(
961
+ 'SELECT {properties_str} FROM {qualified_table}',
962
+ {db_config_str}
963
+ )
964
+ YIELD row
965
+ MERGE (n:{node_label} {{{id_field}: row.{id_field}}})
966
+ SET n += row;"""
967
+ queries.append(node_query)
968
+ logger.info("Prepared merge query for %s", node_label)
969
+
970
+ logger.info(
971
+ "Preparing relationship queries for %d definitions",
972
+ len(graph_model.edges),
973
+ )
974
+
975
+ for rel_def in graph_model.edges:
976
+ rel_key = self._relationship_key(rel_def)
977
+ if relationships_to_migrate and rel_key not in relationships_to_migrate:
978
+ continue
979
+
980
+ rel_query = self._generate_hygm_relationship_query(
981
+ rel_def,
982
+ db_config_str,
983
+ source_db_config,
984
+ procedure_name,
985
+ )
986
+ if rel_query:
987
+ queries.append(rel_query)
988
+ logger.info(
989
+ "Prepared merge query for relationship %s",
990
+ rel_def.edge_type,
991
+ )
992
+
993
+ state["migration_queries"] = queries
994
+ state["current_step"] = "Migration queries prepared"
995
+
996
+ logger.info("Generated %d migration queries", len(queries))
997
+
998
+ except Exception as e:
999
+ logger.error(f"Error generating HyGM-based Cypher queries: {e}")
1000
+ return self._handle_step_error(
1001
+ state,
1002
+ "generating cypher queries",
1003
+ e,
1004
+ )
1005
+
1006
+ return state
1007
+
1008
+ def _generate_hygm_relationship_query(
1009
+ self,
1010
+ rel_def,
1011
+ db_config_str: str,
1012
+ source_db_config: Dict[str, Any],
1013
+ procedure_name: str,
1014
+ ) -> str:
1015
+ """Create relationship query from HyGM definition."""
1016
+
1017
+ try:
1018
+ if not rel_def.source or not rel_def.source.mapping:
1019
+ logger.warning(
1020
+ f"No source mapping for relationship {rel_def.edge_type}"
1021
+ )
1022
+ return ""
1023
+
1024
+ rel_name = rel_def.edge_type
1025
+ source_info = rel_def.source.mapping
1026
+
1027
+ # Determine relationship type from HyGM source
1028
+ if rel_def.source.type == "many_to_many":
1029
+ return self._generate_many_to_many_hygm_query(
1030
+ rel_name,
1031
+ rel_def,
1032
+ source_info,
1033
+ db_config_str,
1034
+ source_db_config,
1035
+ procedure_name,
1036
+ )
1037
+ elif rel_def.source.type in ["table", "foreign_key"]:
1038
+ return self._generate_one_to_many_hygm_query(
1039
+ rel_name,
1040
+ rel_def,
1041
+ source_info,
1042
+ db_config_str,
1043
+ source_db_config,
1044
+ procedure_name,
1045
+ )
1046
+ else:
1047
+ logger.warning(
1048
+ "Unsupported relationship type: %s",
1049
+ rel_def.source.type,
1050
+ )
1051
+ return ""
1052
+
1053
+ except Exception as e:
1054
+ logger.error(
1055
+ "Error generating relationship query for %s: %s",
1056
+ rel_def.edge_type,
1057
+ e,
1058
+ )
1059
+ return ""
1060
+
1061
+ def _generate_one_to_many_hygm_query(
1062
+ self,
1063
+ rel_name: str,
1064
+ rel_def,
1065
+ source_info: Dict[str, Any],
1066
+ db_config_str: str,
1067
+ source_db_config: Dict[str, Any],
1068
+ procedure_name: str,
1069
+ ) -> str:
1070
+ """Generate one-to-many relationship query from HyGM mapping."""
1071
+
1072
+ start_node = source_info.get("start_node", "")
1073
+ end_node = source_info.get("end_node", "")
1074
+ from_pk = source_info.get("from_pk")
1075
+
1076
+ if not start_node or not end_node:
1077
+ logger.error("Missing relationship information for %s", rel_name)
1078
+ raise Exception(
1079
+ "HyGM must provide complete relationship mapping for " f"{rel_name}"
1080
+ )
1081
+
1082
+ try:
1083
+ from_table, fk_column = start_node.split(".", 1)
1084
+ to_table, to_column = end_node.split(".", 1)
1085
+ except ValueError:
1086
+ logger.error(
1087
+ "Invalid mapping format for %s: %s",
1088
+ rel_name,
1089
+ source_info,
1090
+ )
1091
+ raise Exception(
1092
+ f"HyGM must provide valid relationship mapping for {rel_name}"
1093
+ )
1094
+
1095
+ if not from_pk:
1096
+ raise Exception(f"HyGM must provide primary key information for {rel_name}")
1097
+
1098
+ from_label = (
1099
+ rel_def.start_node_labels[0] if rel_def.start_node_labels else from_table
1100
+ )
1101
+ to_label = rel_def.end_node_labels[0] if rel_def.end_node_labels else to_table
1102
+
1103
+ qualified_from_table = self._qualify_table_name(from_table, source_db_config)
1104
+
1105
+ select_sql = (
1106
+ f"SELECT {from_pk}, {fk_column} "
1107
+ f"FROM {qualified_from_table} "
1108
+ f"WHERE {fk_column} IS NOT NULL"
1109
+ )
1110
+
1111
+ query = f"""
1112
+ // Merge {rel_name} relationships (HyGM: {from_label} -> {to_label})
1113
+ CALL {procedure_name}(
1114
+ '{select_sql}',
1115
+ {db_config_str}
1116
+ )
1117
+ YIELD row
1118
+ MATCH (from_node:{from_label} {{{from_pk}: row.{from_pk}}})
1119
+ MATCH (to_node:{to_label} {{{to_column}: row.{fk_column}}})
1120
+ MERGE (from_node)-[:{rel_name}]->(to_node);"""
1121
+
1122
+ return query
1123
+
1124
+ def _generate_many_to_many_hygm_query(
1125
+ self,
1126
+ rel_name: str,
1127
+ rel_def,
1128
+ source_info: Dict[str, Any],
1129
+ db_config_str: str,
1130
+ source_db_config: Dict[str, Any],
1131
+ procedure_name: str,
1132
+ ) -> str:
1133
+ """Generate many-to-many relationship query from HyGM mapping."""
1134
+
1135
+ join_table = source_info.get("join_table")
1136
+ from_table = source_info.get("from_table")
1137
+ to_table = source_info.get("to_table")
1138
+ from_fk = source_info.get("join_from_column")
1139
+ to_fk = source_info.get("join_to_column")
1140
+ from_pk = source_info.get("from_column")
1141
+ to_pk = source_info.get("to_column")
1142
+
1143
+ if not all([join_table, from_table, to_table, from_fk, to_fk, from_pk, to_pk]):
1144
+ logger.error(
1145
+ "Missing many-to-many relationship information for %s",
1146
+ rel_name,
1147
+ )
1148
+ raise Exception(
1149
+ "HyGM must provide complete many-to-many mapping for " f"{rel_name}"
1150
+ )
1151
+
1152
+ from_label = (
1153
+ rel_def.start_node_labels[0] if rel_def.start_node_labels else from_table
1154
+ )
1155
+ to_label = rel_def.end_node_labels[0] if rel_def.end_node_labels else to_table
1156
+
1157
+ join_table_name = cast(str, join_table)
1158
+ qualified_join_table = self._qualify_table_name(
1159
+ join_table_name, source_db_config
1160
+ )
1161
+
1162
+ select_sql = f"SELECT {from_fk}, {to_fk} FROM {qualified_join_table}"
1163
+
1164
+ query = f"""
1165
+ // Merge {rel_name} relationships via {join_table}
1166
+ // (HyGM: {from_label} <-> {to_label})
1167
+ CALL {procedure_name}(
1168
+ '{select_sql}',
1169
+ {db_config_str}
1170
+ )
1171
+ YIELD row
1172
+ MATCH (from:{from_label} {{{from_pk}: row.{from_fk}}})
1173
+ MATCH (to:{to_label} {{{to_pk}: row.{to_fk}}})
1174
+ MERGE (from)-[:{rel_name}]->(to);"""
1175
+ return query
1176
+
1177
+ def _validate_post_migration(
1178
+ self,
1179
+ state: MigrationState,
1180
+ ) -> MigrationState:
1181
+ """Validate post-migration results using HyGM schema comparison."""
1182
+ logger.info("Running post-migration validation...")
1183
+
1184
+ try:
1185
+ # Check if we have a graph model to validate against
1186
+ if not state.get("graph_model"):
1187
+ logger.warning("No graph model available for validation")
1188
+ state["validation_report"] = {
1189
+ "success": False,
1190
+ "reason": "No graph model available",
1191
+ }
1192
+ state["current_step"] = "Post-migration validation skipped"
1193
+ return state
1194
+
1195
+ # Reuse existing Memgraph connection from previous steps
1196
+ if not self.memgraph_client:
1197
+ logger.error("No Memgraph connection available for validation")
1198
+ state["validation_report"] = {
1199
+ "success": False,
1200
+ "reason": "No Memgraph connection available",
1201
+ }
1202
+ state["current_step"] = "Post-migration validation failed"
1203
+ return state
1204
+
1205
+ # Get the graph model from state
1206
+ graph_model = state.get("graph_model")
1207
+
1208
+ # Calculate expected data counts from MySQL for validation
1209
+ structure = state["database_structure"]
1210
+ expected_nodes = 0
1211
+ table_counts = structure.get("table_counts", {})
1212
+
1213
+ # Default to all migrated tables when nothing specific is selected
1214
+ selected_tables = structure.get("selected_tables", [])
1215
+ if not selected_tables:
1216
+ # Use entity tables (exclude views and system tables)
1217
+ entity_tables = structure.get("entity_tables", {})
1218
+ selected_tables = list(entity_tables.keys())
1219
+
1220
+ for table_name in selected_tables:
1221
+ if table_name in table_counts:
1222
+ expected_nodes += table_counts[table_name]
1223
+
1224
+ # Create expected data counts for the validator
1225
+ expected_data_counts = {
1226
+ "nodes": expected_nodes,
1227
+ "selected_tables": selected_tables,
1228
+ }
1229
+
1230
+ # Run validation using existing connection and data counts
1231
+ logger.info("Executing post-migration validation...")
1232
+ validation_result = validate_memgraph_data(
1233
+ expected_model=graph_model,
1234
+ memgraph_connection=self.memgraph_client,
1235
+ expected_data_counts=expected_data_counts,
1236
+ detailed_report=True,
1237
+ )
1238
+
1239
+ # Store validation results in state
1240
+ state["validation_report"] = {
1241
+ "success": validation_result.success,
1242
+ "summary": validation_result.summary,
1243
+ "validation_score": validation_result.details.get(
1244
+ "validation_score", 0
1245
+ ),
1246
+ "issues": [
1247
+ {
1248
+ "severity": issue.severity.value,
1249
+ "category": issue.category,
1250
+ "message": issue.message,
1251
+ "expected": issue.expected,
1252
+ "actual": issue.actual,
1253
+ "recommendation": issue.recommendation,
1254
+ }
1255
+ for issue in validation_result.issues
1256
+ ],
1257
+ "metrics": validation_result.metrics,
1258
+ }
1259
+
1260
+ # Log validation summary
1261
+ if validation_result.success:
1262
+ logger.info("✅ Post-migration validation PASSED")
1263
+ score = int(validation_result.details.get("validation_score", 0))
1264
+ logger.info(f"Validation score: {score}/100")
1265
+ else:
1266
+ logger.warning("⚠️ Post-migration validation found issues")
1267
+ score = int(validation_result.details.get("validation_score", 0))
1268
+ logger.warning(f"Validation score: {score}/100")
1269
+
1270
+ # Log critical issues
1271
+ critical_issues = [
1272
+ issue
1273
+ for issue in validation_result.issues
1274
+ if issue.severity.value == "CRITICAL"
1275
+ ]
1276
+ if critical_issues:
1277
+ count = len(critical_issues)
1278
+ logger.error(f"Found {count} critical validation issues:")
1279
+ # Show first 3 critical issues
1280
+ for issue in critical_issues[:3]:
1281
+ logger.error(f" - {issue.message}")
1282
+
1283
+ state["current_step"] = "Post-migration validation completed"
1284
+
1285
+ if not state["errors"]:
1286
+ self._store_meta_graph(state)
1287
+
1288
+ except Exception as e:
1289
+ logger.error(f"Error during post-migration validation: {e}")
1290
+ state["errors"].append(f"Post-migration validation failed: {e}")
1291
+ state["validation_report"] = {
1292
+ "validation_performed": False,
1293
+ "reason": f"Validation error: {e}",
1294
+ }
1295
+ state["current_step"] = "Post-migration validation failed"
1296
+
1297
+ return state
1298
+
1299
+ def migrate(
1300
+ self,
1301
+ source_db_config: Dict[str, str],
1302
+ memgraph_config: Optional[Dict[str, str]] = None,
1303
+ ) -> Dict[str, Any]:
1304
+ """Execute the complete migration workflow."""
1305
+ logger.info("Starting SQL database to graph migration...")
1306
+
1307
+ # Initialize state
1308
+ initial_state = MigrationState(
1309
+ source_db_config=source_db_config,
1310
+ memgraph_config=memgraph_config,
1311
+ database_structure={},
1312
+ graph_model=None,
1313
+ migration_queries=[],
1314
+ current_step="Starting migration",
1315
+ errors=[],
1316
+ completed_tables=[],
1317
+ total_tables=0,
1318
+ created_indexes=[],
1319
+ created_constraints=[],
1320
+ validation_report={},
1321
+ existing_meta_graph=None,
1322
+ )
1323
+
1324
+ try:
1325
+ # Automatic mode compiles workflow without a checkpointer
1326
+ if self.modeling_mode == ModelingMode.AUTOMATIC:
1327
+ compiled_workflow = self.workflow.compile()
1328
+ final_state = compiled_workflow.invoke(initial_state)
1329
+ else:
1330
+ # Incremental mode enables a persistent checkpointer
1331
+ from langgraph.checkpoint.memory import MemorySaver
1332
+
1333
+ memory = MemorySaver()
1334
+ compiled_workflow = self.workflow.compile(checkpointer=memory)
1335
+
1336
+ # Provide required configuration for checkpointer
1337
+ config: RunnableConfig = {
1338
+ "configurable": {"thread_id": "migration_thread_1"}
1339
+ }
1340
+ final_state = compiled_workflow.invoke(
1341
+ initial_state,
1342
+ config=config,
1343
+ )
1344
+
1345
+ # Cleanup connections
1346
+ if self.database_analyzer:
1347
+ self.database_analyzer.disconnect()
1348
+ if self.memgraph_client:
1349
+ self.memgraph_client.close()
1350
+
1351
+ return {
1352
+ "success": len(final_state["errors"]) == 0,
1353
+ "completed_tables": final_state["completed_tables"],
1354
+ "total_tables": final_state["total_tables"],
1355
+ "errors": final_state["errors"],
1356
+ "final_step": final_state["current_step"],
1357
+ "validation_report": final_state.get("validation_report", {}),
1358
+ }
1359
+
1360
+ except Exception as e:
1361
+ logger.error(f"Migration workflow failed: {e}")
1362
+ return {
1363
+ "success": False,
1364
+ "errors": [f"Workflow execution failed: {e}"],
1365
+ "completed_tables": [],
1366
+ "total_tables": 0,
1367
+ "final_step": "Failed",
1368
+ "validation_report": {},
1369
+ }