structured2graph 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +47 -0
- core/__init__.py +23 -0
- core/hygm/__init__.py +74 -0
- core/hygm/hygm.py +2351 -0
- core/hygm/models/__init__.py +82 -0
- core/hygm/models/graph_models.py +667 -0
- core/hygm/models/llm_models.py +229 -0
- core/hygm/models/operations.py +176 -0
- core/hygm/models/sources.py +68 -0
- core/hygm/models/user_operations.py +139 -0
- core/hygm/strategies/__init__.py +17 -0
- core/hygm/strategies/base.py +36 -0
- core/hygm/strategies/deterministic.py +262 -0
- core/hygm/strategies/llm.py +904 -0
- core/hygm/validation/__init__.py +38 -0
- core/hygm/validation/base.py +194 -0
- core/hygm/validation/graph_schema_validator.py +687 -0
- core/hygm/validation/memgraph_data_validator.py +991 -0
- core/migration_agent.py +1369 -0
- core/schema/spec.json +155 -0
- core/utils/meta_graph.py +108 -0
- database/__init__.py +36 -0
- database/adapters/__init__.py +11 -0
- database/adapters/memgraph.py +318 -0
- database/adapters/mysql.py +311 -0
- database/adapters/postgresql.py +335 -0
- database/analyzer.py +396 -0
- database/factory.py +219 -0
- database/models.py +209 -0
- main.py +518 -0
- query_generation/__init__.py +20 -0
- query_generation/cypher_generator.py +129 -0
- query_generation/schema_utilities.py +88 -0
- structured2graph-0.1.1.dist-info/METADATA +197 -0
- structured2graph-0.1.1.dist-info/RECORD +41 -0
- structured2graph-0.1.1.dist-info/WHEEL +4 -0
- structured2graph-0.1.1.dist-info/entry_points.txt +2 -0
- structured2graph-0.1.1.dist-info/licenses/LICENSE +21 -0
- utils/__init__.py +57 -0
- utils/config.py +235 -0
- utils/environment.py +404 -0
core/migration_agent.py
ADDED
|
@@ -0,0 +1,1369 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
"""
|
|
3
|
+
SQL Database to Graph Migration Agent
|
|
4
|
+
|
|
5
|
+
This agent analyzes SQL databases, generates appropriate Cypher queries,
|
|
6
|
+
and migrates data to graph databases using LangGraph workflow.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
from typing import Dict, List, Any, TypedDict, Optional, cast
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
# Add parent directories to path for imports
|
|
18
|
+
sys.path.append(str(Path(__file__).parent.parent.parent / "memgraph-toolbox" / "src"))
|
|
19
|
+
sys.path.append(str(Path(__file__).parent.parent / "langchain-memgraph"))
|
|
20
|
+
sys.path.append(str(Path(__file__).parent.parent)) # Add agents root to path
|
|
21
|
+
|
|
22
|
+
from langgraph.graph import StateGraph, END # noqa: E402
|
|
23
|
+
from langchain_core.runnables.config import RunnableConfig # noqa: E402
|
|
24
|
+
from langchain_openai import ChatOpenAI # noqa: E402
|
|
25
|
+
from langchain_anthropic import ChatAnthropic # noqa: E402
|
|
26
|
+
from langchain_google_genai import ChatGoogleGenerativeAI # noqa: E402
|
|
27
|
+
from dotenv import load_dotenv # noqa: E402
|
|
28
|
+
|
|
29
|
+
from query_generation.cypher_generator import CypherGenerator # noqa: E402
|
|
30
|
+
from core.hygm import HyGM, ModelingMode, GraphModelingStrategy # noqa: E402
|
|
31
|
+
from core.hygm.validation import validate_memgraph_data # noqa: E402
|
|
32
|
+
from memgraph_toolbox.api.memgraph import Memgraph # noqa: E402
|
|
33
|
+
from database.factory import DatabaseAnalyzerFactory # noqa: E402
|
|
34
|
+
from core.utils.meta_graph import ( # noqa: E402
|
|
35
|
+
node_key as meta_node_key,
|
|
36
|
+
relationship_key as meta_relationship_key,
|
|
37
|
+
summarize_node as meta_summarize_node,
|
|
38
|
+
summarize_relationship as meta_summarize_relationship,
|
|
39
|
+
summarize_nodes as meta_summarize_nodes,
|
|
40
|
+
summarize_relationships as meta_summarize_relationships,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Load environment variables
|
|
44
|
+
load_dotenv()
|
|
45
|
+
|
|
46
|
+
logger = logging.getLogger(__name__)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class MigrationState(TypedDict):
|
|
50
|
+
"""State for the migration workflow."""
|
|
51
|
+
|
|
52
|
+
source_db_config: Dict[str, Any]
|
|
53
|
+
memgraph_config: Optional[Dict[str, str]]
|
|
54
|
+
database_structure: Dict[str, Any]
|
|
55
|
+
graph_model: Any # HyGM GraphModel object
|
|
56
|
+
migration_queries: List[str]
|
|
57
|
+
current_step: str
|
|
58
|
+
errors: List[str]
|
|
59
|
+
completed_tables: List[str]
|
|
60
|
+
total_tables: int
|
|
61
|
+
created_indexes: List[str]
|
|
62
|
+
created_constraints: List[str]
|
|
63
|
+
validation_report: Dict[str, Any] # Post-migration validation results
|
|
64
|
+
existing_meta_graph: Optional[Dict[str, Any]]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class SQLToMemgraphAgent:
|
|
68
|
+
"""Agent for migrating SQL databases to graph databases."""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
modeling_mode: ModelingMode = ModelingMode.AUTOMATIC,
|
|
73
|
+
graph_modeling_strategy: GraphModelingStrategy = (
|
|
74
|
+
GraphModelingStrategy.DETERMINISTIC
|
|
75
|
+
),
|
|
76
|
+
meta_graph_policy: str = "auto",
|
|
77
|
+
llm_provider: Optional[str] = None,
|
|
78
|
+
llm_model: Optional[str] = None,
|
|
79
|
+
):
|
|
80
|
+
"""Initialize the migration agent.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
modeling_mode: Graph modeling mode
|
|
84
|
+
- AUTOMATIC: Generate graph model automatically (default)
|
|
85
|
+
- INCREMENTAL: Review tables and refine interactively
|
|
86
|
+
graph_modeling_strategy: Strategy for graph model creation
|
|
87
|
+
- DETERMINISTIC: Rule-based graph creation (default)
|
|
88
|
+
- LLM_POWERED: LLM generates the graph model
|
|
89
|
+
meta_graph_policy: Meta graph handling policy
|
|
90
|
+
llm_provider: LLM provider name (openai/anthropic/gemini)
|
|
91
|
+
llm_model: Specific model name to use
|
|
92
|
+
"""
|
|
93
|
+
# Initialize LLM client if using LLM strategy OR incremental mode
|
|
94
|
+
# (incremental mode needs LLM for natural language modifications)
|
|
95
|
+
self.llm = None
|
|
96
|
+
needs_llm = (
|
|
97
|
+
graph_modeling_strategy == GraphModelingStrategy.LLM_POWERED
|
|
98
|
+
or modeling_mode == ModelingMode.INCREMENTAL
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if needs_llm:
|
|
102
|
+
# Auto-detect provider from environment if not specified
|
|
103
|
+
if llm_provider is None:
|
|
104
|
+
llm_provider = os.getenv("LLM_PROVIDER")
|
|
105
|
+
if llm_provider:
|
|
106
|
+
llm_provider = llm_provider.lower()
|
|
107
|
+
logger.info("Using LLM provider from environment: %s", llm_provider)
|
|
108
|
+
else:
|
|
109
|
+
# Auto-detect based on available API keys
|
|
110
|
+
if os.getenv("OPENAI_API_KEY"):
|
|
111
|
+
llm_provider = "openai"
|
|
112
|
+
logger.info("Auto-detected LLM provider: OpenAI")
|
|
113
|
+
elif os.getenv("ANTHROPIC_API_KEY"):
|
|
114
|
+
llm_provider = "anthropic"
|
|
115
|
+
logger.info("Auto-detected LLM provider: Anthropic")
|
|
116
|
+
elif os.getenv("GOOGLE_API_KEY"):
|
|
117
|
+
llm_provider = "gemini"
|
|
118
|
+
logger.info("Auto-detected LLM provider: Gemini")
|
|
119
|
+
else:
|
|
120
|
+
raise ValueError(
|
|
121
|
+
"No LLM provider configured. Please set one of: "
|
|
122
|
+
"OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, "
|
|
123
|
+
"or set LLM_PROVIDER environment variable"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Create LangChain chat model based on provider
|
|
127
|
+
provider_lower = llm_provider.lower()
|
|
128
|
+
|
|
129
|
+
if provider_lower == "openai":
|
|
130
|
+
model = llm_model or os.getenv("LLM_MODEL", "gpt-4o")
|
|
131
|
+
self.llm = ChatOpenAI(model=model, temperature=0.1)
|
|
132
|
+
logger.info("Initialized OpenAI client with model: %s", model)
|
|
133
|
+
|
|
134
|
+
elif provider_lower == "anthropic":
|
|
135
|
+
model = llm_model or os.getenv(
|
|
136
|
+
"LLM_MODEL", "claude-3-5-sonnet-20241022"
|
|
137
|
+
)
|
|
138
|
+
self.llm = ChatAnthropic(model=model, temperature=0.1)
|
|
139
|
+
logger.info("Initialized Anthropic client with model: %s", model)
|
|
140
|
+
|
|
141
|
+
elif provider_lower == "gemini":
|
|
142
|
+
model = llm_model or os.getenv("LLM_MODEL", "gemini-2.0-flash-exp")
|
|
143
|
+
self.llm = ChatGoogleGenerativeAI(
|
|
144
|
+
model=model,
|
|
145
|
+
temperature=0.1,
|
|
146
|
+
google_api_key=os.getenv("GOOGLE_API_KEY"),
|
|
147
|
+
)
|
|
148
|
+
logger.info("Initialized Gemini client with model: %s", model)
|
|
149
|
+
|
|
150
|
+
else:
|
|
151
|
+
raise ValueError(
|
|
152
|
+
f"Unsupported LLM provider: {llm_provider}. "
|
|
153
|
+
"Supported providers: openai, anthropic, gemini"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
self.database_analyzer = None
|
|
157
|
+
self.cypher_generator = CypherGenerator()
|
|
158
|
+
self.modeling_mode = modeling_mode
|
|
159
|
+
self.graph_modeling_strategy = graph_modeling_strategy
|
|
160
|
+
policy = (meta_graph_policy or "auto").lower()
|
|
161
|
+
if policy not in {"auto", "skip", "reset"}:
|
|
162
|
+
logger.warning(
|
|
163
|
+
"Unknown meta graph policy '%s'; defaulting to auto",
|
|
164
|
+
meta_graph_policy,
|
|
165
|
+
)
|
|
166
|
+
policy = "auto"
|
|
167
|
+
self.meta_graph_policy = policy
|
|
168
|
+
|
|
169
|
+
self.memgraph_client: Optional[Memgraph] = None
|
|
170
|
+
self._existing_meta_graph: Optional[Dict[str, Any]] = None
|
|
171
|
+
self._current_graph_model: Optional[Any] = None
|
|
172
|
+
self._ingestion_plan: Dict[str, Any] = {}
|
|
173
|
+
self._source_signature: Dict[str, str] = {}
|
|
174
|
+
|
|
175
|
+
# Build the workflow graph
|
|
176
|
+
self.workflow = self._build_workflow()
|
|
177
|
+
|
|
178
|
+
def _get_db_config_for_migrate(self, db_config: Dict[str, Any]) -> str:
|
|
179
|
+
"""
|
|
180
|
+
Convert database config for use with migrate module in Memgraph.
|
|
181
|
+
|
|
182
|
+
Adjusts localhost/127.0.0.1 to host.docker.internal for Docker.
|
|
183
|
+
"""
|
|
184
|
+
migrate_host = db_config["host"]
|
|
185
|
+
if migrate_host == "localhost" or migrate_host == "127.0.0.1":
|
|
186
|
+
migrate_host = "host.docker.internal"
|
|
187
|
+
|
|
188
|
+
config_lines = [
|
|
189
|
+
f"user: '{db_config['user']}'",
|
|
190
|
+
f"password: '{db_config['password']}'",
|
|
191
|
+
f"host: '{migrate_host}'",
|
|
192
|
+
f"database: '{db_config['database']}'",
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
port = db_config.get("port")
|
|
196
|
+
if port:
|
|
197
|
+
config_lines.append(f"port: {port}")
|
|
198
|
+
|
|
199
|
+
config_body = ",\n ".join(config_lines)
|
|
200
|
+
return f"""{{
|
|
201
|
+
{config_body}
|
|
202
|
+
}}"""
|
|
203
|
+
|
|
204
|
+
def _qualify_table_name(self, table_name: str, db_config: Dict[str, Any]) -> str:
|
|
205
|
+
"""Add schema qualification when needed for the source database."""
|
|
206
|
+
|
|
207
|
+
if not table_name:
|
|
208
|
+
return table_name
|
|
209
|
+
|
|
210
|
+
db_type = db_config.get("database_type", "mysql")
|
|
211
|
+
schema = db_config.get("schema")
|
|
212
|
+
|
|
213
|
+
if db_type == "postgresql" and schema and "." not in table_name:
|
|
214
|
+
return f"{schema}.{table_name}"
|
|
215
|
+
|
|
216
|
+
return table_name
|
|
217
|
+
|
|
218
|
+
def _compute_source_signature(
|
|
219
|
+
self,
|
|
220
|
+
state: MigrationState,
|
|
221
|
+
) -> Dict[str, str]:
|
|
222
|
+
"""Create a deterministic signature for the source database."""
|
|
223
|
+
config = state.get("source_db_config", {})
|
|
224
|
+
structure = state.get("database_structure", {})
|
|
225
|
+
host = config.get("host", "")
|
|
226
|
+
database = config.get("database", "")
|
|
227
|
+
db_type = (
|
|
228
|
+
structure.get("database_type") or config.get("database_type") or "mysql"
|
|
229
|
+
)
|
|
230
|
+
signature = {
|
|
231
|
+
"host": host,
|
|
232
|
+
"database": database,
|
|
233
|
+
"type": db_type,
|
|
234
|
+
}
|
|
235
|
+
self._source_signature = signature
|
|
236
|
+
return signature
|
|
237
|
+
|
|
238
|
+
def _node_key(self, node: Any) -> str:
|
|
239
|
+
"""Generate a stable key for a graph node definition."""
|
|
240
|
+
return meta_node_key(node)
|
|
241
|
+
|
|
242
|
+
def _relationship_key(self, rel: Any) -> str:
|
|
243
|
+
"""Generate a stable key for a graph relationship."""
|
|
244
|
+
return meta_relationship_key(rel)
|
|
245
|
+
|
|
246
|
+
def _summarize_node(self, node: Any) -> Dict[str, Any]:
|
|
247
|
+
"""Create a JSON-serializable summary for a node definition."""
|
|
248
|
+
return meta_summarize_node(node)
|
|
249
|
+
|
|
250
|
+
def _summarize_relationship(self, rel: Any) -> Dict[str, Any]:
|
|
251
|
+
"""Create a JSON-serializable summary for a relationship."""
|
|
252
|
+
return meta_summarize_relationship(rel)
|
|
253
|
+
|
|
254
|
+
def _graph_model_schema(self, model: Any) -> Dict[str, Any]:
|
|
255
|
+
"""Convert a graph model to schema format if possible."""
|
|
256
|
+
if hasattr(model, "to_schema_format"):
|
|
257
|
+
return model.to_schema_format()
|
|
258
|
+
return {}
|
|
259
|
+
|
|
260
|
+
def _graph_model_hash(self, schema: Dict[str, Any]) -> str:
|
|
261
|
+
"""Compute a stable hash for a schema dictionary."""
|
|
262
|
+
schema_json = json.dumps(schema, sort_keys=True)
|
|
263
|
+
return hashlib.sha256(schema_json.encode("utf-8")).hexdigest()
|
|
264
|
+
|
|
265
|
+
def _build_node_summaries(self, model: Any) -> Dict[str, Any]:
|
|
266
|
+
"""Build summaries for all nodes in a model."""
|
|
267
|
+
return meta_summarize_nodes(getattr(model, "nodes", []))
|
|
268
|
+
|
|
269
|
+
def _build_relationship_summaries(self, model: Any) -> Dict[str, Any]:
|
|
270
|
+
"""Build summaries for all relationships in a model."""
|
|
271
|
+
return meta_summarize_relationships(getattr(model, "edges", []))
|
|
272
|
+
|
|
273
|
+
def _load_existing_meta_graph(self, state: MigrationState) -> None:
|
|
274
|
+
"""Read stored migration metadata from Memgraph if available."""
|
|
275
|
+
if not self.memgraph_client:
|
|
276
|
+
return
|
|
277
|
+
|
|
278
|
+
signature = self._compute_source_signature(state)
|
|
279
|
+
query = (
|
|
280
|
+
"MATCH (meta:MigrationAgent {source_host: $host, "
|
|
281
|
+
"source_database: $database, source_type: $type}) "
|
|
282
|
+
"RETURN meta LIMIT 1"
|
|
283
|
+
)
|
|
284
|
+
result = self.memgraph_client.query(
|
|
285
|
+
query,
|
|
286
|
+
{
|
|
287
|
+
"host": signature["host"],
|
|
288
|
+
"database": signature["database"],
|
|
289
|
+
"type": signature["type"],
|
|
290
|
+
},
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
if result:
|
|
294
|
+
meta = result[0].get("meta", {})
|
|
295
|
+
node_data = meta.get("node_summaries") or "{}"
|
|
296
|
+
rel_data = meta.get("relationship_summaries") or "{}"
|
|
297
|
+
table_counts = meta.get("table_counts") or "{}"
|
|
298
|
+
self._existing_meta_graph = {
|
|
299
|
+
"model_hash": meta.get("model_hash"),
|
|
300
|
+
"node_summaries": json.loads(node_data),
|
|
301
|
+
"relationship_summaries": json.loads(rel_data),
|
|
302
|
+
"table_counts": json.loads(table_counts),
|
|
303
|
+
}
|
|
304
|
+
state["existing_meta_graph"] = self._existing_meta_graph
|
|
305
|
+
logger.info(
|
|
306
|
+
"Loaded existing migration metadata for %s/%s",
|
|
307
|
+
signature["host"],
|
|
308
|
+
signature["database"],
|
|
309
|
+
)
|
|
310
|
+
else:
|
|
311
|
+
self._existing_meta_graph = None
|
|
312
|
+
state["existing_meta_graph"] = None
|
|
313
|
+
logger.info(
|
|
314
|
+
"No existing migration metadata found for %s/%s",
|
|
315
|
+
signature["host"],
|
|
316
|
+
signature["database"],
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
def _calculate_ingestion_plan(
|
|
320
|
+
self,
|
|
321
|
+
graph_model: Any,
|
|
322
|
+
structure: Dict[str, Any],
|
|
323
|
+
) -> Dict[str, Any]:
|
|
324
|
+
"""Determine which nodes and relationships need migration."""
|
|
325
|
+
plan = {
|
|
326
|
+
"nodes": set(),
|
|
327
|
+
"relationships": set(),
|
|
328
|
+
"node_reasons": {},
|
|
329
|
+
"relationship_reasons": {},
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
table_counts = structure.get("table_counts", {}) or {}
|
|
333
|
+
existing = self._existing_meta_graph or {}
|
|
334
|
+
existing_nodes = existing.get("node_summaries", {}) or {}
|
|
335
|
+
existing_rels = existing.get("relationship_summaries", {}) or {}
|
|
336
|
+
existing_counts = existing.get("table_counts", {}) or {}
|
|
337
|
+
|
|
338
|
+
node_keys_by_source: Dict[str, str] = {}
|
|
339
|
+
label_keys: Dict[str, str] = {}
|
|
340
|
+
|
|
341
|
+
for node in getattr(graph_model, "nodes", []):
|
|
342
|
+
key = self._node_key(node)
|
|
343
|
+
summary = self._summarize_node(node)
|
|
344
|
+
source_name = summary.get("source")
|
|
345
|
+
if source_name:
|
|
346
|
+
node_keys_by_source[source_name] = key
|
|
347
|
+
label_key = "|".join(sorted(summary.get("labels", [])))
|
|
348
|
+
label_keys[label_key] = key
|
|
349
|
+
|
|
350
|
+
reasons: List[str] = []
|
|
351
|
+
stored = existing_nodes.get(key)
|
|
352
|
+
if not stored:
|
|
353
|
+
reasons.append("new node definition")
|
|
354
|
+
else:
|
|
355
|
+
if summary["properties"] != stored.get("properties", []):
|
|
356
|
+
reasons.append("properties changed")
|
|
357
|
+
if summary["id_field"] != stored.get("id_field"):
|
|
358
|
+
reasons.append("identifier changed")
|
|
359
|
+
|
|
360
|
+
table_name = summary.get("source")
|
|
361
|
+
if table_name:
|
|
362
|
+
new_count = table_counts.get(table_name)
|
|
363
|
+
old_count = existing_counts.get(table_name)
|
|
364
|
+
if new_count is not None:
|
|
365
|
+
if old_count is None:
|
|
366
|
+
reasons.append("table count unavailable previously")
|
|
367
|
+
elif new_count != old_count:
|
|
368
|
+
if new_count > old_count:
|
|
369
|
+
reasons.append("source data increased")
|
|
370
|
+
else:
|
|
371
|
+
reasons.append("source data changed")
|
|
372
|
+
|
|
373
|
+
if reasons or not existing_nodes:
|
|
374
|
+
plan["nodes"].add(key)
|
|
375
|
+
plan["node_reasons"][key] = reasons or ["initial migration"]
|
|
376
|
+
|
|
377
|
+
for rel in getattr(graph_model, "edges", []):
|
|
378
|
+
key = self._relationship_key(rel)
|
|
379
|
+
summary = self._summarize_relationship(rel)
|
|
380
|
+
reasons: List[str] = []
|
|
381
|
+
stored = existing_rels.get(key)
|
|
382
|
+
if not stored:
|
|
383
|
+
reasons.append("new relationship definition")
|
|
384
|
+
else:
|
|
385
|
+
if summary["mapping"] != stored.get("mapping", {}):
|
|
386
|
+
reasons.append("mapping changed")
|
|
387
|
+
if summary["start"] != stored.get("start", []):
|
|
388
|
+
reasons.append("start labels changed")
|
|
389
|
+
if summary["end"] != stored.get("end", []):
|
|
390
|
+
reasons.append("end labels changed")
|
|
391
|
+
|
|
392
|
+
start_key = None
|
|
393
|
+
end_key = None
|
|
394
|
+
start_table = summary.get("start_table")
|
|
395
|
+
end_table = summary.get("end_table")
|
|
396
|
+
if start_table and start_table in node_keys_by_source:
|
|
397
|
+
start_key = node_keys_by_source[start_table]
|
|
398
|
+
if end_table and end_table in node_keys_by_source:
|
|
399
|
+
end_key = node_keys_by_source[end_table]
|
|
400
|
+
if not start_key:
|
|
401
|
+
label = "|".join(summary.get("start", []))
|
|
402
|
+
start_key = label_keys.get(label)
|
|
403
|
+
if not end_key:
|
|
404
|
+
label = "|".join(summary.get("end", []))
|
|
405
|
+
end_key = label_keys.get(label)
|
|
406
|
+
|
|
407
|
+
dependent_update = False
|
|
408
|
+
if start_key and start_key in plan["nodes"]:
|
|
409
|
+
dependent_update = True
|
|
410
|
+
if end_key and end_key in plan["nodes"]:
|
|
411
|
+
dependent_update = True
|
|
412
|
+
if dependent_update and "dependent node update" not in reasons:
|
|
413
|
+
reasons.append("dependent node update")
|
|
414
|
+
|
|
415
|
+
if reasons or not existing_rels:
|
|
416
|
+
plan["relationships"].add(key)
|
|
417
|
+
plan["relationship_reasons"][key] = reasons or ["initial migration"]
|
|
418
|
+
|
|
419
|
+
self._ingestion_plan = plan
|
|
420
|
+
return plan
|
|
421
|
+
|
|
422
|
+
def _store_meta_graph(self, state: MigrationState) -> None:
|
|
423
|
+
"""Persist the current graph model metadata to Memgraph."""
|
|
424
|
+
if not self.memgraph_client:
|
|
425
|
+
return
|
|
426
|
+
|
|
427
|
+
graph_model = state.get("graph_model")
|
|
428
|
+
if not graph_model:
|
|
429
|
+
return
|
|
430
|
+
|
|
431
|
+
structure = state.get("database_structure", {})
|
|
432
|
+
schema = self._graph_model_schema(graph_model)
|
|
433
|
+
node_summaries = self._build_node_summaries(graph_model)
|
|
434
|
+
rel_summaries = self._build_relationship_summaries(graph_model)
|
|
435
|
+
table_counts = structure.get("table_counts", {}) or {}
|
|
436
|
+
|
|
437
|
+
model_hash = self._graph_model_hash(schema)
|
|
438
|
+
signature = self._source_signature or self._compute_source_signature(state)
|
|
439
|
+
|
|
440
|
+
query = (
|
|
441
|
+
"MERGE (meta:MigrationAgent {source_host: $host, "
|
|
442
|
+
"source_database: $database, source_type: $type}) "
|
|
443
|
+
"SET meta.last_migrated_at = datetime(), "
|
|
444
|
+
"meta.model_hash = $model_hash, "
|
|
445
|
+
"meta.schema = $schema, "
|
|
446
|
+
"meta.node_summaries = $node_summaries, "
|
|
447
|
+
"meta.relationship_summaries = $relationship_summaries, "
|
|
448
|
+
"meta.table_counts = $table_counts"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
self.memgraph_client.query(
|
|
452
|
+
query,
|
|
453
|
+
{
|
|
454
|
+
"host": signature.get("host", ""),
|
|
455
|
+
"database": signature.get("database", ""),
|
|
456
|
+
"type": signature.get("type", ""),
|
|
457
|
+
"model_hash": model_hash,
|
|
458
|
+
"schema": json.dumps(schema, sort_keys=True),
|
|
459
|
+
"node_summaries": json.dumps(node_summaries, sort_keys=True),
|
|
460
|
+
"relationship_summaries": json.dumps(
|
|
461
|
+
rel_summaries,
|
|
462
|
+
sort_keys=True,
|
|
463
|
+
),
|
|
464
|
+
"table_counts": json.dumps(table_counts, sort_keys=True),
|
|
465
|
+
},
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
logger.info(
|
|
469
|
+
"Stored migration metadata for %s/%s",
|
|
470
|
+
signature.get("host", ""),
|
|
471
|
+
signature.get("database", ""),
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
def _build_workflow(self) -> StateGraph:
|
|
475
|
+
"""Build the LangGraph workflow with clear separation of concerns."""
|
|
476
|
+
workflow = StateGraph(MigrationState)
|
|
477
|
+
|
|
478
|
+
# Add nodes - refactored for better modularity
|
|
479
|
+
workflow.add_node(
|
|
480
|
+
"connect_and_analyze_schema",
|
|
481
|
+
self._connect_and_analyze_schema,
|
|
482
|
+
)
|
|
483
|
+
workflow.add_node(
|
|
484
|
+
"create_graph_model",
|
|
485
|
+
self._create_graph_model,
|
|
486
|
+
)
|
|
487
|
+
workflow.add_node(
|
|
488
|
+
"create_indexes",
|
|
489
|
+
self._create_indexes,
|
|
490
|
+
)
|
|
491
|
+
workflow.add_node(
|
|
492
|
+
"generate_cypher_queries",
|
|
493
|
+
self._generate_cypher_queries,
|
|
494
|
+
)
|
|
495
|
+
workflow.add_node(
|
|
496
|
+
"prepare_target_database",
|
|
497
|
+
self._prepare_target_database,
|
|
498
|
+
)
|
|
499
|
+
workflow.add_node(
|
|
500
|
+
"execute_data_migration",
|
|
501
|
+
self._execute_data_migration,
|
|
502
|
+
)
|
|
503
|
+
workflow.add_node(
|
|
504
|
+
"validate_post_migration",
|
|
505
|
+
self._validate_post_migration,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# Add conditional edges for better error handling
|
|
509
|
+
workflow.add_edge(
|
|
510
|
+
"connect_and_analyze_schema",
|
|
511
|
+
"prepare_target_database",
|
|
512
|
+
)
|
|
513
|
+
workflow.add_edge(
|
|
514
|
+
"prepare_target_database",
|
|
515
|
+
"create_graph_model",
|
|
516
|
+
)
|
|
517
|
+
workflow.add_edge("create_graph_model", "create_indexes")
|
|
518
|
+
workflow.add_edge("create_indexes", "generate_cypher_queries")
|
|
519
|
+
workflow.add_edge("generate_cypher_queries", "execute_data_migration")
|
|
520
|
+
workflow.add_edge("execute_data_migration", "validate_post_migration")
|
|
521
|
+
workflow.add_edge("validate_post_migration", END)
|
|
522
|
+
|
|
523
|
+
# Set entry point
|
|
524
|
+
workflow.set_entry_point("connect_and_analyze_schema")
|
|
525
|
+
|
|
526
|
+
# Return the workflow (not compiled) so caller can add checkpointer
|
|
527
|
+
return workflow
|
|
528
|
+
|
|
529
|
+
def _connect_and_analyze_schema(
|
|
530
|
+
self,
|
|
531
|
+
state: MigrationState,
|
|
532
|
+
) -> MigrationState:
|
|
533
|
+
"""Connect to source database and prepare info for HyGM."""
|
|
534
|
+
logger.info("Preparing database connection for HyGM analysis...")
|
|
535
|
+
|
|
536
|
+
try:
|
|
537
|
+
# Initialize database analyzer to test connection
|
|
538
|
+
source_config = state["source_db_config"].copy()
|
|
539
|
+
db_type = source_config.pop("database_type", "mysql")
|
|
540
|
+
|
|
541
|
+
database_analyzer = DatabaseAnalyzerFactory.create_analyzer(
|
|
542
|
+
database_type=db_type,
|
|
543
|
+
**source_config,
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
if not database_analyzer.connect():
|
|
547
|
+
raise Exception("Failed to connect to source database")
|
|
548
|
+
|
|
549
|
+
# Get basic database structure for HyGM
|
|
550
|
+
db_structure = database_analyzer.get_database_structure()
|
|
551
|
+
hygm_data = db_structure.to_hygm_format()
|
|
552
|
+
|
|
553
|
+
# Store the database structure for HyGM
|
|
554
|
+
state["database_structure"] = hygm_data
|
|
555
|
+
state["total_tables"] = len(hygm_data.get("entity_tables", {}))
|
|
556
|
+
state["current_step"] = "Database connection established"
|
|
557
|
+
|
|
558
|
+
logger.info("Database structure prepared for HyGM analysis")
|
|
559
|
+
|
|
560
|
+
except Exception as e:
|
|
561
|
+
logger.error(f"Error connecting to database: {e}")
|
|
562
|
+
state["errors"].append(f"Database connection failed: {e}")
|
|
563
|
+
|
|
564
|
+
return state
|
|
565
|
+
|
|
566
|
+
def _create_graph_model(self, state: MigrationState) -> MigrationState:
|
|
567
|
+
"""Create graph model using HyGM based on analyzed schema."""
|
|
568
|
+
logger.info("Creating graph model using HyGM...")
|
|
569
|
+
|
|
570
|
+
try:
|
|
571
|
+
hygm_data = state["database_structure"]
|
|
572
|
+
|
|
573
|
+
# Log the modeling mode being used
|
|
574
|
+
if self.modeling_mode == ModelingMode.INCREMENTAL:
|
|
575
|
+
logger.info(
|
|
576
|
+
"Using incremental graph modeling mode with an "
|
|
577
|
+
"end-of-session interactive refinement option"
|
|
578
|
+
)
|
|
579
|
+
else:
|
|
580
|
+
logger.info("Using automatic graph modeling mode")
|
|
581
|
+
|
|
582
|
+
# Create graph modeler with strategy and mode
|
|
583
|
+
graph_modeler = HyGM(
|
|
584
|
+
llm=self.llm,
|
|
585
|
+
mode=self.modeling_mode,
|
|
586
|
+
strategy=self.graph_modeling_strategy,
|
|
587
|
+
existing_meta_graph=state.get("existing_meta_graph"),
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
# Log the strategy being used
|
|
591
|
+
strategy_name = self.graph_modeling_strategy.value
|
|
592
|
+
logger.info(f"Using {strategy_name} graph modeling strategy")
|
|
593
|
+
|
|
594
|
+
# Generate graph model using new unified interface
|
|
595
|
+
graph_model = graph_modeler.create_graph_model(
|
|
596
|
+
hygm_data,
|
|
597
|
+
domain_context="Database migration to graph database",
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
# Store the graph model in state
|
|
601
|
+
state["graph_model"] = graph_model
|
|
602
|
+
|
|
603
|
+
logger.info(
|
|
604
|
+
f"Graph model created with {len(graph_model.nodes)} "
|
|
605
|
+
f"node types and {len(graph_model.edges)} "
|
|
606
|
+
f"relationship types"
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
state["current_step"] = "Graph model created successfully"
|
|
610
|
+
|
|
611
|
+
except Exception as e:
|
|
612
|
+
logger.error(f"Graph modeling failed: {e}")
|
|
613
|
+
# HyGM is required - propagate the error
|
|
614
|
+
return self._handle_step_error(state, "creating graph model", e)
|
|
615
|
+
|
|
616
|
+
return state
|
|
617
|
+
|
|
618
|
+
def _prepare_target_database(
|
|
619
|
+
self,
|
|
620
|
+
state: MigrationState,
|
|
621
|
+
) -> MigrationState:
|
|
622
|
+
"""Prepare the target Memgraph database for migration."""
|
|
623
|
+
logger.info("Preparing target database for migration...")
|
|
624
|
+
|
|
625
|
+
try:
|
|
626
|
+
# Initialize Memgraph connection
|
|
627
|
+
config_value = state.get("memgraph_config")
|
|
628
|
+
if not config_value:
|
|
629
|
+
raise Exception("Memgraph configuration is required")
|
|
630
|
+
config = cast(Dict[str, str], config_value)
|
|
631
|
+
|
|
632
|
+
url = config.get("url")
|
|
633
|
+
if not url:
|
|
634
|
+
raise Exception("Memgraph configuration must include 'url'")
|
|
635
|
+
|
|
636
|
+
username = config.get("username") or ""
|
|
637
|
+
password = config.get("password") or ""
|
|
638
|
+
database = config.get("database") or "memgraph"
|
|
639
|
+
|
|
640
|
+
self.memgraph_client = Memgraph(
|
|
641
|
+
url=url,
|
|
642
|
+
username=username,
|
|
643
|
+
password=password,
|
|
644
|
+
database=database,
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
# Test Memgraph connection
|
|
648
|
+
test_query = "MATCH (n) RETURN count(n) as node_count LIMIT 1"
|
|
649
|
+
self.memgraph_client.query(test_query)
|
|
650
|
+
logger.info("Memgraph connection established successfully")
|
|
651
|
+
|
|
652
|
+
# Load existing meta graph to plan incremental ingestion
|
|
653
|
+
policy = getattr(self, "meta_graph_policy", "auto")
|
|
654
|
+
if policy == "skip":
|
|
655
|
+
logger.info("Meta graph loading skipped by configuration")
|
|
656
|
+
self._existing_meta_graph = None
|
|
657
|
+
state["existing_meta_graph"] = None
|
|
658
|
+
else:
|
|
659
|
+
self._load_existing_meta_graph(state)
|
|
660
|
+
if self._existing_meta_graph:
|
|
661
|
+
if policy == "reset":
|
|
662
|
+
logger.info(
|
|
663
|
+
"Existing migration metadata ignored due to reset policy",
|
|
664
|
+
)
|
|
665
|
+
self._existing_meta_graph = None
|
|
666
|
+
state["existing_meta_graph"] = None
|
|
667
|
+
else:
|
|
668
|
+
logger.info(
|
|
669
|
+
"Existing migration metadata detected; data will be merged",
|
|
670
|
+
)
|
|
671
|
+
else:
|
|
672
|
+
logger.info(
|
|
673
|
+
"No migration metadata found; treating this as an initial run",
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
state["current_step"] = "Target database prepared successfully"
|
|
677
|
+
|
|
678
|
+
except Exception as e:
|
|
679
|
+
logger.error(f"Error preparing target database: {e}")
|
|
680
|
+
state["errors"].append(f"Database preparation failed: {e}")
|
|
681
|
+
state["current_step"] = "Database preparation failed"
|
|
682
|
+
|
|
683
|
+
return state
|
|
684
|
+
|
|
685
|
+
def _execute_data_migration(self, state: MigrationState) -> MigrationState:
|
|
686
|
+
"""Execute the actual data migration queries."""
|
|
687
|
+
logger.info("Executing data migration...")
|
|
688
|
+
|
|
689
|
+
try:
|
|
690
|
+
memgraph_client = self.memgraph_client
|
|
691
|
+
if not memgraph_client:
|
|
692
|
+
raise Exception("Memgraph client is not initialized")
|
|
693
|
+
|
|
694
|
+
queries = state["migration_queries"]
|
|
695
|
+
|
|
696
|
+
# Execute all migration queries sequentially
|
|
697
|
+
successful_queries = 0
|
|
698
|
+
for i, query in enumerate(queries):
|
|
699
|
+
# Skip empty queries but keep comment-only blocks for context
|
|
700
|
+
query_lines = [line.strip() for line in query.strip().split("\n")]
|
|
701
|
+
non_comment_lines = [
|
|
702
|
+
line for line in query_lines if line and not line.startswith("//")
|
|
703
|
+
]
|
|
704
|
+
|
|
705
|
+
if non_comment_lines: # Has actual Cypher code
|
|
706
|
+
try:
|
|
707
|
+
logger.info(
|
|
708
|
+
"Executing query %d/%d...",
|
|
709
|
+
i + 1,
|
|
710
|
+
len(queries),
|
|
711
|
+
)
|
|
712
|
+
memgraph_client.query(query)
|
|
713
|
+
successful_queries += 1
|
|
714
|
+
|
|
715
|
+
# Log progress for node creation queries
|
|
716
|
+
if "MERGE (n:" in query or "CREATE (n:" in query:
|
|
717
|
+
# Extract table name from comment line
|
|
718
|
+
# Comment format: "// Merge {label} nodes from {table} table (HyGM optimized)"
|
|
719
|
+
table_name = None
|
|
720
|
+
for line in query_lines:
|
|
721
|
+
if (
|
|
722
|
+
line.startswith("//")
|
|
723
|
+
and " from " in line
|
|
724
|
+
and " table" in line
|
|
725
|
+
):
|
|
726
|
+
try:
|
|
727
|
+
# Extract table name from comment
|
|
728
|
+
parts = (
|
|
729
|
+
line.split(" from ")[1]
|
|
730
|
+
.split(" table")[0]
|
|
731
|
+
.strip()
|
|
732
|
+
)
|
|
733
|
+
table_name = parts
|
|
734
|
+
break
|
|
735
|
+
except (IndexError, AttributeError):
|
|
736
|
+
pass
|
|
737
|
+
|
|
738
|
+
if table_name:
|
|
739
|
+
logger.info(
|
|
740
|
+
f"Successfully migrated data from table: "
|
|
741
|
+
f"{table_name}"
|
|
742
|
+
)
|
|
743
|
+
# Update completed tables list
|
|
744
|
+
if table_name not in state["completed_tables"]:
|
|
745
|
+
state["completed_tables"].append(table_name)
|
|
746
|
+
elif (
|
|
747
|
+
"MERGE (" in query or "CREATE (" in query
|
|
748
|
+
) and "-[:" in query:
|
|
749
|
+
logger.info("Successfully created relationships")
|
|
750
|
+
|
|
751
|
+
except Exception as e:
|
|
752
|
+
logger.error(f"Failed to execute query {i + 1}: {e}")
|
|
753
|
+
logger.error(f"Query: {query[:100]}...")
|
|
754
|
+
state["errors"].append(f"Query execution failed: {e}")
|
|
755
|
+
|
|
756
|
+
logger.info(
|
|
757
|
+
f"Migration completed: {successful_queries}/{len(queries)} "
|
|
758
|
+
f"queries executed successfully"
|
|
759
|
+
)
|
|
760
|
+
state["current_step"] = "Data migration completed"
|
|
761
|
+
|
|
762
|
+
except Exception as e:
|
|
763
|
+
logger.error(f"Error executing data migration: {e}")
|
|
764
|
+
state["errors"].append(f"Data migration failed: {e}")
|
|
765
|
+
|
|
766
|
+
return state
|
|
767
|
+
|
|
768
|
+
def _execute_queries_with_logging(
|
|
769
|
+
self,
|
|
770
|
+
queries: List[str],
|
|
771
|
+
query_type: str,
|
|
772
|
+
memgraph_client: Memgraph,
|
|
773
|
+
success_list: List[str],
|
|
774
|
+
warning_prefix: str = "warning",
|
|
775
|
+
) -> None:
|
|
776
|
+
"""Execute queries with consistent logging and error handling."""
|
|
777
|
+
for query in queries:
|
|
778
|
+
try:
|
|
779
|
+
logger.info("Creating %s: %s", query_type, query)
|
|
780
|
+
memgraph_client.query(query)
|
|
781
|
+
success_list.append(query)
|
|
782
|
+
except Exception as e:
|
|
783
|
+
# Some queries might already exist, log but continue
|
|
784
|
+
logger.warning(
|
|
785
|
+
f"{query_type.capitalize()} creation {warning_prefix}: %s",
|
|
786
|
+
e,
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
def _handle_step_error(
|
|
790
|
+
self,
|
|
791
|
+
state: MigrationState,
|
|
792
|
+
step_name: str,
|
|
793
|
+
error: Exception,
|
|
794
|
+
) -> MigrationState:
|
|
795
|
+
"""Standardized error handling for workflow steps."""
|
|
796
|
+
error_msg = f"Error {step_name}: {error}"
|
|
797
|
+
failure_msg = f"{step_name.capitalize()} failed: {error}"
|
|
798
|
+
|
|
799
|
+
logger.error(error_msg)
|
|
800
|
+
state["errors"].append(failure_msg)
|
|
801
|
+
state["current_step"] = f"{step_name.capitalize()} failed"
|
|
802
|
+
|
|
803
|
+
return state
|
|
804
|
+
|
|
805
|
+
def _create_indexes(self, state: MigrationState) -> MigrationState:
|
|
806
|
+
"""Create indexes and constraints in Memgraph before migration."""
|
|
807
|
+
logger.info("Creating HyGM indexes and constraints...")
|
|
808
|
+
|
|
809
|
+
try:
|
|
810
|
+
# Use the existing Memgraph connection from prepare_target_database
|
|
811
|
+
if not self.memgraph_client:
|
|
812
|
+
raise Exception("No Memgraph connection available")
|
|
813
|
+
|
|
814
|
+
# Track created indexes and constraints
|
|
815
|
+
created_indexes = []
|
|
816
|
+
created_constraints = []
|
|
817
|
+
|
|
818
|
+
# Get the HyGM graph model (required)
|
|
819
|
+
graph_model = state.get("graph_model")
|
|
820
|
+
if not graph_model or not hasattr(graph_model, "node_indexes"):
|
|
821
|
+
raise Exception("HyGM graph model with indexes is required")
|
|
822
|
+
|
|
823
|
+
logger.info("Using HyGM-provided indexes and constraints")
|
|
824
|
+
|
|
825
|
+
# Generate index queries from HyGM graph model
|
|
826
|
+
index_queries = self.cypher_generator.generate_index_queries_from_hygm(
|
|
827
|
+
graph_model.node_indexes
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
# Generate constraint queries from HyGM graph model
|
|
831
|
+
constraint_queries = (
|
|
832
|
+
self.cypher_generator.generate_constraint_queries_from_hygm(
|
|
833
|
+
graph_model.node_constraints
|
|
834
|
+
)
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
logger.info(
|
|
838
|
+
"HyGM provided %d indexes and %d constraints",
|
|
839
|
+
len(index_queries),
|
|
840
|
+
len(constraint_queries),
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
# Execute constraint queries first
|
|
844
|
+
self._execute_queries_with_logging(
|
|
845
|
+
constraint_queries,
|
|
846
|
+
"constraint",
|
|
847
|
+
self.memgraph_client,
|
|
848
|
+
created_constraints,
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
# Execute index queries
|
|
852
|
+
self._execute_queries_with_logging(
|
|
853
|
+
index_queries, "index", self.memgraph_client, created_indexes
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
# Store results in state
|
|
857
|
+
state["created_indexes"] = created_indexes
|
|
858
|
+
state["created_constraints"] = created_constraints
|
|
859
|
+
state["current_step"] = "HyGM indexes and constraints created"
|
|
860
|
+
|
|
861
|
+
logger.info(
|
|
862
|
+
"Created %d constraints and %d indexes from HyGM model",
|
|
863
|
+
len(created_constraints),
|
|
864
|
+
len(created_indexes),
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
except Exception as e:
|
|
868
|
+
return self._handle_step_error(state, "creating indexes", e)
|
|
869
|
+
|
|
870
|
+
return state
|
|
871
|
+
|
|
872
|
+
def _generate_cypher_queries(
|
|
873
|
+
self,
|
|
874
|
+
state: MigrationState,
|
|
875
|
+
) -> MigrationState:
|
|
876
|
+
"""Generate merge-based Cypher queries using the ingestion plan."""
|
|
877
|
+
logger.info("Generating Cypher queries based on HyGM graph model...")
|
|
878
|
+
|
|
879
|
+
try:
|
|
880
|
+
source_db_config = state["source_db_config"]
|
|
881
|
+
graph_model = state.get("graph_model")
|
|
882
|
+
if not graph_model:
|
|
883
|
+
raise Exception("HyGM graph model is required for migration")
|
|
884
|
+
|
|
885
|
+
self._current_graph_model = graph_model
|
|
886
|
+
|
|
887
|
+
structure = state.get("database_structure", {})
|
|
888
|
+
plan = self._calculate_ingestion_plan(graph_model, structure)
|
|
889
|
+
nodes_to_migrate = plan["nodes"]
|
|
890
|
+
relationships_to_migrate = plan["relationships"]
|
|
891
|
+
|
|
892
|
+
if not nodes_to_migrate and not relationships_to_migrate:
|
|
893
|
+
logger.info(
|
|
894
|
+
"Schema and table counts already match stored metadata; "
|
|
895
|
+
"no migration queries generated"
|
|
896
|
+
)
|
|
897
|
+
state["migration_queries"] = []
|
|
898
|
+
state["current_step"] = "No new data to migrate"
|
|
899
|
+
return state
|
|
900
|
+
|
|
901
|
+
for node_key in sorted(nodes_to_migrate):
|
|
902
|
+
reasons = plan["node_reasons"].get(node_key, [])
|
|
903
|
+
reason_text = ", ".join(reasons) if reasons else "initial migration"
|
|
904
|
+
logger.info("Node plan %s → %s", node_key, reason_text)
|
|
905
|
+
|
|
906
|
+
for rel_key in sorted(relationships_to_migrate):
|
|
907
|
+
reasons = plan["relationship_reasons"].get(rel_key, [])
|
|
908
|
+
reason_text = ", ".join(reasons) if reasons else "initial migration"
|
|
909
|
+
logger.info("Relationship plan %s → %s", rel_key, reason_text)
|
|
910
|
+
|
|
911
|
+
queries: List[str] = []
|
|
912
|
+
db_config_str = self._get_db_config_for_migrate(source_db_config)
|
|
913
|
+
db_type = source_db_config.get("database_type", "mysql")
|
|
914
|
+
procedure_name = f"migrate.{db_type}"
|
|
915
|
+
logger.info("Using %s procedure for data ingestion", procedure_name)
|
|
916
|
+
|
|
917
|
+
for node_def in graph_model.nodes:
|
|
918
|
+
node_key = self._node_key(node_def)
|
|
919
|
+
if nodes_to_migrate and node_key not in nodes_to_migrate:
|
|
920
|
+
continue
|
|
921
|
+
|
|
922
|
+
source = getattr(node_def, "source", None)
|
|
923
|
+
source_table = getattr(source, "name", None) or "unknown"
|
|
924
|
+
qualified_table = self._qualify_table_name(
|
|
925
|
+
source_table, source_db_config
|
|
926
|
+
)
|
|
927
|
+
node_label = node_def.primary_label
|
|
928
|
+
|
|
929
|
+
properties = [
|
|
930
|
+
prop.key if hasattr(prop, "key") else str(prop)
|
|
931
|
+
for prop in getattr(node_def, "properties", [])
|
|
932
|
+
]
|
|
933
|
+
|
|
934
|
+
node_mapping = getattr(source, "mapping", {}) if source else {}
|
|
935
|
+
id_field = node_mapping.get("id_field")
|
|
936
|
+
if not id_field and properties:
|
|
937
|
+
id_field = properties[0]
|
|
938
|
+
|
|
939
|
+
if id_field and id_field not in properties:
|
|
940
|
+
properties.append(id_field)
|
|
941
|
+
|
|
942
|
+
if not id_field:
|
|
943
|
+
logger.warning(
|
|
944
|
+
"Skipping node %s: identifier field missing",
|
|
945
|
+
node_label,
|
|
946
|
+
)
|
|
947
|
+
continue
|
|
948
|
+
|
|
949
|
+
if not properties:
|
|
950
|
+
logger.warning(
|
|
951
|
+
"No properties found for node %s from table %s",
|
|
952
|
+
node_label,
|
|
953
|
+
source_table,
|
|
954
|
+
)
|
|
955
|
+
continue
|
|
956
|
+
|
|
957
|
+
properties_str = ", ".join(properties)
|
|
958
|
+
node_query = f"""
|
|
959
|
+
// Merge {node_label} nodes from {source_table} table (HyGM optimized)
|
|
960
|
+
CALL {procedure_name}(
|
|
961
|
+
'SELECT {properties_str} FROM {qualified_table}',
|
|
962
|
+
{db_config_str}
|
|
963
|
+
)
|
|
964
|
+
YIELD row
|
|
965
|
+
MERGE (n:{node_label} {{{id_field}: row.{id_field}}})
|
|
966
|
+
SET n += row;"""
|
|
967
|
+
queries.append(node_query)
|
|
968
|
+
logger.info("Prepared merge query for %s", node_label)
|
|
969
|
+
|
|
970
|
+
logger.info(
|
|
971
|
+
"Preparing relationship queries for %d definitions",
|
|
972
|
+
len(graph_model.edges),
|
|
973
|
+
)
|
|
974
|
+
|
|
975
|
+
for rel_def in graph_model.edges:
|
|
976
|
+
rel_key = self._relationship_key(rel_def)
|
|
977
|
+
if relationships_to_migrate and rel_key not in relationships_to_migrate:
|
|
978
|
+
continue
|
|
979
|
+
|
|
980
|
+
rel_query = self._generate_hygm_relationship_query(
|
|
981
|
+
rel_def,
|
|
982
|
+
db_config_str,
|
|
983
|
+
source_db_config,
|
|
984
|
+
procedure_name,
|
|
985
|
+
)
|
|
986
|
+
if rel_query:
|
|
987
|
+
queries.append(rel_query)
|
|
988
|
+
logger.info(
|
|
989
|
+
"Prepared merge query for relationship %s",
|
|
990
|
+
rel_def.edge_type,
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
state["migration_queries"] = queries
|
|
994
|
+
state["current_step"] = "Migration queries prepared"
|
|
995
|
+
|
|
996
|
+
logger.info("Generated %d migration queries", len(queries))
|
|
997
|
+
|
|
998
|
+
except Exception as e:
|
|
999
|
+
logger.error(f"Error generating HyGM-based Cypher queries: {e}")
|
|
1000
|
+
return self._handle_step_error(
|
|
1001
|
+
state,
|
|
1002
|
+
"generating cypher queries",
|
|
1003
|
+
e,
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
return state
|
|
1007
|
+
|
|
1008
|
+
def _generate_hygm_relationship_query(
|
|
1009
|
+
self,
|
|
1010
|
+
rel_def,
|
|
1011
|
+
db_config_str: str,
|
|
1012
|
+
source_db_config: Dict[str, Any],
|
|
1013
|
+
procedure_name: str,
|
|
1014
|
+
) -> str:
|
|
1015
|
+
"""Create relationship query from HyGM definition."""
|
|
1016
|
+
|
|
1017
|
+
try:
|
|
1018
|
+
if not rel_def.source or not rel_def.source.mapping:
|
|
1019
|
+
logger.warning(
|
|
1020
|
+
f"No source mapping for relationship {rel_def.edge_type}"
|
|
1021
|
+
)
|
|
1022
|
+
return ""
|
|
1023
|
+
|
|
1024
|
+
rel_name = rel_def.edge_type
|
|
1025
|
+
source_info = rel_def.source.mapping
|
|
1026
|
+
|
|
1027
|
+
# Determine relationship type from HyGM source
|
|
1028
|
+
if rel_def.source.type == "many_to_many":
|
|
1029
|
+
return self._generate_many_to_many_hygm_query(
|
|
1030
|
+
rel_name,
|
|
1031
|
+
rel_def,
|
|
1032
|
+
source_info,
|
|
1033
|
+
db_config_str,
|
|
1034
|
+
source_db_config,
|
|
1035
|
+
procedure_name,
|
|
1036
|
+
)
|
|
1037
|
+
elif rel_def.source.type in ["table", "foreign_key"]:
|
|
1038
|
+
return self._generate_one_to_many_hygm_query(
|
|
1039
|
+
rel_name,
|
|
1040
|
+
rel_def,
|
|
1041
|
+
source_info,
|
|
1042
|
+
db_config_str,
|
|
1043
|
+
source_db_config,
|
|
1044
|
+
procedure_name,
|
|
1045
|
+
)
|
|
1046
|
+
else:
|
|
1047
|
+
logger.warning(
|
|
1048
|
+
"Unsupported relationship type: %s",
|
|
1049
|
+
rel_def.source.type,
|
|
1050
|
+
)
|
|
1051
|
+
return ""
|
|
1052
|
+
|
|
1053
|
+
except Exception as e:
|
|
1054
|
+
logger.error(
|
|
1055
|
+
"Error generating relationship query for %s: %s",
|
|
1056
|
+
rel_def.edge_type,
|
|
1057
|
+
e,
|
|
1058
|
+
)
|
|
1059
|
+
return ""
|
|
1060
|
+
|
|
1061
|
+
def _generate_one_to_many_hygm_query(
|
|
1062
|
+
self,
|
|
1063
|
+
rel_name: str,
|
|
1064
|
+
rel_def,
|
|
1065
|
+
source_info: Dict[str, Any],
|
|
1066
|
+
db_config_str: str,
|
|
1067
|
+
source_db_config: Dict[str, Any],
|
|
1068
|
+
procedure_name: str,
|
|
1069
|
+
) -> str:
|
|
1070
|
+
"""Generate one-to-many relationship query from HyGM mapping."""
|
|
1071
|
+
|
|
1072
|
+
start_node = source_info.get("start_node", "")
|
|
1073
|
+
end_node = source_info.get("end_node", "")
|
|
1074
|
+
from_pk = source_info.get("from_pk")
|
|
1075
|
+
|
|
1076
|
+
if not start_node or not end_node:
|
|
1077
|
+
logger.error("Missing relationship information for %s", rel_name)
|
|
1078
|
+
raise Exception(
|
|
1079
|
+
"HyGM must provide complete relationship mapping for " f"{rel_name}"
|
|
1080
|
+
)
|
|
1081
|
+
|
|
1082
|
+
try:
|
|
1083
|
+
from_table, fk_column = start_node.split(".", 1)
|
|
1084
|
+
to_table, to_column = end_node.split(".", 1)
|
|
1085
|
+
except ValueError:
|
|
1086
|
+
logger.error(
|
|
1087
|
+
"Invalid mapping format for %s: %s",
|
|
1088
|
+
rel_name,
|
|
1089
|
+
source_info,
|
|
1090
|
+
)
|
|
1091
|
+
raise Exception(
|
|
1092
|
+
f"HyGM must provide valid relationship mapping for {rel_name}"
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
if not from_pk:
|
|
1096
|
+
raise Exception(f"HyGM must provide primary key information for {rel_name}")
|
|
1097
|
+
|
|
1098
|
+
from_label = (
|
|
1099
|
+
rel_def.start_node_labels[0] if rel_def.start_node_labels else from_table
|
|
1100
|
+
)
|
|
1101
|
+
to_label = rel_def.end_node_labels[0] if rel_def.end_node_labels else to_table
|
|
1102
|
+
|
|
1103
|
+
qualified_from_table = self._qualify_table_name(from_table, source_db_config)
|
|
1104
|
+
|
|
1105
|
+
select_sql = (
|
|
1106
|
+
f"SELECT {from_pk}, {fk_column} "
|
|
1107
|
+
f"FROM {qualified_from_table} "
|
|
1108
|
+
f"WHERE {fk_column} IS NOT NULL"
|
|
1109
|
+
)
|
|
1110
|
+
|
|
1111
|
+
query = f"""
|
|
1112
|
+
// Merge {rel_name} relationships (HyGM: {from_label} -> {to_label})
|
|
1113
|
+
CALL {procedure_name}(
|
|
1114
|
+
'{select_sql}',
|
|
1115
|
+
{db_config_str}
|
|
1116
|
+
)
|
|
1117
|
+
YIELD row
|
|
1118
|
+
MATCH (from_node:{from_label} {{{from_pk}: row.{from_pk}}})
|
|
1119
|
+
MATCH (to_node:{to_label} {{{to_column}: row.{fk_column}}})
|
|
1120
|
+
MERGE (from_node)-[:{rel_name}]->(to_node);"""
|
|
1121
|
+
|
|
1122
|
+
return query
|
|
1123
|
+
|
|
1124
|
+
def _generate_many_to_many_hygm_query(
|
|
1125
|
+
self,
|
|
1126
|
+
rel_name: str,
|
|
1127
|
+
rel_def,
|
|
1128
|
+
source_info: Dict[str, Any],
|
|
1129
|
+
db_config_str: str,
|
|
1130
|
+
source_db_config: Dict[str, Any],
|
|
1131
|
+
procedure_name: str,
|
|
1132
|
+
) -> str:
|
|
1133
|
+
"""Generate many-to-many relationship query from HyGM mapping."""
|
|
1134
|
+
|
|
1135
|
+
join_table = source_info.get("join_table")
|
|
1136
|
+
from_table = source_info.get("from_table")
|
|
1137
|
+
to_table = source_info.get("to_table")
|
|
1138
|
+
from_fk = source_info.get("join_from_column")
|
|
1139
|
+
to_fk = source_info.get("join_to_column")
|
|
1140
|
+
from_pk = source_info.get("from_column")
|
|
1141
|
+
to_pk = source_info.get("to_column")
|
|
1142
|
+
|
|
1143
|
+
if not all([join_table, from_table, to_table, from_fk, to_fk, from_pk, to_pk]):
|
|
1144
|
+
logger.error(
|
|
1145
|
+
"Missing many-to-many relationship information for %s",
|
|
1146
|
+
rel_name,
|
|
1147
|
+
)
|
|
1148
|
+
raise Exception(
|
|
1149
|
+
"HyGM must provide complete many-to-many mapping for " f"{rel_name}"
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
from_label = (
|
|
1153
|
+
rel_def.start_node_labels[0] if rel_def.start_node_labels else from_table
|
|
1154
|
+
)
|
|
1155
|
+
to_label = rel_def.end_node_labels[0] if rel_def.end_node_labels else to_table
|
|
1156
|
+
|
|
1157
|
+
join_table_name = cast(str, join_table)
|
|
1158
|
+
qualified_join_table = self._qualify_table_name(
|
|
1159
|
+
join_table_name, source_db_config
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
select_sql = f"SELECT {from_fk}, {to_fk} FROM {qualified_join_table}"
|
|
1163
|
+
|
|
1164
|
+
query = f"""
|
|
1165
|
+
// Merge {rel_name} relationships via {join_table}
|
|
1166
|
+
// (HyGM: {from_label} <-> {to_label})
|
|
1167
|
+
CALL {procedure_name}(
|
|
1168
|
+
'{select_sql}',
|
|
1169
|
+
{db_config_str}
|
|
1170
|
+
)
|
|
1171
|
+
YIELD row
|
|
1172
|
+
MATCH (from:{from_label} {{{from_pk}: row.{from_fk}}})
|
|
1173
|
+
MATCH (to:{to_label} {{{to_pk}: row.{to_fk}}})
|
|
1174
|
+
MERGE (from)-[:{rel_name}]->(to);"""
|
|
1175
|
+
return query
|
|
1176
|
+
|
|
1177
|
+
def _validate_post_migration(
|
|
1178
|
+
self,
|
|
1179
|
+
state: MigrationState,
|
|
1180
|
+
) -> MigrationState:
|
|
1181
|
+
"""Validate post-migration results using HyGM schema comparison."""
|
|
1182
|
+
logger.info("Running post-migration validation...")
|
|
1183
|
+
|
|
1184
|
+
try:
|
|
1185
|
+
# Check if we have a graph model to validate against
|
|
1186
|
+
if not state.get("graph_model"):
|
|
1187
|
+
logger.warning("No graph model available for validation")
|
|
1188
|
+
state["validation_report"] = {
|
|
1189
|
+
"success": False,
|
|
1190
|
+
"reason": "No graph model available",
|
|
1191
|
+
}
|
|
1192
|
+
state["current_step"] = "Post-migration validation skipped"
|
|
1193
|
+
return state
|
|
1194
|
+
|
|
1195
|
+
# Reuse existing Memgraph connection from previous steps
|
|
1196
|
+
if not self.memgraph_client:
|
|
1197
|
+
logger.error("No Memgraph connection available for validation")
|
|
1198
|
+
state["validation_report"] = {
|
|
1199
|
+
"success": False,
|
|
1200
|
+
"reason": "No Memgraph connection available",
|
|
1201
|
+
}
|
|
1202
|
+
state["current_step"] = "Post-migration validation failed"
|
|
1203
|
+
return state
|
|
1204
|
+
|
|
1205
|
+
# Get the graph model from state
|
|
1206
|
+
graph_model = state.get("graph_model")
|
|
1207
|
+
|
|
1208
|
+
# Calculate expected data counts from MySQL for validation
|
|
1209
|
+
structure = state["database_structure"]
|
|
1210
|
+
expected_nodes = 0
|
|
1211
|
+
table_counts = structure.get("table_counts", {})
|
|
1212
|
+
|
|
1213
|
+
# Default to all migrated tables when nothing specific is selected
|
|
1214
|
+
selected_tables = structure.get("selected_tables", [])
|
|
1215
|
+
if not selected_tables:
|
|
1216
|
+
# Use entity tables (exclude views and system tables)
|
|
1217
|
+
entity_tables = structure.get("entity_tables", {})
|
|
1218
|
+
selected_tables = list(entity_tables.keys())
|
|
1219
|
+
|
|
1220
|
+
for table_name in selected_tables:
|
|
1221
|
+
if table_name in table_counts:
|
|
1222
|
+
expected_nodes += table_counts[table_name]
|
|
1223
|
+
|
|
1224
|
+
# Create expected data counts for the validator
|
|
1225
|
+
expected_data_counts = {
|
|
1226
|
+
"nodes": expected_nodes,
|
|
1227
|
+
"selected_tables": selected_tables,
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
# Run validation using existing connection and data counts
|
|
1231
|
+
logger.info("Executing post-migration validation...")
|
|
1232
|
+
validation_result = validate_memgraph_data(
|
|
1233
|
+
expected_model=graph_model,
|
|
1234
|
+
memgraph_connection=self.memgraph_client,
|
|
1235
|
+
expected_data_counts=expected_data_counts,
|
|
1236
|
+
detailed_report=True,
|
|
1237
|
+
)
|
|
1238
|
+
|
|
1239
|
+
# Store validation results in state
|
|
1240
|
+
state["validation_report"] = {
|
|
1241
|
+
"success": validation_result.success,
|
|
1242
|
+
"summary": validation_result.summary,
|
|
1243
|
+
"validation_score": validation_result.details.get(
|
|
1244
|
+
"validation_score", 0
|
|
1245
|
+
),
|
|
1246
|
+
"issues": [
|
|
1247
|
+
{
|
|
1248
|
+
"severity": issue.severity.value,
|
|
1249
|
+
"category": issue.category,
|
|
1250
|
+
"message": issue.message,
|
|
1251
|
+
"expected": issue.expected,
|
|
1252
|
+
"actual": issue.actual,
|
|
1253
|
+
"recommendation": issue.recommendation,
|
|
1254
|
+
}
|
|
1255
|
+
for issue in validation_result.issues
|
|
1256
|
+
],
|
|
1257
|
+
"metrics": validation_result.metrics,
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
# Log validation summary
|
|
1261
|
+
if validation_result.success:
|
|
1262
|
+
logger.info("✅ Post-migration validation PASSED")
|
|
1263
|
+
score = int(validation_result.details.get("validation_score", 0))
|
|
1264
|
+
logger.info(f"Validation score: {score}/100")
|
|
1265
|
+
else:
|
|
1266
|
+
logger.warning("⚠️ Post-migration validation found issues")
|
|
1267
|
+
score = int(validation_result.details.get("validation_score", 0))
|
|
1268
|
+
logger.warning(f"Validation score: {score}/100")
|
|
1269
|
+
|
|
1270
|
+
# Log critical issues
|
|
1271
|
+
critical_issues = [
|
|
1272
|
+
issue
|
|
1273
|
+
for issue in validation_result.issues
|
|
1274
|
+
if issue.severity.value == "CRITICAL"
|
|
1275
|
+
]
|
|
1276
|
+
if critical_issues:
|
|
1277
|
+
count = len(critical_issues)
|
|
1278
|
+
logger.error(f"Found {count} critical validation issues:")
|
|
1279
|
+
# Show first 3 critical issues
|
|
1280
|
+
for issue in critical_issues[:3]:
|
|
1281
|
+
logger.error(f" - {issue.message}")
|
|
1282
|
+
|
|
1283
|
+
state["current_step"] = "Post-migration validation completed"
|
|
1284
|
+
|
|
1285
|
+
if not state["errors"]:
|
|
1286
|
+
self._store_meta_graph(state)
|
|
1287
|
+
|
|
1288
|
+
except Exception as e:
|
|
1289
|
+
logger.error(f"Error during post-migration validation: {e}")
|
|
1290
|
+
state["errors"].append(f"Post-migration validation failed: {e}")
|
|
1291
|
+
state["validation_report"] = {
|
|
1292
|
+
"validation_performed": False,
|
|
1293
|
+
"reason": f"Validation error: {e}",
|
|
1294
|
+
}
|
|
1295
|
+
state["current_step"] = "Post-migration validation failed"
|
|
1296
|
+
|
|
1297
|
+
return state
|
|
1298
|
+
|
|
1299
|
+
def migrate(
|
|
1300
|
+
self,
|
|
1301
|
+
source_db_config: Dict[str, str],
|
|
1302
|
+
memgraph_config: Optional[Dict[str, str]] = None,
|
|
1303
|
+
) -> Dict[str, Any]:
|
|
1304
|
+
"""Execute the complete migration workflow."""
|
|
1305
|
+
logger.info("Starting SQL database to graph migration...")
|
|
1306
|
+
|
|
1307
|
+
# Initialize state
|
|
1308
|
+
initial_state = MigrationState(
|
|
1309
|
+
source_db_config=source_db_config,
|
|
1310
|
+
memgraph_config=memgraph_config,
|
|
1311
|
+
database_structure={},
|
|
1312
|
+
graph_model=None,
|
|
1313
|
+
migration_queries=[],
|
|
1314
|
+
current_step="Starting migration",
|
|
1315
|
+
errors=[],
|
|
1316
|
+
completed_tables=[],
|
|
1317
|
+
total_tables=0,
|
|
1318
|
+
created_indexes=[],
|
|
1319
|
+
created_constraints=[],
|
|
1320
|
+
validation_report={},
|
|
1321
|
+
existing_meta_graph=None,
|
|
1322
|
+
)
|
|
1323
|
+
|
|
1324
|
+
try:
|
|
1325
|
+
# Automatic mode compiles workflow without a checkpointer
|
|
1326
|
+
if self.modeling_mode == ModelingMode.AUTOMATIC:
|
|
1327
|
+
compiled_workflow = self.workflow.compile()
|
|
1328
|
+
final_state = compiled_workflow.invoke(initial_state)
|
|
1329
|
+
else:
|
|
1330
|
+
# Incremental mode enables a persistent checkpointer
|
|
1331
|
+
from langgraph.checkpoint.memory import MemorySaver
|
|
1332
|
+
|
|
1333
|
+
memory = MemorySaver()
|
|
1334
|
+
compiled_workflow = self.workflow.compile(checkpointer=memory)
|
|
1335
|
+
|
|
1336
|
+
# Provide required configuration for checkpointer
|
|
1337
|
+
config: RunnableConfig = {
|
|
1338
|
+
"configurable": {"thread_id": "migration_thread_1"}
|
|
1339
|
+
}
|
|
1340
|
+
final_state = compiled_workflow.invoke(
|
|
1341
|
+
initial_state,
|
|
1342
|
+
config=config,
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
# Cleanup connections
|
|
1346
|
+
if self.database_analyzer:
|
|
1347
|
+
self.database_analyzer.disconnect()
|
|
1348
|
+
if self.memgraph_client:
|
|
1349
|
+
self.memgraph_client.close()
|
|
1350
|
+
|
|
1351
|
+
return {
|
|
1352
|
+
"success": len(final_state["errors"]) == 0,
|
|
1353
|
+
"completed_tables": final_state["completed_tables"],
|
|
1354
|
+
"total_tables": final_state["total_tables"],
|
|
1355
|
+
"errors": final_state["errors"],
|
|
1356
|
+
"final_step": final_state["current_step"],
|
|
1357
|
+
"validation_report": final_state.get("validation_report", {}),
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
except Exception as e:
|
|
1361
|
+
logger.error(f"Migration workflow failed: {e}")
|
|
1362
|
+
return {
|
|
1363
|
+
"success": False,
|
|
1364
|
+
"errors": [f"Workflow execution failed: {e}"],
|
|
1365
|
+
"completed_tables": [],
|
|
1366
|
+
"total_tables": 0,
|
|
1367
|
+
"final_step": "Failed",
|
|
1368
|
+
"validation_report": {},
|
|
1369
|
+
}
|