structured2graph 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. __init__.py +47 -0
  2. core/__init__.py +23 -0
  3. core/hygm/__init__.py +74 -0
  4. core/hygm/hygm.py +2351 -0
  5. core/hygm/models/__init__.py +82 -0
  6. core/hygm/models/graph_models.py +667 -0
  7. core/hygm/models/llm_models.py +229 -0
  8. core/hygm/models/operations.py +176 -0
  9. core/hygm/models/sources.py +68 -0
  10. core/hygm/models/user_operations.py +139 -0
  11. core/hygm/strategies/__init__.py +17 -0
  12. core/hygm/strategies/base.py +36 -0
  13. core/hygm/strategies/deterministic.py +262 -0
  14. core/hygm/strategies/llm.py +904 -0
  15. core/hygm/validation/__init__.py +38 -0
  16. core/hygm/validation/base.py +194 -0
  17. core/hygm/validation/graph_schema_validator.py +687 -0
  18. core/hygm/validation/memgraph_data_validator.py +991 -0
  19. core/migration_agent.py +1369 -0
  20. core/schema/spec.json +155 -0
  21. core/utils/meta_graph.py +108 -0
  22. database/__init__.py +36 -0
  23. database/adapters/__init__.py +11 -0
  24. database/adapters/memgraph.py +318 -0
  25. database/adapters/mysql.py +311 -0
  26. database/adapters/postgresql.py +335 -0
  27. database/analyzer.py +396 -0
  28. database/factory.py +219 -0
  29. database/models.py +209 -0
  30. main.py +518 -0
  31. query_generation/__init__.py +20 -0
  32. query_generation/cypher_generator.py +129 -0
  33. query_generation/schema_utilities.py +88 -0
  34. structured2graph-0.1.1.dist-info/METADATA +197 -0
  35. structured2graph-0.1.1.dist-info/RECORD +41 -0
  36. structured2graph-0.1.1.dist-info/WHEEL +4 -0
  37. structured2graph-0.1.1.dist-info/entry_points.txt +2 -0
  38. structured2graph-0.1.1.dist-info/licenses/LICENSE +21 -0
  39. utils/__init__.py +57 -0
  40. utils/config.py +235 -0
  41. utils/environment.py +404 -0
@@ -0,0 +1,904 @@
1
+ """
2
+ LLM-powered modeling strategy for Hypothetical Graph Modeling (HyGM).
3
+
4
+ This strategy uses AI/LLM models to create sophisticated graph models.
5
+ Supports multiple providers: OpenAI, Anthropic, Gemini via LangChain.
6
+ """
7
+
8
+ import logging
9
+ from typing import Dict, Any, Optional, List, TYPE_CHECKING
10
+ from langchain_core.language_models import BaseChatModel
11
+
12
+ if TYPE_CHECKING:
13
+ from core.hygm.models.graph_models import GraphModel
14
+
15
+ try:
16
+ from .base import BaseModelingStrategy
17
+ except ImportError:
18
+ from core.hygm.strategies.base import BaseModelingStrategy
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class LLMStrategy(BaseModelingStrategy):
24
+ """
25
+ LLM-powered graph modeling strategy using AI for intelligent mapping.
26
+
27
+ Uses LangChain's BaseChatModel interface to support multiple providers.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ llm_client: Optional[BaseChatModel] = None,
33
+ model_name: str = "gpt-4o-mini",
34
+ temperature: float = 0.1,
35
+ ):
36
+ """
37
+ Initialize LLM strategy.
38
+
39
+ Args:
40
+ llm_client: LangChain chat model (ChatOpenAI/ChatAnthropic/ChatGoogleGenerativeAI)
41
+ model_name: Model to use for graph generation
42
+ temperature: Temperature for generation (lower=more deterministic)
43
+ """
44
+ self.llm_client = llm_client
45
+ self.model_name = model_name
46
+ self.temperature = temperature
47
+ self._database_structure = {}
48
+ self._current_llm_model = None
49
+
50
+ def get_strategy_name(self) -> str:
51
+ """Return the name of this strategy."""
52
+ return "llm"
53
+
54
+ def create_model(
55
+ self,
56
+ database_structure: Dict[str, Any],
57
+ domain_context: Optional[str] = None,
58
+ user_operation_context: Optional[str] = None,
59
+ ) -> "GraphModel":
60
+ """
61
+ Create a sophisticated graph model using LLM analysis.
62
+
63
+ Args:
64
+ database_structure: Database schema and structure
65
+ domain_context: Optional domain context for better modeling
66
+ user_operation_context: Context about user operations to preserve
67
+
68
+ Returns:
69
+ GraphModel: AI-generated graph model
70
+
71
+ Raises:
72
+ ValueError: If no LLM client is provided
73
+ Exception: If LLM model creation fails
74
+ """
75
+ logger.info("Creating LLM-powered graph model...")
76
+
77
+ # Store database structure for relationship mapping
78
+ self._database_structure = database_structure
79
+
80
+ if not self.llm_client:
81
+ raise ValueError(
82
+ "No LLM client provided for LLM-powered modeling. "
83
+ "Please configure OpenAI API key or use deterministic strategy."
84
+ )
85
+
86
+ return self._create_llm_model(
87
+ database_structure, domain_context, user_operation_context
88
+ )
89
+
90
+ def _create_llm_model(
91
+ self,
92
+ database_structure: Dict[str, Any],
93
+ domain_context: Optional[str] = None,
94
+ user_operation_context: Optional[str] = None,
95
+ ) -> "GraphModel":
96
+ """Create model using LLM structured output."""
97
+ logger.info("Using LLM to generate graph model...")
98
+
99
+ try:
100
+ # Import the structured output model
101
+ from core.hygm.models.llm_models import LLMGraphModel
102
+
103
+ # Prepare the prompt
104
+ prompt = self._build_modeling_prompt(
105
+ database_structure, domain_context, user_operation_context
106
+ )
107
+
108
+ # Call LLM with unified client interface
109
+ if not self.llm_client:
110
+ raise ValueError("No LLM client available")
111
+
112
+ # Create system message for graph modeling
113
+ system_message = (
114
+ "You are an expert database architect specializing "
115
+ "in converting relational schemas to graph models. "
116
+ "Analyze the provided database structure and create "
117
+ "an optimal graph model that preserves relationships "
118
+ "and enables efficient querying."
119
+ )
120
+
121
+ # Generate the structured output using LangChain's with_structured_output
122
+ structured_llm = self.llm_client.with_structured_output(LLMGraphModel)
123
+ llm_model = structured_llm.invoke(
124
+ [
125
+ {"role": "system", "content": system_message},
126
+ {"role": "user", "content": prompt},
127
+ ]
128
+ )
129
+
130
+ logger.info(
131
+ "LLM generated %d nodes and %d relationships",
132
+ len(llm_model.nodes),
133
+ len(llm_model.relationships),
134
+ )
135
+
136
+ # Extract and convert LLM response to internal graph model
137
+ return self._convert_llm_to_graph_model(llm_model)
138
+
139
+ except ImportError as e:
140
+ error_msg = f"Missing LLM models module: {e}"
141
+ logger.error(error_msg)
142
+ raise ImportError(error_msg) from e
143
+ except Exception as e:
144
+ error_msg = f"LLM model creation failed: {e}"
145
+ logger.error(error_msg)
146
+ raise RuntimeError(error_msg) from e
147
+
148
+ def _build_modeling_prompt(
149
+ self,
150
+ database_structure: Dict[str, Any],
151
+ domain_context: Optional[str] = None,
152
+ user_operation_context: Optional[str] = None,
153
+ ) -> str:
154
+ """Build the prompt for LLM graph modeling."""
155
+
156
+ prompt_parts = [
157
+ "Convert this relational database schema to an optimal graph model.",
158
+ "Analyze the database structure and create nodes and relationships that:",
159
+ "",
160
+ "Database Structure:",
161
+ str(database_structure),
162
+ "",
163
+ "Requirements:",
164
+ "- Create semantic node labels (not just table names)",
165
+ "- Identify meaningful relationships based on foreign keys",
166
+ "- Include relevant properties from source tables on nodes",
167
+ "- Set appropriate primary keys for each node",
168
+ "- **CRITICAL: Recommend indexes for ALL relation properties**",
169
+ "- **CRITICAL: Both relationship ends MUST have indexes**",
170
+ "- Include unique constraints for primary keys and unique fields",
171
+ "- Use descriptive relationship names (e.g., 'OWNS', 'BELONGS_TO')",
172
+ "- Consider one-to-many, many-to-many, and one-to-one types",
173
+ "- Optimize for both data integrity and query performance",
174
+ "",
175
+ "Index Guidelines:",
176
+ "- Primary keys should always have indexes",
177
+ "- Foreign key properties should always have indexes",
178
+ "- Unique fields should have indexes",
179
+ "- Properties in WHERE clauses should have indexes",
180
+ "",
181
+ "Constraint Guidelines:",
182
+ "- Primary key properties should have unique constraints",
183
+ "- Unique business fields should have unique constraints",
184
+ "- Consider data integrity requirements from source database",
185
+ ]
186
+
187
+ # Add user operation context if provided (critical for preserving user changes)
188
+ if user_operation_context:
189
+ prompt_parts.extend(
190
+ [
191
+ "",
192
+ "⚠️ CRITICAL REQUIREMENT:",
193
+ user_operation_context,
194
+ "⚠️ YOU MUST PRESERVE ALL USER CHANGES LISTED ABOVE.",
195
+ "DO NOT REVERT ANY USER OPERATIONS WHEN CREATING THE MODEL.",
196
+ "",
197
+ ]
198
+ )
199
+
200
+ if domain_context:
201
+ prompt_parts.extend(
202
+ [
203
+ "",
204
+ f"Domain Context: {domain_context}",
205
+ "Use this context to create more semantically meaningful "
206
+ "models.",
207
+ ]
208
+ )
209
+
210
+ return "\n".join(prompt_parts)
211
+
212
+ def _convert_llm_to_graph_model(self, llm_model) -> "GraphModel":
213
+ """Convert LLM response to internal GraphModel format."""
214
+ # Store the LLM model for use in relationship mapping
215
+ self._current_llm_model = llm_model
216
+
217
+ from core.hygm.models.graph_models import (
218
+ GraphModel,
219
+ GraphNode,
220
+ GraphRelationship,
221
+ GraphProperty,
222
+ GraphIndex,
223
+ GraphConstraint,
224
+ )
225
+ from core.hygm.models.sources import (
226
+ NodeSource,
227
+ PropertySource,
228
+ RelationshipSource,
229
+ IndexSource,
230
+ ConstraintSource,
231
+ )
232
+
233
+ # Convert nodes - preserve original table mapping
234
+ nodes = []
235
+ for llm_node in llm_model.nodes:
236
+ source = NodeSource(
237
+ type="table", # Keep as table source for migration
238
+ name=llm_node.source_table, # Use actual source table name
239
+ location=f"database.schema.{llm_node.source_table}",
240
+ mapping={"labels": llm_node.labels},
241
+ )
242
+
243
+ properties = []
244
+ for prop_name in llm_node.properties:
245
+ field_path = f"{llm_node.source_table}.{prop_name}"
246
+ prop_source = PropertySource(field=field_path)
247
+ graph_prop = GraphProperty(key=prop_name, source=prop_source)
248
+ properties.append(graph_prop)
249
+
250
+ node = GraphNode(
251
+ labels=llm_node.labels, properties=properties, source=source
252
+ )
253
+ nodes.append(node)
254
+
255
+ # Convert relationships - preserve database structure mapping
256
+ relationships = []
257
+ for llm_rel in llm_model.relationships:
258
+ # Find the source database relationship information
259
+ db_rel_info = self._find_database_relationship(llm_rel)
260
+
261
+ if db_rel_info:
262
+ # Use actual database structure for migration mapping
263
+ from_table = db_rel_info.get("from_table", "")
264
+ location = f"database.schema.{from_table}"
265
+ rel_source = RelationshipSource(
266
+ type=db_rel_info.get("source_type", "table"),
267
+ name=db_rel_info.get("constraint_name", llm_rel.name),
268
+ location=location,
269
+ mapping=db_rel_info.get("mapping", {}),
270
+ )
271
+ else:
272
+ # Get source tables for the relationship nodes
273
+ from_table = self._get_source_table_for_node(llm_rel.from_node)
274
+ to_table = self._get_source_table_for_node(llm_rel.to_node)
275
+
276
+ # If we can't find direct mapping, create basic table mapping
277
+ if from_table and to_table:
278
+ # Get primary keys for proper mapping
279
+ from_pk = self._get_table_primary_key(from_table)
280
+ to_pk = self._get_table_primary_key(to_table)
281
+
282
+ mapping = {
283
+ "start_node": f"{from_table}.{from_pk}",
284
+ "end_node": f"{to_table}.{to_pk}",
285
+ "from_pk": from_pk,
286
+ "edge_type": llm_rel.name,
287
+ "directionality": llm_rel.directionality,
288
+ }
289
+ location = f"database.schema.{from_table}"
290
+ else:
291
+ mapping = {
292
+ "edge_type": llm_rel.name,
293
+ "directionality": llm_rel.directionality,
294
+ # Still need start_node and end_node for migration agent
295
+ "start_node": "unknown.id",
296
+ "end_node": "unknown.id",
297
+ "from_pk": "id",
298
+ }
299
+ location = "ai_analysis"
300
+
301
+ # Fallback for relationships without database mapping
302
+ # Use "table" as a supported type for migration agent
303
+ rel_source = RelationshipSource(
304
+ type="table",
305
+ name=llm_rel.name,
306
+ location=location,
307
+ mapping=mapping,
308
+ )
309
+
310
+ properties = []
311
+ for prop_name in llm_rel.properties:
312
+ prop_source = PropertySource(field=f"llm.{prop_name}")
313
+ graph_prop = GraphProperty(key=prop_name, source=prop_source)
314
+ properties.append(graph_prop)
315
+
316
+ # Map LLM node names to actual node labels
317
+ start_labels = self._map_node_name_to_labels(llm_rel.from_node, llm_model)
318
+ end_labels = self._map_node_name_to_labels(llm_rel.to_node, llm_model)
319
+
320
+ relationship = GraphRelationship(
321
+ edge_type=llm_rel.name,
322
+ start_node_labels=start_labels,
323
+ end_node_labels=end_labels,
324
+ properties=properties,
325
+ source=rel_source,
326
+ directionality=llm_rel.directionality,
327
+ )
328
+ relationships.append(relationship)
329
+
330
+ # Convert indexes from node-level index specifications
331
+ indexes = []
332
+ for llm_node in llm_model.nodes:
333
+ for index_prop in llm_node.indexes:
334
+ index_source = IndexSource(
335
+ origin="llm_recommendation",
336
+ reason=f"Index recommended by LLM for {llm_node.name}."
337
+ f"{index_prop}",
338
+ created_by="ai_analysis",
339
+ index_name=None,
340
+ migrated_from=None,
341
+ )
342
+
343
+ graph_index = GraphIndex(
344
+ labels=llm_node.labels,
345
+ properties=[index_prop],
346
+ type="btree", # Default index type
347
+ source=index_source,
348
+ )
349
+ indexes.append(graph_index)
350
+
351
+ # Convert constraints from node-level constraint specifications
352
+ constraints = []
353
+ for llm_node in llm_model.nodes:
354
+ for constraint_prop in llm_node.constraints:
355
+ constraint_source = ConstraintSource(
356
+ origin="llm_recommendation",
357
+ constraint_name=f"ai_unique_constraint_{llm_node.name}_"
358
+ f"{constraint_prop}",
359
+ migrated_from="ai_analysis",
360
+ )
361
+
362
+ graph_constraint = GraphConstraint(
363
+ type="unique", # Assume unique constraints
364
+ labels=llm_node.labels,
365
+ properties=[constraint_prop],
366
+ source=constraint_source,
367
+ )
368
+ constraints.append(graph_constraint)
369
+
370
+ return GraphModel(
371
+ nodes=nodes,
372
+ edges=relationships,
373
+ node_indexes=indexes,
374
+ node_constraints=constraints,
375
+ )
376
+
377
+ def _find_database_relationship(self, llm_rel) -> Dict[str, Any]:
378
+ """
379
+ Find the original database relationship information for an LLM
380
+ relationship.
381
+
382
+ This method matches LLM-generated relationships back to the original
383
+ database structure to preserve technical migration details.
384
+ """
385
+ if not hasattr(self, "_database_structure"):
386
+ return {}
387
+
388
+ relationships = self._database_structure.get("relationships", [])
389
+ entity_tables = self._database_structure.get("entity_tables", {})
390
+
391
+ # First, try to get source table names from LLM model context
392
+ from_table_name = self._get_source_table_for_node(llm_rel.from_node)
393
+ to_table_name = self._get_source_table_for_node(llm_rel.to_node)
394
+
395
+ # Try to match by source table names if available
396
+ if from_table_name and to_table_name:
397
+ for db_rel in relationships:
398
+ if isinstance(db_rel, dict):
399
+ from_table = db_rel.get("from_table", "").lower()
400
+ to_table = db_rel.get("to_table", "").lower()
401
+ rel_type = db_rel.get("relationship_type", "one_to_many")
402
+ else:
403
+ # Handle object format
404
+ from_table = db_rel.from_table.lower()
405
+ to_table = db_rel.to_table.lower()
406
+ rel_type = db_rel.relationship_type
407
+
408
+ # Check if this database relationship matches the source tables
409
+ if (
410
+ from_table == from_table_name.lower()
411
+ and to_table == to_table_name.lower()
412
+ ):
413
+ return self._build_relationship_mapping(db_rel, llm_rel, rel_type)
414
+
415
+ # Also try reverse direction
416
+ if (
417
+ to_table == from_table_name.lower()
418
+ and from_table == to_table_name.lower()
419
+ ):
420
+ return self._build_relationship_mapping(
421
+ db_rel, llm_rel, rel_type, reverse=True
422
+ )
423
+
424
+ # Fallback: Try to match by node names and relationship semantics
425
+ for db_rel in relationships:
426
+ if isinstance(db_rel, dict):
427
+ from_table = db_rel.get("from_table", "").lower()
428
+ to_table = db_rel.get("to_table", "").lower()
429
+ rel_type = db_rel.get("relationship_type", "one_to_many")
430
+ else:
431
+ # Handle object format
432
+ from_table = db_rel.from_table.lower()
433
+ to_table = db_rel.to_table.lower()
434
+ rel_type = db_rel.relationship_type
435
+
436
+ # Check if this database relationship matches the LLM relationship
437
+ if self._tables_match_nodes(
438
+ from_table, llm_rel.from_node
439
+ ) and self._tables_match_nodes(to_table, llm_rel.to_node):
440
+ return self._build_relationship_mapping(db_rel, llm_rel, rel_type)
441
+
442
+ # Also try reverse direction for bidirectional relationships
443
+ if self._tables_match_nodes(
444
+ to_table, llm_rel.from_node
445
+ ) and self._tables_match_nodes(from_table, llm_rel.to_node):
446
+ return self._build_relationship_mapping(
447
+ db_rel, llm_rel, rel_type, reverse=True
448
+ )
449
+
450
+ # If no direct match found, try to infer from foreign keys
451
+ return self._infer_relationship_from_foreign_keys(llm_rel, entity_tables)
452
+
453
+ def _get_source_table_for_node(self, node_name: str) -> str:
454
+ """Get the source table name for a given LLM node name."""
455
+ # This should be called within _convert_llm_to_graph_model
456
+ # where we have access to the LLM model context
457
+ if hasattr(self, "_current_llm_model") and self._current_llm_model:
458
+ for llm_node in self._current_llm_model.nodes:
459
+ # Check by node name or any of the labels
460
+ if llm_node.name.lower() == node_name.lower() or any(
461
+ label.lower() == node_name.lower() for label in llm_node.labels
462
+ ):
463
+ return llm_node.source_table
464
+
465
+ # Also check if node_name matches any variation of source_table
466
+ source_table = llm_node.source_table
467
+ if self._tables_match_nodes(source_table, node_name):
468
+ return source_table
469
+ return ""
470
+
471
+ def _build_relationship_mapping(self, db_rel, llm_rel, rel_type, reverse=False):
472
+ """Build relationship mapping from database relationship."""
473
+ if isinstance(db_rel, dict):
474
+ from_table = db_rel.get("to_table" if reverse else "from_table", "")
475
+ to_table = db_rel.get("from_table" if reverse else "to_table", "")
476
+ from_col = db_rel.get("to_column" if reverse else "from_column", "id")
477
+ to_col = db_rel.get("from_column" if reverse else "to_column", "id")
478
+ join_table = db_rel.get("join_table")
479
+ join_from_col = db_rel.get(
480
+ "join_to_column" if reverse else "join_from_column"
481
+ )
482
+ join_to_col = db_rel.get(
483
+ "join_from_column" if reverse else "join_to_column"
484
+ )
485
+ else:
486
+ from_table = db_rel.to_table if reverse else db_rel.from_table
487
+ to_table = db_rel.from_table if reverse else db_rel.to_table
488
+ from_col = db_rel.to_column if reverse else db_rel.from_column
489
+ to_col = db_rel.from_column if reverse else db_rel.to_column
490
+ join_table = getattr(db_rel, "join_table", None)
491
+ join_from_col = getattr(
492
+ db_rel, "join_to_column" if reverse else "join_from_column", None
493
+ )
494
+ join_to_col = getattr(
495
+ db_rel, "join_from_column" if reverse else "join_to_column", None
496
+ )
497
+
498
+ # Get primary keys for the tables from database structure
499
+ from_table_pk = self._get_table_primary_key(from_table)
500
+
501
+ mapping = {
502
+ "start_node": f"{from_table}.{from_col}",
503
+ "end_node": f"{to_table}.{to_col}",
504
+ "edge_type": llm_rel.name,
505
+ "from_pk": from_table_pk, # Add primary key for migration agent
506
+ }
507
+
508
+ # Add many-to-many specific information if available
509
+ if rel_type == "many_to_many" and join_table:
510
+ mapping.update(
511
+ {
512
+ "join_table": join_table,
513
+ "join_from_column": join_from_col,
514
+ "join_to_column": join_to_col,
515
+ "from_table": from_table,
516
+ "to_table": to_table,
517
+ "from_column": from_col,
518
+ "to_column": to_col,
519
+ }
520
+ )
521
+
522
+ # Determine source type
523
+ source_type = "many_to_many" if rel_type == "many_to_many" else "table"
524
+
525
+ # Get constraint name with proper handling
526
+ constraint_name = llm_rel.name
527
+ if isinstance(db_rel, dict):
528
+ constraint_name = db_rel.get("constraint_name", llm_rel.name)
529
+ elif hasattr(db_rel, "constraint_name") and db_rel.constraint_name:
530
+ constraint_name = db_rel.constraint_name
531
+
532
+ return {
533
+ "source_type": source_type,
534
+ "constraint_name": constraint_name,
535
+ "from_table": from_table,
536
+ "to_table": to_table,
537
+ "mapping": mapping,
538
+ }
539
+
540
+ def _infer_relationship_from_foreign_keys(self, llm_rel, entity_tables):
541
+ """Infer relationship mapping from foreign key information."""
542
+ # Find tables that match the relationship nodes
543
+ from_table_name = None
544
+ to_table_name = None
545
+
546
+ for table_name in entity_tables.keys():
547
+ if self._tables_match_nodes(table_name, llm_rel.from_node):
548
+ from_table_name = table_name
549
+ if self._tables_match_nodes(table_name, llm_rel.to_node):
550
+ to_table_name = table_name
551
+
552
+ if not from_table_name or not to_table_name:
553
+ logger.warning(
554
+ "Could not find matching tables for relationship %s: "
555
+ "from_node=%s->%s, to_node=%s->%s",
556
+ llm_rel.name,
557
+ llm_rel.from_node,
558
+ from_table_name,
559
+ llm_rel.to_node,
560
+ to_table_name,
561
+ )
562
+
563
+ # Additional debug info: show available node mappings
564
+ if hasattr(self, "_current_llm_model") and self._current_llm_model:
565
+ logger.debug("Available LLM nodes:")
566
+ for node in self._current_llm_model.nodes:
567
+ logger.debug(
568
+ " Node: %s -> Source table: %s (Labels: %s)",
569
+ node.name,
570
+ node.source_table,
571
+ node.labels,
572
+ )
573
+
574
+ # Try additional inference for known problematic patterns
575
+ inferred_mapping = self._infer_problematic_relationships(
576
+ llm_rel, entity_tables
577
+ )
578
+ if inferred_mapping:
579
+ return inferred_mapping
580
+
581
+ return {}
582
+
583
+ # Check if from_table has a foreign key to to_table
584
+ from_table_info = entity_tables.get(from_table_name, {})
585
+ foreign_keys = from_table_info.get("foreign_keys", [])
586
+
587
+ for fk in foreign_keys:
588
+ if isinstance(fk, dict):
589
+ referenced_table = fk.get("referenced_table", "")
590
+ fk_column = fk.get("column", "")
591
+ referenced_column = fk.get("referenced_column", "")
592
+ else:
593
+ referenced_table = fk.referenced_table
594
+ fk_column = fk.column_name
595
+ referenced_column = fk.referenced_column
596
+
597
+ if referenced_table.lower() == to_table_name.lower():
598
+ # Found a matching foreign key
599
+ from_table_pk = self._get_table_primary_key(from_table_name)
600
+ mapping = {
601
+ "start_node": f"{from_table_name}.{fk_column}",
602
+ "end_node": f"{to_table_name}.{referenced_column}",
603
+ "edge_type": llm_rel.name,
604
+ "from_pk": from_table_pk,
605
+ }
606
+
607
+ return {
608
+ "source_type": "table",
609
+ "constraint_name": llm_rel.name,
610
+ "from_table": from_table_name,
611
+ "to_table": to_table_name,
612
+ "mapping": mapping,
613
+ }
614
+
615
+ # Check reverse direction
616
+ to_table_info = entity_tables.get(to_table_name, {})
617
+ to_foreign_keys = to_table_info.get("foreign_keys", [])
618
+
619
+ for fk in to_foreign_keys:
620
+ if isinstance(fk, dict):
621
+ referenced_table = fk.get("referenced_table", "")
622
+ fk_column = fk.get("column", "")
623
+ referenced_column = fk.get("referenced_column", "")
624
+ else:
625
+ referenced_table = fk.referenced_table
626
+ fk_column = fk.column_name
627
+ referenced_column = fk.referenced_column
628
+
629
+ if referenced_table.lower() == from_table_name.lower():
630
+ # Found a reverse foreign key - reverse direction
631
+ to_table_pk = self._get_table_primary_key(to_table_name)
632
+ mapping = {
633
+ "start_node": f"{to_table_name}.{fk_column}",
634
+ "end_node": f"{from_table_name}.{referenced_column}",
635
+ "edge_type": llm_rel.name,
636
+ "from_pk": to_table_pk,
637
+ }
638
+
639
+ return {
640
+ "source_type": "table",
641
+ "constraint_name": llm_rel.name,
642
+ "from_table": to_table_name,
643
+ "to_table": from_table_name,
644
+ "mapping": mapping,
645
+ }
646
+
647
+ # Try additional inference for problematic relationships
648
+ return self._infer_problematic_relationships(llm_rel, entity_tables)
649
+
650
+ def _infer_problematic_relationships(self, llm_rel, entity_tables):
651
+ """Handle relationship patterns using generic inference strategies."""
652
+
653
+ # Try to infer relationship based on table name patterns
654
+ from_candidates = []
655
+ to_candidates = []
656
+
657
+ # Look for tables that could match the relationship nodes
658
+ for table_name in entity_tables.keys():
659
+ if self._tables_match_nodes(table_name, llm_rel.from_node):
660
+ from_candidates.append(table_name)
661
+ if self._tables_match_nodes(table_name, llm_rel.to_node):
662
+ to_candidates.append(table_name)
663
+
664
+ # If we can't find exact matches, try pattern inference
665
+ if not from_candidates and not to_candidates:
666
+ return {}
667
+
668
+ # Use the first matching candidates
669
+ from_table = from_candidates[0] if from_candidates else None
670
+ to_table = to_candidates[0] if to_candidates else None
671
+
672
+ if not from_table or not to_table:
673
+ logger.warning(
674
+ "Could not find complete table mapping for relationship %s",
675
+ llm_rel.name,
676
+ )
677
+ return {}
678
+
679
+ # Try to infer foreign key column based on common patterns
680
+ fk_column = self._infer_foreign_key_column(from_table, to_table, entity_tables)
681
+
682
+ # Get primary key for from_table
683
+ from_table_pk = self._get_table_primary_key(from_table)
684
+
685
+ mapping = {
686
+ "start_node": f"{from_table}.{fk_column}",
687
+ "end_node": f"{to_table}.{fk_column}",
688
+ "edge_type": llm_rel.name,
689
+ "from_pk": from_table_pk,
690
+ }
691
+
692
+ logger.info(
693
+ "Inferred relationship mapping for %s: %s.%s -> %s.%s",
694
+ llm_rel.name,
695
+ from_table,
696
+ fk_column,
697
+ to_table,
698
+ fk_column,
699
+ )
700
+
701
+ return {
702
+ "source_type": "table", # Use supported type for migration agent
703
+ "constraint_name": llm_rel.name,
704
+ "from_table": from_table,
705
+ "to_table": to_table,
706
+ "mapping": mapping,
707
+ "relationship_type": "one_to_many", # Default assumption
708
+ }
709
+
710
+ def _infer_foreign_key_column(self, from_table, to_table, entity_tables):
711
+ """Infer the most likely foreign key column name."""
712
+
713
+ # Check actual foreign keys first
714
+ from_table_info = entity_tables.get(from_table, {})
715
+ foreign_keys = from_table_info.get("foreign_keys", [])
716
+
717
+ for fk in foreign_keys:
718
+ if isinstance(fk, dict):
719
+ referenced_table = fk.get("referenced_table", "")
720
+ fk_column = fk.get("column", "")
721
+ else:
722
+ referenced_table = fk.referenced_table
723
+ fk_column = fk.column_name
724
+
725
+ if referenced_table.lower() == to_table.lower():
726
+ return fk_column
727
+
728
+ # Get actual column names from both tables
729
+ from_columns = []
730
+ to_columns = []
731
+
732
+ # Extract column names from from_table
733
+ from_table_columns = from_table_info.get("columns", [])
734
+ for col in from_table_columns:
735
+ if isinstance(col, dict):
736
+ from_columns.append(col.get("name", ""))
737
+ else:
738
+ from_columns.append(getattr(col, "name", str(col)))
739
+
740
+ # Extract column names from to_table
741
+ to_table_info = entity_tables.get(to_table, {})
742
+ to_table_columns = to_table_info.get("columns", [])
743
+ for col in to_table_columns:
744
+ if isinstance(col, dict):
745
+ to_columns.append(col.get("name", ""))
746
+ else:
747
+ to_columns.append(getattr(col, "name", str(col)))
748
+
749
+ # Find common columns between the two tables
750
+ common_columns = set(from_columns) & set(to_columns)
751
+ if common_columns:
752
+ # Prefer ID-like columns
753
+ for col in ["title_id", "name_id", "id"]:
754
+ if col in common_columns:
755
+ return col
756
+ # Return the first common column
757
+ return list(common_columns)[0]
758
+
759
+ # Try to infer based on table names
760
+ # For title-related relationships, use title_id
761
+ if "title" in from_table.lower() or "title" in to_table.lower():
762
+ if "title_id" in from_columns:
763
+ return "title_id"
764
+ if "title_id" in to_columns:
765
+ return "title_id"
766
+
767
+ # For name/person-related relationships, use name_id
768
+ if (
769
+ "name" in from_table.lower()
770
+ or "name" in to_table.lower()
771
+ or "person" in from_table.lower()
772
+ or "person" in to_table.lower()
773
+ ):
774
+ if "name_id" in from_columns:
775
+ return "name_id"
776
+ if "name_id" in to_columns:
777
+ return "name_id"
778
+
779
+ # Look for any ID-like column in from_table
780
+ for col in from_columns:
781
+ if col.lower().endswith("_id") or col.lower() == "id":
782
+ return col
783
+
784
+ # Look for any ID-like column in to_table
785
+ for col in to_columns:
786
+ if col.lower().endswith("_id") or col.lower() == "id":
787
+ return col
788
+
789
+ # Ultimate fallback - use first column from from_table
790
+ if from_columns:
791
+ return from_columns[0]
792
+
793
+ # Last resort
794
+ return "id"
795
+
796
+ def _tables_match_nodes(self, table_name: str, node_name: str) -> bool:
797
+ """Check if a table name matches a node name (flexible matching)."""
798
+ table_lower = table_name.lower()
799
+ node_lower = node_name.lower()
800
+
801
+ # Direct match
802
+ if table_lower == node_lower:
803
+ return True
804
+
805
+ # Singularized match (e.g., "titles" table -> "Title" node)
806
+ if table_lower.rstrip("s") == node_lower:
807
+ return True
808
+
809
+ # Pluralized match (e.g., "title" node -> "Titles" table)
810
+ if table_lower == node_lower + "s":
811
+ return True
812
+
813
+ # Handle underscores to camelcase
814
+ # (e.g., "alias_attributes" -> "AliasAttribute")
815
+ table_camel = "".join(word.capitalize() for word in table_lower.split("_"))
816
+ if table_camel.lower() == node_lower:
817
+ return True
818
+
819
+ # Handle trailing underscores (e.g., "names_" -> "Name")
820
+ if table_lower.rstrip("_") == node_lower:
821
+ return True
822
+ if table_lower.rstrip("_s") == node_lower:
823
+ return True
824
+
825
+ # Handle reverse: node with underscores to table
826
+ node_parts = node_lower.replace("_", " ").split()
827
+ if len(node_parts) > 1:
828
+ # Convert CamelCase to snake_case for comparison
829
+ import re
830
+
831
+ snake_case = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", node_name)
832
+ snake_case = re.sub("([a-z0-9])([A-Z])", r"\1_\2", snake_case).lower()
833
+ if table_lower == snake_case:
834
+ return True
835
+
836
+ # Handle specific patterns that are common across databases
837
+ specific_mappings = {
838
+ # Common pattern variations
839
+ "genres": ["genre"],
840
+ "ratings": ["rating"],
841
+ }
842
+
843
+ # Check if table has specific mapping to node
844
+ table_base = table_lower.split("_")[-1] # Get last part after underscore
845
+ if table_base in specific_mappings:
846
+ return node_lower in specific_mappings[table_base]
847
+
848
+ # Check reverse mapping (node to table)
849
+ for table_pattern, node_patterns in specific_mappings.items():
850
+ if node_lower in node_patterns:
851
+ # Check if table ends with this pattern
852
+ if table_lower.endswith(table_pattern) or table_lower.endswith(
853
+ table_pattern + "s"
854
+ ):
855
+ return True
856
+
857
+ # Handle prefix patterns like "title_" + concept
858
+ if "_" in table_lower:
859
+ table_parts = table_lower.split("_")
860
+ if len(table_parts) == 2:
861
+ prefix, suffix = table_parts
862
+ # Pattern: prefix_suffix -> Suffix (e.g., title_genres -> Genre)
863
+ if suffix.rstrip("s") == node_lower:
864
+ return True
865
+
866
+ return False
867
+
868
+ def _map_node_name_to_labels(self, node_name: str, llm_model) -> List[str]:
869
+ """Map LLM node name to the actual labels defined in the model."""
870
+ for llm_node in llm_model.nodes:
871
+ if llm_node.name.lower() == node_name.lower() or any(
872
+ label.lower() == node_name.lower() for label in llm_node.labels
873
+ ):
874
+ return llm_node.labels
875
+
876
+ # Fallback to the node name itself as label
877
+ return [node_name]
878
+
879
+ def _get_table_primary_key(self, table_name: str) -> str:
880
+ """Get the primary key column name for a table from database structure."""
881
+ if not hasattr(self, "_database_structure"):
882
+ return f"{table_name}_id" # Default fallback
883
+
884
+ # Check entity tables first
885
+ entity_tables = self._database_structure.get("entity_tables", {})
886
+ table_info = entity_tables.get(table_name)
887
+
888
+ if table_info:
889
+ # Get primary keys from table info
890
+ primary_keys = table_info.get("primary_keys", [])
891
+ if primary_keys:
892
+ return primary_keys[0] # Return first primary key
893
+
894
+ # Fallback: look in schema for primary key
895
+ schema = table_info.get("schema", [])
896
+ for col_info in schema:
897
+ if isinstance(col_info, dict):
898
+ if col_info.get("key") == "PRI":
899
+ return col_info.get("field", f"{table_name}_id")
900
+ elif hasattr(col_info, "is_primary_key") and col_info.is_primary_key:
901
+ return col_info.name
902
+
903
+ # Final fallback: conventional naming
904
+ return f"{table_name}_id"