structured2graph 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. __init__.py +47 -0
  2. core/__init__.py +23 -0
  3. core/hygm/__init__.py +74 -0
  4. core/hygm/hygm.py +2351 -0
  5. core/hygm/models/__init__.py +82 -0
  6. core/hygm/models/graph_models.py +667 -0
  7. core/hygm/models/llm_models.py +229 -0
  8. core/hygm/models/operations.py +176 -0
  9. core/hygm/models/sources.py +68 -0
  10. core/hygm/models/user_operations.py +139 -0
  11. core/hygm/strategies/__init__.py +17 -0
  12. core/hygm/strategies/base.py +36 -0
  13. core/hygm/strategies/deterministic.py +262 -0
  14. core/hygm/strategies/llm.py +904 -0
  15. core/hygm/validation/__init__.py +38 -0
  16. core/hygm/validation/base.py +194 -0
  17. core/hygm/validation/graph_schema_validator.py +687 -0
  18. core/hygm/validation/memgraph_data_validator.py +991 -0
  19. core/migration_agent.py +1369 -0
  20. core/schema/spec.json +155 -0
  21. core/utils/meta_graph.py +108 -0
  22. database/__init__.py +36 -0
  23. database/adapters/__init__.py +11 -0
  24. database/adapters/memgraph.py +318 -0
  25. database/adapters/mysql.py +311 -0
  26. database/adapters/postgresql.py +335 -0
  27. database/analyzer.py +396 -0
  28. database/factory.py +219 -0
  29. database/models.py +209 -0
  30. main.py +518 -0
  31. query_generation/__init__.py +20 -0
  32. query_generation/cypher_generator.py +129 -0
  33. query_generation/schema_utilities.py +88 -0
  34. structured2graph-0.1.1.dist-info/METADATA +197 -0
  35. structured2graph-0.1.1.dist-info/RECORD +41 -0
  36. structured2graph-0.1.1.dist-info/WHEEL +4 -0
  37. structured2graph-0.1.1.dist-info/entry_points.txt +2 -0
  38. structured2graph-0.1.1.dist-info/licenses/LICENSE +21 -0
  39. utils/__init__.py +57 -0
  40. utils/config.py +235 -0
  41. utils/environment.py +404 -0
@@ -0,0 +1,991 @@
1
+ """
2
+ Memgraph Data Validation Module.
3
+
4
+ This module provides comprehensive validation functionality for Memgraph
5
+ databases after migration, including schema validation (nodes, relationships,
6
+ indexes, constraints) and data count validation. It compares the expected
7
+ GraphModel specification with the actual Memgraph database state to ensure
8
+ migration success.
9
+ """
10
+
11
+ import logging
12
+ from typing import Dict, List, Any
13
+ from ..models.graph_models import GraphModel
14
+ from .base import (
15
+ BaseValidator,
16
+ ValidationResult,
17
+ ValidationSeverity,
18
+ ValidationCategory,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class MemgraphDataValidator(BaseValidator):
25
+ """
26
+ Validates Memgraph data and schema against expected GraphModel.
27
+
28
+ This class provides comprehensive post-migration validation of Memgraph
29
+ database including schema validation (nodes, relationships, indexes,
30
+ constraints) and data count verification by comparing it with the expected
31
+ GraphModel specification.
32
+ """
33
+
34
+ def __init__(self, memgraph_connection):
35
+ """
36
+ Initialize validator with Memgraph connection.
37
+
38
+ Args:
39
+ memgraph_connection: Connection to Memgraph database
40
+ (adapter or raw connection)
41
+ """
42
+ super().__init__()
43
+ self.connection = memgraph_connection
44
+ self._cached_data_counts = (
45
+ None # Cache for data counts to avoid repeated queries
46
+ )
47
+
48
+ def validate(self, expected_model: GraphModel, **kwargs) -> ValidationResult:
49
+ """
50
+ Abstract method implementation for BaseValidator.
51
+
52
+ This is the main entry point for validation.
53
+ """
54
+ return self.validate_post_migration(expected_model)
55
+
56
+ def validate_post_migration(
57
+ self, expected_model: GraphModel, expected_data_counts: Dict[str, int] = None
58
+ ) -> ValidationResult:
59
+ """
60
+ Validate the migrated schema and data against expected model.
61
+
62
+ Args:
63
+ expected_model: GraphModel representing expected schema
64
+ expected_data_counts: Optional dict with expected node/relationship counts
65
+ Format: {"nodes": 12345, "relationships": 6789, "selected_tables": ["table1", "table2"]}
66
+
67
+ Returns:
68
+ ValidationResult with detailed comparison results
69
+ """
70
+ self.reset() # Reset state using parent method
71
+
72
+ try:
73
+ # Get actual schema from Memgraph
74
+ actual_schema = self._get_actual_schema()
75
+ nodes_count = len(actual_schema.get("nodes", []))
76
+ rels_count = len(actual_schema.get("relationships", []))
77
+ logger.info(
78
+ "Retrieved actual schema: %d nodes, %d relationships",
79
+ nodes_count,
80
+ rels_count,
81
+ )
82
+
83
+ # Convert expected model to comparable format
84
+ expected_schema = self._convert_model_to_schema_info(expected_model)
85
+ exp_nodes = len(expected_schema.get("nodes", []))
86
+ exp_rels = len(expected_schema.get("relationships", []))
87
+ logger.info(
88
+ "Expected schema: %d nodes, %d relationships", exp_nodes, exp_rels
89
+ )
90
+
91
+ # Perform schema validations
92
+ self._validate_node_labels(expected_schema, actual_schema)
93
+ self._validate_node_properties(expected_schema, actual_schema)
94
+ self._validate_relationships(expected_schema, actual_schema)
95
+ self._validate_indexes(expected_schema, actual_schema)
96
+ self._validate_constraints(expected_schema, actual_schema)
97
+
98
+ # Perform data count validation if expected counts provided
99
+ if expected_data_counts:
100
+ self._validate_data_counts(expected_data_counts)
101
+
102
+ # Calculate metrics using the base class structure
103
+ self._update_metrics(expected_schema, actual_schema, expected_data_counts)
104
+
105
+ # Generate results
106
+ critical_issues = [
107
+ issue
108
+ for issue in self.issues
109
+ if issue.severity == ValidationSeverity.CRITICAL
110
+ ]
111
+ success = not any(critical_issues)
112
+ summary = self._generate_summary()
113
+
114
+ return ValidationResult(
115
+ validation_type="memgraph_data_validation",
116
+ success=success,
117
+ summary=summary,
118
+ issues=self.issues,
119
+ metrics=self.metrics,
120
+ details={
121
+ "expected_schema": expected_schema,
122
+ "actual_schema": actual_schema,
123
+ "validation_score": self._calculate_validation_score(),
124
+ "data_counts": self._get_actual_data_counts()
125
+ if expected_data_counts
126
+ else None,
127
+ },
128
+ )
129
+
130
+ except Exception as e:
131
+ logger.error("Validation failed with error: %s", str(e))
132
+ self.add_issue(
133
+ ValidationSeverity.CRITICAL,
134
+ ValidationCategory.SCHEMA_MISMATCH,
135
+ f"Validation process failed: {str(e)}",
136
+ recommendation="Check database connection and schema access",
137
+ )
138
+
139
+ return ValidationResult(
140
+ validation_type="memgraph_data_validation",
141
+ success=False,
142
+ summary=f"Validation failed: {str(e)}",
143
+ issues=self.issues,
144
+ metrics=self.metrics,
145
+ )
146
+
147
+ def _get_actual_schema(self) -> Dict[str, Any]:
148
+ """
149
+ Get actual schema from Memgraph database.
150
+
151
+ Returns:
152
+ Structured schema information from Memgraph
153
+ """
154
+ # Handle different connection types
155
+ if hasattr(self.connection, "get_schema_info"):
156
+ # MemgraphAdapter interface
157
+ return self._parse_memgraph_adapter_schema(self.connection)
158
+ else:
159
+ # Raw connection - execute SHOW SCHEMA INFO
160
+ return self._parse_raw_connection_schema(self.connection)
161
+
162
+ def _parse_raw_connection_schema(self, connection) -> Dict[str, Any]:
163
+ """
164
+ Parse schema from raw Memgraph connection.
165
+
166
+ Args:
167
+ connection: Raw database connection (Memgraph client)
168
+
169
+ Returns:
170
+ Structured schema information
171
+ """
172
+ try:
173
+ # Check if it's a Memgraph client with .query() method
174
+ if hasattr(connection, "query"):
175
+ result = connection.query("SHOW SCHEMA INFO;")
176
+ # Convert query result to list to access records
177
+ records = list(result)
178
+
179
+ if records and "schema" in records[0]:
180
+ # Parse the JSON schema from the first record
181
+ import json
182
+
183
+ schema_json = records[0]["schema"]
184
+ schema_data = json.loads(schema_json)
185
+ return self._parse_memgraph_json_schema(schema_data)
186
+ else:
187
+ logger.warning("SHOW SCHEMA INFO returned unexpected format")
188
+ return {
189
+ "nodes": [],
190
+ "relationships": [],
191
+ "indexes": [],
192
+ "constraints": [],
193
+ }
194
+ else:
195
+ # Fallback for cursor-based connections
196
+ cursor = connection.cursor()
197
+ cursor.execute("SHOW SCHEMA INFO;")
198
+ result = cursor.fetchall()
199
+ return self._parse_schema_info_result(result)
200
+ except Exception as e:
201
+ logger.error(f"Failed to get schema from connection: {e}")
202
+ return {"nodes": [], "relationships": [], "indexes": [], "constraints": []}
203
+
204
+ def _parse_schema_info_result(self, schema_info_rows) -> Dict[str, Any]:
205
+ """
206
+ Parse the result of SHOW SCHEMA INFO query.
207
+
208
+ Args:
209
+ schema_info_rows: Raw result from SHOW SCHEMA INFO
210
+
211
+ Returns:
212
+ Structured schema information
213
+ """
214
+ nodes = {}
215
+ relationships = {}
216
+ indexes = []
217
+ constraints = []
218
+
219
+ for row in schema_info_rows:
220
+ if len(row) >= 3:
221
+ element_type = row[0] # "node" or "relationship"
222
+ element_name = row[1] # label or relationship type
223
+ properties = row[2] if len(row) > 2 else {}
224
+
225
+ if element_type == "node":
226
+ if element_name not in nodes:
227
+ # Handle both single labels and label combinations
228
+ if isinstance(element_name, str):
229
+ labels = [element_name]
230
+ else:
231
+ labels = element_name
232
+ nodes[element_name] = {"labels": labels, "properties": {}}
233
+ if isinstance(properties, dict):
234
+ nodes[element_name]["properties"].update(properties)
235
+
236
+ elif element_type == "relationship":
237
+ if element_name not in relationships:
238
+ relationships[element_name] = {
239
+ "type": element_name,
240
+ "properties": {},
241
+ }
242
+ if isinstance(properties, dict):
243
+ rel_props = relationships[element_name]["properties"]
244
+ rel_props.update(properties)
245
+
246
+ return {
247
+ "nodes": list(nodes.values()),
248
+ "relationships": list(relationships.values()),
249
+ "indexes": indexes,
250
+ "constraints": constraints,
251
+ }
252
+
253
+ def _parse_memgraph_adapter_schema(self, adapter) -> Dict[str, Any]:
254
+ """
255
+ Parse schema info from MemgraphAdapter.
256
+
257
+ Args:
258
+ adapter: MemgraphAdapter instance
259
+
260
+ Returns:
261
+ Structured schema information
262
+ """
263
+ try:
264
+ # Get schema info using adapter methods
265
+ schema_info_rows = adapter.get_schema_info()
266
+ parsed_schema = self._parse_schema_info_result(schema_info_rows)
267
+
268
+ # Get additional info
269
+ indexes = adapter.get_indexes()
270
+ constraints = adapter.get_constraints()
271
+
272
+ parsed_schema["indexes"] = indexes
273
+ parsed_schema["constraints"] = constraints
274
+
275
+ return parsed_schema
276
+
277
+ except Exception as e:
278
+ logger.error(f"Failed to parse MemgraphAdapter schema: {e}")
279
+ # Fallback to empty schema
280
+ return {"nodes": [], "relationships": [], "indexes": [], "constraints": []}
281
+
282
+ def _parse_memgraph_json_schema(
283
+ self, schema_data: Dict[str, Any]
284
+ ) -> Dict[str, Any]:
285
+ """
286
+ Parse Memgraph JSON schema format and extract data counts.
287
+
288
+ Args:
289
+ schema_data: JSON schema data from SHOW SCHEMA INFO
290
+
291
+ Returns:
292
+ Structured schema information with data counts
293
+ """
294
+ nodes = []
295
+ relationships = []
296
+ indexes = []
297
+ constraints = []
298
+
299
+ # Track data counts from schema info
300
+ total_nodes = 0
301
+ total_relationships = 0
302
+
303
+ # Parse nodes
304
+ for node_data in schema_data.get("nodes", []):
305
+ node_info = {"labels": node_data.get("labels", []), "properties": {}}
306
+
307
+ # Calculate node count from property types
308
+ node_count = 0
309
+ for prop in node_data.get("properties", []):
310
+ prop_name = prop.get("key", "")
311
+ prop_types = prop.get("types", [])
312
+ if prop_types:
313
+ # Get the primary type and its count
314
+ primary_type = prop_types[0].get("type", "String")
315
+ # Sum counts across all types for this property (excluding Null)
316
+ prop_count = sum(
317
+ type_def.get("count", 0)
318
+ for type_def in prop_types
319
+ if type_def.get("type", "") != "Null"
320
+ )
321
+ # Use the highest property count as the node count estimate
322
+ node_count = max(node_count, prop_count)
323
+ node_info["properties"][prop_name] = primary_type
324
+
325
+ # Store the node count in the node info
326
+ node_info["node_count"] = node_count
327
+ total_nodes += node_count
328
+ nodes.append(node_info)
329
+
330
+ # Parse relationships
331
+ for edge_data in schema_data.get("edges", []):
332
+ rel_info = {"type": edge_data.get("type", ""), "properties": {}}
333
+
334
+ # Calculate relationship count from property types
335
+ rel_count = 0
336
+ properties = edge_data.get("properties", [])
337
+
338
+ if properties:
339
+ # If relationship has properties, count from property types
340
+ for prop in properties:
341
+ prop_name = prop.get("key", "")
342
+ prop_types = prop.get("types", [])
343
+ if prop_types:
344
+ primary_type = prop_types[0].get("type", "String")
345
+ # Sum counts across all types for this property (excluding Null)
346
+ prop_count = sum(
347
+ type_def.get("count", 0)
348
+ for type_def in prop_types
349
+ if type_def.get("type", "") != "Null"
350
+ )
351
+ # Use the highest property count as the relationship count estimate
352
+ rel_count = max(rel_count, prop_count)
353
+ rel_info["properties"][prop_name] = primary_type
354
+ else:
355
+ # If relationship has no properties, we need to count differently
356
+ # For now, we'll mark it as unknown and use a fallback query later
357
+ rel_count = -1 # Mark as needs counting
358
+
359
+ # Store the relationship count
360
+ rel_info["relationship_count"] = rel_count
361
+ if rel_count > 0:
362
+ total_relationships += rel_count
363
+ relationships.append(rel_info)
364
+
365
+ # Parse indexes (unchanged)
366
+ for index_data in schema_data.get("node_indexes", []):
367
+ index_info = {
368
+ "type": "node",
369
+ "labels": index_data.get("labels", []),
370
+ "properties": index_data.get("properties", []),
371
+ "index_type": index_data.get("type", "label+properties"),
372
+ }
373
+ indexes.append(index_info)
374
+
375
+ # Parse constraints (unchanged)
376
+ for constraint_data in schema_data.get("node_constraints", []):
377
+ constraint_info = {
378
+ "type": "node",
379
+ "labels": constraint_data.get("labels", []),
380
+ "properties": constraint_data.get("properties", []),
381
+ "constraint_type": constraint_data.get("type", "unique"),
382
+ }
383
+ constraints.append(constraint_info)
384
+
385
+ return {
386
+ "nodes": nodes,
387
+ "relationships": relationships,
388
+ "indexes": indexes,
389
+ "constraints": constraints,
390
+ "data_counts": {
391
+ "total_nodes": total_nodes,
392
+ "total_relationships": total_relationships,
393
+ },
394
+ }
395
+
396
+ def _convert_model_to_schema_info(self, model: GraphModel) -> Dict[str, Any]:
397
+ """
398
+ Convert GraphModel to schema info format for comparison.
399
+
400
+ Args:
401
+ model: GraphModel to convert
402
+
403
+ Returns:
404
+ Schema info format compatible with Memgraph output
405
+ """
406
+ nodes = []
407
+ for node in model.nodes:
408
+ node_info = {"labels": node.labels, "properties": {}}
409
+
410
+ # Convert properties to simple format
411
+ for prop in node.properties:
412
+ if hasattr(prop, "key"):
413
+ # Determine expected type from GraphProperty types
414
+ prop_types = prop.types if hasattr(prop, "types") else []
415
+ primary_type = self._get_primary_type(prop_types)
416
+ node_info["properties"][prop.key] = primary_type
417
+ else:
418
+ # Handle string properties
419
+ node_info["properties"][str(prop)] = "String"
420
+
421
+ nodes.append(node_info)
422
+
423
+ relationships = []
424
+ for edge in model.edges:
425
+ rel_info = {"type": edge.edge_type, "properties": {}}
426
+
427
+ # Convert properties
428
+ for prop in edge.properties:
429
+ if hasattr(prop, "key"):
430
+ prop_types = prop.types if hasattr(prop, "types") else []
431
+ primary_type = self._get_primary_type(prop_types)
432
+ rel_info["properties"][prop.key] = primary_type
433
+ else:
434
+ rel_info["properties"][str(prop)] = "String"
435
+
436
+ relationships.append(rel_info)
437
+
438
+ # Convert indexes
439
+ indexes = []
440
+ for index in model.node_indexes:
441
+ index_info = {
442
+ "type": "node",
443
+ "labels": index.labels,
444
+ "properties": index.properties,
445
+ "index_type": index.type,
446
+ }
447
+ indexes.append(index_info)
448
+
449
+ for index in model.edge_indexes:
450
+ index_info = {
451
+ "type": "edge",
452
+ "edge_type": index.edge_type,
453
+ "properties": index.properties,
454
+ "index_type": index.type,
455
+ }
456
+ indexes.append(index_info)
457
+
458
+ # Convert constraints
459
+ constraints = []
460
+ for constraint in model.node_constraints:
461
+ constraint_info = {
462
+ "type": "node",
463
+ "labels": constraint.labels,
464
+ "properties": constraint.properties,
465
+ "constraint_type": constraint.type,
466
+ }
467
+ constraints.append(constraint_info)
468
+
469
+ for constraint in model.edge_constraints:
470
+ constraint_info = {
471
+ "type": "edge",
472
+ "edge_type": constraint.edge_type,
473
+ "properties": constraint.properties,
474
+ "constraint_type": constraint.type,
475
+ }
476
+ constraints.append(constraint_info)
477
+
478
+ return {
479
+ "nodes": nodes,
480
+ "relationships": relationships,
481
+ "indexes": indexes,
482
+ "constraints": constraints,
483
+ }
484
+
485
+ def _get_primary_type(self, type_list: List[Dict[str, Any]]) -> str:
486
+ """Get the primary type from a list of type definitions."""
487
+ if not type_list:
488
+ return "String"
489
+
490
+ # Find the type with highest count, excluding Null
491
+ max_count = 0
492
+ primary_type = "String"
493
+
494
+ for type_def in type_list:
495
+ if type_def.get("type", "") != "Null":
496
+ count = type_def.get("count", 0)
497
+ if count > max_count:
498
+ max_count = count
499
+ primary_type = type_def.get("type", "String")
500
+
501
+ return primary_type
502
+
503
+ def _validate_node_labels(self, expected: Dict[str, Any], actual: Dict[str, Any]):
504
+ """Validate that all expected node labels exist in Memgraph."""
505
+ expected_labels = {tuple(node["labels"]) for node in expected["nodes"]}
506
+ actual_labels = {tuple(node["labels"]) for node in actual["nodes"]}
507
+
508
+ # Check for missing labels
509
+ missing_labels = expected_labels - actual_labels
510
+ for labels in missing_labels:
511
+ self.add_issue(
512
+ ValidationSeverity.CRITICAL,
513
+ ValidationCategory.SCHEMA_MISMATCH,
514
+ f"Missing node labels in Memgraph: {list(labels)}",
515
+ expected=list(labels),
516
+ actual=None,
517
+ recommendation="Check migration script for node creation issues",
518
+ )
519
+
520
+ # Check for unexpected labels
521
+ extra_labels = actual_labels - expected_labels
522
+ for labels in extra_labels:
523
+ self.add_issue(
524
+ ValidationSeverity.WARNING,
525
+ ValidationCategory.SCHEMA_MISMATCH,
526
+ f"Unexpected node labels in Memgraph: {list(labels)}",
527
+ expected=None,
528
+ actual=list(labels),
529
+ recommendation="Verify if these labels were intentionally created",
530
+ )
531
+
532
+ def _validate_node_properties(
533
+ self, expected: Dict[str, Any], actual: Dict[str, Any]
534
+ ):
535
+ """Validate node properties match expected schema."""
536
+ # Create lookup dictionaries
537
+ expected_nodes = {tuple(node["labels"]): node for node in expected["nodes"]}
538
+ actual_nodes = {tuple(node["labels"]): node for node in actual["nodes"]}
539
+
540
+ for labels, expected_node in expected_nodes.items():
541
+ if labels not in actual_nodes:
542
+ continue # Already handled in label validation
543
+
544
+ actual_node = actual_nodes[labels]
545
+ expected_props = expected_node.get("properties", {})
546
+ actual_props = actual_node.get("properties", {})
547
+
548
+ # Check for missing properties
549
+ missing_props = set(expected_props.keys()) - set(actual_props.keys())
550
+ for prop in missing_props:
551
+ self.add_issue(
552
+ ValidationSeverity.CRITICAL,
553
+ ValidationCategory.SCHEMA_MISMATCH,
554
+ f"Missing property '{prop}' on node {list(labels)}",
555
+ expected=prop,
556
+ actual=None,
557
+ recommendation="Check property mapping in migration script",
558
+ )
559
+
560
+ # TODO: Re-enable property type mismatch validation once type
561
+ # mapping is stabilized
562
+ # Check for type mismatches
563
+ # for prop, expected_type in expected_props.items():
564
+ # if prop in actual_props:
565
+ # actual_type = actual_props[prop]
566
+ # if not self._types_compatible(expected_type, actual_type):
567
+ # self.add_issue(
568
+ # ValidationSeverity.WARNING,
569
+ # ValidationCategory.SCHEMA_MISMATCH,
570
+ # f"Property '{prop}' type mismatch on node {list(labels)}",
571
+ # expected=expected_type,
572
+ # actual=actual_type,
573
+ # recommendation="Verify data transformation logic",
574
+ # )
575
+
576
+ def _validate_relationships(self, expected: Dict[str, Any], actual: Dict[str, Any]):
577
+ """Validate relationship types and properties."""
578
+ expected_rels = {rel["type"]: rel for rel in expected["relationships"]}
579
+ actual_rels = {rel["type"]: rel for rel in actual["relationships"]}
580
+
581
+ # Check for missing relationship types
582
+ missing_rels = set(expected_rels.keys()) - set(actual_rels.keys())
583
+ for rel_type in missing_rels:
584
+ self.add_issue(
585
+ ValidationSeverity.CRITICAL,
586
+ ValidationCategory.SCHEMA_MISMATCH,
587
+ f"Missing relationship type: {rel_type}",
588
+ expected=rel_type,
589
+ actual=None,
590
+ recommendation="Check relationship creation in migration script",
591
+ )
592
+
593
+ # Check relationship properties
594
+ for rel_type, expected_rel in expected_rels.items():
595
+ if rel_type not in actual_rels:
596
+ continue
597
+
598
+ actual_rel = actual_rels[rel_type]
599
+ expected_props = expected_rel.get("properties", {})
600
+ actual_props = actual_rel.get("properties", {})
601
+
602
+ missing_props = set(expected_props.keys()) - set(actual_props.keys())
603
+ for prop in missing_props:
604
+ self.add_issue(
605
+ ValidationSeverity.WARNING,
606
+ ValidationCategory.SCHEMA_MISMATCH,
607
+ f"Missing property '{prop}' on relationship {rel_type}",
608
+ expected=prop,
609
+ actual=None,
610
+ recommendation="Check relationship property mapping",
611
+ )
612
+
613
+ def _validate_indexes(self, expected: Dict[str, Any], actual: Dict[str, Any]):
614
+ """Validate that expected indexes exist in Memgraph."""
615
+ expected_indexes = expected.get("indexes", [])
616
+ actual_indexes = actual.get("indexes", [])
617
+
618
+ # Create comparable representations
619
+ expected_index_keys = set()
620
+ for idx in expected_indexes:
621
+ if idx.get("type") == "node":
622
+ key = (
623
+ "node",
624
+ tuple(idx.get("labels", [])),
625
+ tuple(idx.get("properties", [])),
626
+ )
627
+ else:
628
+ key = (
629
+ "edge",
630
+ idx.get("edge_type", ""),
631
+ tuple(idx.get("properties", [])),
632
+ )
633
+ expected_index_keys.add(key)
634
+
635
+ actual_index_keys = set()
636
+ for idx in actual_indexes:
637
+ if idx.get("type") == "node":
638
+ key = (
639
+ "node",
640
+ tuple(idx.get("labels", [])),
641
+ tuple(idx.get("properties", [])),
642
+ )
643
+ else:
644
+ key = (
645
+ "edge",
646
+ idx.get("edge_type", ""),
647
+ tuple(idx.get("properties", [])),
648
+ )
649
+ actual_index_keys.add(key)
650
+
651
+ # Check for missing indexes
652
+ missing_indexes = expected_index_keys - actual_index_keys
653
+ for index_key in missing_indexes:
654
+ self.add_issue(
655
+ ValidationSeverity.WARNING,
656
+ ValidationCategory.PERFORMANCE,
657
+ f"Missing index: {index_key}",
658
+ expected=index_key,
659
+ actual=None,
660
+ recommendation="Consider creating missing indexes for performance",
661
+ )
662
+
663
+ def _validate_constraints(self, expected: Dict[str, Any], actual: Dict[str, Any]):
664
+ """Validate that expected constraints exist in Memgraph."""
665
+ expected_constraints = expected.get("constraints", [])
666
+ actual_constraints = actual.get("constraints", [])
667
+
668
+ # Create comparable representations
669
+ expected_constraint_keys = set()
670
+ for const in expected_constraints:
671
+ if const.get("type") == "node":
672
+ key = (
673
+ "node",
674
+ tuple(const.get("labels", [])),
675
+ tuple(const.get("properties", [])),
676
+ const.get("constraint_type"),
677
+ )
678
+ else:
679
+ key = (
680
+ "edge",
681
+ const.get("edge_type", ""),
682
+ tuple(const.get("properties", [])),
683
+ const.get("constraint_type"),
684
+ )
685
+ expected_constraint_keys.add(key)
686
+
687
+ actual_constraint_keys = set()
688
+ for const in actual_constraints:
689
+ if const.get("type") == "node":
690
+ key = (
691
+ "node",
692
+ tuple(const.get("labels", [])),
693
+ tuple(const.get("properties", [])),
694
+ const.get("constraint_type"),
695
+ )
696
+ else:
697
+ key = (
698
+ "edge",
699
+ const.get("edge_type", ""),
700
+ tuple(const.get("properties", [])),
701
+ const.get("constraint_type"),
702
+ )
703
+ actual_constraint_keys.add(key)
704
+
705
+ # Check for missing constraints
706
+ missing_constraints = expected_constraint_keys - actual_constraint_keys
707
+ for constraint_key in missing_constraints:
708
+ self.add_issue(
709
+ ValidationSeverity.CRITICAL,
710
+ ValidationCategory.DATA_INTEGRITY,
711
+ f"Missing constraint: {constraint_key}",
712
+ expected=constraint_key,
713
+ actual=None,
714
+ recommendation="Ensure data integrity by creating missing constraints",
715
+ )
716
+
717
+ def _validate_data_counts(self, expected_data_counts: Dict[str, int]):
718
+ """
719
+ Validate actual data counts against expected counts.
720
+
721
+ Args:
722
+ expected_data_counts: Expected counts with keys like "nodes", "relationships"
723
+ """
724
+ try:
725
+ actual_counts = self._get_actual_data_counts()
726
+
727
+ # Validate node count
728
+ if "nodes" in expected_data_counts:
729
+ expected_nodes = expected_data_counts["nodes"]
730
+ actual_nodes = actual_counts["nodes"]
731
+
732
+ if actual_nodes != expected_nodes:
733
+ severity = (
734
+ ValidationSeverity.CRITICAL
735
+ if abs(actual_nodes - expected_nodes) > expected_nodes * 0.1
736
+ else ValidationSeverity.WARNING
737
+ )
738
+ self.add_issue(
739
+ severity,
740
+ ValidationCategory.DATA_INTEGRITY,
741
+ f"Node count mismatch: expected {expected_nodes}, got {actual_nodes}",
742
+ expected=expected_nodes,
743
+ actual=actual_nodes,
744
+ recommendation="Check migration completeness and data source consistency",
745
+ )
746
+ else:
747
+ logger.info(f"✅ Node count validation passed: {actual_nodes} nodes")
748
+
749
+ # Validate relationship count
750
+ if "relationships" in expected_data_counts:
751
+ expected_rels = expected_data_counts["relationships"]
752
+ actual_rels = actual_counts["relationships"]
753
+
754
+ # Relationships can vary more due to optional FKs, so be more lenient
755
+ if actual_rels < expected_rels * 0.5:
756
+ self.add_issue(
757
+ ValidationSeverity.WARNING,
758
+ ValidationCategory.DATA_INTEGRITY,
759
+ f"Low relationship count: expected ~{expected_rels}, got {actual_rels}",
760
+ expected=expected_rels,
761
+ actual=actual_rels,
762
+ recommendation="Check foreign key constraints and data completeness",
763
+ )
764
+ else:
765
+ logger.info(
766
+ f"✅ Relationship count acceptable: {actual_rels} relationships"
767
+ )
768
+
769
+ except Exception as e:
770
+ logger.error(f"Error validating data counts: {e}")
771
+ self.add_issue(
772
+ ValidationSeverity.WARNING,
773
+ ValidationCategory.DATA_INTEGRITY,
774
+ f"Data count validation failed: {str(e)}",
775
+ recommendation="Check database connection and query permissions",
776
+ )
777
+
778
+ def _get_actual_data_counts(self) -> Dict[str, int]:
779
+ """
780
+ Get actual node and relationship counts from Memgraph schema info.
781
+
782
+ Returns:
783
+ Dictionary with "nodes" and "relationships" counts
784
+ """
785
+ # Return cached result if available
786
+ if self._cached_data_counts is not None:
787
+ return self._cached_data_counts
788
+
789
+ try:
790
+ # Use the already-retrieved schema info which contains counts
791
+ actual_schema = self._get_actual_schema()
792
+
793
+ # Extract counts from schema data
794
+ data_counts = actual_schema.get("data_counts", {})
795
+ if data_counts:
796
+ nodes = data_counts.get("total_nodes", 0)
797
+ relationships = data_counts.get("total_relationships", 0)
798
+
799
+ # Check if we need to count relationships with fallback query
800
+ # This happens when relationships don't have properties
801
+ if relationships == 0 and actual_schema.get("relationships"):
802
+ # Check if any relationships were marked as needing count
803
+ needs_counting = any(
804
+ rel.get("relationship_count", 0) == -1
805
+ for rel in actual_schema.get("relationships", [])
806
+ )
807
+
808
+ if needs_counting:
809
+ # Use fallback query for relationships
810
+ relationships = self._count_relationships_fallback()
811
+ logger.info("Used fallback query for relationship count")
812
+
813
+ result = {"nodes": nodes, "relationships": relationships}
814
+
815
+ # Cache the result
816
+ self._cached_data_counts = result
817
+ logger.info(
818
+ "Data counts from schema: %d nodes, %d rel", nodes, relationships
819
+ )
820
+ return result
821
+
822
+ # Fallback: if schema doesn't have counts, calculate from node lists
823
+ # This happens when using non-JSON schema format
824
+ nodes = len(actual_schema.get("nodes", []))
825
+ relationships = len(actual_schema.get("relationships", []))
826
+
827
+ result = {
828
+ "nodes": nodes, # This will be type count, not data count
829
+ "relationships": relationships,
830
+ }
831
+
832
+ # Cache the result
833
+ self._cached_data_counts = result
834
+ logger.warning(
835
+ "Schema info didn't contain data counts, " "using type counts instead"
836
+ )
837
+ logger.info(
838
+ "Schema-based counts: %d node types, %d rel types", nodes, relationships
839
+ )
840
+ return result
841
+
842
+ except (ValueError, KeyError, AttributeError) as e:
843
+ logger.error("Failed to get data counts from schema: %s", str(e))
844
+ result = {"nodes": 0, "relationships": 0}
845
+ self._cached_data_counts = result
846
+ return result
847
+
848
+ def _count_relationships_fallback(self) -> int:
849
+ """
850
+ Fallback method to count relationships using direct Cypher query.
851
+ Used when relationships don't have properties and can't be counted.
852
+
853
+ Returns:
854
+ Total number of relationships in the database
855
+ """
856
+ try:
857
+ if hasattr(self.connection, "query"):
858
+ # Direct connection with query method
859
+ query = "MATCH ()-[r]->() RETURN count(r) as rel_count"
860
+ rel_result = self.connection.query(query)
861
+ rel_count = rel_result[0]["rel_count"] if rel_result else 0
862
+ else:
863
+ # Cursor-based connection
864
+ cursor = self.connection.cursor()
865
+ cursor.execute("MATCH ()-[r]->() RETURN count(r) as rel_count")
866
+ rel_count = cursor.fetchone()[0]
867
+
868
+ logger.debug("Fallback relationship count: %d", rel_count)
869
+ return rel_count
870
+
871
+ except (ValueError, KeyError, AttributeError) as e:
872
+ logger.error("Fallback relationship counting failed: %s", str(e))
873
+ return 0
874
+
875
+ def _types_compatible(self, expected_type: str, actual_type: str) -> bool:
876
+ """Check if actual type is compatible with expected type."""
877
+ # Define type compatibility mappings
878
+ compatible_types = {
879
+ "String": ["String", "TEXT", "VARCHAR"],
880
+ "Integer": ["Integer", "INT", "BIGINT"],
881
+ "Float": ["Float", "DOUBLE", "DECIMAL"],
882
+ "Boolean": ["Boolean", "BOOL"],
883
+ "Date": ["Date", "DATE"],
884
+ "LocalDateTime": ["LocalDateTime", "DATETIME", "TIMESTAMP"],
885
+ "LocalTime": ["LocalTime", "TIME"],
886
+ }
887
+
888
+ expected_compatible = compatible_types.get(expected_type, [expected_type])
889
+ return actual_type in expected_compatible
890
+
891
+ def _update_metrics(
892
+ self,
893
+ expected: Dict[str, Any],
894
+ actual: Dict[str, Any],
895
+ expected_data_counts: Dict[str, int] = None,
896
+ ):
897
+ """Update validation metrics using the base ValidationMetrics."""
898
+ # Update basic counts
899
+ self.metrics.tables_total = len(expected.get("nodes", []))
900
+ self.metrics.tables_covered = len(actual.get("nodes", []))
901
+
902
+ # Relationships
903
+ self.metrics.relationships_total = len(expected.get("relationships", []))
904
+ self.metrics.relationships_covered = len(actual.get("relationships", []))
905
+
906
+ # Indexes
907
+ self.metrics.indexes_total = len(expected.get("indexes", []))
908
+ self.metrics.indexes_covered = len(actual.get("indexes", []))
909
+
910
+ # Constraints
911
+ self.metrics.constraints_total = len(expected.get("constraints", []))
912
+ self.metrics.constraints_covered = len(actual.get("constraints", []))
913
+
914
+ # Add data count metrics if available
915
+ if expected_data_counts:
916
+ actual_counts = self._get_actual_data_counts()
917
+ # Store additional metrics (can be accessed via metrics object)
918
+ self.metrics.data_nodes_expected = expected_data_counts.get("nodes", 0)
919
+ self.metrics.data_nodes_actual = actual_counts.get("nodes", 0)
920
+ self.metrics.data_relationships_expected = expected_data_counts.get(
921
+ "relationships", 0
922
+ )
923
+ self.metrics.data_relationships_actual = actual_counts.get(
924
+ "relationships", 0
925
+ )
926
+
927
+ # Calculate coverage percentage
928
+ self.metrics.calculate_coverage()
929
+
930
+ def _calculate_validation_score(self) -> float:
931
+ """Calculate a validation score (0-100)."""
932
+ if not self.issues:
933
+ return 100.0
934
+
935
+ # Weight different severities
936
+ critical_weight = 10
937
+ warning_weight = 3
938
+ info_weight = 1
939
+
940
+ total_penalty = sum(
941
+ critical_weight
942
+ if issue.severity == ValidationSeverity.CRITICAL
943
+ else warning_weight
944
+ if issue.severity == ValidationSeverity.WARNING
945
+ else info_weight
946
+ for issue in self.issues
947
+ )
948
+
949
+ # Calculate score (max penalty of 100)
950
+ max_penalty = 100
951
+ score = max(0, 100 - (total_penalty * 100 / max_penalty))
952
+ return round(score, 2)
953
+
954
+
955
+ def validate_memgraph_data(
956
+ expected_model: GraphModel,
957
+ memgraph_connection,
958
+ expected_data_counts: Dict[str, int] = None,
959
+ detailed_report: bool = True,
960
+ ) -> ValidationResult:
961
+ """
962
+ Convenience function for post-migration Memgraph data validation.
963
+
964
+ Args:
965
+ expected_model: Expected GraphModel/spec.json
966
+ memgraph_connection: Connection to Memgraph database
967
+ expected_data_counts: Optional expected node/relationship counts
968
+ detailed_report: Whether to include detailed issue information
969
+
970
+ Returns:
971
+ ValidationResult with comparison results
972
+ """
973
+ validator = MemgraphDataValidator(memgraph_connection)
974
+ result = validator.validate_post_migration(expected_model, expected_data_counts)
975
+
976
+ if detailed_report:
977
+ logger.info("Validation Summary: %s", result.summary)
978
+ validation_score = result.details.get("validation_score", 0)
979
+ logger.info("Validation Score: %d/100", validation_score)
980
+
981
+ for issue in result.issues:
982
+ log_level = (
983
+ logging.ERROR
984
+ if issue.severity == ValidationSeverity.CRITICAL
985
+ else logging.WARNING
986
+ )
987
+ logger.log(log_level, "%s: %s", issue.category, issue.message)
988
+ if issue.recommendation:
989
+ logger.info(" Recommendation: %s", issue.recommendation)
990
+
991
+ return result