structured2graph 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +47 -0
- core/__init__.py +23 -0
- core/hygm/__init__.py +74 -0
- core/hygm/hygm.py +2351 -0
- core/hygm/models/__init__.py +82 -0
- core/hygm/models/graph_models.py +667 -0
- core/hygm/models/llm_models.py +229 -0
- core/hygm/models/operations.py +176 -0
- core/hygm/models/sources.py +68 -0
- core/hygm/models/user_operations.py +139 -0
- core/hygm/strategies/__init__.py +17 -0
- core/hygm/strategies/base.py +36 -0
- core/hygm/strategies/deterministic.py +262 -0
- core/hygm/strategies/llm.py +904 -0
- core/hygm/validation/__init__.py +38 -0
- core/hygm/validation/base.py +194 -0
- core/hygm/validation/graph_schema_validator.py +687 -0
- core/hygm/validation/memgraph_data_validator.py +991 -0
- core/migration_agent.py +1369 -0
- core/schema/spec.json +155 -0
- core/utils/meta_graph.py +108 -0
- database/__init__.py +36 -0
- database/adapters/__init__.py +11 -0
- database/adapters/memgraph.py +318 -0
- database/adapters/mysql.py +311 -0
- database/adapters/postgresql.py +335 -0
- database/analyzer.py +396 -0
- database/factory.py +219 -0
- database/models.py +209 -0
- main.py +518 -0
- query_generation/__init__.py +20 -0
- query_generation/cypher_generator.py +129 -0
- query_generation/schema_utilities.py +88 -0
- structured2graph-0.1.1.dist-info/METADATA +197 -0
- structured2graph-0.1.1.dist-info/RECORD +41 -0
- structured2graph-0.1.1.dist-info/WHEEL +4 -0
- structured2graph-0.1.1.dist-info/entry_points.txt +2 -0
- structured2graph-0.1.1.dist-info/licenses/LICENSE +21 -0
- utils/__init__.py +57 -0
- utils/config.py +235 -0
- utils/environment.py +404 -0
|
@@ -0,0 +1,687 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Graph Schema Validator for graph models.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive validation of GraphModel objects against
|
|
5
|
+
the original database structure to ensure complete coverage and correctness
|
|
6
|
+
before migration begins. This is Type 1 validation in the two-tier system.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Dict, Any, TYPE_CHECKING
|
|
11
|
+
from .base import (
|
|
12
|
+
BaseValidator,
|
|
13
|
+
ValidationResult,
|
|
14
|
+
ValidationSeverity,
|
|
15
|
+
ValidationCategory,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from ..models.graph_models import GraphModel
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class GraphSchemaValidator(BaseValidator):
|
|
25
|
+
"""
|
|
26
|
+
Validates GraphModel against original database structure.
|
|
27
|
+
|
|
28
|
+
This validator ensures that the GraphModel properly represents
|
|
29
|
+
all tables, properties, relationships, indexes, and constraints
|
|
30
|
+
from the source database before migration begins.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def validate(
|
|
34
|
+
self, graph_model: "GraphModel", database_structure: Dict[str, Any]
|
|
35
|
+
) -> ValidationResult:
|
|
36
|
+
"""
|
|
37
|
+
Perform comprehensive graph schema validation.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
graph_model: GraphModel to validate
|
|
41
|
+
database_structure: Original database structure from data_interface
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
ValidationResult with detailed validation results
|
|
45
|
+
"""
|
|
46
|
+
self.reset()
|
|
47
|
+
|
|
48
|
+
logger.info("Starting graph schema validation...")
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Basic validation checks
|
|
52
|
+
if not graph_model:
|
|
53
|
+
self.add_issue(
|
|
54
|
+
ValidationSeverity.CRITICAL,
|
|
55
|
+
ValidationCategory.STRUCTURE,
|
|
56
|
+
"Graph model is not provided",
|
|
57
|
+
recommendation="Ensure graph model is created",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if not database_structure:
|
|
61
|
+
self.add_issue(
|
|
62
|
+
ValidationSeverity.CRITICAL,
|
|
63
|
+
ValidationCategory.STRUCTURE,
|
|
64
|
+
"Database structure is not provided",
|
|
65
|
+
recommendation="Ensure database structure is extracted",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# If basic checks fail, return early
|
|
69
|
+
critical_issues = [
|
|
70
|
+
issue
|
|
71
|
+
for issue in self.issues
|
|
72
|
+
if issue.severity == ValidationSeverity.CRITICAL
|
|
73
|
+
]
|
|
74
|
+
if critical_issues:
|
|
75
|
+
return ValidationResult(
|
|
76
|
+
validation_type="graph_schema",
|
|
77
|
+
success=False,
|
|
78
|
+
summary="Validation failed: Missing required inputs",
|
|
79
|
+
issues=self.issues,
|
|
80
|
+
metrics=self.metrics,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Core validations
|
|
84
|
+
self._validate_table_coverage(graph_model, database_structure)
|
|
85
|
+
self._validate_property_coverage(graph_model, database_structure)
|
|
86
|
+
self._validate_relationship_coverage(graph_model, database_structure)
|
|
87
|
+
self._validate_index_coverage(graph_model, database_structure)
|
|
88
|
+
self._validate_constraint_coverage(graph_model, database_structure)
|
|
89
|
+
|
|
90
|
+
# Quality validations
|
|
91
|
+
self._validate_schema_consistency(graph_model)
|
|
92
|
+
self._validate_naming_conventions(graph_model)
|
|
93
|
+
self._validate_performance_considerations(graph_model)
|
|
94
|
+
|
|
95
|
+
# Calculate final metrics
|
|
96
|
+
self.metrics.calculate_coverage()
|
|
97
|
+
|
|
98
|
+
# Generate summary
|
|
99
|
+
summary = self._generate_summary()
|
|
100
|
+
success = not any(
|
|
101
|
+
issue.severity == ValidationSeverity.CRITICAL for issue in self.issues
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
result = ValidationResult(
|
|
105
|
+
validation_type="graph_schema",
|
|
106
|
+
success=success,
|
|
107
|
+
summary=summary,
|
|
108
|
+
issues=self.issues,
|
|
109
|
+
metrics=self.metrics,
|
|
110
|
+
details={
|
|
111
|
+
"database_structure_summary": self._get_db_summary(
|
|
112
|
+
database_structure
|
|
113
|
+
),
|
|
114
|
+
"graph_model_summary": self._get_model_summary(graph_model),
|
|
115
|
+
},
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
logger.info("Graph schema validation completed: %s", summary)
|
|
119
|
+
return result
|
|
120
|
+
|
|
121
|
+
except Exception as e:
|
|
122
|
+
logger.error("Graph schema validation failed: %s", str(e))
|
|
123
|
+
self.add_issue(
|
|
124
|
+
ValidationSeverity.CRITICAL,
|
|
125
|
+
ValidationCategory.STRUCTURE,
|
|
126
|
+
f"Validation process failed: {str(e)}",
|
|
127
|
+
recommendation=("Check graph model and database structure format"),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return ValidationResult(
|
|
131
|
+
validation_type="graph_schema",
|
|
132
|
+
success=False,
|
|
133
|
+
summary=f"Validation failed: {str(e)}",
|
|
134
|
+
issues=self.issues,
|
|
135
|
+
metrics=self.metrics,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def _validate_table_coverage(self, graph_model, database_structure):
|
|
139
|
+
"""Validate that all entity tables are represented as nodes."""
|
|
140
|
+
logger.debug("Validating table coverage...")
|
|
141
|
+
|
|
142
|
+
# Handle both new structured format and legacy format
|
|
143
|
+
if hasattr(database_structure, "entity_tables"):
|
|
144
|
+
# New structured format - work directly with objects
|
|
145
|
+
entity_tables = database_structure.entity_tables
|
|
146
|
+
else:
|
|
147
|
+
# Legacy format fallback
|
|
148
|
+
entity_tables = database_structure.get("entity_tables", {})
|
|
149
|
+
|
|
150
|
+
self.metrics.tables_total = len(entity_tables)
|
|
151
|
+
|
|
152
|
+
if not entity_tables:
|
|
153
|
+
self.add_issue(
|
|
154
|
+
ValidationSeverity.WARNING,
|
|
155
|
+
ValidationCategory.COVERAGE,
|
|
156
|
+
"No entity tables found in database structure",
|
|
157
|
+
recommendation="Verify database structure extraction",
|
|
158
|
+
)
|
|
159
|
+
return
|
|
160
|
+
|
|
161
|
+
# Get tables covered by nodes
|
|
162
|
+
covered_tables = set()
|
|
163
|
+
for node in graph_model.nodes:
|
|
164
|
+
if node.source and hasattr(node.source, "name"):
|
|
165
|
+
covered_tables.add(node.source.name)
|
|
166
|
+
|
|
167
|
+
self.metrics.tables_covered = len(covered_tables)
|
|
168
|
+
|
|
169
|
+
# Check for missing tables
|
|
170
|
+
missing_tables = set(entity_tables.keys()) - covered_tables
|
|
171
|
+
if missing_tables:
|
|
172
|
+
# Get details about missing tables
|
|
173
|
+
missing_table_details = []
|
|
174
|
+
for table_name in sorted(missing_tables):
|
|
175
|
+
if hasattr(database_structure, "entity_tables"):
|
|
176
|
+
table_info = entity_tables[table_name]
|
|
177
|
+
column_count = len(table_info.columns)
|
|
178
|
+
pk_count = len(table_info.primary_keys)
|
|
179
|
+
fk_count = len(table_info.foreign_keys)
|
|
180
|
+
else:
|
|
181
|
+
table_info = entity_tables[table_name]
|
|
182
|
+
column_count = len(table_info.get("schema", []))
|
|
183
|
+
pk_count = len(table_info.get("primary_keys", []))
|
|
184
|
+
fk_count = len(table_info.get("foreign_keys", []))
|
|
185
|
+
|
|
186
|
+
missing_table_details.append(
|
|
187
|
+
f"'{table_name}' ({column_count} cols, {pk_count} PKs, "
|
|
188
|
+
f"{fk_count} FKs)"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
details_str = "; ".join(missing_table_details)
|
|
192
|
+
|
|
193
|
+
self.add_issue(
|
|
194
|
+
ValidationSeverity.CRITICAL,
|
|
195
|
+
ValidationCategory.COVERAGE,
|
|
196
|
+
(
|
|
197
|
+
f"Missing {len(missing_tables)} entity tables in graph "
|
|
198
|
+
f"model: {details_str}. These tables should be "
|
|
199
|
+
f"represented as nodes in the graph model."
|
|
200
|
+
),
|
|
201
|
+
expected=list(entity_tables.keys()),
|
|
202
|
+
actual=list(covered_tables),
|
|
203
|
+
recommendation=(
|
|
204
|
+
"Create nodes for each missing table. For example:\n"
|
|
205
|
+
+ "\n".join(
|
|
206
|
+
[
|
|
207
|
+
f" - Add '{table}' node with appropriate labels"
|
|
208
|
+
for table in sorted(missing_tables)
|
|
209
|
+
]
|
|
210
|
+
)
|
|
211
|
+
),
|
|
212
|
+
details={"missing_tables": list(missing_tables)},
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
logger.debug(
|
|
216
|
+
"Table coverage: %d/%d tables covered",
|
|
217
|
+
self.metrics.tables_covered,
|
|
218
|
+
self.metrics.tables_total,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
def _validate_property_coverage(self, graph_model, database_structure):
|
|
222
|
+
"""Validate that all table columns are represented as properties."""
|
|
223
|
+
logger.debug("Validating property coverage...")
|
|
224
|
+
|
|
225
|
+
# Handle both new structured format and legacy format
|
|
226
|
+
if hasattr(database_structure, "entity_tables"):
|
|
227
|
+
# New structured format - work directly with objects
|
|
228
|
+
entity_tables = database_structure.entity_tables
|
|
229
|
+
foreign_key_columns = self._get_foreign_key_columns(database_structure)
|
|
230
|
+
|
|
231
|
+
total_properties = 0
|
|
232
|
+
covered_properties = 0
|
|
233
|
+
missing_by_table = {}
|
|
234
|
+
|
|
235
|
+
for table_name, table_info in entity_tables.items():
|
|
236
|
+
# Get column names directly from ColumnInfo objects
|
|
237
|
+
table_columns = {col.name for col in table_info.columns}
|
|
238
|
+
total_properties += len(table_columns)
|
|
239
|
+
|
|
240
|
+
# Find corresponding node
|
|
241
|
+
node = self._find_node_for_table(graph_model, table_name)
|
|
242
|
+
if not node:
|
|
243
|
+
missing_by_table[table_name] = list(table_columns)
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
# Check property coverage for this node
|
|
247
|
+
node_properties = {prop.key for prop in node.properties}
|
|
248
|
+
missing_props = table_columns - node_properties
|
|
249
|
+
|
|
250
|
+
# Separate foreign key columns from regular missing properties
|
|
251
|
+
table_foreign_keys = foreign_key_columns.get(table_name, set())
|
|
252
|
+
missing_foreign_keys = missing_props & table_foreign_keys
|
|
253
|
+
missing_regular_props = missing_props - table_foreign_keys
|
|
254
|
+
|
|
255
|
+
# Count coverage: covered + foreign keys that became relationships
|
|
256
|
+
covered_properties += len(table_columns) - len(missing_regular_props)
|
|
257
|
+
|
|
258
|
+
# Only report regular properties as missing (not foreign keys)
|
|
259
|
+
if missing_regular_props:
|
|
260
|
+
missing_by_table[table_name] = list(missing_regular_props)
|
|
261
|
+
|
|
262
|
+
# Log foreign keys that became relationships (for debugging)
|
|
263
|
+
if missing_foreign_keys:
|
|
264
|
+
logger.debug(
|
|
265
|
+
"Table %s: %d foreign key columns became relationships: %s",
|
|
266
|
+
table_name,
|
|
267
|
+
len(missing_foreign_keys),
|
|
268
|
+
missing_foreign_keys,
|
|
269
|
+
)
|
|
270
|
+
else:
|
|
271
|
+
# Legacy format fallback
|
|
272
|
+
entity_tables = database_structure.get("entity_tables", {})
|
|
273
|
+
foreign_key_columns = self._get_foreign_key_columns(database_structure)
|
|
274
|
+
|
|
275
|
+
total_properties = 0
|
|
276
|
+
covered_properties = 0
|
|
277
|
+
missing_by_table = {}
|
|
278
|
+
|
|
279
|
+
for table_name, table_info in entity_tables.items():
|
|
280
|
+
table_columns = {col["field"] for col in table_info.get("schema", [])}
|
|
281
|
+
total_properties += len(table_columns)
|
|
282
|
+
|
|
283
|
+
# Find corresponding node
|
|
284
|
+
node = self._find_node_for_table(graph_model, table_name)
|
|
285
|
+
if not node:
|
|
286
|
+
missing_by_table[table_name] = list(table_columns)
|
|
287
|
+
continue
|
|
288
|
+
|
|
289
|
+
# Check property coverage for this node
|
|
290
|
+
node_properties = {prop.key for prop in node.properties}
|
|
291
|
+
missing_props = table_columns - node_properties
|
|
292
|
+
|
|
293
|
+
# Separate foreign key columns from regular missing properties
|
|
294
|
+
table_foreign_keys = foreign_key_columns.get(table_name, set())
|
|
295
|
+
missing_foreign_keys = missing_props & table_foreign_keys
|
|
296
|
+
missing_regular_props = missing_props - table_foreign_keys
|
|
297
|
+
|
|
298
|
+
# Count coverage: covered + foreign keys that became relationships
|
|
299
|
+
covered_properties += len(table_columns) - len(missing_regular_props)
|
|
300
|
+
|
|
301
|
+
# Only report regular properties as missing (not foreign keys)
|
|
302
|
+
if missing_regular_props:
|
|
303
|
+
missing_by_table[table_name] = list(missing_regular_props)
|
|
304
|
+
|
|
305
|
+
# Log foreign keys that became relationships (for debugging)
|
|
306
|
+
if missing_foreign_keys:
|
|
307
|
+
logger.debug(
|
|
308
|
+
"Table %s: %d foreign key columns became relationships: %s",
|
|
309
|
+
table_name,
|
|
310
|
+
len(missing_foreign_keys),
|
|
311
|
+
missing_foreign_keys,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
self.metrics.properties_total = total_properties
|
|
315
|
+
self.metrics.properties_covered = covered_properties
|
|
316
|
+
|
|
317
|
+
if missing_by_table:
|
|
318
|
+
total_missing = sum(len(props) for props in missing_by_table.values())
|
|
319
|
+
|
|
320
|
+
# Create detailed message about missing properties
|
|
321
|
+
missing_details = []
|
|
322
|
+
for table_name, missing_props in missing_by_table.items():
|
|
323
|
+
props_str = ", ".join(sorted(missing_props))
|
|
324
|
+
missing_details.append(f"Table '{table_name}': {props_str}")
|
|
325
|
+
|
|
326
|
+
details_message = "; ".join(missing_details)
|
|
327
|
+
|
|
328
|
+
self.add_issue(
|
|
329
|
+
ValidationSeverity.CRITICAL,
|
|
330
|
+
ValidationCategory.COVERAGE,
|
|
331
|
+
(
|
|
332
|
+
f"Missing {total_missing} non-foreign-key properties "
|
|
333
|
+
f"across {len(missing_by_table)} tables. Details: "
|
|
334
|
+
f"{details_message}. These properties may contain "
|
|
335
|
+
"important data that should be preserved in "
|
|
336
|
+
"the graph model."
|
|
337
|
+
),
|
|
338
|
+
expected=f"{total_properties} properties",
|
|
339
|
+
actual=f"{covered_properties} properties",
|
|
340
|
+
recommendation=(
|
|
341
|
+
"Add missing properties to corresponding nodes:\n"
|
|
342
|
+
+ "\n".join(
|
|
343
|
+
[
|
|
344
|
+
f" - Add to '{table}' node: {', '.join(props)}"
|
|
345
|
+
for table, props in missing_by_table.items()
|
|
346
|
+
]
|
|
347
|
+
)
|
|
348
|
+
+ "\nNote: Foreign key columns are correctly modeled "
|
|
349
|
+
"as relationships, not properties."
|
|
350
|
+
),
|
|
351
|
+
details={"missing_by_table": missing_by_table},
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
logger.debug(
|
|
355
|
+
"Property coverage: %d/%d properties covered (including foreign keys as relationships)",
|
|
356
|
+
covered_properties,
|
|
357
|
+
total_properties,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
def _validate_relationship_coverage(self, graph_model, database_structure):
|
|
361
|
+
"""Validate that foreign keys are represented as relationships."""
|
|
362
|
+
logger.debug("Validating relationship coverage...")
|
|
363
|
+
|
|
364
|
+
relationships = database_structure.get("relationships", [])
|
|
365
|
+
self.metrics.relationships_total = len(relationships)
|
|
366
|
+
|
|
367
|
+
if not relationships:
|
|
368
|
+
self.add_issue(
|
|
369
|
+
ValidationSeverity.INFO,
|
|
370
|
+
ValidationCategory.COVERAGE,
|
|
371
|
+
"No relationships found in database structure",
|
|
372
|
+
recommendation="Verify foreign key extraction is correct",
|
|
373
|
+
)
|
|
374
|
+
return
|
|
375
|
+
|
|
376
|
+
# Simple coverage check - count modeled vs database relationships
|
|
377
|
+
modeled_count = len(graph_model.edges)
|
|
378
|
+
self.metrics.relationships_covered = min(modeled_count, len(relationships))
|
|
379
|
+
|
|
380
|
+
if modeled_count < len(relationships):
|
|
381
|
+
# Get details about database relationships
|
|
382
|
+
db_relationship_details = []
|
|
383
|
+
for rel in relationships[:5]: # Show first 5 for brevity
|
|
384
|
+
if isinstance(rel, dict):
|
|
385
|
+
from_table = rel.get("from_table", "unknown")
|
|
386
|
+
to_table = rel.get("to_table", "unknown")
|
|
387
|
+
column = rel.get("column", "unknown")
|
|
388
|
+
db_relationship_details.append(
|
|
389
|
+
f"{from_table}.{column} -> {to_table}"
|
|
390
|
+
)
|
|
391
|
+
else:
|
|
392
|
+
# Handle object format if needed
|
|
393
|
+
db_relationship_details.append(str(rel))
|
|
394
|
+
|
|
395
|
+
missing_count = len(relationships) - modeled_count
|
|
396
|
+
details_str = "; ".join(db_relationship_details)
|
|
397
|
+
if len(relationships) > 5:
|
|
398
|
+
details_str += f" (and {len(relationships) - 5} more)"
|
|
399
|
+
|
|
400
|
+
self.add_issue(
|
|
401
|
+
ValidationSeverity.WARNING,
|
|
402
|
+
ValidationCategory.COVERAGE,
|
|
403
|
+
(
|
|
404
|
+
f"Fewer relationships modeled ({modeled_count}) than in "
|
|
405
|
+
f"database ({len(relationships)}). Missing "
|
|
406
|
+
f"{missing_count} potential relationships. Database "
|
|
407
|
+
f"relationships include: {details_str}. These foreign key "
|
|
408
|
+
"relationships may represent important "
|
|
409
|
+
"connections that should be modeled as graph "
|
|
410
|
+
"relationships."
|
|
411
|
+
),
|
|
412
|
+
expected=f"{len(relationships)} relationships",
|
|
413
|
+
actual=f"{modeled_count} relationships",
|
|
414
|
+
recommendation=(
|
|
415
|
+
"Review database foreign keys and consider adding "
|
|
416
|
+
"missing relationships."
|
|
417
|
+
),
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
logger.debug(
|
|
421
|
+
"Relationship coverage: %d/%d relationships covered",
|
|
422
|
+
self.metrics.relationships_covered,
|
|
423
|
+
len(relationships),
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
def _validate_index_coverage(self, graph_model, database_structure):
|
|
427
|
+
"""Validate that important indexes are planned."""
|
|
428
|
+
logger.debug("Validating index coverage...")
|
|
429
|
+
|
|
430
|
+
# Get planned indexes from model
|
|
431
|
+
planned_indexes = len(graph_model.node_indexes) + len(graph_model.edge_indexes)
|
|
432
|
+
self.metrics.indexes_covered = planned_indexes
|
|
433
|
+
|
|
434
|
+
# Handle both new structured format and legacy format
|
|
435
|
+
if hasattr(database_structure, "entity_tables"):
|
|
436
|
+
# New structured format - work directly with objects
|
|
437
|
+
db_indexes_count = sum(
|
|
438
|
+
len(table_info.indexes)
|
|
439
|
+
for table_info in database_structure.entity_tables.values()
|
|
440
|
+
)
|
|
441
|
+
else:
|
|
442
|
+
# Legacy format fallback
|
|
443
|
+
entity_tables = database_structure.get("entity_tables", {})
|
|
444
|
+
db_indexes_count = 0
|
|
445
|
+
for table_info in entity_tables.values():
|
|
446
|
+
db_indexes_count += len(table_info.get("indexes", []))
|
|
447
|
+
|
|
448
|
+
self.metrics.indexes_total = db_indexes_count
|
|
449
|
+
|
|
450
|
+
if db_indexes_count > 0 and planned_indexes == 0:
|
|
451
|
+
self.add_issue(
|
|
452
|
+
ValidationSeverity.WARNING,
|
|
453
|
+
ValidationCategory.PERFORMANCE,
|
|
454
|
+
f"No indexes planned, but {db_indexes_count} exist in source database",
|
|
455
|
+
expected="Indexes planned for performance",
|
|
456
|
+
actual="No indexes planned",
|
|
457
|
+
recommendation="Consider adding indexes for frequently queried properties",
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
logger.debug(
|
|
461
|
+
"Index planning: %d indexes planned vs %d in source database",
|
|
462
|
+
planned_indexes,
|
|
463
|
+
db_indexes_count,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
def _validate_constraint_coverage(self, graph_model, database_structure):
|
|
467
|
+
"""Validate that database constraints are represented."""
|
|
468
|
+
logger.debug("Validating constraint coverage...")
|
|
469
|
+
|
|
470
|
+
# Get planned constraints from model
|
|
471
|
+
planned_constraints = len(graph_model.node_constraints) + len(
|
|
472
|
+
graph_model.edge_constraints
|
|
473
|
+
)
|
|
474
|
+
self.metrics.constraints_covered = planned_constraints
|
|
475
|
+
|
|
476
|
+
# Handle both new structured format and legacy format
|
|
477
|
+
if hasattr(database_structure, "entity_tables"):
|
|
478
|
+
# New structured format - work directly with objects
|
|
479
|
+
db_constraints_count = 0
|
|
480
|
+
for table_info in database_structure.entity_tables.values():
|
|
481
|
+
db_constraints_count += len(table_info.primary_keys)
|
|
482
|
+
db_constraints_count += len(table_info.foreign_keys)
|
|
483
|
+
else:
|
|
484
|
+
# Legacy format fallback
|
|
485
|
+
entity_tables = database_structure.get("entity_tables", {})
|
|
486
|
+
db_constraints_count = 0
|
|
487
|
+
for table_info in entity_tables.values():
|
|
488
|
+
db_constraints_count += len(table_info.get("primary_keys", []))
|
|
489
|
+
db_constraints_count += len(table_info.get("foreign_keys", []))
|
|
490
|
+
|
|
491
|
+
self.metrics.constraints_total = db_constraints_count
|
|
492
|
+
|
|
493
|
+
if db_constraints_count > 0 and planned_constraints == 0:
|
|
494
|
+
self.add_issue(
|
|
495
|
+
ValidationSeverity.WARNING,
|
|
496
|
+
ValidationCategory.CONSISTENCY,
|
|
497
|
+
f"No constraints planned, but {db_constraints_count} "
|
|
498
|
+
"exist in source",
|
|
499
|
+
expected="Constraints planned for data integrity",
|
|
500
|
+
actual="No constraints planned",
|
|
501
|
+
recommendation="Consider adding constraints for data " "integrity",
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
logger.debug(
|
|
505
|
+
"Constraint planning: %d constraints planned vs %d in source",
|
|
506
|
+
planned_constraints,
|
|
507
|
+
db_constraints_count,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
def _validate_schema_consistency(self, graph_model):
|
|
511
|
+
"""Validate internal consistency of the graph model."""
|
|
512
|
+
logger.debug("Validating schema consistency...")
|
|
513
|
+
|
|
514
|
+
# Check for duplicate node labels
|
|
515
|
+
seen_labels = set()
|
|
516
|
+
for node in graph_model.nodes:
|
|
517
|
+
for label in node.labels:
|
|
518
|
+
if label in seen_labels:
|
|
519
|
+
self.add_issue(
|
|
520
|
+
ValidationSeverity.WARNING,
|
|
521
|
+
ValidationCategory.CONSISTENCY,
|
|
522
|
+
f"Duplicate node label '{label}' found",
|
|
523
|
+
recommendation="Ensure node labels are unique",
|
|
524
|
+
)
|
|
525
|
+
seen_labels.add(label)
|
|
526
|
+
|
|
527
|
+
# Check for orphaned relationships
|
|
528
|
+
node_labels = set()
|
|
529
|
+
for node in graph_model.nodes:
|
|
530
|
+
node_labels.update(node.labels)
|
|
531
|
+
|
|
532
|
+
for edge in graph_model.edges:
|
|
533
|
+
# Check start node labels
|
|
534
|
+
missing_start = set(edge.start_node_labels) - node_labels
|
|
535
|
+
if missing_start:
|
|
536
|
+
self.add_issue(
|
|
537
|
+
ValidationSeverity.CRITICAL,
|
|
538
|
+
ValidationCategory.CONSISTENCY,
|
|
539
|
+
f"Relationship '{edge.edge_type}' references missing "
|
|
540
|
+
f"start node labels: {missing_start}",
|
|
541
|
+
recommendation="Ensure all relationship endpoints "
|
|
542
|
+
"reference existing node labels",
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
# Check end node labels
|
|
546
|
+
missing_end = set(edge.end_node_labels) - node_labels
|
|
547
|
+
if missing_end:
|
|
548
|
+
self.add_issue(
|
|
549
|
+
ValidationSeverity.CRITICAL,
|
|
550
|
+
ValidationCategory.CONSISTENCY,
|
|
551
|
+
f"Relationship '{edge.edge_type}' references missing "
|
|
552
|
+
f"end node labels: {missing_end}",
|
|
553
|
+
recommendation="Ensure all relationship endpoints "
|
|
554
|
+
"reference existing node labels",
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
def _validate_naming_conventions(self, graph_model):
|
|
558
|
+
"""Validate naming conventions and best practices."""
|
|
559
|
+
logger.debug("Validating naming conventions...")
|
|
560
|
+
|
|
561
|
+
# Check node label conventions
|
|
562
|
+
for node in graph_model.nodes:
|
|
563
|
+
for label in node.labels:
|
|
564
|
+
if not label[0].isupper():
|
|
565
|
+
self.add_issue(
|
|
566
|
+
ValidationSeverity.INFO,
|
|
567
|
+
ValidationCategory.CONSISTENCY,
|
|
568
|
+
f"Node label '{label}' should start with uppercase",
|
|
569
|
+
recommendation="Use PascalCase for node labels",
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# Check relationship type conventions
|
|
573
|
+
for edge in graph_model.edges:
|
|
574
|
+
if not edge.edge_type.isupper():
|
|
575
|
+
self.add_issue(
|
|
576
|
+
ValidationSeverity.INFO,
|
|
577
|
+
ValidationCategory.CONSISTENCY,
|
|
578
|
+
f"Relationship type '{edge.edge_type}' should be " "uppercase",
|
|
579
|
+
recommendation="Use UPPER_CASE for relationship types",
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
def _validate_performance_considerations(self, graph_model):
|
|
583
|
+
"""Validate performance-related aspects."""
|
|
584
|
+
logger.debug("Validating performance considerations...")
|
|
585
|
+
|
|
586
|
+
# Check for nodes without indexes on key properties
|
|
587
|
+
for node in graph_model.nodes:
|
|
588
|
+
key_properties = [
|
|
589
|
+
prop.key
|
|
590
|
+
for prop in node.properties
|
|
591
|
+
if "id" in prop.key.lower() or "key" in prop.key.lower()
|
|
592
|
+
]
|
|
593
|
+
|
|
594
|
+
if key_properties:
|
|
595
|
+
# Check if any of these have planned indexes
|
|
596
|
+
has_index = any(
|
|
597
|
+
set(key_properties) & set(index.properties)
|
|
598
|
+
for index in graph_model.node_indexes
|
|
599
|
+
if index.labels
|
|
600
|
+
and any(label in node.labels for label in index.labels)
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
if not has_index:
|
|
604
|
+
node_label = "/".join(node.labels)
|
|
605
|
+
key_props = ", ".join(key_properties)
|
|
606
|
+
self.add_issue(
|
|
607
|
+
ValidationSeverity.INFO,
|
|
608
|
+
ValidationCategory.PERFORMANCE,
|
|
609
|
+
f"Node {node_label} has key properties without " "indexes",
|
|
610
|
+
recommendation=f"Consider adding indexes for: " f"{key_props}",
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
# Helper methods
|
|
614
|
+
|
|
615
|
+
def _get_foreign_key_columns(self, database_structure):
|
|
616
|
+
"""
|
|
617
|
+
Extract foreign key columns from database structure.
|
|
618
|
+
|
|
619
|
+
Returns a dict mapping table_name -> set of foreign key column names.
|
|
620
|
+
"""
|
|
621
|
+
foreign_key_columns = {}
|
|
622
|
+
|
|
623
|
+
# Check if we have the new DatabaseStructure model
|
|
624
|
+
if hasattr(database_structure, "entity_tables"):
|
|
625
|
+
# New structured format - work directly with objects
|
|
626
|
+
for table_name, table_info in database_structure.entity_tables.items():
|
|
627
|
+
fk_columns = {fk.column_name for fk in table_info.foreign_keys}
|
|
628
|
+
if fk_columns:
|
|
629
|
+
foreign_key_columns[table_name] = fk_columns
|
|
630
|
+
else:
|
|
631
|
+
# Legacy format fallback
|
|
632
|
+
entity_tables = database_structure.get("entity_tables", {})
|
|
633
|
+
for table_name, table_info in entity_tables.items():
|
|
634
|
+
foreign_keys = table_info.get("foreign_keys", [])
|
|
635
|
+
fk_columns = set()
|
|
636
|
+
|
|
637
|
+
for fk in foreign_keys:
|
|
638
|
+
if isinstance(fk, dict):
|
|
639
|
+
# Handle dict format
|
|
640
|
+
if "column" in fk:
|
|
641
|
+
fk_columns.add(fk["column"])
|
|
642
|
+
elif "column_name" in fk:
|
|
643
|
+
fk_columns.add(fk["column_name"])
|
|
644
|
+
else:
|
|
645
|
+
# Handle object format
|
|
646
|
+
if hasattr(fk, "column_name"):
|
|
647
|
+
fk_columns.add(fk.column_name)
|
|
648
|
+
|
|
649
|
+
if fk_columns:
|
|
650
|
+
foreign_key_columns[table_name] = fk_columns
|
|
651
|
+
|
|
652
|
+
return foreign_key_columns
|
|
653
|
+
|
|
654
|
+
def _find_node_for_table(self, graph_model, table_name: str):
|
|
655
|
+
"""Find the node that represents a given table."""
|
|
656
|
+
for node in graph_model.nodes:
|
|
657
|
+
if node.source and hasattr(node.source, "name"):
|
|
658
|
+
if node.source.name == table_name:
|
|
659
|
+
return node
|
|
660
|
+
return None
|
|
661
|
+
|
|
662
|
+
def _get_model_summary(self, graph_model) -> Dict[str, Any]:
|
|
663
|
+
"""Get a summary of the graph model."""
|
|
664
|
+
return {
|
|
665
|
+
"nodes": len(graph_model.nodes),
|
|
666
|
+
"relationships": len(graph_model.edges),
|
|
667
|
+
"node_indexes": len(graph_model.node_indexes),
|
|
668
|
+
"edge_indexes": len(graph_model.edge_indexes),
|
|
669
|
+
"node_constraints": len(graph_model.node_constraints),
|
|
670
|
+
"edge_constraints": len(graph_model.edge_constraints),
|
|
671
|
+
"node_labels": [
|
|
672
|
+
label for node in graph_model.nodes for label in node.labels
|
|
673
|
+
],
|
|
674
|
+
"relationship_types": [edge.edge_type for edge in graph_model.edges],
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
def _get_db_summary(self, database_structure) -> Dict[str, Any]:
|
|
678
|
+
"""Get a summary of the database structure."""
|
|
679
|
+
entity_tables = database_structure.get("entity_tables", {})
|
|
680
|
+
relationships = database_structure.get("relationships", [])
|
|
681
|
+
|
|
682
|
+
return {
|
|
683
|
+
"entity_tables": len(entity_tables),
|
|
684
|
+
"relationships": len(relationships),
|
|
685
|
+
"database_type": database_structure.get("database_type", "unknown"),
|
|
686
|
+
"database_name": database_structure.get("database_name", "unknown"),
|
|
687
|
+
}
|