@intentsolutionsio/nosql-data-modeler 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,492 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Validates a NoSQL schema against best practices and common errors.
4
+
5
+ This script analyzes NoSQL database schemas (MongoDB, DynamoDB, etc.) for
6
+ compliance with best practices, performance guidelines, and common pitfalls.
7
+ """
8
+
9
+ import argparse
10
+ import json
11
+ import sys
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Dict, List, Any, Tuple
15
+
16
+
17
+ class NoSQLSchemaValidator:
18
+ """Validates NoSQL schemas against best practices."""
19
+
20
+ # Best practice rules
21
+ BEST_PRACTICES = {
22
+ "naming": {
23
+ "rule": "Naming conventions should be consistent",
24
+ "checks": [
25
+ "Use camelCase or snake_case consistently",
26
+ "Avoid single-letter field names",
27
+ "Use descriptive names"
28
+ ]
29
+ },
30
+ "indexing": {
31
+ "rule": "Index strategy is defined",
32
+ "checks": [
33
+ "Frequently queried fields are indexed",
34
+ "Composite indexes are defined for common queries",
35
+ "Index overhead is considered"
36
+ ]
37
+ },
38
+ "denormalization": {
39
+ "rule": "Denormalization is used appropriately",
40
+ "checks": [
41
+ "Denormalization reduces query complexity",
42
+ "Duplicate data is managed intentionally",
43
+ "Update patterns are considered"
44
+ ]
45
+ },
46
+ "data_types": {
47
+ "rule": "Data types are appropriate",
48
+ "checks": [
49
+ "Numeric fields use appropriate numeric types",
50
+ "Dates use datetime types",
51
+ "IDs use consistent types"
52
+ ]
53
+ },
54
+ "document_size": {
55
+ "rule": "Document size is reasonable",
56
+ "checks": [
57
+ "Documents don't exceed size limits (16MB for MongoDB)",
58
+ "Array fields don't grow unbounded",
59
+ "Large nested objects are avoided"
60
+ ]
61
+ }
62
+ }
63
+
64
+ # Common anti-patterns
65
+ ANTI_PATTERNS = [
66
+ {
67
+ "name": "unbounded_arrays",
68
+ "description": "Arrays that can grow without limits",
69
+ "severity": "high",
70
+ "recommendation": "Cap array size or use separate collections"
71
+ },
72
+ {
73
+ "name": "deeply_nested",
74
+ "description": "Deeply nested document structures (>3 levels)",
75
+ "severity": "medium",
76
+ "recommendation": "Flatten structure or normalize data"
77
+ },
78
+ {
79
+ "name": "no_indexes",
80
+ "description": "Frequently queried fields without indexes",
81
+ "severity": "high",
82
+ "recommendation": "Add indexes for query performance"
83
+ },
84
+ {
85
+ "name": "inconsistent_types",
86
+ "description": "Field with inconsistent data types across documents",
87
+ "severity": "medium",
88
+ "recommendation": "Enforce schema validation or add type hints"
89
+ },
90
+ {
91
+ "name": "circular_references",
92
+ "description": "Circular document references",
93
+ "severity": "high",
94
+ "recommendation": "Use one-way references or denormalization"
95
+ },
96
+ {
97
+ "name": "missing_ids",
98
+ "description": "Documents or arrays without ID fields",
99
+ "severity": "high",
100
+ "recommendation": "Add unique IDs for referencing"
101
+ }
102
+ ]
103
+
104
+ def __init__(self):
105
+ """Initialize schema validator."""
106
+ self.schema = {}
107
+ self.issues = []
108
+
109
+ def load_schema(self, filepath: str) -> bool:
110
+ """
111
+ Load schema from JSON file.
112
+
113
+ Args:
114
+ filepath: Path to schema JSON file
115
+
116
+ Returns:
117
+ True if successful, False otherwise
118
+ """
119
+ try:
120
+ with open(filepath, 'r') as f:
121
+ self.schema = json.load(f)
122
+ return True
123
+ except (FileNotFoundError, json.JSONDecodeError) as e:
124
+ print(f"Error loading schema: {e}", file=sys.stderr)
125
+ return False
126
+
127
+ def validate_naming_convention(self) -> List[Dict[str, Any]]:
128
+ """
129
+ Check naming convention consistency.
130
+
131
+ Returns:
132
+ List of issues found
133
+ """
134
+ issues = []
135
+ fields = self._extract_all_fields()
136
+
137
+ naming_styles = {"camelCase": 0, "snake_case": 0, "PascalCase": 0}
138
+
139
+ for field in fields:
140
+ if "_" in field and field[0] != "_":
141
+ naming_styles["snake_case"] += 1
142
+ elif field[0].isupper():
143
+ naming_styles["PascalCase"] += 1
144
+ else:
145
+ naming_styles["camelCase"] += 1
146
+
147
+ # Check for inconsistency
148
+ non_zero_styles = [count for count in naming_styles.values() if count > 0]
149
+ if len(non_zero_styles) > 1:
150
+ issues.append({
151
+ "severity": "medium",
152
+ "type": "naming_inconsistency",
153
+ "message": "Inconsistent naming convention across fields",
154
+ "details": naming_styles
155
+ })
156
+
157
+ return issues
158
+
159
+ def validate_indexes(self) -> List[Dict[str, Any]]:
160
+ """
161
+ Check indexing strategy.
162
+
163
+ Returns:
164
+ List of issues found
165
+ """
166
+ issues = []
167
+
168
+ if "indexes" not in self.schema:
169
+ issues.append({
170
+ "severity": "medium",
171
+ "type": "missing_indexes",
172
+ "message": "No indexes defined in schema",
173
+ "recommendation": "Define indexes for frequently queried fields"
174
+ })
175
+ else:
176
+ indexes = self.schema.get("indexes", [])
177
+ if not indexes:
178
+ issues.append({
179
+ "severity": "medium",
180
+ "type": "empty_indexes",
181
+ "message": "Indexes array is empty",
182
+ "recommendation": "Add indexes for query optimization"
183
+ })
184
+
185
+ return issues
186
+
187
+ def validate_document_structure(self) -> List[Dict[str, Any]]:
188
+ """
189
+ Check document structure for anti-patterns.
190
+
191
+ Returns:
192
+ List of issues found
193
+ """
194
+ issues = []
195
+
196
+ # Check for unbounded arrays
197
+ for field, field_def in self._extract_fields(self.schema).items():
198
+ if field_def.get("type") == "array":
199
+ if "max_items" not in field_def:
200
+ issues.append({
201
+ "severity": "high",
202
+ "type": "unbounded_array",
203
+ "field": field,
204
+ "message": f"Array field '{field}' has no maximum size limit",
205
+ "recommendation": "Set max_items or use separate collection"
206
+ })
207
+
208
+ # Check for deeply nested structures
209
+ depth = self._calculate_nesting_depth(self.schema)
210
+ if depth > 3:
211
+ issues.append({
212
+ "severity": "medium",
213
+ "type": "deeply_nested",
214
+ "message": f"Document nesting depth is {depth} levels (recommended: ≤3)",
215
+ "recommendation": "Flatten structure or normalize data"
216
+ })
217
+
218
+ return issues
219
+
220
+ def validate_data_types(self) -> List[Dict[str, Any]]:
221
+ """
222
+ Check data type consistency.
223
+
224
+ Returns:
225
+ List of issues found
226
+ """
227
+ issues = []
228
+
229
+ fields = self._extract_fields(self.schema)
230
+
231
+ for field, field_def in fields.items():
232
+ field_type = field_def.get("type")
233
+
234
+ # Check for type mismatches
235
+ if field_type not in ["string", "number", "boolean", "object", "array", "date", "null"]:
236
+ issues.append({
237
+ "severity": "medium",
238
+ "type": "unknown_type",
239
+ "field": field,
240
+ "message": f"Unknown type '{field_type}' for field '{field}'"
241
+ })
242
+
243
+ # Check for ID fields without proper type
244
+ if "id" in field.lower() and field_type not in ["string", "number"]:
245
+ issues.append({
246
+ "severity": "high",
247
+ "type": "invalid_id_type",
248
+ "field": field,
249
+ "message": f"ID field '{field}' should be string or number, not {field_type}"
250
+ })
251
+
252
+ return issues
253
+
254
+ def validate_references(self) -> List[Dict[str, Any]]:
255
+ """
256
+ Check for missing or circular references.
257
+
258
+ Returns:
259
+ List of issues found
260
+ """
261
+ issues = []
262
+
263
+ fields = self._extract_fields(self.schema)
264
+
265
+ for field, field_def in fields.items():
266
+ if "ref" in field_def:
267
+ ref_target = field_def.get("ref")
268
+ # Check if reference target exists
269
+ if "$id" in self.schema:
270
+ if ref_target != self.schema.get("$id"):
271
+ # Cross-collection reference - might be valid
272
+ pass
273
+
274
+ return issues
275
+
276
+ def _extract_all_fields(self) -> List[str]:
277
+ """Extract all field names from schema."""
278
+ fields = []
279
+
280
+ def extract_recursive(obj):
281
+ if isinstance(obj, dict):
282
+ for key, value in obj.items():
283
+ if key not in ["$schema", "$id", "type", "properties"]:
284
+ fields.append(key)
285
+ if isinstance(value, dict):
286
+ extract_recursive(value)
287
+
288
+ extract_recursive(self.schema)
289
+ return fields
290
+
291
+ def _extract_fields(self, obj: Dict) -> Dict[str, Dict]:
292
+ """Extract fields with their definitions."""
293
+ fields = {}
294
+
295
+ if "properties" in obj:
296
+ return obj["properties"]
297
+
298
+ for key, value in obj.items():
299
+ if isinstance(value, dict) and ("type" in value or "properties" in value):
300
+ fields[key] = value
301
+
302
+ return fields
303
+
304
+ def _calculate_nesting_depth(self, obj: Dict, current_depth: int = 0) -> int:
305
+ """Calculate maximum nesting depth."""
306
+ max_depth = current_depth
307
+
308
+ if isinstance(obj, dict):
309
+ for key, value in obj.items():
310
+ if isinstance(value, dict):
311
+ depth = self._calculate_nesting_depth(value, current_depth + 1)
312
+ max_depth = max(max_depth, depth)
313
+ elif isinstance(value, list) and value and isinstance(value[0], dict):
314
+ depth = self._calculate_nesting_depth(value[0], current_depth + 1)
315
+ max_depth = max(max_depth, depth)
316
+
317
+ return max_depth
318
+
319
+ def run_all_validations(self) -> Dict[str, Any]:
320
+ """
321
+ Run all validation checks.
322
+
323
+ Returns:
324
+ Validation results dictionary
325
+ """
326
+ results = {
327
+ "timestamp": datetime.now().isoformat(),
328
+ "schema_name": self.schema.get("$id", "unknown"),
329
+ "validations": [],
330
+ "summary": {
331
+ "critical": 0,
332
+ "high": 0,
333
+ "medium": 0,
334
+ "low": 0
335
+ }
336
+ }
337
+
338
+ # Run all validation methods
339
+ validation_methods = [
340
+ self.validate_naming_convention,
341
+ self.validate_indexes,
342
+ self.validate_document_structure,
343
+ self.validate_data_types,
344
+ self.validate_references
345
+ ]
346
+
347
+ for method in validation_methods:
348
+ issues = method()
349
+ results["validations"].extend(issues)
350
+
351
+ # Count by severity
352
+ for issue in results["validations"]:
353
+ severity = issue.get("severity", "low")
354
+ results["summary"][severity] = results["summary"].get(severity, 0) + 1
355
+
356
+ return results
357
+
358
+
359
+ def format_validation_report(results: Dict[str, Any]) -> str:
360
+ """
361
+ Format validation results into readable report.
362
+
363
+ Args:
364
+ results: Validation results dictionary
365
+
366
+ Returns:
367
+ Formatted report string
368
+ """
369
+ report = []
370
+ report.append(f"\n{'='*70}")
371
+ report.append(f"NoSQL Schema Validation Report")
372
+ report.append(f"Schema: {results['schema_name']}")
373
+ report.append(f"{'='*70}\n")
374
+
375
+ summary = results.get("summary", {})
376
+ report.append("Summary:")
377
+ report.append(f" Critical: {summary.get('critical', 0)}")
378
+ report.append(f" High: {summary.get('high', 0)}")
379
+ report.append(f" Medium: {summary.get('medium', 0)}")
380
+ report.append(f" Low: {summary.get('low', 0)}")
381
+ report.append("")
382
+
383
+ validations = results.get("validations", [])
384
+
385
+ if not validations:
386
+ report.append("✓ No issues found - Schema follows best practices!\n")
387
+ else:
388
+ report.append("Issues Found:\n")
389
+
390
+ for issue in sorted(validations, key=lambda x: ["critical", "high", "medium", "low"].index(x.get("severity", "low"))):
391
+ severity = issue.get("severity", "low").upper()
392
+ type_name = issue.get("type", "unknown")
393
+ message = issue.get("message", "")
394
+
395
+ report.append(f"[{severity}] {type_name}")
396
+ report.append(f" {message}")
397
+
398
+ if "field" in issue:
399
+ report.append(f" Field: {issue['field']}")
400
+
401
+ if "recommendation" in issue:
402
+ report.append(f" → {issue['recommendation']}")
403
+
404
+ report.append("")
405
+
406
+ report.append(f"{'='*70}\n")
407
+
408
+ return "\n".join(report)
409
+
410
+
411
+ def main():
412
+ """Main entry point for schema validation."""
413
+ parser = argparse.ArgumentParser(
414
+ description="Validate NoSQL schema against best practices",
415
+ formatter_class=argparse.RawDescriptionHelpFormatter,
416
+ epilog="""
417
+ Examples:
418
+ %(prog)s --schema schema.json
419
+ %(prog)s --schema user-schema.json --output report.json
420
+ %(prog)s --schema product-schema.json --format json
421
+ """
422
+ )
423
+
424
+ parser.add_argument(
425
+ "--schema",
426
+ required=True,
427
+ help="Path to JSON schema file"
428
+ )
429
+ parser.add_argument(
430
+ "--output",
431
+ help="Output file for validation report (JSON)"
432
+ )
433
+ parser.add_argument(
434
+ "--format",
435
+ default="text",
436
+ choices=["text", "json"],
437
+ help="Output format"
438
+ )
439
+ parser.add_argument(
440
+ "--verbose",
441
+ action="store_true",
442
+ help="Print detailed output"
443
+ )
444
+
445
+ args = parser.parse_args()
446
+
447
+ try:
448
+ validator = NoSQLSchemaValidator()
449
+
450
+ if args.verbose:
451
+ print(f"Loading schema from {args.schema}...", file=sys.stderr)
452
+
453
+ if not validator.load_schema(args.schema):
454
+ sys.exit(1)
455
+
456
+ if args.verbose:
457
+ print("Running validations...", file=sys.stderr)
458
+
459
+ results = validator.run_all_validations()
460
+
461
+ # Output results
462
+ if args.format == "json":
463
+ output = json.dumps(results, indent=2)
464
+ else:
465
+ output = format_validation_report(results)
466
+
467
+ print(output)
468
+
469
+ # Save to file if requested
470
+ if args.output:
471
+ with open(args.output, 'w') as f:
472
+ if args.format == "json":
473
+ json.dump(results, f, indent=2)
474
+ else:
475
+ f.write(output)
476
+
477
+ if args.verbose:
478
+ print(f"\nResults saved to {args.output}", file=sys.stderr)
479
+
480
+ # Exit code based on critical issues
481
+ if results["summary"].get("critical", 0) > 0:
482
+ sys.exit(1)
483
+ else:
484
+ sys.exit(0)
485
+
486
+ except Exception as e:
487
+ print(f"Error: {e}", file=sys.stderr)
488
+ sys.exit(1)
489
+
490
+
491
+ if __name__ == "__main__":
492
+ main()