@intentsolutionsio/nosql-data-modeler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +17 -0
- package/LICENSE +21 -0
- package/README.md +35 -0
- package/agents/nosql-agent.md +36 -0
- package/package.json +38 -0
- package/skills/modeling-nosql-data/SKILL.md +86 -0
- package/skills/modeling-nosql-data/assets/README.md +7 -0
- package/skills/modeling-nosql-data/references/README.md +4 -0
- package/skills/modeling-nosql-data/scripts/README.md +7 -0
- package/skills/modeling-nosql-data/scripts/generate_sample_data.py +391 -0
- package/skills/modeling-nosql-data/scripts/migrate_schema.py +455 -0
- package/skills/modeling-nosql-data/scripts/validate_schema.py +492 -0
|
@@ -0,0 +1,492 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Validates a NoSQL schema against best practices and common errors.
|
|
4
|
+
|
|
5
|
+
This script analyzes NoSQL database schemas (MongoDB, DynamoDB, etc.) for
|
|
6
|
+
compliance with best practices, performance guidelines, and common pitfalls.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Dict, List, Any, Tuple
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NoSQLSchemaValidator:
|
|
18
|
+
"""Validates NoSQL schemas against best practices."""
|
|
19
|
+
|
|
20
|
+
# Best practice rules
|
|
21
|
+
BEST_PRACTICES = {
|
|
22
|
+
"naming": {
|
|
23
|
+
"rule": "Naming conventions should be consistent",
|
|
24
|
+
"checks": [
|
|
25
|
+
"Use camelCase or snake_case consistently",
|
|
26
|
+
"Avoid single-letter field names",
|
|
27
|
+
"Use descriptive names"
|
|
28
|
+
]
|
|
29
|
+
},
|
|
30
|
+
"indexing": {
|
|
31
|
+
"rule": "Index strategy is defined",
|
|
32
|
+
"checks": [
|
|
33
|
+
"Frequently queried fields are indexed",
|
|
34
|
+
"Composite indexes are defined for common queries",
|
|
35
|
+
"Index overhead is considered"
|
|
36
|
+
]
|
|
37
|
+
},
|
|
38
|
+
"denormalization": {
|
|
39
|
+
"rule": "Denormalization is used appropriately",
|
|
40
|
+
"checks": [
|
|
41
|
+
"Denormalization reduces query complexity",
|
|
42
|
+
"Duplicate data is managed intentionally",
|
|
43
|
+
"Update patterns are considered"
|
|
44
|
+
]
|
|
45
|
+
},
|
|
46
|
+
"data_types": {
|
|
47
|
+
"rule": "Data types are appropriate",
|
|
48
|
+
"checks": [
|
|
49
|
+
"Numeric fields use appropriate numeric types",
|
|
50
|
+
"Dates use datetime types",
|
|
51
|
+
"IDs use consistent types"
|
|
52
|
+
]
|
|
53
|
+
},
|
|
54
|
+
"document_size": {
|
|
55
|
+
"rule": "Document size is reasonable",
|
|
56
|
+
"checks": [
|
|
57
|
+
"Documents don't exceed size limits (16MB for MongoDB)",
|
|
58
|
+
"Array fields don't grow unbounded",
|
|
59
|
+
"Large nested objects are avoided"
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Common anti-patterns
|
|
65
|
+
ANTI_PATTERNS = [
|
|
66
|
+
{
|
|
67
|
+
"name": "unbounded_arrays",
|
|
68
|
+
"description": "Arrays that can grow without limits",
|
|
69
|
+
"severity": "high",
|
|
70
|
+
"recommendation": "Cap array size or use separate collections"
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"name": "deeply_nested",
|
|
74
|
+
"description": "Deeply nested document structures (>3 levels)",
|
|
75
|
+
"severity": "medium",
|
|
76
|
+
"recommendation": "Flatten structure or normalize data"
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"name": "no_indexes",
|
|
80
|
+
"description": "Frequently queried fields without indexes",
|
|
81
|
+
"severity": "high",
|
|
82
|
+
"recommendation": "Add indexes for query performance"
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"name": "inconsistent_types",
|
|
86
|
+
"description": "Field with inconsistent data types across documents",
|
|
87
|
+
"severity": "medium",
|
|
88
|
+
"recommendation": "Enforce schema validation or add type hints"
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"name": "circular_references",
|
|
92
|
+
"description": "Circular document references",
|
|
93
|
+
"severity": "high",
|
|
94
|
+
"recommendation": "Use one-way references or denormalization"
|
|
95
|
+
},
|
|
96
|
+
{
|
|
97
|
+
"name": "missing_ids",
|
|
98
|
+
"description": "Documents or arrays without ID fields",
|
|
99
|
+
"severity": "high",
|
|
100
|
+
"recommendation": "Add unique IDs for referencing"
|
|
101
|
+
}
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
def __init__(self):
|
|
105
|
+
"""Initialize schema validator."""
|
|
106
|
+
self.schema = {}
|
|
107
|
+
self.issues = []
|
|
108
|
+
|
|
109
|
+
def load_schema(self, filepath: str) -> bool:
|
|
110
|
+
"""
|
|
111
|
+
Load schema from JSON file.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
filepath: Path to schema JSON file
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
True if successful, False otherwise
|
|
118
|
+
"""
|
|
119
|
+
try:
|
|
120
|
+
with open(filepath, 'r') as f:
|
|
121
|
+
self.schema = json.load(f)
|
|
122
|
+
return True
|
|
123
|
+
except (FileNotFoundError, json.JSONDecodeError) as e:
|
|
124
|
+
print(f"Error loading schema: {e}", file=sys.stderr)
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
def validate_naming_convention(self) -> List[Dict[str, Any]]:
|
|
128
|
+
"""
|
|
129
|
+
Check naming convention consistency.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
List of issues found
|
|
133
|
+
"""
|
|
134
|
+
issues = []
|
|
135
|
+
fields = self._extract_all_fields()
|
|
136
|
+
|
|
137
|
+
naming_styles = {"camelCase": 0, "snake_case": 0, "PascalCase": 0}
|
|
138
|
+
|
|
139
|
+
for field in fields:
|
|
140
|
+
if "_" in field and field[0] != "_":
|
|
141
|
+
naming_styles["snake_case"] += 1
|
|
142
|
+
elif field[0].isupper():
|
|
143
|
+
naming_styles["PascalCase"] += 1
|
|
144
|
+
else:
|
|
145
|
+
naming_styles["camelCase"] += 1
|
|
146
|
+
|
|
147
|
+
# Check for inconsistency
|
|
148
|
+
non_zero_styles = [count for count in naming_styles.values() if count > 0]
|
|
149
|
+
if len(non_zero_styles) > 1:
|
|
150
|
+
issues.append({
|
|
151
|
+
"severity": "medium",
|
|
152
|
+
"type": "naming_inconsistency",
|
|
153
|
+
"message": "Inconsistent naming convention across fields",
|
|
154
|
+
"details": naming_styles
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
return issues
|
|
158
|
+
|
|
159
|
+
def validate_indexes(self) -> List[Dict[str, Any]]:
|
|
160
|
+
"""
|
|
161
|
+
Check indexing strategy.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List of issues found
|
|
165
|
+
"""
|
|
166
|
+
issues = []
|
|
167
|
+
|
|
168
|
+
if "indexes" not in self.schema:
|
|
169
|
+
issues.append({
|
|
170
|
+
"severity": "medium",
|
|
171
|
+
"type": "missing_indexes",
|
|
172
|
+
"message": "No indexes defined in schema",
|
|
173
|
+
"recommendation": "Define indexes for frequently queried fields"
|
|
174
|
+
})
|
|
175
|
+
else:
|
|
176
|
+
indexes = self.schema.get("indexes", [])
|
|
177
|
+
if not indexes:
|
|
178
|
+
issues.append({
|
|
179
|
+
"severity": "medium",
|
|
180
|
+
"type": "empty_indexes",
|
|
181
|
+
"message": "Indexes array is empty",
|
|
182
|
+
"recommendation": "Add indexes for query optimization"
|
|
183
|
+
})
|
|
184
|
+
|
|
185
|
+
return issues
|
|
186
|
+
|
|
187
|
+
def validate_document_structure(self) -> List[Dict[str, Any]]:
|
|
188
|
+
"""
|
|
189
|
+
Check document structure for anti-patterns.
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
List of issues found
|
|
193
|
+
"""
|
|
194
|
+
issues = []
|
|
195
|
+
|
|
196
|
+
# Check for unbounded arrays
|
|
197
|
+
for field, field_def in self._extract_fields(self.schema).items():
|
|
198
|
+
if field_def.get("type") == "array":
|
|
199
|
+
if "max_items" not in field_def:
|
|
200
|
+
issues.append({
|
|
201
|
+
"severity": "high",
|
|
202
|
+
"type": "unbounded_array",
|
|
203
|
+
"field": field,
|
|
204
|
+
"message": f"Array field '{field}' has no maximum size limit",
|
|
205
|
+
"recommendation": "Set max_items or use separate collection"
|
|
206
|
+
})
|
|
207
|
+
|
|
208
|
+
# Check for deeply nested structures
|
|
209
|
+
depth = self._calculate_nesting_depth(self.schema)
|
|
210
|
+
if depth > 3:
|
|
211
|
+
issues.append({
|
|
212
|
+
"severity": "medium",
|
|
213
|
+
"type": "deeply_nested",
|
|
214
|
+
"message": f"Document nesting depth is {depth} levels (recommended: ≤3)",
|
|
215
|
+
"recommendation": "Flatten structure or normalize data"
|
|
216
|
+
})
|
|
217
|
+
|
|
218
|
+
return issues
|
|
219
|
+
|
|
220
|
+
def validate_data_types(self) -> List[Dict[str, Any]]:
|
|
221
|
+
"""
|
|
222
|
+
Check data type consistency.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
List of issues found
|
|
226
|
+
"""
|
|
227
|
+
issues = []
|
|
228
|
+
|
|
229
|
+
fields = self._extract_fields(self.schema)
|
|
230
|
+
|
|
231
|
+
for field, field_def in fields.items():
|
|
232
|
+
field_type = field_def.get("type")
|
|
233
|
+
|
|
234
|
+
# Check for type mismatches
|
|
235
|
+
if field_type not in ["string", "number", "boolean", "object", "array", "date", "null"]:
|
|
236
|
+
issues.append({
|
|
237
|
+
"severity": "medium",
|
|
238
|
+
"type": "unknown_type",
|
|
239
|
+
"field": field,
|
|
240
|
+
"message": f"Unknown type '{field_type}' for field '{field}'"
|
|
241
|
+
})
|
|
242
|
+
|
|
243
|
+
# Check for ID fields without proper type
|
|
244
|
+
if "id" in field.lower() and field_type not in ["string", "number"]:
|
|
245
|
+
issues.append({
|
|
246
|
+
"severity": "high",
|
|
247
|
+
"type": "invalid_id_type",
|
|
248
|
+
"field": field,
|
|
249
|
+
"message": f"ID field '{field}' should be string or number, not {field_type}"
|
|
250
|
+
})
|
|
251
|
+
|
|
252
|
+
return issues
|
|
253
|
+
|
|
254
|
+
def validate_references(self) -> List[Dict[str, Any]]:
|
|
255
|
+
"""
|
|
256
|
+
Check for missing or circular references.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
List of issues found
|
|
260
|
+
"""
|
|
261
|
+
issues = []
|
|
262
|
+
|
|
263
|
+
fields = self._extract_fields(self.schema)
|
|
264
|
+
|
|
265
|
+
for field, field_def in fields.items():
|
|
266
|
+
if "ref" in field_def:
|
|
267
|
+
ref_target = field_def.get("ref")
|
|
268
|
+
# Check if reference target exists
|
|
269
|
+
if "$id" in self.schema:
|
|
270
|
+
if ref_target != self.schema.get("$id"):
|
|
271
|
+
# Cross-collection reference - might be valid
|
|
272
|
+
pass
|
|
273
|
+
|
|
274
|
+
return issues
|
|
275
|
+
|
|
276
|
+
def _extract_all_fields(self) -> List[str]:
|
|
277
|
+
"""Extract all field names from schema."""
|
|
278
|
+
fields = []
|
|
279
|
+
|
|
280
|
+
def extract_recursive(obj):
|
|
281
|
+
if isinstance(obj, dict):
|
|
282
|
+
for key, value in obj.items():
|
|
283
|
+
if key not in ["$schema", "$id", "type", "properties"]:
|
|
284
|
+
fields.append(key)
|
|
285
|
+
if isinstance(value, dict):
|
|
286
|
+
extract_recursive(value)
|
|
287
|
+
|
|
288
|
+
extract_recursive(self.schema)
|
|
289
|
+
return fields
|
|
290
|
+
|
|
291
|
+
def _extract_fields(self, obj: Dict) -> Dict[str, Dict]:
|
|
292
|
+
"""Extract fields with their definitions."""
|
|
293
|
+
fields = {}
|
|
294
|
+
|
|
295
|
+
if "properties" in obj:
|
|
296
|
+
return obj["properties"]
|
|
297
|
+
|
|
298
|
+
for key, value in obj.items():
|
|
299
|
+
if isinstance(value, dict) and ("type" in value or "properties" in value):
|
|
300
|
+
fields[key] = value
|
|
301
|
+
|
|
302
|
+
return fields
|
|
303
|
+
|
|
304
|
+
def _calculate_nesting_depth(self, obj: Dict, current_depth: int = 0) -> int:
|
|
305
|
+
"""Calculate maximum nesting depth."""
|
|
306
|
+
max_depth = current_depth
|
|
307
|
+
|
|
308
|
+
if isinstance(obj, dict):
|
|
309
|
+
for key, value in obj.items():
|
|
310
|
+
if isinstance(value, dict):
|
|
311
|
+
depth = self._calculate_nesting_depth(value, current_depth + 1)
|
|
312
|
+
max_depth = max(max_depth, depth)
|
|
313
|
+
elif isinstance(value, list) and value and isinstance(value[0], dict):
|
|
314
|
+
depth = self._calculate_nesting_depth(value[0], current_depth + 1)
|
|
315
|
+
max_depth = max(max_depth, depth)
|
|
316
|
+
|
|
317
|
+
return max_depth
|
|
318
|
+
|
|
319
|
+
def run_all_validations(self) -> Dict[str, Any]:
|
|
320
|
+
"""
|
|
321
|
+
Run all validation checks.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
Validation results dictionary
|
|
325
|
+
"""
|
|
326
|
+
results = {
|
|
327
|
+
"timestamp": datetime.now().isoformat(),
|
|
328
|
+
"schema_name": self.schema.get("$id", "unknown"),
|
|
329
|
+
"validations": [],
|
|
330
|
+
"summary": {
|
|
331
|
+
"critical": 0,
|
|
332
|
+
"high": 0,
|
|
333
|
+
"medium": 0,
|
|
334
|
+
"low": 0
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
# Run all validation methods
|
|
339
|
+
validation_methods = [
|
|
340
|
+
self.validate_naming_convention,
|
|
341
|
+
self.validate_indexes,
|
|
342
|
+
self.validate_document_structure,
|
|
343
|
+
self.validate_data_types,
|
|
344
|
+
self.validate_references
|
|
345
|
+
]
|
|
346
|
+
|
|
347
|
+
for method in validation_methods:
|
|
348
|
+
issues = method()
|
|
349
|
+
results["validations"].extend(issues)
|
|
350
|
+
|
|
351
|
+
# Count by severity
|
|
352
|
+
for issue in results["validations"]:
|
|
353
|
+
severity = issue.get("severity", "low")
|
|
354
|
+
results["summary"][severity] = results["summary"].get(severity, 0) + 1
|
|
355
|
+
|
|
356
|
+
return results
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def format_validation_report(results: Dict[str, Any]) -> str:
|
|
360
|
+
"""
|
|
361
|
+
Format validation results into readable report.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
results: Validation results dictionary
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
Formatted report string
|
|
368
|
+
"""
|
|
369
|
+
report = []
|
|
370
|
+
report.append(f"\n{'='*70}")
|
|
371
|
+
report.append(f"NoSQL Schema Validation Report")
|
|
372
|
+
report.append(f"Schema: {results['schema_name']}")
|
|
373
|
+
report.append(f"{'='*70}\n")
|
|
374
|
+
|
|
375
|
+
summary = results.get("summary", {})
|
|
376
|
+
report.append("Summary:")
|
|
377
|
+
report.append(f" Critical: {summary.get('critical', 0)}")
|
|
378
|
+
report.append(f" High: {summary.get('high', 0)}")
|
|
379
|
+
report.append(f" Medium: {summary.get('medium', 0)}")
|
|
380
|
+
report.append(f" Low: {summary.get('low', 0)}")
|
|
381
|
+
report.append("")
|
|
382
|
+
|
|
383
|
+
validations = results.get("validations", [])
|
|
384
|
+
|
|
385
|
+
if not validations:
|
|
386
|
+
report.append("✓ No issues found - Schema follows best practices!\n")
|
|
387
|
+
else:
|
|
388
|
+
report.append("Issues Found:\n")
|
|
389
|
+
|
|
390
|
+
for issue in sorted(validations, key=lambda x: ["critical", "high", "medium", "low"].index(x.get("severity", "low"))):
|
|
391
|
+
severity = issue.get("severity", "low").upper()
|
|
392
|
+
type_name = issue.get("type", "unknown")
|
|
393
|
+
message = issue.get("message", "")
|
|
394
|
+
|
|
395
|
+
report.append(f"[{severity}] {type_name}")
|
|
396
|
+
report.append(f" {message}")
|
|
397
|
+
|
|
398
|
+
if "field" in issue:
|
|
399
|
+
report.append(f" Field: {issue['field']}")
|
|
400
|
+
|
|
401
|
+
if "recommendation" in issue:
|
|
402
|
+
report.append(f" → {issue['recommendation']}")
|
|
403
|
+
|
|
404
|
+
report.append("")
|
|
405
|
+
|
|
406
|
+
report.append(f"{'='*70}\n")
|
|
407
|
+
|
|
408
|
+
return "\n".join(report)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def main():
|
|
412
|
+
"""Main entry point for schema validation."""
|
|
413
|
+
parser = argparse.ArgumentParser(
|
|
414
|
+
description="Validate NoSQL schema against best practices",
|
|
415
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
416
|
+
epilog="""
|
|
417
|
+
Examples:
|
|
418
|
+
%(prog)s --schema schema.json
|
|
419
|
+
%(prog)s --schema user-schema.json --output report.json
|
|
420
|
+
%(prog)s --schema product-schema.json --format json
|
|
421
|
+
"""
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
parser.add_argument(
|
|
425
|
+
"--schema",
|
|
426
|
+
required=True,
|
|
427
|
+
help="Path to JSON schema file"
|
|
428
|
+
)
|
|
429
|
+
parser.add_argument(
|
|
430
|
+
"--output",
|
|
431
|
+
help="Output file for validation report (JSON)"
|
|
432
|
+
)
|
|
433
|
+
parser.add_argument(
|
|
434
|
+
"--format",
|
|
435
|
+
default="text",
|
|
436
|
+
choices=["text", "json"],
|
|
437
|
+
help="Output format"
|
|
438
|
+
)
|
|
439
|
+
parser.add_argument(
|
|
440
|
+
"--verbose",
|
|
441
|
+
action="store_true",
|
|
442
|
+
help="Print detailed output"
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
args = parser.parse_args()
|
|
446
|
+
|
|
447
|
+
try:
|
|
448
|
+
validator = NoSQLSchemaValidator()
|
|
449
|
+
|
|
450
|
+
if args.verbose:
|
|
451
|
+
print(f"Loading schema from {args.schema}...", file=sys.stderr)
|
|
452
|
+
|
|
453
|
+
if not validator.load_schema(args.schema):
|
|
454
|
+
sys.exit(1)
|
|
455
|
+
|
|
456
|
+
if args.verbose:
|
|
457
|
+
print("Running validations...", file=sys.stderr)
|
|
458
|
+
|
|
459
|
+
results = validator.run_all_validations()
|
|
460
|
+
|
|
461
|
+
# Output results
|
|
462
|
+
if args.format == "json":
|
|
463
|
+
output = json.dumps(results, indent=2)
|
|
464
|
+
else:
|
|
465
|
+
output = format_validation_report(results)
|
|
466
|
+
|
|
467
|
+
print(output)
|
|
468
|
+
|
|
469
|
+
# Save to file if requested
|
|
470
|
+
if args.output:
|
|
471
|
+
with open(args.output, 'w') as f:
|
|
472
|
+
if args.format == "json":
|
|
473
|
+
json.dump(results, f, indent=2)
|
|
474
|
+
else:
|
|
475
|
+
f.write(output)
|
|
476
|
+
|
|
477
|
+
if args.verbose:
|
|
478
|
+
print(f"\nResults saved to {args.output}", file=sys.stderr)
|
|
479
|
+
|
|
480
|
+
# Exit code based on critical issues
|
|
481
|
+
if results["summary"].get("critical", 0) > 0:
|
|
482
|
+
sys.exit(1)
|
|
483
|
+
else:
|
|
484
|
+
sys.exit(0)
|
|
485
|
+
|
|
486
|
+
except Exception as e:
|
|
487
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
488
|
+
sys.exit(1)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
if __name__ == "__main__":
|
|
492
|
+
main()
|