@intentsolutionsio/nosql-data-modeler 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +17 -0
- package/LICENSE +21 -0
- package/README.md +35 -0
- package/agents/nosql-agent.md +36 -0
- package/package.json +38 -0
- package/skills/modeling-nosql-data/SKILL.md +86 -0
- package/skills/modeling-nosql-data/assets/README.md +7 -0
- package/skills/modeling-nosql-data/references/README.md +4 -0
- package/skills/modeling-nosql-data/scripts/README.md +7 -0
- package/skills/modeling-nosql-data/scripts/generate_sample_data.py +391 -0
- package/skills/modeling-nosql-data/scripts/migrate_schema.py +455 -0
- package/skills/modeling-nosql-data/scripts/validate_schema.py +492 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "nosql-data-modeler",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Database plugin for nosql-data-modeler",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "Claude Code Plugins",
|
|
7
|
+
"email": "[email protected]"
|
|
8
|
+
},
|
|
9
|
+
"repository": "https://github.com/jeremylongshore/claude-code-plugins",
|
|
10
|
+
"license": "MIT",
|
|
11
|
+
"keywords": [
|
|
12
|
+
"database",
|
|
13
|
+
"backend",
|
|
14
|
+
"data",
|
|
15
|
+
"agent-skills"
|
|
16
|
+
]
|
|
17
|
+
}
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024-2026 Jeremy Longshore & Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Nosql Data Modeler Plugin
|
|
2
|
+
|
|
3
|
+
Database plugin for nosql-data-modeler
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
/plugin install nosql-data-modeler@claude-code-plugins-plus
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
/nosql-data-modeler
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Features
|
|
18
|
+
|
|
19
|
+
- Database best practices
|
|
20
|
+
- Multi-database support
|
|
21
|
+
- Production-ready implementations
|
|
22
|
+
- Comprehensive documentation
|
|
23
|
+
|
|
24
|
+
## Requirements
|
|
25
|
+
|
|
26
|
+
- Database access
|
|
27
|
+
- Appropriate permissions
|
|
28
|
+
|
|
29
|
+
## Files
|
|
30
|
+
|
|
31
|
+
- `commands/nosql-data-modeler.md` or `agents/nosql-data-modeler-agent.md` - Main plugin logic
|
|
32
|
+
|
|
33
|
+
## License
|
|
34
|
+
|
|
35
|
+
MIT
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: nosql-agent
|
|
3
|
+
description: Design NoSQL data models
|
|
4
|
+
---
|
|
5
|
+
# NoSQL Data Modeler
|
|
6
|
+
|
|
7
|
+
Design efficient NoSQL data models for document and key-value databases.
|
|
8
|
+
|
|
9
|
+
## NoSQL Modeling Principles
|
|
10
|
+
|
|
11
|
+
1. **Embed vs Reference**: Denormalization for performance
|
|
12
|
+
2. **Access Patterns**: Design for queries, not normalization
|
|
13
|
+
3. **Sharding Keys**: Distribute data evenly
|
|
14
|
+
4. **Indexes**: Support query patterns
|
|
15
|
+
|
|
16
|
+
## MongoDB Example
|
|
17
|
+
|
|
18
|
+
```javascript
|
|
19
|
+
// User document with embedded posts (1-to-few)
|
|
20
|
+
{
|
|
21
|
+
_id: ObjectId("..."),
|
|
22
|
+
email: "[email protected]",
|
|
23
|
+
profile: {
|
|
24
|
+
name: "John Doe",
|
|
25
|
+
avatar: "url"
|
|
26
|
+
},
|
|
27
|
+
posts: [
|
|
28
|
+
{ title: "Post 1", content: "..." },
|
|
29
|
+
{ title: "Post 2", content: "..." }
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## When to Activate
|
|
35
|
+
|
|
36
|
+
Design NoSQL schemas for MongoDB, DynamoDB, Cassandra, etc.
|
package/package.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@intentsolutionsio/nosql-data-modeler",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Database plugin for nosql-data-modeler",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"database",
|
|
7
|
+
"backend",
|
|
8
|
+
"data",
|
|
9
|
+
"agent-skills",
|
|
10
|
+
"claude-code",
|
|
11
|
+
"claude-plugin",
|
|
12
|
+
"tonsofskills"
|
|
13
|
+
],
|
|
14
|
+
"repository": {
|
|
15
|
+
"type": "git",
|
|
16
|
+
"url": "git+https://github.com/jeremylongshore/claude-code-plugins-plus-skills.git",
|
|
17
|
+
"directory": "plugins/database/nosql-data-modeler"
|
|
18
|
+
},
|
|
19
|
+
"homepage": "https://tonsofskills.com/plugins/nosql-data-modeler",
|
|
20
|
+
"bugs": "https://github.com/jeremylongshore/claude-code-plugins-plus-skills/issues",
|
|
21
|
+
"license": "MIT",
|
|
22
|
+
"author": {
|
|
23
|
+
"name": "Claude Code Plugins",
|
|
24
|
+
"email": "[email protected]"
|
|
25
|
+
},
|
|
26
|
+
"publishConfig": {
|
|
27
|
+
"access": "public"
|
|
28
|
+
},
|
|
29
|
+
"files": [
|
|
30
|
+
"README.md",
|
|
31
|
+
".claude-plugin",
|
|
32
|
+
"skills",
|
|
33
|
+
"agents"
|
|
34
|
+
],
|
|
35
|
+
"scripts": {
|
|
36
|
+
"postinstall": "node -e \"console.log(\\\"\\\\n→ This npm package is a tracking/proof artifact. Install the plugin via:\\\\n ccpi install nosql-data-modeler\\\\n or /plugin install nosql-data-modeler@claude-code-plugins-plus in Claude Code\\\\n\\\")\""
|
|
37
|
+
}
|
|
38
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: modeling-nosql-data
|
|
3
|
+
description: |
|
|
4
|
+
Build use when you need to work with NoSQL data modeling.
|
|
5
|
+
This skill provides NoSQL database design with comprehensive guidance and automation.
|
|
6
|
+
Trigger with phrases like "model NoSQL data", "design document structure",
|
|
7
|
+
or "optimize NoSQL schema".
|
|
8
|
+
|
|
9
|
+
allowed-tools: Read, Write, Edit, Grep, Glob, Bash(psql:*), Bash(mysql:*), Bash(mongosh:*)
|
|
10
|
+
version: 1.0.0
|
|
11
|
+
author: Jeremy Longshore <jeremy@intentsolutions.io>
|
|
12
|
+
license: MIT
|
|
13
|
+
compatible-with: claude-code, codex, openclaw
|
|
14
|
+
tags: [database, modeling-nosql]
|
|
15
|
+
---
|
|
16
|
+
# NoSQL Data Modeler
|
|
17
|
+
|
|
18
|
+
## Overview
|
|
19
|
+
|
|
20
|
+
Design data models for NoSQL databases including MongoDB (document), DynamoDB (key-value/wide-column), Redis (key-value), and Cassandra (wide-column). Unlike relational modeling where normalization drives design, NoSQL modeling starts from access patterns and query requirements, then shapes the data to serve those patterns efficiently.
|
|
21
|
+
|
|
22
|
+
## Prerequisites
|
|
23
|
+
|
|
24
|
+
- `mongosh`, `aws dynamodb` CLI, `redis-cli`, or `cqlsh` installed depending on target database
|
|
25
|
+
- Documented list of application access patterns (read/write queries the application performs)
|
|
26
|
+
- Expected data volumes (document count, average document size, growth rate)
|
|
27
|
+
- Read/write ratio and latency requirements for each access pattern
|
|
28
|
+
- Understanding of consistency requirements (strong vs. eventual consistency)
|
|
29
|
+
|
|
30
|
+
## Instructions
|
|
31
|
+
|
|
32
|
+
1. Catalog all application access patterns as a table with columns: pattern name, query description, frequency (queries/sec), latency requirement, and data fields accessed. This drives every modeling decision.
|
|
33
|
+
|
|
34
|
+
2. For MongoDB document modeling, apply the embedding vs. referencing decision framework:
|
|
35
|
+
- **Embed** when: data is always accessed together, child data has no independent lifecycle, cardinality is bounded (1:few), and updates are infrequent.
|
|
36
|
+
- **Reference** when: data has independent access patterns, cardinality is unbounded (1:many/many:many), child documents are large, or data is shared across parents.
|
|
37
|
+
|
|
38
|
+
3. Design document schemas that match query patterns. If the application needs "all orders for a customer with line items," embed line items inside the order document. If the application needs "all products across all orders," use references to a products collection.
|
|
39
|
+
|
|
40
|
+
4. For DynamoDB, design the partition key and sort key to support the primary access pattern with a single-table design. Use composite sort keys (e.g., `ORDER#2024-01-15#12345`) for hierarchical data. Plan GSIs (Global Secondary Indexes) for secondary access patterns, keeping total GSI count under 5.
|
|
41
|
+
|
|
42
|
+
5. Evaluate denormalization trade-offs: duplicating data across documents reduces read latency but increases write complexity and storage. Denormalize data that changes rarely (user names, product categories) but reference data that changes frequently (prices, inventory counts).
|
|
43
|
+
|
|
44
|
+
6. Handle one-to-many relationships by choosing between embedding (small arrays), child referencing (parent stores child IDs), or parent referencing (child stores parent ID). For unbounded one-to-many, always use parent referencing to avoid document size limits (16MB in MongoDB).
|
|
45
|
+
|
|
46
|
+
7. Model many-to-many relationships using an array of references in each document or a dedicated junction collection. For DynamoDB, use adjacency list patterns with inverted GSIs.
|
|
47
|
+
|
|
48
|
+
8. Plan for schema evolution by using schema versioning fields (`schemaVersion: 2`), writing migration scripts that update documents in batches, and ensuring application code handles both old and new document shapes during rollout.
|
|
49
|
+
|
|
50
|
+
9. Validate the model against access patterns by running sample queries with `explain()` in MongoDB or examining consumed capacity units in DynamoDB. Verify that primary access patterns require only single-partition reads.
|
|
51
|
+
|
|
52
|
+
10. Document the final data model with sample documents, index definitions, and the access pattern mapping that justifies each modeling decision.
|
|
53
|
+
|
|
54
|
+
## Output
|
|
55
|
+
|
|
56
|
+
- **Data model diagrams** showing document/collection structure, embedded vs. referenced relationships
|
|
57
|
+
- **Sample documents** in JSON format for each collection/table with realistic data
|
|
58
|
+
- **Index definitions** including compound indexes, partial indexes, and TTL indexes
|
|
59
|
+
- **Access pattern mapping** table linking each query to its supporting collection and index
|
|
60
|
+
- **Migration scripts** for evolving schemas from existing relational models to NoSQL
|
|
61
|
+
|
|
62
|
+
## Error Handling
|
|
63
|
+
|
|
64
|
+
| Error | Cause | Solution |
|
|
65
|
+
|-------|-------|---------|
|
|
66
|
+
| Document exceeds 16MB size limit (MongoDB) | Unbounded array growth from embedding too many child documents | Switch from embedding to referencing; use the bucket pattern to chunk large arrays into fixed-size sub-documents |
|
|
67
|
+
| Hot partition in DynamoDB | Partition key with low cardinality causes uneven distribution | Add a random suffix or use a composite key; distribute writes across partitions with write sharding |
|
|
68
|
+
| High read latency on referenced documents | Too many round trips to resolve references (N+1 query problem) | Denormalize frequently accessed reference data; use `$lookup` aggregation for server-side joins; batch reference resolution |
|
|
69
|
+
| Inconsistent denormalized data | Write to source succeeds but denormalized copies not updated | Implement change streams (MongoDB) or DynamoDB Streams to propagate updates; use transactional writes where supported |
|
|
70
|
+
| Query requires full collection scan | Missing index on query filter fields | Create compound indexes matching query predicates and sort order; use `explain()` to verify index usage |
|
|
71
|
+
|
|
72
|
+
## Examples
|
|
73
|
+
|
|
74
|
+
**E-commerce product catalog in MongoDB**: Products embed variant arrays (size, color, price) since variants are always accessed with the product. Reviews reference the product by ID since reviews are accessed independently and grow unboundedly. A compound index on `{category: 1, price: 1}` supports filtered browsing.
|
|
75
|
+
|
|
76
|
+
**Social media feed in DynamoDB single-table design**: Partition key is `USER#userId`, sort key is `POST#timestamp` for user timeline queries. A GSI with partition key `HASHTAG#tag` and sort key `timestamp` supports hashtag feeds. User profile data uses sort key `PROFILE` on the same partition.
|
|
77
|
+
|
|
78
|
+
**IoT sensor data in Cassandra**: Partition key is `sensor_id`, clustering column is `timestamp DESC`. Each partition holds one sensor's readings, ordered by time. TTL of 90 days automatically expires old readings. Materialized views support queries by location and sensor type.
|
|
79
|
+
|
|
80
|
+
## Resources
|
|
81
|
+
|
|
82
|
+
- MongoDB data modeling patterns: https://www.mongodb.com/docs/manual/data-modeling/
|
|
83
|
+
- DynamoDB single-table design: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/bp-modeling-nosql.html
|
|
84
|
+
- Cassandra data modeling guide: https://cassandra.apache.org/doc/latest/cassandra/data_modeling/
|
|
85
|
+
- Redis data structures: https://redis.io/docs/data-types/
|
|
86
|
+
- NoSQL design patterns catalog: https://www.mongodb.com/docs/manual/applications/data-models/
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# Assets
|
|
2
|
+
|
|
3
|
+
Bundled resources for nosql-data-modeler skill
|
|
4
|
+
|
|
5
|
+
- [ ] schema_templates/: Templates for common NoSQL schemas (e.g., user profile, product catalog).
|
|
6
|
+
- [ ] sample_data/: Sample data files corresponding to the schema templates.
|
|
7
|
+
- [ ] diagrams/: Visual diagrams illustrating different NoSQL data modeling patterns.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# Scripts
|
|
2
|
+
|
|
3
|
+
Bundled resources for nosql-data-modeler skill
|
|
4
|
+
|
|
5
|
+
- [x] validate_schema.py: Validates a NoSQL schema against best practices and common errors.
|
|
6
|
+
- [x] generate_sample_data.py: Generates sample data based on the defined schema for testing purposes.
|
|
7
|
+
- [x] migrate_schema.py: Migrates a schema from one NoSQL database type to another (e.g., MongoDB to DynamoDB).
|
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Generates sample data based on defined schema for testing purposes.
|
|
4
|
+
|
|
5
|
+
This script creates realistic sample data documents based on a NoSQL schema,
|
|
6
|
+
useful for testing queries, indexes, and application logic.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import argparse
|
|
10
|
+
import json
|
|
11
|
+
import random
|
|
12
|
+
import sys
|
|
13
|
+
from datetime import datetime, timedelta
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Dict, List, Any, Union
|
|
16
|
+
from uuid import uuid4
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SampleDataGenerator:
|
|
20
|
+
"""Generates sample data based on schema."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, schema: Dict[str, Any]):
|
|
23
|
+
"""
|
|
24
|
+
Initialize generator.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
schema: Schema definition dictionary
|
|
28
|
+
"""
|
|
29
|
+
self.schema = schema
|
|
30
|
+
self.generated_data = []
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def generate_value(field_def: Dict[str, Any]) -> Any:
|
|
34
|
+
"""
|
|
35
|
+
Generate a sample value for a field.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
field_def: Field definition with type and constraints
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Generated value
|
|
42
|
+
"""
|
|
43
|
+
field_type = field_def.get("type", "string")
|
|
44
|
+
|
|
45
|
+
if field_type == "string":
|
|
46
|
+
if "enum" in field_def:
|
|
47
|
+
return random.choice(field_def["enum"])
|
|
48
|
+
elif "format" in field_def:
|
|
49
|
+
format_type = field_def["format"]
|
|
50
|
+
if format_type == "email":
|
|
51
|
+
return f"user{random.randint(1, 1000)}@example.com"
|
|
52
|
+
elif format_type == "uuid":
|
|
53
|
+
return str(uuid4())
|
|
54
|
+
elif format_type == "url":
|
|
55
|
+
return f"https://example.com/{random.randint(1, 100)}"
|
|
56
|
+
else:
|
|
57
|
+
return f"Sample {field_def.get('description', 'value')}"
|
|
58
|
+
|
|
59
|
+
elif field_type == "number":
|
|
60
|
+
min_val = field_def.get("minimum", 0)
|
|
61
|
+
max_val = field_def.get("maximum", 1000)
|
|
62
|
+
return random.uniform(min_val, max_val)
|
|
63
|
+
|
|
64
|
+
elif field_type == "integer":
|
|
65
|
+
min_val = field_def.get("minimum", 0)
|
|
66
|
+
max_val = field_def.get("maximum", 100)
|
|
67
|
+
return random.randint(min_val, max_val)
|
|
68
|
+
|
|
69
|
+
elif field_type == "boolean":
|
|
70
|
+
return random.choice([True, False])
|
|
71
|
+
|
|
72
|
+
elif field_type == "date":
|
|
73
|
+
days_ago = random.randint(0, 365)
|
|
74
|
+
return (datetime.now() - timedelta(days=days_ago)).isoformat()
|
|
75
|
+
|
|
76
|
+
elif field_type == "array":
|
|
77
|
+
min_items = field_def.get("minItems", 1)
|
|
78
|
+
max_items = field_def.get("maxItems", 5)
|
|
79
|
+
count = random.randint(min_items, max_items)
|
|
80
|
+
|
|
81
|
+
if "items" in field_def:
|
|
82
|
+
return [
|
|
83
|
+
SampleDataGenerator.generate_value(field_def["items"])
|
|
84
|
+
for _ in range(count)
|
|
85
|
+
]
|
|
86
|
+
else:
|
|
87
|
+
return [f"item{i}" for i in range(count)]
|
|
88
|
+
|
|
89
|
+
elif field_type == "object":
|
|
90
|
+
obj = {}
|
|
91
|
+
if "properties" in field_def:
|
|
92
|
+
for prop_name, prop_def in field_def["properties"].items():
|
|
93
|
+
obj[prop_name] = SampleDataGenerator.generate_value(prop_def)
|
|
94
|
+
return obj
|
|
95
|
+
|
|
96
|
+
else:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
def generate_document(self) -> Dict[str, Any]:
|
|
100
|
+
"""
|
|
101
|
+
Generate a complete sample document.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Sample document dictionary
|
|
105
|
+
"""
|
|
106
|
+
document = {}
|
|
107
|
+
|
|
108
|
+
# Handle top-level properties
|
|
109
|
+
if "properties" in self.schema:
|
|
110
|
+
for field_name, field_def in self.schema["properties"].items():
|
|
111
|
+
required = field_name in self.schema.get("required", [])
|
|
112
|
+
|
|
113
|
+
if required or random.random() > 0.3: # 70% chance for optional fields
|
|
114
|
+
document[field_name] = self.generate_value(field_def)
|
|
115
|
+
|
|
116
|
+
# Handle root-level type definitions
|
|
117
|
+
for field_name, field_def in self.schema.items():
|
|
118
|
+
if field_name.startswith("$") or field_name in ["type", "properties", "required"]:
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
if isinstance(field_def, dict) and "type" in field_def:
|
|
122
|
+
document[field_name] = self.generate_value(field_def)
|
|
123
|
+
|
|
124
|
+
return document
|
|
125
|
+
|
|
126
|
+
def generate_documents(self, count: int) -> List[Dict[str, Any]]:
|
|
127
|
+
"""
|
|
128
|
+
Generate multiple sample documents.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
count: Number of documents to generate
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
List of sample documents
|
|
135
|
+
"""
|
|
136
|
+
self.generated_data = [self.generate_document() for _ in range(count)]
|
|
137
|
+
return self.generated_data
|
|
138
|
+
|
|
139
|
+
def export_json(self, filepath: str) -> bool:
|
|
140
|
+
"""
|
|
141
|
+
Export generated data as JSON.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
filepath: Path to export to
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
True if successful, False otherwise
|
|
148
|
+
"""
|
|
149
|
+
try:
|
|
150
|
+
with open(filepath, 'w') as f:
|
|
151
|
+
json.dump(self.generated_data, f, indent=2)
|
|
152
|
+
return True
|
|
153
|
+
except Exception as e:
|
|
154
|
+
print(f"Error exporting JSON: {e}", file=sys.stderr)
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
def export_jsonl(self, filepath: str) -> bool:
|
|
158
|
+
"""
|
|
159
|
+
Export generated data as JSONL (JSON Lines).
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
filepath: Path to export to
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
True if successful, False otherwise
|
|
166
|
+
"""
|
|
167
|
+
try:
|
|
168
|
+
with open(filepath, 'w') as f:
|
|
169
|
+
for doc in self.generated_data:
|
|
170
|
+
f.write(json.dumps(doc) + '\n')
|
|
171
|
+
return True
|
|
172
|
+
except Exception as e:
|
|
173
|
+
print(f"Error exporting JSONL: {e}", file=sys.stderr)
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
def export_csv(self, filepath: str) -> bool:
|
|
177
|
+
"""
|
|
178
|
+
Export flattened sample data as CSV.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
filepath: Path to export to
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
True if successful, False otherwise
|
|
185
|
+
"""
|
|
186
|
+
try:
|
|
187
|
+
import csv
|
|
188
|
+
|
|
189
|
+
if not self.generated_data:
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
# Flatten documents and collect all keys
|
|
193
|
+
flattened = []
|
|
194
|
+
all_keys = set()
|
|
195
|
+
|
|
196
|
+
for doc in self.generated_data:
|
|
197
|
+
flat = self._flatten_dict(doc)
|
|
198
|
+
flattened.append(flat)
|
|
199
|
+
all_keys.update(flat.keys())
|
|
200
|
+
|
|
201
|
+
all_keys = sorted(list(all_keys))
|
|
202
|
+
|
|
203
|
+
with open(filepath, 'w', newline='') as f:
|
|
204
|
+
writer = csv.DictWriter(f, fieldnames=all_keys)
|
|
205
|
+
writer.writeheader()
|
|
206
|
+
writer.writerows(flattened)
|
|
207
|
+
|
|
208
|
+
return True
|
|
209
|
+
except Exception as e:
|
|
210
|
+
print(f"Error exporting CSV: {e}", file=sys.stderr)
|
|
211
|
+
return False
|
|
212
|
+
|
|
213
|
+
def _flatten_dict(self, d: Dict, parent_key: str = '', sep: str = '.') -> Dict:
|
|
214
|
+
"""
|
|
215
|
+
Flatten nested dictionary.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
d: Dictionary to flatten
|
|
219
|
+
parent_key: Parent key for nesting
|
|
220
|
+
sep: Separator for nested keys
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
Flattened dictionary
|
|
224
|
+
"""
|
|
225
|
+
items = []
|
|
226
|
+
|
|
227
|
+
for k, v in d.items():
|
|
228
|
+
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
|
229
|
+
|
|
230
|
+
if isinstance(v, dict):
|
|
231
|
+
items.extend(self._flatten_dict(v, new_key, sep=sep).items())
|
|
232
|
+
elif isinstance(v, list):
|
|
233
|
+
items.append((new_key, json.dumps(v)))
|
|
234
|
+
else:
|
|
235
|
+
items.append((new_key, v))
|
|
236
|
+
|
|
237
|
+
return dict(items)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def load_schema(filepath: str) -> Dict[str, Any]:
|
|
241
|
+
"""
|
|
242
|
+
Load schema from JSON file.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
filepath: Path to schema file
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
Schema dictionary
|
|
249
|
+
|
|
250
|
+
Raises:
|
|
251
|
+
FileNotFoundError: If file doesn't exist
|
|
252
|
+
json.JSONDecodeError: If file is not valid JSON
|
|
253
|
+
"""
|
|
254
|
+
try:
|
|
255
|
+
with open(filepath, 'r') as f:
|
|
256
|
+
return json.load(f)
|
|
257
|
+
except FileNotFoundError:
|
|
258
|
+
print(f"Error: Schema file not found: {filepath}", file=sys.stderr)
|
|
259
|
+
sys.exit(1)
|
|
260
|
+
except json.JSONDecodeError as e:
|
|
261
|
+
print(f"Error: Invalid JSON in schema: {e}", file=sys.stderr)
|
|
262
|
+
sys.exit(1)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def main():
|
|
266
|
+
"""Main entry point for sample data generation."""
|
|
267
|
+
parser = argparse.ArgumentParser(
|
|
268
|
+
description="Generate sample data based on NoSQL schema",
|
|
269
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
270
|
+
epilog="""
|
|
271
|
+
Examples:
|
|
272
|
+
# Generate 10 documents as JSON
|
|
273
|
+
%(prog)s --schema user-schema.json --count 10
|
|
274
|
+
|
|
275
|
+
# Generate 100 documents as JSONL
|
|
276
|
+
%(prog)s --schema order-schema.json --count 100 --format jsonl
|
|
277
|
+
|
|
278
|
+
# Export as CSV for spreadsheet analysis
|
|
279
|
+
%(prog)s --schema product-schema.json --count 50 --format csv --output products.csv
|
|
280
|
+
|
|
281
|
+
# Print to stdout
|
|
282
|
+
%(prog)s --schema schema.json --count 5 --print
|
|
283
|
+
"""
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
parser.add_argument(
|
|
287
|
+
"--schema",
|
|
288
|
+
required=True,
|
|
289
|
+
help="Path to JSON schema file"
|
|
290
|
+
)
|
|
291
|
+
parser.add_argument(
|
|
292
|
+
"--count",
|
|
293
|
+
type=int,
|
|
294
|
+
default=10,
|
|
295
|
+
help="Number of sample documents to generate (default: 10)"
|
|
296
|
+
)
|
|
297
|
+
parser.add_argument(
|
|
298
|
+
"--format",
|
|
299
|
+
default="json",
|
|
300
|
+
choices=["json", "jsonl", "csv"],
|
|
301
|
+
help="Output format"
|
|
302
|
+
)
|
|
303
|
+
parser.add_argument(
|
|
304
|
+
"--output",
|
|
305
|
+
help="Output file path"
|
|
306
|
+
)
|
|
307
|
+
parser.add_argument(
|
|
308
|
+
"--print",
|
|
309
|
+
action="store_true",
|
|
310
|
+
help="Print generated data to stdout"
|
|
311
|
+
)
|
|
312
|
+
parser.add_argument(
|
|
313
|
+
"--seed",
|
|
314
|
+
type=int,
|
|
315
|
+
help="Random seed for reproducible data"
|
|
316
|
+
)
|
|
317
|
+
parser.add_argument(
|
|
318
|
+
"--verbose",
|
|
319
|
+
action="store_true",
|
|
320
|
+
help="Print detailed output"
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
args = parser.parse_args()
|
|
324
|
+
|
|
325
|
+
# Set random seed if provided
|
|
326
|
+
if args.seed:
|
|
327
|
+
random.seed(args.seed)
|
|
328
|
+
|
|
329
|
+
try:
|
|
330
|
+
# Load schema
|
|
331
|
+
if args.verbose:
|
|
332
|
+
print(f"Loading schema from {args.schema}...", file=sys.stderr)
|
|
333
|
+
|
|
334
|
+
schema = load_schema(args.schema)
|
|
335
|
+
|
|
336
|
+
# Generate data
|
|
337
|
+
if args.verbose:
|
|
338
|
+
print(f"Generating {args.count} sample documents...", file=sys.stderr)
|
|
339
|
+
|
|
340
|
+
generator = SampleDataGenerator(schema)
|
|
341
|
+
generator.generate_documents(args.count)
|
|
342
|
+
|
|
343
|
+
# Print to stdout if requested
|
|
344
|
+
if args.print:
|
|
345
|
+
if args.format == "jsonl":
|
|
346
|
+
for doc in generator.generated_data:
|
|
347
|
+
print(json.dumps(doc))
|
|
348
|
+
else:
|
|
349
|
+
print(json.dumps(generator.generated_data, indent=2))
|
|
350
|
+
|
|
351
|
+
# Export to file
|
|
352
|
+
if args.output:
|
|
353
|
+
if args.verbose:
|
|
354
|
+
print(f"Exporting to {args.output}...", file=sys.stderr)
|
|
355
|
+
|
|
356
|
+
if args.format == "csv":
|
|
357
|
+
success = generator.export_csv(args.output)
|
|
358
|
+
elif args.format == "jsonl":
|
|
359
|
+
success = generator.export_jsonl(args.output)
|
|
360
|
+
else: # json
|
|
361
|
+
success = generator.export_json(args.output)
|
|
362
|
+
|
|
363
|
+
if success:
|
|
364
|
+
if args.verbose:
|
|
365
|
+
print(f"✓ Generated {args.count} documents", file=sys.stderr)
|
|
366
|
+
print(f"✓ Saved to {args.output}", file=sys.stderr)
|
|
367
|
+
sys.exit(0)
|
|
368
|
+
else:
|
|
369
|
+
sys.exit(1)
|
|
370
|
+
elif not args.print:
|
|
371
|
+
# If no output file and not printing, export to default location
|
|
372
|
+
default_file = f"sample_data.{args.format}"
|
|
373
|
+
if args.format == "csv":
|
|
374
|
+
generator.export_csv(default_file)
|
|
375
|
+
elif args.format == "jsonl":
|
|
376
|
+
generator.export_jsonl(default_file)
|
|
377
|
+
else:
|
|
378
|
+
generator.export_json(default_file)
|
|
379
|
+
|
|
380
|
+
if args.verbose:
|
|
381
|
+
print(f"✓ Data saved to {default_file}", file=sys.stderr)
|
|
382
|
+
|
|
383
|
+
sys.exit(0)
|
|
384
|
+
|
|
385
|
+
except Exception as e:
|
|
386
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
387
|
+
sys.exit(1)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
if __name__ == "__main__":
|
|
391
|
+
main()
|