mkschema 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mkschema-0.1.0/LICENSE +21 -0
- mkschema-0.1.0/PKG-INFO +109 -0
- mkschema-0.1.0/README.md +86 -0
- mkschema-0.1.0/pyproject.toml +38 -0
- mkschema-0.1.0/setup.cfg +4 -0
- mkschema-0.1.0/src/mkschema/__init__.py +3 -0
- mkschema-0.1.0/src/mkschema/__main__.py +6 -0
- mkschema-0.1.0/src/mkschema/cli.py +126 -0
- mkschema-0.1.0/src/mkschema/core.py +134 -0
- mkschema-0.1.0/src/mkschema.egg-info/PKG-INFO +109 -0
- mkschema-0.1.0/src/mkschema.egg-info/SOURCES.txt +13 -0
- mkschema-0.1.0/src/mkschema.egg-info/dependency_links.txt +1 -0
- mkschema-0.1.0/src/mkschema.egg-info/entry_points.txt +2 -0
- mkschema-0.1.0/src/mkschema.egg-info/top_level.txt +1 -0
- mkschema-0.1.0/tests/test_core.py +85 -0
mkschema-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 mkschema contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
mkschema-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mkschema
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Generate a JSON Schema from sample JSON — merge multiple samples (or NDJSON) with type + format inference and required-key detection. Zero dependencies.
|
|
5
|
+
Author: yyfjj
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jjdoor/mkschema-py
|
|
8
|
+
Project-URL: Repository, https://github.com/jjdoor/mkschema-py
|
|
9
|
+
Project-URL: Issues, https://github.com/jjdoor/mkschema-py/issues
|
|
10
|
+
Keywords: json-schema,schema,json,generate,infer,ndjson,openapi,cli,codegen,validation
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Topic :: Software Development :: Code Generators
|
|
18
|
+
Classifier: Topic :: Utilities
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# mkschema
|
|
25
|
+
|
|
26
|
+
**Generate a JSON Schema from real JSON — and feed it more than one sample.**
|
|
27
|
+
You have an API response, a config, a pile of log records, and you want a JSON
|
|
28
|
+
Schema for validation, docs, or contract tests. Hand-writing it is tedious;
|
|
29
|
+
most generators take a *single* example and over-fit it — every field marked
|
|
30
|
+
required, types pinned to whatever that one record happened to contain.
|
|
31
|
+
`mkschema` merges **many** samples: a field in *every* sample is `required`, a
|
|
32
|
+
field in only some is optional, and differing types are unioned. **Zero
|
|
33
|
+
dependencies, no network.**
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install mkschema
|
|
37
|
+
|
|
38
|
+
$ printf '{"id":1,"name":"Ada","age":30}\n{"id":2,"age":30.5}\n' | mkschema --ndjson -
|
|
39
|
+
|
|
40
|
+
{
|
|
41
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
42
|
+
"type": "object",
|
|
43
|
+
"properties": {
|
|
44
|
+
"age": { "type": "number" }, // 30 and 30.5 unioned to number
|
|
45
|
+
"id": { "type": "integer" },
|
|
46
|
+
"name": { "type": "string" }
|
|
47
|
+
},
|
|
48
|
+
"required": ["age", "id"] // name was missing from the 2nd sample, so it's optional
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
> This is the Python build. A behavior-equivalent Node build is on npm:
|
|
53
|
+
> `npx mkschema` (<https://github.com/jjdoor/mkschema>).
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
mkschema sample.json # infer from one file
|
|
59
|
+
mkschema a.json b.json c.json # merge several samples into one schema
|
|
60
|
+
mkschema --ndjson records.ndjson # one sample per line (logs, exports)
|
|
61
|
+
cat response.json | mkschema - # read a JSON value from stdin
|
|
62
|
+
mkschema users.json --title User --id https://ex.com/user.schema.json
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Schema goes to **stdout**, so redirect it: `mkschema data.json > schema.json`.
|
|
66
|
+
|
|
67
|
+
## What it infers
|
|
68
|
+
|
|
69
|
+
- **Types** — `null`, `boolean`, `integer`, `number`, `string`, `array`,
|
|
70
|
+
`object`. Numbers are classified by value, so `5.0` is an `integer` and the
|
|
71
|
+
Python and Node builds agree.
|
|
72
|
+
- **String `format`** — `date-time`, `date`, `email`, `uuid`, `ipv4`, `uri`
|
|
73
|
+
(kept only when *all* samples of a field agree).
|
|
74
|
+
- **`required`** — the intersection across samples: a key present in every
|
|
75
|
+
sample. (One sample ⇒ everything required.)
|
|
76
|
+
- **Arrays** — `items` is the merge of all element schemas, so `[1, "x"]`
|
|
77
|
+
becomes `{ "type": ["integer", "string"] }`.
|
|
78
|
+
- **Unions** — a field that is an integer in one sample and a float in another
|
|
79
|
+
becomes `number`; genuinely different types become a sorted `type` array.
|
|
80
|
+
|
|
81
|
+
## Options
|
|
82
|
+
|
|
83
|
+
| Flag | Effect |
|
|
84
|
+
|------|--------|
|
|
85
|
+
| `--ndjson <src>` | Treat each line of `<src>` (a file, or `-` for stdin) as a separate sample |
|
|
86
|
+
| `--title <name>` | Set the schema `title` |
|
|
87
|
+
| `--id <uri>` | Set `$id` |
|
|
88
|
+
| `-` | Read one JSON value from stdin |
|
|
89
|
+
| `-v`, `--version` · `-h`, `--help` | |
|
|
90
|
+
|
|
91
|
+
## Notes
|
|
92
|
+
|
|
93
|
+
- **Output is draft 2020-12** JSON Schema, deterministic (properties and
|
|
94
|
+
`required` are sorted) so it diffs cleanly in version control.
|
|
95
|
+
- **Same tool, two builds.** A behavior-equivalent Node build is on npm
|
|
96
|
+
(`npx mkschema`); use whichever your stack has.
|
|
97
|
+
- It infers structure, not constraints — add your own `minLength`, `enum`,
|
|
98
|
+
`pattern`, etc. afterward. mkschema gives you the scaffold from real data.
|
|
99
|
+
|
|
100
|
+
## Exit codes
|
|
101
|
+
|
|
102
|
+
| Code | Meaning |
|
|
103
|
+
|------|---------|
|
|
104
|
+
| `0` | schema written |
|
|
105
|
+
| `2` | error (no input, invalid JSON, unreadable file) |
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
MIT
|
mkschema-0.1.0/README.md
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# mkschema
|
|
2
|
+
|
|
3
|
+
**Generate a JSON Schema from real JSON — and feed it more than one sample.**
|
|
4
|
+
You have an API response, a config, a pile of log records, and you want a JSON
|
|
5
|
+
Schema for validation, docs, or contract tests. Hand-writing it is tedious;
|
|
6
|
+
most generators take a *single* example and over-fit it — every field marked
|
|
7
|
+
required, types pinned to whatever that one record happened to contain.
|
|
8
|
+
`mkschema` merges **many** samples: a field in *every* sample is `required`, a
|
|
9
|
+
field in only some is optional, and differing types are unioned. **Zero
|
|
10
|
+
dependencies, no network.**
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install mkschema
|
|
14
|
+
|
|
15
|
+
$ printf '{"id":1,"name":"Ada","age":30}\n{"id":2,"age":30.5}\n' | mkschema --ndjson -
|
|
16
|
+
|
|
17
|
+
{
|
|
18
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
19
|
+
"type": "object",
|
|
20
|
+
"properties": {
|
|
21
|
+
"age": { "type": "number" }, // 30 and 30.5 unioned to number
|
|
22
|
+
"id": { "type": "integer" },
|
|
23
|
+
"name": { "type": "string" }
|
|
24
|
+
},
|
|
25
|
+
"required": ["age", "id"] // name was missing from the 2nd sample, so it's optional
|
|
26
|
+
}
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
> This is the Python build. A behavior-equivalent Node build is on npm:
|
|
30
|
+
> `npx mkschema` (<https://github.com/jjdoor/mkschema>).
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
mkschema sample.json # infer from one file
|
|
36
|
+
mkschema a.json b.json c.json # merge several samples into one schema
|
|
37
|
+
mkschema --ndjson records.ndjson # one sample per line (logs, exports)
|
|
38
|
+
cat response.json | mkschema - # read a JSON value from stdin
|
|
39
|
+
mkschema users.json --title User --id https://ex.com/user.schema.json
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Schema goes to **stdout**, so redirect it: `mkschema data.json > schema.json`.
|
|
43
|
+
|
|
44
|
+
## What it infers
|
|
45
|
+
|
|
46
|
+
- **Types** — `null`, `boolean`, `integer`, `number`, `string`, `array`,
|
|
47
|
+
`object`. Numbers are classified by value, so `5.0` is an `integer` and the
|
|
48
|
+
Python and Node builds agree.
|
|
49
|
+
- **String `format`** — `date-time`, `date`, `email`, `uuid`, `ipv4`, `uri`
|
|
50
|
+
(kept only when *all* samples of a field agree).
|
|
51
|
+
- **`required`** — the intersection across samples: a key present in every
|
|
52
|
+
sample. (One sample ⇒ everything required.)
|
|
53
|
+
- **Arrays** — `items` is the merge of all element schemas, so `[1, "x"]`
|
|
54
|
+
becomes `{ "type": ["integer", "string"] }`.
|
|
55
|
+
- **Unions** — a field that is an integer in one sample and a float in another
|
|
56
|
+
becomes `number`; genuinely different types become a sorted `type` array.
|
|
57
|
+
|
|
58
|
+
## Options
|
|
59
|
+
|
|
60
|
+
| Flag | Effect |
|
|
61
|
+
|------|--------|
|
|
62
|
+
| `--ndjson <src>` | Treat each line of `<src>` (a file, or `-` for stdin) as a separate sample |
|
|
63
|
+
| `--title <name>` | Set the schema `title` |
|
|
64
|
+
| `--id <uri>` | Set `$id` |
|
|
65
|
+
| `-` | Read one JSON value from stdin |
|
|
66
|
+
| `-v`, `--version` · `-h`, `--help` | |
|
|
67
|
+
|
|
68
|
+
## Notes
|
|
69
|
+
|
|
70
|
+
- **Output is draft 2020-12** JSON Schema, deterministic (properties and
|
|
71
|
+
`required` are sorted) so it diffs cleanly in version control.
|
|
72
|
+
- **Same tool, two builds.** A behavior-equivalent Node build is on npm
|
|
73
|
+
(`npx mkschema`); use whichever your stack has.
|
|
74
|
+
- It infers structure, not constraints — add your own `minLength`, `enum`,
|
|
75
|
+
`pattern`, etc. afterward. mkschema gives you the scaffold from real data.
|
|
76
|
+
|
|
77
|
+
## Exit codes
|
|
78
|
+
|
|
79
|
+
| Code | Meaning |
|
|
80
|
+
|------|---------|
|
|
81
|
+
| `0` | schema written |
|
|
82
|
+
| `2` | error (no input, invalid JSON, unreadable file) |
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
MIT
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "mkschema"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Generate a JSON Schema from sample JSON — merge multiple samples (or NDJSON) with type + format inference and required-key detection. Zero dependencies."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "yyfjj" }]
|
|
13
|
+
keywords = ["json-schema", "schema", "json", "generate", "infer", "ndjson", "openapi", "cli", "codegen", "validation"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Environment :: Console",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Topic :: Software Development :: Code Generators",
|
|
22
|
+
"Topic :: Utilities",
|
|
23
|
+
]
|
|
24
|
+
dependencies = []
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/jjdoor/mkschema-py"
|
|
28
|
+
Repository = "https://github.com/jjdoor/mkschema-py"
|
|
29
|
+
Issues = "https://github.com/jjdoor/mkschema-py/issues"
|
|
30
|
+
|
|
31
|
+
[project.scripts]
|
|
32
|
+
mkschema = "mkschema.cli:main"
|
|
33
|
+
|
|
34
|
+
[tool.setuptools]
|
|
35
|
+
package-dir = { "" = "src" }
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.packages.find]
|
|
38
|
+
where = ["src"]
|
mkschema-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""mkschema command-line interface."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from . import core
|
|
8
|
+
|
|
9
|
+
VERSION = "0.1.0"
|
|
10
|
+
|
|
11
|
+
HELP = """mkschema — generate a JSON Schema from sample JSON. Feed several samples to merge.
|
|
12
|
+
|
|
13
|
+
Usage
|
|
14
|
+
mkschema <sample.json> [more.json ...] Infer a schema (multiple files merge as samples)
|
|
15
|
+
mkschema - Read one JSON value from stdin
|
|
16
|
+
mkschema --ndjson <file|-> Treat each line as a separate sample
|
|
17
|
+
|
|
18
|
+
Options
|
|
19
|
+
--title <name> Set the schema "title"
|
|
20
|
+
--id <uri> Set "$id"
|
|
21
|
+
--ndjson <src> Merge newline-delimited JSON samples (a file, or - for stdin)
|
|
22
|
+
-h, --help Show this help
|
|
23
|
+
-v, --version Show version
|
|
24
|
+
|
|
25
|
+
Merging samples: a key present in EVERY sample is "required"; a key in only some
|
|
26
|
+
is optional; differing types are unioned. Output is a JSON Schema (draft 2020-12)
|
|
27
|
+
written to stdout.
|
|
28
|
+
|
|
29
|
+
Exit 0 ok · 2 error (no input / invalid JSON / unreadable file)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def fail(msg):
|
|
34
|
+
sys.stderr.write(f"mkschema: {msg}\n")
|
|
35
|
+
sys.exit(2)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def read_src(path):
|
|
39
|
+
try:
|
|
40
|
+
if path == "-":
|
|
41
|
+
return sys.stdin.buffer.read().decode("utf-8") # raw read, no newline translation (matches Node)
|
|
42
|
+
with open(path, "r", encoding="utf-8", newline="") as f:
|
|
43
|
+
return f.read()
|
|
44
|
+
except FileNotFoundError:
|
|
45
|
+
fail(f"cannot read {path}: no such file")
|
|
46
|
+
except OSError as e:
|
|
47
|
+
fail(f"cannot read {'stdin' if path == '-' else path}: {e}")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _reject_const(c):
|
|
51
|
+
raise ValueError(f"{c} is not valid JSON")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def parse_json(text, label):
|
|
55
|
+
try:
|
|
56
|
+
# parse_constant rejects NaN/Infinity/-Infinity, which JSON.parse also rejects
|
|
57
|
+
return json.loads(text, parse_constant=_reject_const)
|
|
58
|
+
except ValueError as e:
|
|
59
|
+
fail(f"{label}: not valid JSON ({e})")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def main(argv=None):
|
|
63
|
+
try:
|
|
64
|
+
import signal
|
|
65
|
+
signal.signal(signal.SIGPIPE, signal.SIG_DFL) # die quietly on broken pipe (Unix), like Node
|
|
66
|
+
except (AttributeError, ValueError, OSError):
|
|
67
|
+
pass
|
|
68
|
+
argv = list(sys.argv[1:] if argv is None else argv)
|
|
69
|
+
title = None
|
|
70
|
+
id_ = None
|
|
71
|
+
ndjson = None
|
|
72
|
+
paths = []
|
|
73
|
+
|
|
74
|
+
i = 0
|
|
75
|
+
while i < len(argv):
|
|
76
|
+
a = argv[i]
|
|
77
|
+
if a in ("-h", "--help"):
|
|
78
|
+
sys.stdout.write(HELP)
|
|
79
|
+
return 0
|
|
80
|
+
elif a in ("-v", "--version"):
|
|
81
|
+
sys.stdout.write(VERSION + "\n")
|
|
82
|
+
return 0
|
|
83
|
+
elif a == "--title":
|
|
84
|
+
i += 1
|
|
85
|
+
if i >= len(argv):
|
|
86
|
+
fail("--title needs a value")
|
|
87
|
+
title = argv[i]
|
|
88
|
+
elif a == "--id":
|
|
89
|
+
i += 1
|
|
90
|
+
if i >= len(argv):
|
|
91
|
+
fail("--id needs a value")
|
|
92
|
+
id_ = argv[i]
|
|
93
|
+
elif a == "--ndjson":
|
|
94
|
+
i += 1
|
|
95
|
+
if i >= len(argv):
|
|
96
|
+
fail("--ndjson needs a file or -")
|
|
97
|
+
ndjson = argv[i]
|
|
98
|
+
elif a == "-":
|
|
99
|
+
paths.append("-")
|
|
100
|
+
elif a.startswith("-"):
|
|
101
|
+
fail(f"unknown option: {a} (try --help)")
|
|
102
|
+
else:
|
|
103
|
+
paths.append(a)
|
|
104
|
+
i += 1
|
|
105
|
+
|
|
106
|
+
samples = []
|
|
107
|
+
if ndjson is not None:
|
|
108
|
+
text = read_src(ndjson)
|
|
109
|
+
label = "stdin" if ndjson == "-" else ndjson
|
|
110
|
+
n = 0
|
|
111
|
+
for line in re.split(r"\r?\n", text):
|
|
112
|
+
if re.fullmatch(r"[ \t\n\v\f\r]*", line):
|
|
113
|
+
continue
|
|
114
|
+
n += 1
|
|
115
|
+
samples.append(parse_json(line, f"{label} line {n}"))
|
|
116
|
+
if not samples:
|
|
117
|
+
fail("no JSON samples found in --ndjson input")
|
|
118
|
+
else:
|
|
119
|
+
if not paths:
|
|
120
|
+
fail("no input — pass a JSON file, -, or --ndjson (try --help)")
|
|
121
|
+
for p in paths:
|
|
122
|
+
samples.append(parse_json(read_src(p), "stdin" if p == "-" else p))
|
|
123
|
+
|
|
124
|
+
schema = core.generate(samples, title=title, id=id_)
|
|
125
|
+
sys.stdout.write(json.dumps(schema, indent=2, ensure_ascii=False) + "\n")
|
|
126
|
+
return 0
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""mkschema core — pure JSON Schema inference from samples. No fs, no network.
|
|
2
|
+
|
|
3
|
+
`infer` turns one JSON value into a schema node; `merge` combines two nodes;
|
|
4
|
+
`generate` reduces many samples into one JSON Schema (draft 2020-12). Feeding
|
|
5
|
+
multiple samples is the point: a key present in *every* sample is `required`, a
|
|
6
|
+
key in only some is optional, and differing types are unioned — so the schema
|
|
7
|
+
fits real data instead of over-fitting one example.
|
|
8
|
+
|
|
9
|
+
Mirrors the Node build byte-for-byte (same key order, same inference).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
DRAFT = "https://json-schema.org/draft/2020-12/schema"
|
|
15
|
+
|
|
16
|
+
# String formats, tried in order; ASCII digit classes so both builds agree.
|
|
17
|
+
# `\Z` (absolute end), not `$`, to match JS `$` semantics — Python `$` also matches
|
|
18
|
+
# before a trailing newline, which a JSON `\n`-escaped value could contain.
|
|
19
|
+
_FORMATS = [
|
|
20
|
+
("date-time", re.compile(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}[Tt][0-9]{2}:[0-9]{2}:[0-9]{2}(?:\.[0-9]+)?(?:[Zz]|[+-][0-9]{2}:?[0-9]{2})?\Z")),
|
|
21
|
+
("date", re.compile(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}\Z")),
|
|
22
|
+
("uuid", re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\Z")),
|
|
23
|
+
("uri", re.compile(r"^[a-zA-Z][a-zA-Z0-9+.\-]*://\S+\Z")), # before email: URI with userinfo isn't an email
|
|
24
|
+
("email", re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+\Z")),
|
|
25
|
+
("ipv4", re.compile(r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}\Z")),
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def infer_type(v):
|
|
30
|
+
"""JSON type of a value. Numbers are classified by VALUE — 5.0 is an integer —
|
|
31
|
+
so the Python and Node builds agree."""
|
|
32
|
+
if v is None:
|
|
33
|
+
return "null"
|
|
34
|
+
if isinstance(v, bool): # bool before int (bool is a subclass of int)
|
|
35
|
+
return "boolean"
|
|
36
|
+
if isinstance(v, int):
|
|
37
|
+
return "integer"
|
|
38
|
+
if isinstance(v, float):
|
|
39
|
+
return "integer" if v.is_integer() else "number"
|
|
40
|
+
if isinstance(v, str):
|
|
41
|
+
return "string"
|
|
42
|
+
if isinstance(v, list):
|
|
43
|
+
return "array"
|
|
44
|
+
return "object"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def infer_format(s):
|
|
48
|
+
for name, rx in _FORMATS:
|
|
49
|
+
if rx.match(s):
|
|
50
|
+
return name
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def infer(v):
|
|
55
|
+
"""Infer a schema node from a single JSON value."""
|
|
56
|
+
t = infer_type(v)
|
|
57
|
+
if t == "object":
|
|
58
|
+
keys = sorted(v.keys())
|
|
59
|
+
node = {"type": "object", "properties": {k: infer(v[k]) for k in keys}}
|
|
60
|
+
if keys:
|
|
61
|
+
node["required"] = list(keys)
|
|
62
|
+
return node
|
|
63
|
+
if t == "array":
|
|
64
|
+
if not v:
|
|
65
|
+
return {"type": "array"}
|
|
66
|
+
return {"type": "array", "items": merge_all([infer(e) for e in v])}
|
|
67
|
+
if t == "string":
|
|
68
|
+
fmt = infer_format(v)
|
|
69
|
+
return {"type": "string", "format": fmt} if fmt else {"type": "string"}
|
|
70
|
+
return {"type": t}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _type_set(node):
|
|
74
|
+
return list(node["type"]) if isinstance(node["type"], list) else [node["type"]]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def merge(a, b):
|
|
78
|
+
"""Merge two schema nodes (objects union props & intersect required; arrays
|
|
79
|
+
merge items; integer+number → number; mixed types → sorted type array)."""
|
|
80
|
+
s = set(_type_set(a)) | set(_type_set(b))
|
|
81
|
+
if "integer" in s and "number" in s:
|
|
82
|
+
s.discard("integer")
|
|
83
|
+
types = sorted(s)
|
|
84
|
+
|
|
85
|
+
if len(types) == 1:
|
|
86
|
+
t = types[0]
|
|
87
|
+
if t == "object":
|
|
88
|
+
a_props, b_props = a.get("properties", {}), b.get("properties", {})
|
|
89
|
+
keys = sorted(set(a_props) | set(b_props))
|
|
90
|
+
properties = {}
|
|
91
|
+
for k in keys:
|
|
92
|
+
if k in a_props and k in b_props:
|
|
93
|
+
properties[k] = merge(a_props[k], b_props[k])
|
|
94
|
+
else:
|
|
95
|
+
properties[k] = a_props[k] if k in a_props else b_props[k]
|
|
96
|
+
a_req, b_req = a.get("required", []), b.get("required", [])
|
|
97
|
+
required = sorted(k for k in a_req if k in b_req)
|
|
98
|
+
node = {"type": "object", "properties": properties}
|
|
99
|
+
if required:
|
|
100
|
+
node["required"] = required
|
|
101
|
+
return node
|
|
102
|
+
if t == "array":
|
|
103
|
+
node = {"type": "array"}
|
|
104
|
+
if a.get("items") is not None and b.get("items") is not None:
|
|
105
|
+
node["items"] = merge(a["items"], b["items"])
|
|
106
|
+
elif a.get("items") is not None or b.get("items") is not None:
|
|
107
|
+
node["items"] = a.get("items") if a.get("items") is not None else b.get("items")
|
|
108
|
+
return node
|
|
109
|
+
if t == "string":
|
|
110
|
+
node = {"type": "string"}
|
|
111
|
+
if a.get("format") and a.get("format") == b.get("format"):
|
|
112
|
+
node["format"] = a["format"]
|
|
113
|
+
return node
|
|
114
|
+
return {"type": t}
|
|
115
|
+
return {"type": types}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def merge_all(nodes):
|
|
119
|
+
acc = None
|
|
120
|
+
for n in nodes:
|
|
121
|
+
acc = n if acc is None else merge(acc, n)
|
|
122
|
+
return acc
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def generate(samples, title=None, id=None):
|
|
126
|
+
"""Generate a JSON Schema (draft 2020-12) from one or more sample values."""
|
|
127
|
+
node = merge_all([infer(s) for s in samples])
|
|
128
|
+
schema = {"$schema": DRAFT}
|
|
129
|
+
if id:
|
|
130
|
+
schema["$id"] = id
|
|
131
|
+
if title:
|
|
132
|
+
schema["title"] = title
|
|
133
|
+
schema.update(node)
|
|
134
|
+
return schema
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mkschema
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Generate a JSON Schema from sample JSON — merge multiple samples (or NDJSON) with type + format inference and required-key detection. Zero dependencies.
|
|
5
|
+
Author: yyfjj
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jjdoor/mkschema-py
|
|
8
|
+
Project-URL: Repository, https://github.com/jjdoor/mkschema-py
|
|
9
|
+
Project-URL: Issues, https://github.com/jjdoor/mkschema-py/issues
|
|
10
|
+
Keywords: json-schema,schema,json,generate,infer,ndjson,openapi,cli,codegen,validation
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Topic :: Software Development :: Code Generators
|
|
18
|
+
Classifier: Topic :: Utilities
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# mkschema
|
|
25
|
+
|
|
26
|
+
**Generate a JSON Schema from real JSON — and feed it more than one sample.**
|
|
27
|
+
You have an API response, a config, a pile of log records, and you want a JSON
|
|
28
|
+
Schema for validation, docs, or contract tests. Hand-writing it is tedious;
|
|
29
|
+
most generators take a *single* example and over-fit it — every field marked
|
|
30
|
+
required, types pinned to whatever that one record happened to contain.
|
|
31
|
+
`mkschema` merges **many** samples: a field in *every* sample is `required`, a
|
|
32
|
+
field in only some is optional, and differing types are unioned. **Zero
|
|
33
|
+
dependencies, no network.**
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install mkschema
|
|
37
|
+
|
|
38
|
+
$ printf '{"id":1,"name":"Ada","age":30}\n{"id":2,"age":30.5}\n' | mkschema --ndjson -
|
|
39
|
+
|
|
40
|
+
{
|
|
41
|
+
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
|
42
|
+
"type": "object",
|
|
43
|
+
"properties": {
|
|
44
|
+
"age": { "type": "number" }, // 30 and 30.5 unioned to number
|
|
45
|
+
"id": { "type": "integer" },
|
|
46
|
+
"name": { "type": "string" }
|
|
47
|
+
},
|
|
48
|
+
"required": ["age", "id"] // name was missing from the 2nd sample, so it's optional
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
> This is the Python build. A behavior-equivalent Node build is on npm:
|
|
53
|
+
> `npx mkschema` (<https://github.com/jjdoor/mkschema>).
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
mkschema sample.json # infer from one file
|
|
59
|
+
mkschema a.json b.json c.json # merge several samples into one schema
|
|
60
|
+
mkschema --ndjson records.ndjson # one sample per line (logs, exports)
|
|
61
|
+
cat response.json | mkschema - # read a JSON value from stdin
|
|
62
|
+
mkschema users.json --title User --id https://ex.com/user.schema.json
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Schema goes to **stdout**, so redirect it: `mkschema data.json > schema.json`.
|
|
66
|
+
|
|
67
|
+
## What it infers
|
|
68
|
+
|
|
69
|
+
- **Types** — `null`, `boolean`, `integer`, `number`, `string`, `array`,
|
|
70
|
+
`object`. Numbers are classified by value, so `5.0` is an `integer` and the
|
|
71
|
+
Python and Node builds agree.
|
|
72
|
+
- **String `format`** — `date-time`, `date`, `email`, `uuid`, `ipv4`, `uri`
|
|
73
|
+
(kept only when *all* samples of a field agree).
|
|
74
|
+
- **`required`** — the intersection across samples: a key present in every
|
|
75
|
+
sample. (One sample ⇒ everything required.)
|
|
76
|
+
- **Arrays** — `items` is the merge of all element schemas, so `[1, "x"]`
|
|
77
|
+
becomes `{ "type": ["integer", "string"] }`.
|
|
78
|
+
- **Unions** — a field that is an integer in one sample and a float in another
|
|
79
|
+
becomes `number`; genuinely different types become a sorted `type` array.
|
|
80
|
+
|
|
81
|
+
## Options
|
|
82
|
+
|
|
83
|
+
| Flag | Effect |
|
|
84
|
+
|------|--------|
|
|
85
|
+
| `--ndjson <src>` | Treat each line of `<src>` (a file, or `-` for stdin) as a separate sample |
|
|
86
|
+
| `--title <name>` | Set the schema `title` |
|
|
87
|
+
| `--id <uri>` | Set `$id` |
|
|
88
|
+
| `-` | Read one JSON value from stdin |
|
|
89
|
+
| `-v`, `--version` · `-h`, `--help` | |
|
|
90
|
+
|
|
91
|
+
## Notes
|
|
92
|
+
|
|
93
|
+
- **Output is draft 2020-12** JSON Schema, deterministic (properties and
|
|
94
|
+
`required` are sorted) so it diffs cleanly in version control.
|
|
95
|
+
- **Same tool, two builds.** A behavior-equivalent Node build is on npm
|
|
96
|
+
(`npx mkschema`); use whichever your stack has.
|
|
97
|
+
- It infers structure, not constraints — add your own `minLength`, `enum`,
|
|
98
|
+
`pattern`, etc. afterward. mkschema gives you the scaffold from real data.
|
|
99
|
+
|
|
100
|
+
## Exit codes
|
|
101
|
+
|
|
102
|
+
| Code | Meaning |
|
|
103
|
+
|------|---------|
|
|
104
|
+
| `0` | schema written |
|
|
105
|
+
| `2` | error (no input, invalid JSON, unreadable file) |
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
MIT
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/mkschema/__init__.py
|
|
5
|
+
src/mkschema/__main__.py
|
|
6
|
+
src/mkschema/cli.py
|
|
7
|
+
src/mkschema/core.py
|
|
8
|
+
src/mkschema.egg-info/PKG-INFO
|
|
9
|
+
src/mkschema.egg-info/SOURCES.txt
|
|
10
|
+
src/mkschema.egg-info/dependency_links.txt
|
|
11
|
+
src/mkschema.egg-info/entry_points.txt
|
|
12
|
+
src/mkschema.egg-info/top_level.txt
|
|
13
|
+
tests/test_core.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mkschema
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from mkschema.core import infer_type, infer_format, infer, merge, merge_all, generate
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_infer_type_by_value():
|
|
5
|
+
assert infer_type(None) == "null"
|
|
6
|
+
assert infer_type(True) == "boolean"
|
|
7
|
+
assert infer_type(5) == "integer"
|
|
8
|
+
assert infer_type(5.0) == "integer" # value-based, so Node & Python agree
|
|
9
|
+
assert infer_type(5.5) == "number"
|
|
10
|
+
assert infer_type("x") == "string"
|
|
11
|
+
assert infer_type([]) == "array"
|
|
12
|
+
assert infer_type({}) == "object"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_infer_format():
|
|
16
|
+
assert infer_format("2026-06-19T08:00:00Z") == "date-time"
|
|
17
|
+
assert infer_format("2026-06-19") == "date"
|
|
18
|
+
assert infer_format("a@b.co") == "email"
|
|
19
|
+
assert infer_format("550e8400-e29b-41d4-a716-446655440000") == "uuid"
|
|
20
|
+
assert infer_format("10.0.0.1") == "ipv4"
|
|
21
|
+
assert infer_format("https://x.io/y") == "uri"
|
|
22
|
+
assert infer_format("just a string") is None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_infer_format_uri_userinfo_and_datetime_anchor():
|
|
26
|
+
assert infer_format("https://user@github.com/repo") == "uri" # uri tried before email
|
|
27
|
+
assert infer_format("https://user:pw@host.com/p") == "uri"
|
|
28
|
+
assert infer_format("a@b.co") == "email" # plain email unaffected
|
|
29
|
+
assert infer_format("2026-06-19T08:00:00.5+02:00") == "date-time" # fractional + offset
|
|
30
|
+
assert infer_format("2026-06-19T08:00:00 then junk") is None # anchored: trailing junk rejected
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_infer_object_required_sorted_recurse():
|
|
34
|
+
assert infer({"b": 1, "a": "x"}) == {
|
|
35
|
+
"type": "object",
|
|
36
|
+
"properties": {"a": {"type": "string"}, "b": {"type": "integer"}},
|
|
37
|
+
"required": ["a", "b"],
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_infer_array_items_merged():
|
|
42
|
+
assert infer([1, 2, 3]) == {"type": "array", "items": {"type": "integer"}}
|
|
43
|
+
assert infer([1, "x"]) == {"type": "array", "items": {"type": ["integer", "string"]}}
|
|
44
|
+
assert infer([]) == {"type": "array"}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_infer_string_format():
|
|
48
|
+
assert infer("2026-06-19") == {"type": "string", "format": "date"}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_merge_integer_number_collapses():
|
|
52
|
+
assert merge({"type": "integer"}, {"type": "number"}) == {"type": "number"}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_merge_mixed_types_to_sorted_array():
|
|
56
|
+
assert merge({"type": "string"}, {"type": "object", "properties": {}}) == {"type": ["object", "string"]}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_merge_objects_union_props_intersect_required():
|
|
60
|
+
a = infer({"id": 1, "name": "x"})
|
|
61
|
+
b = infer({"id": 2, "email": "a@b.co"})
|
|
62
|
+
m = merge(a, b)
|
|
63
|
+
assert sorted(m["properties"].keys()) == ["email", "id", "name"]
|
|
64
|
+
assert m["required"] == ["id"]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_merge_string_format_only_when_agree():
|
|
68
|
+
assert merge({"type": "string", "format": "date"}, {"type": "string", "format": "date"}) == {"type": "string", "format": "date"}
|
|
69
|
+
assert merge({"type": "string", "format": "date"}, {"type": "string", "format": "email"}) == {"type": "string"}
|
|
70
|
+
assert merge({"type": "string", "format": "date"}, {"type": "string"}) == {"type": "string"}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_merge_all_required_across_all_samples():
|
|
74
|
+
m = merge_all([infer({"a": 1, "b": 2}), infer({"a": 1, "b": 2}), infer({"a": 1})])
|
|
75
|
+
assert m["required"] == ["a"]
|
|
76
|
+
assert sorted(m["properties"].keys()) == ["a", "b"]
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_generate_wraps_and_orders():
|
|
80
|
+
s = generate([{"n": 1}], title="Thing", id="https://x/s.json")
|
|
81
|
+
assert list(s.keys()) == ["$schema", "$id", "title", "type", "properties", "required"]
|
|
82
|
+
assert s["$schema"] == "https://json-schema.org/draft/2020-12/schema"
|
|
83
|
+
assert s["title"] == "Thing"
|
|
84
|
+
assert s["$id"] == "https://x/s.json"
|
|
85
|
+
assert s["properties"] == {"n": {"type": "integer"}}
|