structurize 2.16.6__py3-none-any.whl → 2.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +1 -0
- avrotize/_version.py +3 -3
- avrotize/avrotocsharp.py +74 -10
- avrotize/avrotojava.py +1130 -51
- avrotize/avrotopython.py +4 -2
- avrotize/commands.json +671 -53
- avrotize/common.py +6 -1
- avrotize/jsonstoavro.py +518 -49
- avrotize/structuretocpp.py +697 -0
- avrotize/structuretocsv.py +365 -0
- avrotize/structuretodatapackage.py +659 -0
- avrotize/structuretodb.py +1125 -0
- avrotize/structuretogo.py +720 -0
- avrotize/structuretographql.py +502 -0
- avrotize/structuretoiceberg.py +355 -0
- avrotize/structuretojava.py +853 -0
- avrotize/structuretokusto.py +639 -0
- avrotize/structuretomd.py +322 -0
- avrotize/structuretoproto.py +764 -0
- avrotize/structuretorust.py +714 -0
- avrotize/structuretoxsd.py +679 -0
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/METADATA +1 -1
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/RECORD +27 -14
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/WHEEL +0 -0
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/entry_points.txt +0 -0
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/licenses/LICENSE +0 -0
- {structurize-2.16.6.dist-info → structurize-2.17.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,659 @@
|
|
|
1
|
+
"""Convert JSON Structure schemas to Data Package format."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from typing import Dict, List, Optional, Set, cast
|
|
6
|
+
from datapackage import Package
|
|
7
|
+
|
|
8
|
+
from avrotize.common import get_longest_namespace_prefix
|
|
9
|
+
|
|
10
|
+
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | bool | int | None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StructureToDataPackageConverter:
|
|
14
|
+
"""Class to convert JSON Structure schema to Data Package."""
|
|
15
|
+
|
|
16
|
+
def __init__(self) -> None:
|
|
17
|
+
self.named_type_cache: Dict[str, JsonNode] = {}
|
|
18
|
+
self.schema_registry: Dict[str, Dict] = {}
|
|
19
|
+
|
|
20
|
+
def get_fullname(self, namespace: str, name: str) -> str:
|
|
21
|
+
"""Get the full name of a record type."""
|
|
22
|
+
return f"{namespace}.{name}" if namespace else name
|
|
23
|
+
|
|
24
|
+
def resolve_ref(self, ref: str, context_schema: Optional[Dict] = None) -> Optional[Dict]:
|
|
25
|
+
"""Resolves a $ref to the actual schema definition."""
|
|
26
|
+
# Check if it's an absolute URI reference (schema with $id)
|
|
27
|
+
if not ref.startswith('#/'):
|
|
28
|
+
if ref in self.schema_registry:
|
|
29
|
+
return self.schema_registry[ref]
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
# Handle fragment-only references (internal to document)
|
|
33
|
+
path = ref[2:].split('/')
|
|
34
|
+
schema = context_schema if context_schema else None
|
|
35
|
+
|
|
36
|
+
if schema is None:
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
for part in path:
|
|
40
|
+
if not isinstance(schema, dict) or part not in schema:
|
|
41
|
+
return None
|
|
42
|
+
schema = schema[part]
|
|
43
|
+
|
|
44
|
+
return cast(Dict, schema)
|
|
45
|
+
|
|
46
|
+
def register_schema_ids(self, schema: Dict, base_uri: str = '') -> None:
|
|
47
|
+
"""Recursively registers schemas with $id keywords."""
|
|
48
|
+
if not isinstance(schema, dict):
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
# Register this schema if it has an $id
|
|
52
|
+
if '$id' in schema:
|
|
53
|
+
schema_id = schema['$id']
|
|
54
|
+
# Handle relative URIs
|
|
55
|
+
if base_uri and not schema_id.startswith(('http://', 'https://', 'urn:')):
|
|
56
|
+
from urllib.parse import urljoin
|
|
57
|
+
schema_id = urljoin(base_uri, schema_id)
|
|
58
|
+
self.schema_registry[schema_id] = schema
|
|
59
|
+
base_uri = schema_id
|
|
60
|
+
|
|
61
|
+
# Recursively process definitions
|
|
62
|
+
if 'definitions' in schema:
|
|
63
|
+
for def_name, def_schema in schema['definitions'].items():
|
|
64
|
+
if isinstance(def_schema, dict):
|
|
65
|
+
self.register_schema_ids(def_schema, base_uri)
|
|
66
|
+
|
|
67
|
+
# Recursively process properties
|
|
68
|
+
if 'properties' in schema:
|
|
69
|
+
for prop_name, prop_schema in schema['properties'].items():
|
|
70
|
+
if isinstance(prop_schema, dict):
|
|
71
|
+
self.register_schema_ids(prop_schema, base_uri)
|
|
72
|
+
|
|
73
|
+
# Recursively process items, values, etc.
|
|
74
|
+
for key in ['items', 'values', 'additionalProperties']:
|
|
75
|
+
if key in schema and isinstance(schema[key], dict):
|
|
76
|
+
self.register_schema_ids(schema[key], base_uri)
|
|
77
|
+
|
|
78
|
+
def convert_structure_to_datapackage(self, structure_schema_path: str,
|
|
79
|
+
structure_record_type: Optional[str],
|
|
80
|
+
datapackage_path: str) -> None:
|
|
81
|
+
"""Convert a JSON Structure schema to a Data Package."""
|
|
82
|
+
with open(structure_schema_path, "r", encoding="utf-8") as f:
|
|
83
|
+
schema_json = f.read()
|
|
84
|
+
|
|
85
|
+
# Parse the schema as a JSON object
|
|
86
|
+
schema = json.loads(schema_json)
|
|
87
|
+
|
|
88
|
+
# Register schema IDs for cross-references
|
|
89
|
+
if isinstance(schema, dict):
|
|
90
|
+
self.register_schema_ids(schema)
|
|
91
|
+
elif isinstance(schema, list):
|
|
92
|
+
for s in schema:
|
|
93
|
+
if isinstance(s, dict):
|
|
94
|
+
self.register_schema_ids(s)
|
|
95
|
+
|
|
96
|
+
self.cache_named_types(schema)
|
|
97
|
+
|
|
98
|
+
# Handle list of schemas or single schema
|
|
99
|
+
if isinstance(schema, list):
|
|
100
|
+
if structure_record_type:
|
|
101
|
+
schema = next(
|
|
102
|
+
(x for x in schema
|
|
103
|
+
if isinstance(x, dict) and
|
|
104
|
+
(x.get("name") == structure_record_type or
|
|
105
|
+
str(x.get("namespace", "")) + "." + str(x.get("name", "")) == structure_record_type)),
|
|
106
|
+
None)
|
|
107
|
+
if schema is None:
|
|
108
|
+
print(f"No top-level record type {structure_record_type} found in the JSON Structure schema")
|
|
109
|
+
sys.exit(1)
|
|
110
|
+
schemas_to_convert = schema if isinstance(schema, list) else [schema]
|
|
111
|
+
elif isinstance(schema, dict):
|
|
112
|
+
# Single schema - convert it to a list
|
|
113
|
+
if 'type' in schema and schema['type'] == 'object':
|
|
114
|
+
schemas_to_convert = [schema]
|
|
115
|
+
elif 'definitions' in schema or '$defs' in schema:
|
|
116
|
+
# Schema with definitions/$defs - extract object types
|
|
117
|
+
schemas_to_convert = []
|
|
118
|
+
definitions = schema.get('definitions', schema.get('$defs', {}))
|
|
119
|
+
self._extract_object_schemas(definitions, schemas_to_convert)
|
|
120
|
+
# Also include root if it's an object
|
|
121
|
+
if schema.get('type') == 'object':
|
|
122
|
+
schemas_to_convert.insert(0, schema)
|
|
123
|
+
else:
|
|
124
|
+
schemas_to_convert = [schema]
|
|
125
|
+
else:
|
|
126
|
+
print("Expected a single JSON Structure schema as a JSON object, or a list of schema records")
|
|
127
|
+
sys.exit(1)
|
|
128
|
+
|
|
129
|
+
# Calculate longest namespace prefix
|
|
130
|
+
longest_namespace_prefix = self._get_longest_namespace_prefix(schemas_to_convert)
|
|
131
|
+
self.create_datapackage_for_schemas(schemas_to_convert, datapackage_path, longest_namespace_prefix)
|
|
132
|
+
|
|
133
|
+
def _extract_object_schemas(self, definitions: Dict, schemas_to_convert: List[Dict]) -> None:
|
|
134
|
+
"""Extract object type schemas from definitions recursively."""
|
|
135
|
+
for name, definition in definitions.items():
|
|
136
|
+
if isinstance(definition, dict):
|
|
137
|
+
if definition.get('type') == 'object':
|
|
138
|
+
# Add name if not present
|
|
139
|
+
if 'name' not in definition:
|
|
140
|
+
definition['name'] = name
|
|
141
|
+
schemas_to_convert.append(definition)
|
|
142
|
+
elif 'definitions' in definition:
|
|
143
|
+
# Nested definitions
|
|
144
|
+
self._extract_object_schemas(definition['definitions'], schemas_to_convert)
|
|
145
|
+
|
|
146
|
+
def _get_longest_namespace_prefix(self, schemas: List[Dict]) -> str:
|
|
147
|
+
"""Calculate the longest common namespace prefix from schemas."""
|
|
148
|
+
if not schemas:
|
|
149
|
+
return ""
|
|
150
|
+
|
|
151
|
+
namespaces = []
|
|
152
|
+
for schema in schemas:
|
|
153
|
+
if isinstance(schema, dict):
|
|
154
|
+
ns = schema.get('namespace', '')
|
|
155
|
+
if ns:
|
|
156
|
+
namespaces.append(ns)
|
|
157
|
+
|
|
158
|
+
if not namespaces:
|
|
159
|
+
return ""
|
|
160
|
+
|
|
161
|
+
# Find common prefix
|
|
162
|
+
if len(namespaces) == 1:
|
|
163
|
+
return namespaces[0]
|
|
164
|
+
|
|
165
|
+
# Split by dots and find common parts
|
|
166
|
+
parts_lists = [ns.split('.') for ns in namespaces]
|
|
167
|
+
common_parts = []
|
|
168
|
+
|
|
169
|
+
for i in range(min(len(p) for p in parts_lists)):
|
|
170
|
+
part = parts_lists[0][i]
|
|
171
|
+
if all(p[i] == part for p in parts_lists):
|
|
172
|
+
common_parts.append(part)
|
|
173
|
+
else:
|
|
174
|
+
break
|
|
175
|
+
|
|
176
|
+
return '.'.join(common_parts)
|
|
177
|
+
|
|
178
|
+
def create_datapackage_for_schemas(self, schemas: List[Dict],
|
|
179
|
+
datapackage_path: str,
|
|
180
|
+
namespace_prefix: str) -> None:
|
|
181
|
+
"""Create a Data Package for given schemas."""
|
|
182
|
+
package = Package()
|
|
183
|
+
data_package_resources = []
|
|
184
|
+
|
|
185
|
+
for schema in schemas:
|
|
186
|
+
if not isinstance(schema, dict):
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
# Skip non-object types
|
|
190
|
+
if schema.get('type') != 'object':
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
name = str(schema.get("name", "UnnamedTable"))
|
|
194
|
+
namespace = str(schema.get("namespace", ""))
|
|
195
|
+
|
|
196
|
+
# Remove common namespace prefix
|
|
197
|
+
if namespace.startswith(namespace_prefix):
|
|
198
|
+
namespace = namespace[len(namespace_prefix):].strip(".")
|
|
199
|
+
|
|
200
|
+
table_name = f"{namespace}_{name}" if namespace else name
|
|
201
|
+
# Data Package resource names must be lowercase with hyphens/underscores only
|
|
202
|
+
table_name = table_name.lower().replace(" ", "-")
|
|
203
|
+
properties = schema.get("properties", {})
|
|
204
|
+
|
|
205
|
+
# Create the Data Package schema
|
|
206
|
+
resource_schema: Dict[str, List[JsonNode]] = {
|
|
207
|
+
"fields": []
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
for prop_name, prop_schema in properties.items():
|
|
211
|
+
column_name = prop_name
|
|
212
|
+
column_type = self.convert_structure_type_to_datapackage_type(prop_schema, schema)
|
|
213
|
+
field_schema = {"name": column_name, "type": column_type}
|
|
214
|
+
|
|
215
|
+
# Add description from doc or description (only if prop_schema is a dict)
|
|
216
|
+
if isinstance(prop_schema, dict):
|
|
217
|
+
# Handle title
|
|
218
|
+
if "title" in prop_schema:
|
|
219
|
+
field_schema["title"] = prop_schema["title"]
|
|
220
|
+
|
|
221
|
+
if "description" in prop_schema:
|
|
222
|
+
field_schema["description"] = prop_schema["description"]
|
|
223
|
+
elif "doc" in prop_schema:
|
|
224
|
+
field_schema["description"] = prop_schema["doc"]
|
|
225
|
+
|
|
226
|
+
# Handle $comment (add to description)
|
|
227
|
+
if "$comment" in prop_schema:
|
|
228
|
+
comment = prop_schema["$comment"]
|
|
229
|
+
if "description" in field_schema:
|
|
230
|
+
field_schema["description"] += f" [Comment: {comment}]"
|
|
231
|
+
else:
|
|
232
|
+
field_schema["description"] = f"[Comment: {comment}]"
|
|
233
|
+
|
|
234
|
+
# Handle examples
|
|
235
|
+
if "examples" in prop_schema:
|
|
236
|
+
field_schema["examples"] = prop_schema["examples"]
|
|
237
|
+
|
|
238
|
+
# Handle default values
|
|
239
|
+
if "default" in prop_schema:
|
|
240
|
+
field_schema["default"] = prop_schema["default"]
|
|
241
|
+
|
|
242
|
+
# Handle const (fixed value)
|
|
243
|
+
if "const" in prop_schema:
|
|
244
|
+
if 'constraints' not in field_schema:
|
|
245
|
+
field_schema['constraints'] = {}
|
|
246
|
+
field_schema['constraints']['enum'] = [prop_schema["const"]]
|
|
247
|
+
if "description" in field_schema:
|
|
248
|
+
field_schema["description"] += f" (constant value)"
|
|
249
|
+
else:
|
|
250
|
+
field_schema["description"] = "Constant value"
|
|
251
|
+
|
|
252
|
+
# Handle readOnly/writeOnly
|
|
253
|
+
if "readOnly" in prop_schema and prop_schema["readOnly"]:
|
|
254
|
+
if "description" in field_schema:
|
|
255
|
+
field_schema["description"] += " (read-only)"
|
|
256
|
+
else:
|
|
257
|
+
field_schema["description"] = "Read-only field"
|
|
258
|
+
|
|
259
|
+
if "writeOnly" in prop_schema and prop_schema["writeOnly"]:
|
|
260
|
+
if "description" in field_schema:
|
|
261
|
+
field_schema["description"] += " (write-only)"
|
|
262
|
+
else:
|
|
263
|
+
field_schema["description"] = "Write-only field"
|
|
264
|
+
|
|
265
|
+
# Handle deprecated
|
|
266
|
+
if "deprecated" in prop_schema and prop_schema["deprecated"]:
|
|
267
|
+
if "description" in field_schema:
|
|
268
|
+
field_schema["description"] += " (DEPRECATED)"
|
|
269
|
+
else:
|
|
270
|
+
field_schema["description"] = "DEPRECATED"
|
|
271
|
+
|
|
272
|
+
# Add format constraints if applicable
|
|
273
|
+
self._add_field_constraints(field_schema, prop_schema)
|
|
274
|
+
|
|
275
|
+
resource_schema["fields"].append(field_schema)
|
|
276
|
+
|
|
277
|
+
resource = {
|
|
278
|
+
"name": table_name,
|
|
279
|
+
"data": [], # Empty data array for schema-only package
|
|
280
|
+
"schema": resource_schema
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
# Add resource title if available
|
|
284
|
+
if "title" in schema:
|
|
285
|
+
resource["title"] = schema["title"]
|
|
286
|
+
|
|
287
|
+
# Add resource description if available
|
|
288
|
+
if "description" in schema:
|
|
289
|
+
resource["description"] = schema["description"]
|
|
290
|
+
elif "doc" in schema:
|
|
291
|
+
resource["description"] = schema["doc"]
|
|
292
|
+
|
|
293
|
+
# Handle abstract types
|
|
294
|
+
if schema.get("abstract", False):
|
|
295
|
+
if "description" in resource:
|
|
296
|
+
resource["description"] += " (Abstract type - cannot be instantiated directly)"
|
|
297
|
+
else:
|
|
298
|
+
resource["description"] = "Abstract type - cannot be instantiated directly"
|
|
299
|
+
|
|
300
|
+
# Handle $extends (inheritance)
|
|
301
|
+
if "$extends" in schema:
|
|
302
|
+
extends_ref = schema["$extends"]
|
|
303
|
+
if "description" in resource:
|
|
304
|
+
resource["description"] += f" (Extends: {extends_ref})"
|
|
305
|
+
else:
|
|
306
|
+
resource["description"] = f"Extends: {extends_ref}"
|
|
307
|
+
|
|
308
|
+
# Handle $offers (add-in system)
|
|
309
|
+
if "$offers" in schema:
|
|
310
|
+
offers = schema["$offers"]
|
|
311
|
+
if "description" in resource:
|
|
312
|
+
resource["description"] += f" (Offers: {', '.join(offers.keys()) if isinstance(offers, dict) else str(offers)})"
|
|
313
|
+
else:
|
|
314
|
+
resource["description"] = f"Offers: {', '.join(offers.keys()) if isinstance(offers, dict) else str(offers)}"
|
|
315
|
+
|
|
316
|
+
# Handle $uses (add-in system)
|
|
317
|
+
if "$uses" in schema:
|
|
318
|
+
uses = schema["$uses"]
|
|
319
|
+
uses_str = ', '.join(uses) if isinstance(uses, list) else str(uses)
|
|
320
|
+
if "description" in resource:
|
|
321
|
+
resource["description"] += f" (Uses add-ins: {uses_str})"
|
|
322
|
+
else:
|
|
323
|
+
resource["description"] = f"Uses add-ins: {uses_str}"
|
|
324
|
+
|
|
325
|
+
# Handle deprecated at schema level
|
|
326
|
+
if schema.get("deprecated", False):
|
|
327
|
+
if "description" in resource:
|
|
328
|
+
resource["description"] += " (DEPRECATED)"
|
|
329
|
+
else:
|
|
330
|
+
resource["description"] = "DEPRECATED"
|
|
331
|
+
|
|
332
|
+
data_package_resources.append(resource)
|
|
333
|
+
|
|
334
|
+
# Add resources to the Data Package
|
|
335
|
+
for resource in data_package_resources:
|
|
336
|
+
package.add_resource(resource)
|
|
337
|
+
|
|
338
|
+
# Save the Data Package
|
|
339
|
+
package.descriptor["name"] = namespace_prefix if namespace_prefix else "datapackage"
|
|
340
|
+
package.commit()
|
|
341
|
+
|
|
342
|
+
with open(datapackage_path, "w", encoding="utf-8") as f:
|
|
343
|
+
f.write(json.dumps(package.descriptor, indent=2))
|
|
344
|
+
|
|
345
|
+
def _add_field_constraints(self, field_schema: Dict, prop_schema: Dict) -> None:
|
|
346
|
+
"""Add Data Package field constraints from JSON Structure annotations."""
|
|
347
|
+
# Add format for specific types
|
|
348
|
+
prop_type = prop_schema.get('type', '')
|
|
349
|
+
|
|
350
|
+
# Date/time formats
|
|
351
|
+
if prop_type == 'date':
|
|
352
|
+
field_schema['format'] = 'date'
|
|
353
|
+
elif prop_type == 'datetime' or prop_type == 'timestamp':
|
|
354
|
+
field_schema['format'] = 'datetime'
|
|
355
|
+
elif prop_type == 'time':
|
|
356
|
+
field_schema['format'] = 'time'
|
|
357
|
+
elif prop_type == 'duration':
|
|
358
|
+
field_schema['format'] = 'duration'
|
|
359
|
+
elif prop_type == 'uri':
|
|
360
|
+
field_schema['format'] = 'uri'
|
|
361
|
+
elif prop_type == 'uuid':
|
|
362
|
+
field_schema['format'] = 'uuid'
|
|
363
|
+
elif prop_type == 'binary':
|
|
364
|
+
field_schema['format'] = 'binary'
|
|
365
|
+
|
|
366
|
+
# Handle format keyword for additional string formats
|
|
367
|
+
if 'format' in prop_schema:
|
|
368
|
+
field_schema['format'] = prop_schema['format']
|
|
369
|
+
|
|
370
|
+
# String constraints
|
|
371
|
+
if 'maxLength' in prop_schema:
|
|
372
|
+
if 'constraints' not in field_schema:
|
|
373
|
+
field_schema['constraints'] = {}
|
|
374
|
+
field_schema['constraints']['maxLength'] = prop_schema['maxLength']
|
|
375
|
+
|
|
376
|
+
if 'minLength' in prop_schema:
|
|
377
|
+
if 'constraints' not in field_schema:
|
|
378
|
+
field_schema['constraints'] = {}
|
|
379
|
+
field_schema['constraints']['minLength'] = prop_schema['minLength']
|
|
380
|
+
|
|
381
|
+
if 'pattern' in prop_schema:
|
|
382
|
+
if 'constraints' not in field_schema:
|
|
383
|
+
field_schema['constraints'] = {}
|
|
384
|
+
field_schema['constraints']['pattern'] = prop_schema['pattern']
|
|
385
|
+
|
|
386
|
+
# Numeric constraints
|
|
387
|
+
if 'minimum' in prop_schema:
|
|
388
|
+
if 'constraints' not in field_schema:
|
|
389
|
+
field_schema['constraints'] = {}
|
|
390
|
+
field_schema['constraints']['minimum'] = prop_schema['minimum']
|
|
391
|
+
|
|
392
|
+
if 'maximum' in prop_schema:
|
|
393
|
+
if 'constraints' not in field_schema:
|
|
394
|
+
field_schema['constraints'] = {}
|
|
395
|
+
field_schema['constraints']['maximum'] = prop_schema['maximum']
|
|
396
|
+
|
|
397
|
+
if 'exclusiveMinimum' in prop_schema:
|
|
398
|
+
if 'constraints' not in field_schema:
|
|
399
|
+
field_schema['constraints'] = {}
|
|
400
|
+
# Data Package doesn't have exclusiveMinimum, so we document it
|
|
401
|
+
field_schema['constraints']['minimum'] = prop_schema['exclusiveMinimum']
|
|
402
|
+
if 'description' in field_schema:
|
|
403
|
+
field_schema['description'] += f" (exclusive minimum: {prop_schema['exclusiveMinimum']})"
|
|
404
|
+
else:
|
|
405
|
+
field_schema['description'] = f"Exclusive minimum: {prop_schema['exclusiveMinimum']}"
|
|
406
|
+
|
|
407
|
+
if 'exclusiveMaximum' in prop_schema:
|
|
408
|
+
if 'constraints' not in field_schema:
|
|
409
|
+
field_schema['constraints'] = {}
|
|
410
|
+
# Data Package doesn't have exclusiveMaximum, so we document it
|
|
411
|
+
field_schema['constraints']['maximum'] = prop_schema['exclusiveMaximum']
|
|
412
|
+
if 'description' in field_schema:
|
|
413
|
+
field_schema['description'] += f" (exclusive maximum: {prop_schema['exclusiveMaximum']})"
|
|
414
|
+
else:
|
|
415
|
+
field_schema['description'] = f"Exclusive maximum: {prop_schema['exclusiveMaximum']}"
|
|
416
|
+
|
|
417
|
+
if 'multipleOf' in prop_schema:
|
|
418
|
+
# Data Package doesn't have multipleOf, document in description
|
|
419
|
+
multiple_of = prop_schema['multipleOf']
|
|
420
|
+
if 'description' in field_schema:
|
|
421
|
+
field_schema['description'] += f" (multiple of {multiple_of})"
|
|
422
|
+
else:
|
|
423
|
+
field_schema['description'] = f"Must be multiple of {multiple_of}"
|
|
424
|
+
|
|
425
|
+
# Decimal precision/scale
|
|
426
|
+
if 'precision' in prop_schema or 'scale' in prop_schema:
|
|
427
|
+
precision = prop_schema.get('precision')
|
|
428
|
+
scale = prop_schema.get('scale')
|
|
429
|
+
desc_parts = []
|
|
430
|
+
if precision:
|
|
431
|
+
desc_parts.append(f"precision: {precision}")
|
|
432
|
+
if scale:
|
|
433
|
+
desc_parts.append(f"scale: {scale}")
|
|
434
|
+
precision_desc = f" ({', '.join(desc_parts)})"
|
|
435
|
+
if 'description' in field_schema:
|
|
436
|
+
field_schema['description'] += precision_desc
|
|
437
|
+
else:
|
|
438
|
+
field_schema['description'] = precision_desc.strip('() ')
|
|
439
|
+
|
|
440
|
+
# Array constraints
|
|
441
|
+
if 'maxItems' in prop_schema:
|
|
442
|
+
# Data Package doesn't have maxItems, document in description
|
|
443
|
+
if 'description' in field_schema:
|
|
444
|
+
field_schema['description'] += f" (max items: {prop_schema['maxItems']})"
|
|
445
|
+
else:
|
|
446
|
+
field_schema['description'] = f"Maximum {prop_schema['maxItems']} items"
|
|
447
|
+
|
|
448
|
+
if 'minItems' in prop_schema:
|
|
449
|
+
# Data Package doesn't have minItems, document in description
|
|
450
|
+
if 'description' in field_schema:
|
|
451
|
+
field_schema['description'] += f" (min items: {prop_schema['minItems']})"
|
|
452
|
+
else:
|
|
453
|
+
field_schema['description'] = f"Minimum {prop_schema['minItems']} items"
|
|
454
|
+
|
|
455
|
+
if 'uniqueItems' in prop_schema and prop_schema['uniqueItems']:
|
|
456
|
+
if 'constraints' not in field_schema:
|
|
457
|
+
field_schema['constraints'] = {}
|
|
458
|
+
field_schema['constraints']['unique'] = True
|
|
459
|
+
|
|
460
|
+
# Enum values
|
|
461
|
+
if 'enum' in prop_schema:
|
|
462
|
+
if 'constraints' not in field_schema:
|
|
463
|
+
field_schema['constraints'] = {}
|
|
464
|
+
field_schema['constraints']['enum'] = prop_schema['enum']
|
|
465
|
+
|
|
466
|
+
# Content metadata
|
|
467
|
+
if 'contentEncoding' in prop_schema:
|
|
468
|
+
# Store as custom property
|
|
469
|
+
field_schema['contentEncoding'] = prop_schema['contentEncoding']
|
|
470
|
+
|
|
471
|
+
if 'contentMediaType' in prop_schema:
|
|
472
|
+
# Store as custom property
|
|
473
|
+
field_schema['contentMediaType'] = prop_schema['contentMediaType']
|
|
474
|
+
|
|
475
|
+
def convert_structure_type_to_datapackage_type(self, structure_type: JsonNode,
|
|
476
|
+
context_schema: Optional[Dict] = None) -> str:
|
|
477
|
+
"""Convert a JSON Structure type to a Data Package type."""
|
|
478
|
+
if isinstance(structure_type, list):
|
|
479
|
+
# Union type
|
|
480
|
+
item_count = len(structure_type)
|
|
481
|
+
if item_count == 1:
|
|
482
|
+
return self.convert_structure_type_to_datapackage_type(structure_type[0], context_schema)
|
|
483
|
+
elif item_count == 2:
|
|
484
|
+
# Check for nullable union (type + null)
|
|
485
|
+
first = structure_type[0]
|
|
486
|
+
second = structure_type[1]
|
|
487
|
+
if isinstance(first, str) and first == "null":
|
|
488
|
+
return self.convert_structure_type_to_datapackage_type(second, context_schema)
|
|
489
|
+
elif isinstance(second, str) and second == "null":
|
|
490
|
+
return self.convert_structure_type_to_datapackage_type(first, context_schema)
|
|
491
|
+
# Complex union - default to string
|
|
492
|
+
return "string"
|
|
493
|
+
elif isinstance(structure_type, dict):
|
|
494
|
+
# Handle allOf (merge all schemas)
|
|
495
|
+
if 'allOf' in structure_type:
|
|
496
|
+
# For allOf, we typically take the most specific type
|
|
497
|
+
# In Data Package context, we'll use the first concrete type
|
|
498
|
+
for sub_schema in structure_type['allOf']:
|
|
499
|
+
if isinstance(sub_schema, dict) and 'type' in sub_schema:
|
|
500
|
+
return self.convert_structure_type_to_datapackage_type(sub_schema, context_schema)
|
|
501
|
+
return "object" # Default to object for allOf
|
|
502
|
+
|
|
503
|
+
# Handle oneOf (one of the schemas must match)
|
|
504
|
+
if 'oneOf' in structure_type:
|
|
505
|
+
# For oneOf, we use string as it's the most flexible
|
|
506
|
+
# Could potentially be a union in more sophisticated implementations
|
|
507
|
+
return "string"
|
|
508
|
+
|
|
509
|
+
# Handle anyOf (any of the schemas may match)
|
|
510
|
+
if 'anyOf' in structure_type:
|
|
511
|
+
# Similar to oneOf, use string for flexibility
|
|
512
|
+
return "string"
|
|
513
|
+
|
|
514
|
+
# Handle not (negation)
|
|
515
|
+
if 'not' in structure_type:
|
|
516
|
+
# Can't directly represent negation, default to string
|
|
517
|
+
return "string"
|
|
518
|
+
|
|
519
|
+
# Handle if/then/else (conditional schemas)
|
|
520
|
+
if 'if' in structure_type:
|
|
521
|
+
# Use 'then' schema if present, else 'else' schema, else string
|
|
522
|
+
if 'then' in structure_type:
|
|
523
|
+
return self.convert_structure_type_to_datapackage_type(structure_type['then'], context_schema)
|
|
524
|
+
elif 'else' in structure_type:
|
|
525
|
+
return self.convert_structure_type_to_datapackage_type(structure_type['else'], context_schema)
|
|
526
|
+
return "string"
|
|
527
|
+
|
|
528
|
+
# Handle $ref
|
|
529
|
+
if '$ref' in structure_type:
|
|
530
|
+
ref_schema = self.resolve_ref(structure_type['$ref'], context_schema)
|
|
531
|
+
if ref_schema:
|
|
532
|
+
return self.convert_structure_type_to_datapackage_type(ref_schema, context_schema)
|
|
533
|
+
return "string"
|
|
534
|
+
|
|
535
|
+
# Handle enum
|
|
536
|
+
if 'enum' in structure_type:
|
|
537
|
+
# Enums are represented as strings with enum constraint
|
|
538
|
+
return "string"
|
|
539
|
+
|
|
540
|
+
# Get the type field
|
|
541
|
+
type_name = structure_type.get("type")
|
|
542
|
+
|
|
543
|
+
# Handle case where type itself is a dict with $ref
|
|
544
|
+
if isinstance(type_name, dict):
|
|
545
|
+
return self.convert_structure_type_to_datapackage_type(type_name, context_schema)
|
|
546
|
+
|
|
547
|
+
if type_name == "array":
|
|
548
|
+
return "array"
|
|
549
|
+
elif type_name == "set":
|
|
550
|
+
return "array" # Sets are represented as arrays in Data Package
|
|
551
|
+
elif type_name == "map":
|
|
552
|
+
return "object"
|
|
553
|
+
elif type_name == "object":
|
|
554
|
+
return "object"
|
|
555
|
+
elif type_name == "choice":
|
|
556
|
+
return "string" # Choices default to string
|
|
557
|
+
elif type_name == "tuple":
|
|
558
|
+
return "array" # Tuples are arrays with fixed structure
|
|
559
|
+
elif type_name:
|
|
560
|
+
return self.map_scalar_type(type_name)
|
|
561
|
+
else:
|
|
562
|
+
return "string"
|
|
563
|
+
elif isinstance(structure_type, str):
|
|
564
|
+
# Check named type cache
|
|
565
|
+
if structure_type in self.named_type_cache:
|
|
566
|
+
return self.convert_structure_type_to_datapackage_type(
|
|
567
|
+
self.named_type_cache[structure_type], context_schema)
|
|
568
|
+
return self.map_scalar_type(structure_type)
|
|
569
|
+
|
|
570
|
+
return "string"
|
|
571
|
+
|
|
572
|
+
def cache_named_types(self, structure_type: JsonNode) -> None:
|
|
573
|
+
"""Add an encountered type to the cache of named types."""
|
|
574
|
+
if isinstance(structure_type, list):
|
|
575
|
+
for item in structure_type:
|
|
576
|
+
self.cache_named_types(item)
|
|
577
|
+
elif isinstance(structure_type, dict):
|
|
578
|
+
# Cache this type if it has a name
|
|
579
|
+
if structure_type.get("name"):
|
|
580
|
+
full_name = self.get_fullname(
|
|
581
|
+
str(structure_type.get("namespace", "")),
|
|
582
|
+
str(structure_type.get("name")))
|
|
583
|
+
self.named_type_cache[full_name] = structure_type
|
|
584
|
+
|
|
585
|
+
# Recursively cache types in properties
|
|
586
|
+
if "properties" in structure_type:
|
|
587
|
+
for prop_name, prop_schema in structure_type["properties"].items():
|
|
588
|
+
if isinstance(prop_schema, dict):
|
|
589
|
+
self.cache_named_types(prop_schema)
|
|
590
|
+
|
|
591
|
+
# Recursively cache types in definitions
|
|
592
|
+
if "definitions" in structure_type:
|
|
593
|
+
for def_name, def_schema in structure_type["definitions"].items():
|
|
594
|
+
if isinstance(def_schema, dict):
|
|
595
|
+
self.cache_named_types(def_schema)
|
|
596
|
+
|
|
597
|
+
# Cache types in array items, map values, etc.
|
|
598
|
+
for key in ['items', 'values', 'additionalProperties']:
|
|
599
|
+
if key in structure_type and isinstance(structure_type[key], dict):
|
|
600
|
+
self.cache_named_types(structure_type[key])
|
|
601
|
+
|
|
602
|
+
def map_scalar_type(self, type_name: str) -> str:
|
|
603
|
+
"""Map a JSON Structure scalar type to a Data Package scalar type."""
|
|
604
|
+
# JSON Structure Core primitive types mapping
|
|
605
|
+
scalar_type_mapping = {
|
|
606
|
+
# JSON primitive types
|
|
607
|
+
"null": "string",
|
|
608
|
+
"boolean": "boolean",
|
|
609
|
+
"string": "string",
|
|
610
|
+
"number": "number",
|
|
611
|
+
"integer": "integer",
|
|
612
|
+
|
|
613
|
+
# Extended primitive types - integers
|
|
614
|
+
"int8": "integer",
|
|
615
|
+
"uint8": "integer",
|
|
616
|
+
"int16": "integer",
|
|
617
|
+
"uint16": "integer",
|
|
618
|
+
"int32": "integer",
|
|
619
|
+
"uint32": "integer",
|
|
620
|
+
"int64": "integer",
|
|
621
|
+
"uint64": "integer",
|
|
622
|
+
"int128": "integer",
|
|
623
|
+
"uint128": "integer",
|
|
624
|
+
|
|
625
|
+
# Extended primitive types - floats
|
|
626
|
+
"float8": "number",
|
|
627
|
+
"float": "number",
|
|
628
|
+
"double": "number",
|
|
629
|
+
"binary32": "number",
|
|
630
|
+
"binary64": "number",
|
|
631
|
+
"decimal": "number",
|
|
632
|
+
|
|
633
|
+
# Extended primitive types - other
|
|
634
|
+
"binary": "string", # Base64-encoded in JSON
|
|
635
|
+
"date": "date",
|
|
636
|
+
"time": "time",
|
|
637
|
+
"datetime": "datetime",
|
|
638
|
+
"timestamp": "datetime",
|
|
639
|
+
"duration": "duration",
|
|
640
|
+
"uuid": "string",
|
|
641
|
+
"uri": "string",
|
|
642
|
+
"jsonpointer": "string",
|
|
643
|
+
|
|
644
|
+
# Special type
|
|
645
|
+
"any": "any"
|
|
646
|
+
}
|
|
647
|
+
return scalar_type_mapping.get(type_name, "string")
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def convert_structure_to_datapackage(structure_schema_path: str,
|
|
651
|
+
structure_record_type: Optional[str],
|
|
652
|
+
datapackage_path: str) -> None:
|
|
653
|
+
"""Convert a JSON Structure schema to a Data Package."""
|
|
654
|
+
converter = StructureToDataPackageConverter()
|
|
655
|
+
converter.convert_structure_to_datapackage(structure_schema_path, structure_record_type, datapackage_path)
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
# Example usage:
|
|
659
|
+
# convert_structure_to_datapackage("schema.struct.json", "MyRecord", "datapackage.json")
|