structurize 2.16.2__py3-none-any.whl → 2.16.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- avrotize/__init__.py +63 -63
- avrotize/__main__.py +5 -5
- avrotize/_version.py +34 -34
- avrotize/asn1toavro.py +160 -160
- avrotize/avrotize.py +152 -152
- avrotize/avrotocpp.py +483 -483
- avrotize/avrotocsharp.py +992 -992
- avrotize/avrotocsv.py +121 -121
- avrotize/avrotodatapackage.py +173 -173
- avrotize/avrotodb.py +1383 -1383
- avrotize/avrotogo.py +476 -476
- avrotize/avrotographql.py +197 -197
- avrotize/avrotoiceberg.py +210 -210
- avrotize/avrotojava.py +1023 -1023
- avrotize/avrotojs.py +250 -250
- avrotize/avrotojsons.py +481 -481
- avrotize/avrotojstruct.py +345 -345
- avrotize/avrotokusto.py +363 -363
- avrotize/avrotomd.py +137 -137
- avrotize/avrotools.py +168 -168
- avrotize/avrotoparquet.py +208 -208
- avrotize/avrotoproto.py +358 -358
- avrotize/avrotopython.py +622 -622
- avrotize/avrotorust.py +435 -435
- avrotize/avrotots.py +598 -598
- avrotize/avrotoxsd.py +344 -344
- avrotize/commands.json +2493 -2433
- avrotize/common.py +828 -828
- avrotize/constants.py +4 -4
- avrotize/csvtoavro.py +131 -131
- avrotize/datapackagetoavro.py +76 -76
- avrotize/dependency_resolver.py +348 -348
- avrotize/jsonstoavro.py +1698 -1698
- avrotize/jsonstostructure.py +2642 -2642
- avrotize/jstructtoavro.py +878 -878
- avrotize/kstructtoavro.py +93 -93
- avrotize/kustotoavro.py +455 -455
- avrotize/parquettoavro.py +157 -157
- avrotize/proto2parser.py +497 -497
- avrotize/proto3parser.py +402 -402
- avrotize/prototoavro.py +382 -382
- avrotize/structuretocsharp.py +2005 -2005
- avrotize/structuretojsons.py +498 -498
- avrotize/structuretopython.py +772 -772
- avrotize/structuretots.py +653 -0
- avrotize/xsdtoavro.py +413 -413
- {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/METADATA +848 -805
- structurize-2.16.5.dist-info/RECORD +52 -0
- {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/licenses/LICENSE +200 -200
- structurize-2.16.2.dist-info/RECORD +0 -51
- {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/WHEEL +0 -0
- {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/entry_points.txt +0 -0
- {structurize-2.16.2.dist-info → structurize-2.16.5.dist-info}/top_level.txt +0 -0
avrotize/avrotomd.py
CHANGED
|
@@ -1,137 +1,137 @@
|
|
|
1
|
-
# coding: utf-8
|
|
2
|
-
"""
|
|
3
|
-
Module to convert Avro schema to a comprehensive README.md.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import json
|
|
7
|
-
import os
|
|
8
|
-
from jinja2 import Environment, FileSystemLoader
|
|
9
|
-
|
|
10
|
-
from avrotize.common import render_template
|
|
11
|
-
|
|
12
|
-
class AvroToMarkdownConverter:
|
|
13
|
-
"""
|
|
14
|
-
Class to convert Avro schema to a comprehensive README.md.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
def __init__(self, avro_schema_path, markdown_path):
|
|
18
|
-
"""
|
|
19
|
-
Initialize the converter with file paths.
|
|
20
|
-
|
|
21
|
-
:param avro_schema_path: Path to the Avro schema file.
|
|
22
|
-
:param markdown_path: Path to save the README.md file.
|
|
23
|
-
"""
|
|
24
|
-
self.avro_schema_path = avro_schema_path
|
|
25
|
-
self.markdown_path = markdown_path
|
|
26
|
-
self.records = {}
|
|
27
|
-
self.enums = {}
|
|
28
|
-
self.fixeds = {}
|
|
29
|
-
|
|
30
|
-
def convert(self):
|
|
31
|
-
"""
|
|
32
|
-
Convert Avro schema to Markdown and save to file.
|
|
33
|
-
"""
|
|
34
|
-
with open(self.avro_schema_path, 'r', encoding='utf-8') as file:
|
|
35
|
-
avro_schemas = json.load(file)
|
|
36
|
-
|
|
37
|
-
schema_name = os.path.splitext(os.path.basename(self.avro_schema_path))[0]
|
|
38
|
-
if isinstance(avro_schemas, dict):
|
|
39
|
-
self.extract_named_types(avro_schemas)
|
|
40
|
-
elif isinstance(avro_schemas, list):
|
|
41
|
-
for schema in avro_schemas:
|
|
42
|
-
self.extract_named_types(schema)
|
|
43
|
-
|
|
44
|
-
self.generate_markdown(schema_name)
|
|
45
|
-
|
|
46
|
-
def extract_named_types(self, schema, parent_namespace: str = ''):
|
|
47
|
-
"""
|
|
48
|
-
Extract all named types (records, enums, fixed) from the schema.
|
|
49
|
-
"""
|
|
50
|
-
|
|
51
|
-
if isinstance(schema, dict):
|
|
52
|
-
ns = schema.get('namespace', parent_namespace)
|
|
53
|
-
if schema['type'] == 'record':
|
|
54
|
-
self.records.setdefault(ns, []).append(schema)
|
|
55
|
-
elif schema['type'] == 'enum':
|
|
56
|
-
self.enums.setdefault(ns, []).append(schema)
|
|
57
|
-
elif schema['type'] == 'fixed':
|
|
58
|
-
self.fixeds.setdefault(ns, []).append(schema)
|
|
59
|
-
if 'fields' in schema:
|
|
60
|
-
for field in schema['fields']:
|
|
61
|
-
self.extract_named_types(field['type'], ns)
|
|
62
|
-
if 'items' in schema:
|
|
63
|
-
self.extract_named_types(schema['items'], ns)
|
|
64
|
-
if 'values' in schema:
|
|
65
|
-
self.extract_named_types(schema['values'], ns)
|
|
66
|
-
elif isinstance(schema, list):
|
|
67
|
-
for sub_schema in schema:
|
|
68
|
-
self.extract_named_types(sub_schema, '')
|
|
69
|
-
|
|
70
|
-
def generate_markdown(self, schema_name: str):
|
|
71
|
-
"""
|
|
72
|
-
Generate markdown content from the extracted types using Jinja2 template.
|
|
73
|
-
|
|
74
|
-
:param schema_name: The name of the schema file.
|
|
75
|
-
:return: Markdown content as a string.
|
|
76
|
-
"""
|
|
77
|
-
render_template("avrotomd/README.md.jinja", self.markdown_path,
|
|
78
|
-
schema_name = schema_name,
|
|
79
|
-
records = self.records,
|
|
80
|
-
enums = self.enums,
|
|
81
|
-
fixeds = self.fixeds,
|
|
82
|
-
generate_field_markdown = self.generate_field_markdown
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
def generate_field_markdown(self, field, level):
|
|
86
|
-
"""
|
|
87
|
-
Generate markdown content for a single field.
|
|
88
|
-
|
|
89
|
-
:param field: Avro field as a dictionary.
|
|
90
|
-
:param level: The current level of nesting.
|
|
91
|
-
:return: Markdown content as a string.
|
|
92
|
-
"""
|
|
93
|
-
field_md = []
|
|
94
|
-
heading = "#" * level
|
|
95
|
-
field_md.append(f"{heading} {field['name']}\n")
|
|
96
|
-
field_md.append(f"- **Type:** {self.get_avro_type(field['type'])}")
|
|
97
|
-
if 'doc' in field:
|
|
98
|
-
field_md.append(f"- **Description:** {field['doc']}")
|
|
99
|
-
if 'default' in field:
|
|
100
|
-
field_md.append(f"- **Default:** {field['default']}")
|
|
101
|
-
if isinstance(field['type'], dict) and field['type'].get('logicalType'):
|
|
102
|
-
field_md.append(f"- **Logical Type:** {field['type']['logicalType']}")
|
|
103
|
-
if 'symbols' in field.get('type', {}):
|
|
104
|
-
field_md.append(f"- **Symbols:** {', '.join(field['type']['symbols'])}")
|
|
105
|
-
field_md.append("\n")
|
|
106
|
-
return "\n".join(field_md)
|
|
107
|
-
|
|
108
|
-
def get_avro_type(self, avro_type):
|
|
109
|
-
"""
|
|
110
|
-
Get Avro type as a string.
|
|
111
|
-
|
|
112
|
-
:param avro_type: Avro type as a string or dictionary.
|
|
113
|
-
:return: Avro type as a string.
|
|
114
|
-
"""
|
|
115
|
-
if isinstance(avro_type, list):
|
|
116
|
-
return " | ".join([self.get_avro_type(t) for t in avro_type])
|
|
117
|
-
if isinstance(avro_type, dict):
|
|
118
|
-
type_name = avro_type.get('type', 'unknown')
|
|
119
|
-
namespace = avro_type.get('namespace', '')
|
|
120
|
-
if type_name in [r['name'] for r in self.records.get(namespace, [])]:
|
|
121
|
-
return f"[{type_name}](#record-{namespace.lower()}-{type_name.lower()})"
|
|
122
|
-
if type_name in [e['name'] for e in self.enums.get(namespace, [])]:
|
|
123
|
-
return f"[{type_name}](#enum-{namespace.lower()}-{type_name.lower()})"
|
|
124
|
-
if type_name in [f['name'] for f in self.fixeds.get(namespace, [])]:
|
|
125
|
-
return f"[{type_name}](#fixed-{namespace.lower()}-{type_name.lower()})"
|
|
126
|
-
return type_name
|
|
127
|
-
return avro_type
|
|
128
|
-
|
|
129
|
-
def convert_avro_to_markdown(avro_schema_path, markdown_path):
|
|
130
|
-
"""
|
|
131
|
-
Convert an Avro schema file to a README.md file.
|
|
132
|
-
|
|
133
|
-
:param avro_schema_path: Path to the Avro schema file.
|
|
134
|
-
:param markdown_path: Path to save the README.md file.
|
|
135
|
-
"""
|
|
136
|
-
converter = AvroToMarkdownConverter(avro_schema_path, markdown_path)
|
|
137
|
-
converter.convert()
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
"""
|
|
3
|
+
Module to convert Avro schema to a comprehensive README.md.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
from jinja2 import Environment, FileSystemLoader
|
|
9
|
+
|
|
10
|
+
from avrotize.common import render_template
|
|
11
|
+
|
|
12
|
+
class AvroToMarkdownConverter:
|
|
13
|
+
"""
|
|
14
|
+
Class to convert Avro schema to a comprehensive README.md.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, avro_schema_path, markdown_path):
|
|
18
|
+
"""
|
|
19
|
+
Initialize the converter with file paths.
|
|
20
|
+
|
|
21
|
+
:param avro_schema_path: Path to the Avro schema file.
|
|
22
|
+
:param markdown_path: Path to save the README.md file.
|
|
23
|
+
"""
|
|
24
|
+
self.avro_schema_path = avro_schema_path
|
|
25
|
+
self.markdown_path = markdown_path
|
|
26
|
+
self.records = {}
|
|
27
|
+
self.enums = {}
|
|
28
|
+
self.fixeds = {}
|
|
29
|
+
|
|
30
|
+
def convert(self):
|
|
31
|
+
"""
|
|
32
|
+
Convert Avro schema to Markdown and save to file.
|
|
33
|
+
"""
|
|
34
|
+
with open(self.avro_schema_path, 'r', encoding='utf-8') as file:
|
|
35
|
+
avro_schemas = json.load(file)
|
|
36
|
+
|
|
37
|
+
schema_name = os.path.splitext(os.path.basename(self.avro_schema_path))[0]
|
|
38
|
+
if isinstance(avro_schemas, dict):
|
|
39
|
+
self.extract_named_types(avro_schemas)
|
|
40
|
+
elif isinstance(avro_schemas, list):
|
|
41
|
+
for schema in avro_schemas:
|
|
42
|
+
self.extract_named_types(schema)
|
|
43
|
+
|
|
44
|
+
self.generate_markdown(schema_name)
|
|
45
|
+
|
|
46
|
+
def extract_named_types(self, schema, parent_namespace: str = ''):
|
|
47
|
+
"""
|
|
48
|
+
Extract all named types (records, enums, fixed) from the schema.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
if isinstance(schema, dict):
|
|
52
|
+
ns = schema.get('namespace', parent_namespace)
|
|
53
|
+
if schema['type'] == 'record':
|
|
54
|
+
self.records.setdefault(ns, []).append(schema)
|
|
55
|
+
elif schema['type'] == 'enum':
|
|
56
|
+
self.enums.setdefault(ns, []).append(schema)
|
|
57
|
+
elif schema['type'] == 'fixed':
|
|
58
|
+
self.fixeds.setdefault(ns, []).append(schema)
|
|
59
|
+
if 'fields' in schema:
|
|
60
|
+
for field in schema['fields']:
|
|
61
|
+
self.extract_named_types(field['type'], ns)
|
|
62
|
+
if 'items' in schema:
|
|
63
|
+
self.extract_named_types(schema['items'], ns)
|
|
64
|
+
if 'values' in schema:
|
|
65
|
+
self.extract_named_types(schema['values'], ns)
|
|
66
|
+
elif isinstance(schema, list):
|
|
67
|
+
for sub_schema in schema:
|
|
68
|
+
self.extract_named_types(sub_schema, '')
|
|
69
|
+
|
|
70
|
+
def generate_markdown(self, schema_name: str):
|
|
71
|
+
"""
|
|
72
|
+
Generate markdown content from the extracted types using Jinja2 template.
|
|
73
|
+
|
|
74
|
+
:param schema_name: The name of the schema file.
|
|
75
|
+
:return: Markdown content as a string.
|
|
76
|
+
"""
|
|
77
|
+
render_template("avrotomd/README.md.jinja", self.markdown_path,
|
|
78
|
+
schema_name = schema_name,
|
|
79
|
+
records = self.records,
|
|
80
|
+
enums = self.enums,
|
|
81
|
+
fixeds = self.fixeds,
|
|
82
|
+
generate_field_markdown = self.generate_field_markdown
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def generate_field_markdown(self, field, level):
|
|
86
|
+
"""
|
|
87
|
+
Generate markdown content for a single field.
|
|
88
|
+
|
|
89
|
+
:param field: Avro field as a dictionary.
|
|
90
|
+
:param level: The current level of nesting.
|
|
91
|
+
:return: Markdown content as a string.
|
|
92
|
+
"""
|
|
93
|
+
field_md = []
|
|
94
|
+
heading = "#" * level
|
|
95
|
+
field_md.append(f"{heading} {field['name']}\n")
|
|
96
|
+
field_md.append(f"- **Type:** {self.get_avro_type(field['type'])}")
|
|
97
|
+
if 'doc' in field:
|
|
98
|
+
field_md.append(f"- **Description:** {field['doc']}")
|
|
99
|
+
if 'default' in field:
|
|
100
|
+
field_md.append(f"- **Default:** {field['default']}")
|
|
101
|
+
if isinstance(field['type'], dict) and field['type'].get('logicalType'):
|
|
102
|
+
field_md.append(f"- **Logical Type:** {field['type']['logicalType']}")
|
|
103
|
+
if 'symbols' in field.get('type', {}):
|
|
104
|
+
field_md.append(f"- **Symbols:** {', '.join(field['type']['symbols'])}")
|
|
105
|
+
field_md.append("\n")
|
|
106
|
+
return "\n".join(field_md)
|
|
107
|
+
|
|
108
|
+
def get_avro_type(self, avro_type):
|
|
109
|
+
"""
|
|
110
|
+
Get Avro type as a string.
|
|
111
|
+
|
|
112
|
+
:param avro_type: Avro type as a string or dictionary.
|
|
113
|
+
:return: Avro type as a string.
|
|
114
|
+
"""
|
|
115
|
+
if isinstance(avro_type, list):
|
|
116
|
+
return " | ".join([self.get_avro_type(t) for t in avro_type])
|
|
117
|
+
if isinstance(avro_type, dict):
|
|
118
|
+
type_name = avro_type.get('type', 'unknown')
|
|
119
|
+
namespace = avro_type.get('namespace', '')
|
|
120
|
+
if type_name in [r['name'] for r in self.records.get(namespace, [])]:
|
|
121
|
+
return f"[{type_name}](#record-{namespace.lower()}-{type_name.lower()})"
|
|
122
|
+
if type_name in [e['name'] for e in self.enums.get(namespace, [])]:
|
|
123
|
+
return f"[{type_name}](#enum-{namespace.lower()}-{type_name.lower()})"
|
|
124
|
+
if type_name in [f['name'] for f in self.fixeds.get(namespace, [])]:
|
|
125
|
+
return f"[{type_name}](#fixed-{namespace.lower()}-{type_name.lower()})"
|
|
126
|
+
return type_name
|
|
127
|
+
return avro_type
|
|
128
|
+
|
|
129
|
+
def convert_avro_to_markdown(avro_schema_path, markdown_path):
|
|
130
|
+
"""
|
|
131
|
+
Convert an Avro schema file to a README.md file.
|
|
132
|
+
|
|
133
|
+
:param avro_schema_path: Path to the Avro schema file.
|
|
134
|
+
:param markdown_path: Path to save the README.md file.
|
|
135
|
+
"""
|
|
136
|
+
converter = AvroToMarkdownConverter(avro_schema_path, markdown_path)
|
|
137
|
+
converter.convert()
|
avrotize/avrotools.py
CHANGED
|
@@ -1,168 +1,168 @@
|
|
|
1
|
-
""" Avro Tools Module """
|
|
2
|
-
|
|
3
|
-
import json
|
|
4
|
-
import hashlib
|
|
5
|
-
import base64
|
|
6
|
-
from typing import Dict, List, cast
|
|
7
|
-
|
|
8
|
-
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | int | bool | None
|
|
9
|
-
|
|
10
|
-
def transform_to_pcf(schema_json: str) -> str:
|
|
11
|
-
"""
|
|
12
|
-
Transforms an Avro schema into its Parsing Canonical Form (PCF).
|
|
13
|
-
|
|
14
|
-
:param schema_json: The Avro schema as a JSON string.
|
|
15
|
-
:return: The Parsing Canonical Form (PCF) as a JSON string.
|
|
16
|
-
"""
|
|
17
|
-
schema = json.loads(schema_json)
|
|
18
|
-
canonical_schema = canonicalize_schema(schema)
|
|
19
|
-
return json.dumps(canonical_schema, separators=(',', ':'))
|
|
20
|
-
|
|
21
|
-
def avsc_to_pcf(schema_file: str) -> None:
|
|
22
|
-
""" Convert an Avro schema file to its Parsing Canonical Form (PCF)."""
|
|
23
|
-
with open(schema_file, 'r', encoding='utf-8') as file:
|
|
24
|
-
schema = json.load(file)
|
|
25
|
-
print(transform_to_pcf(json.dumps(schema)))
|
|
26
|
-
|
|
27
|
-
def canonicalize_schema(schema: JsonNode, namespace:str="") -> JsonNode:
|
|
28
|
-
"""
|
|
29
|
-
Recursively processes the schema to convert it to the Parsing Canonical Form (PCF).
|
|
30
|
-
|
|
31
|
-
:param schema: The Avro schema as a dictionary.
|
|
32
|
-
:param namespace: The current namespace for resolving names.
|
|
33
|
-
:return: The canonicalized schema as a dictionary.
|
|
34
|
-
"""
|
|
35
|
-
if isinstance(schema, str):
|
|
36
|
-
return schema
|
|
37
|
-
elif isinstance(schema, dict):
|
|
38
|
-
if 'type' in schema and isinstance(schema['type'], str):
|
|
39
|
-
if schema['type'] in PRIMITIVE_TYPES:
|
|
40
|
-
return schema['type']
|
|
41
|
-
if '.' not in schema['type'] and namespace:
|
|
42
|
-
schema['type'] = namespace + '.' + schema['type']
|
|
43
|
-
|
|
44
|
-
if 'name' in schema and '.' not in cast(str,schema['name']) and namespace:
|
|
45
|
-
schema['name'] = namespace + '.' + cast(str,schema['name'])
|
|
46
|
-
|
|
47
|
-
canonical = {}
|
|
48
|
-
for field in FIELD_ORDER:
|
|
49
|
-
if field in schema:
|
|
50
|
-
value = schema[field]
|
|
51
|
-
if field == 'fields' and isinstance(value, list):
|
|
52
|
-
value = [canonicalize_schema(f, cast(str,schema.get('namespace', namespace))) for f in value]
|
|
53
|
-
elif field == 'symbols' or field == 'items' or field == 'values':
|
|
54
|
-
value = canonicalize_schema(value, namespace)
|
|
55
|
-
elif isinstance(value, dict):
|
|
56
|
-
value = canonicalize_schema(value, namespace)
|
|
57
|
-
elif isinstance(value, list):
|
|
58
|
-
value = [canonicalize_schema(v, namespace) for v in value]
|
|
59
|
-
elif isinstance(value, str):
|
|
60
|
-
value = normalize_string(value)
|
|
61
|
-
elif isinstance(value, int):
|
|
62
|
-
value = normalize_integer(value)
|
|
63
|
-
canonical[field] = value
|
|
64
|
-
return canonical
|
|
65
|
-
elif isinstance(schema, list):
|
|
66
|
-
return [canonicalize_schema(s, namespace) for s in schema]
|
|
67
|
-
raise ValueError("Invalid schema: " + str(schema))
|
|
68
|
-
|
|
69
|
-
def normalize_string(value):
|
|
70
|
-
"""
|
|
71
|
-
Normalizes JSON string literals by replacing escaped characters with their UTF-8 equivalents.
|
|
72
|
-
|
|
73
|
-
:param value: The string value to normalize.
|
|
74
|
-
:return: The normalized string.
|
|
75
|
-
"""
|
|
76
|
-
return value.encode('utf-8').decode('unicode_escape')
|
|
77
|
-
|
|
78
|
-
def normalize_integer(value):
|
|
79
|
-
"""
|
|
80
|
-
Normalizes JSON integer literals by removing leading zeros.
|
|
81
|
-
|
|
82
|
-
:param value: The integer value to normalize.
|
|
83
|
-
:return: The normalized integer.
|
|
84
|
-
"""
|
|
85
|
-
return int(value)
|
|
86
|
-
|
|
87
|
-
def fingerprint_sha256(schema_json):
|
|
88
|
-
"""
|
|
89
|
-
Generates a SHA-256 fingerprint for the given Avro schema.
|
|
90
|
-
|
|
91
|
-
:param schema_json: The Avro schema as a JSON string.
|
|
92
|
-
:return: The SHA-256 fingerprint as a base64 string.
|
|
93
|
-
"""
|
|
94
|
-
pcf = transform_to_pcf(schema_json)
|
|
95
|
-
sha256_hash = hashlib.sha256(pcf.encode('utf-8')).digest()
|
|
96
|
-
return base64.b64encode(sha256_hash).decode('utf-8')
|
|
97
|
-
|
|
98
|
-
def fingerprint_md5(schema_json):
|
|
99
|
-
"""
|
|
100
|
-
Generates an MD5 fingerprint for the given Avro schema.
|
|
101
|
-
|
|
102
|
-
:param schema_json: The Avro schema as a JSON string.
|
|
103
|
-
:return: The MD5 fingerprint as a base64 string.
|
|
104
|
-
"""
|
|
105
|
-
pcf = transform_to_pcf(schema_json)
|
|
106
|
-
md5_hash = hashlib.md5(pcf.encode('utf-8')).digest()
|
|
107
|
-
return base64.b64encode(md5_hash).decode('utf-8')
|
|
108
|
-
|
|
109
|
-
def fingerprint_rabin(schema_json):
|
|
110
|
-
"""
|
|
111
|
-
Generates a 64-bit Rabin fingerprint for the given Avro schema.
|
|
112
|
-
|
|
113
|
-
:param schema_json: The Avro schema as a JSON string.
|
|
114
|
-
:return: The Rabin fingerprint as a base64 string.
|
|
115
|
-
"""
|
|
116
|
-
pcf = transform_to_pcf(schema_json).encode('utf-8')
|
|
117
|
-
fp = fingerprint64(pcf)
|
|
118
|
-
return base64.b64encode(fp.to_bytes(8, 'big')).decode('utf-8')
|
|
119
|
-
|
|
120
|
-
def fingerprint64(buf):
|
|
121
|
-
"""
|
|
122
|
-
Computes a 64-bit Rabin fingerprint.
|
|
123
|
-
|
|
124
|
-
:param buf: The input byte buffer.
|
|
125
|
-
:return: The 64-bit Rabin fingerprint.
|
|
126
|
-
"""
|
|
127
|
-
if FP_TABLE is None:
|
|
128
|
-
init_fp_table()
|
|
129
|
-
fp = EMPTY
|
|
130
|
-
for byte in buf:
|
|
131
|
-
fp = (fp >> 8) ^ FP_TABLE[(fp ^ byte) & 0xff]
|
|
132
|
-
return fp
|
|
133
|
-
|
|
134
|
-
def init_fp_table():
|
|
135
|
-
"""
|
|
136
|
-
Initializes the fingerprint table for the Rabin fingerprint algorithm.
|
|
137
|
-
"""
|
|
138
|
-
global FP_TABLE
|
|
139
|
-
FP_TABLE = []
|
|
140
|
-
for i in range(256):
|
|
141
|
-
fp = i
|
|
142
|
-
for _ in range(8):
|
|
143
|
-
fp = (fp >> 1) ^ (EMPTY & -(fp & 1))
|
|
144
|
-
FP_TABLE.append(fp)
|
|
145
|
-
|
|
146
|
-
PRIMITIVE_TYPES = {"null", "boolean", "int", "long", "float", "double", "bytes", "string"}
|
|
147
|
-
FIELD_ORDER = ["name", "type", "fields", "symbols", "items", "values", "size"]
|
|
148
|
-
|
|
149
|
-
EMPTY = 0xc15d213aa4d7a795
|
|
150
|
-
FP_TABLE = None
|
|
151
|
-
|
|
152
|
-
class PCFSchemaResult:
|
|
153
|
-
def __init__(self, pcf: str, sha256: str, md5: str, rabin: str) -> None:
|
|
154
|
-
self.pcf = pcf
|
|
155
|
-
self.sha256 = sha256
|
|
156
|
-
self.md5 = md5
|
|
157
|
-
self.rabin = rabin
|
|
158
|
-
|
|
159
|
-
def pcf_schema(schema_json):
|
|
160
|
-
"""
|
|
161
|
-
Wrapper function to provide PCF transformation and fingerprinting.
|
|
162
|
-
|
|
163
|
-
:param schema_json: The Avro schema as a JSON string.
|
|
164
|
-
:return: An instance of the PCFSchemaResult class containing the PCF and fingerprints (SHA-256, MD5, and Rabin) as base64 strings.
|
|
165
|
-
"""
|
|
166
|
-
pcf = transform_to_pcf(schema_json)
|
|
167
|
-
return PCFSchemaResult(pcf, fingerprint_sha256(schema_json), fingerprint_md5(schema_json), fingerprint_rabin(schema_json))
|
|
168
|
-
|
|
1
|
+
""" Avro Tools Module """
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import hashlib
|
|
5
|
+
import base64
|
|
6
|
+
from typing import Dict, List, cast
|
|
7
|
+
|
|
8
|
+
JsonNode = Dict[str, 'JsonNode'] | List['JsonNode'] | str | int | bool | None
|
|
9
|
+
|
|
10
|
+
def transform_to_pcf(schema_json: str) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Transforms an Avro schema into its Parsing Canonical Form (PCF).
|
|
13
|
+
|
|
14
|
+
:param schema_json: The Avro schema as a JSON string.
|
|
15
|
+
:return: The Parsing Canonical Form (PCF) as a JSON string.
|
|
16
|
+
"""
|
|
17
|
+
schema = json.loads(schema_json)
|
|
18
|
+
canonical_schema = canonicalize_schema(schema)
|
|
19
|
+
return json.dumps(canonical_schema, separators=(',', ':'))
|
|
20
|
+
|
|
21
|
+
def avsc_to_pcf(schema_file: str) -> None:
|
|
22
|
+
""" Convert an Avro schema file to its Parsing Canonical Form (PCF)."""
|
|
23
|
+
with open(schema_file, 'r', encoding='utf-8') as file:
|
|
24
|
+
schema = json.load(file)
|
|
25
|
+
print(transform_to_pcf(json.dumps(schema)))
|
|
26
|
+
|
|
27
|
+
def canonicalize_schema(schema: JsonNode, namespace:str="") -> JsonNode:
|
|
28
|
+
"""
|
|
29
|
+
Recursively processes the schema to convert it to the Parsing Canonical Form (PCF).
|
|
30
|
+
|
|
31
|
+
:param schema: The Avro schema as a dictionary.
|
|
32
|
+
:param namespace: The current namespace for resolving names.
|
|
33
|
+
:return: The canonicalized schema as a dictionary.
|
|
34
|
+
"""
|
|
35
|
+
if isinstance(schema, str):
|
|
36
|
+
return schema
|
|
37
|
+
elif isinstance(schema, dict):
|
|
38
|
+
if 'type' in schema and isinstance(schema['type'], str):
|
|
39
|
+
if schema['type'] in PRIMITIVE_TYPES:
|
|
40
|
+
return schema['type']
|
|
41
|
+
if '.' not in schema['type'] and namespace:
|
|
42
|
+
schema['type'] = namespace + '.' + schema['type']
|
|
43
|
+
|
|
44
|
+
if 'name' in schema and '.' not in cast(str,schema['name']) and namespace:
|
|
45
|
+
schema['name'] = namespace + '.' + cast(str,schema['name'])
|
|
46
|
+
|
|
47
|
+
canonical = {}
|
|
48
|
+
for field in FIELD_ORDER:
|
|
49
|
+
if field in schema:
|
|
50
|
+
value = schema[field]
|
|
51
|
+
if field == 'fields' and isinstance(value, list):
|
|
52
|
+
value = [canonicalize_schema(f, cast(str,schema.get('namespace', namespace))) for f in value]
|
|
53
|
+
elif field == 'symbols' or field == 'items' or field == 'values':
|
|
54
|
+
value = canonicalize_schema(value, namespace)
|
|
55
|
+
elif isinstance(value, dict):
|
|
56
|
+
value = canonicalize_schema(value, namespace)
|
|
57
|
+
elif isinstance(value, list):
|
|
58
|
+
value = [canonicalize_schema(v, namespace) for v in value]
|
|
59
|
+
elif isinstance(value, str):
|
|
60
|
+
value = normalize_string(value)
|
|
61
|
+
elif isinstance(value, int):
|
|
62
|
+
value = normalize_integer(value)
|
|
63
|
+
canonical[field] = value
|
|
64
|
+
return canonical
|
|
65
|
+
elif isinstance(schema, list):
|
|
66
|
+
return [canonicalize_schema(s, namespace) for s in schema]
|
|
67
|
+
raise ValueError("Invalid schema: " + str(schema))
|
|
68
|
+
|
|
69
|
+
def normalize_string(value):
|
|
70
|
+
"""
|
|
71
|
+
Normalizes JSON string literals by replacing escaped characters with their UTF-8 equivalents.
|
|
72
|
+
|
|
73
|
+
:param value: The string value to normalize.
|
|
74
|
+
:return: The normalized string.
|
|
75
|
+
"""
|
|
76
|
+
return value.encode('utf-8').decode('unicode_escape')
|
|
77
|
+
|
|
78
|
+
def normalize_integer(value):
|
|
79
|
+
"""
|
|
80
|
+
Normalizes JSON integer literals by removing leading zeros.
|
|
81
|
+
|
|
82
|
+
:param value: The integer value to normalize.
|
|
83
|
+
:return: The normalized integer.
|
|
84
|
+
"""
|
|
85
|
+
return int(value)
|
|
86
|
+
|
|
87
|
+
def fingerprint_sha256(schema_json):
|
|
88
|
+
"""
|
|
89
|
+
Generates a SHA-256 fingerprint for the given Avro schema.
|
|
90
|
+
|
|
91
|
+
:param schema_json: The Avro schema as a JSON string.
|
|
92
|
+
:return: The SHA-256 fingerprint as a base64 string.
|
|
93
|
+
"""
|
|
94
|
+
pcf = transform_to_pcf(schema_json)
|
|
95
|
+
sha256_hash = hashlib.sha256(pcf.encode('utf-8')).digest()
|
|
96
|
+
return base64.b64encode(sha256_hash).decode('utf-8')
|
|
97
|
+
|
|
98
|
+
def fingerprint_md5(schema_json):
|
|
99
|
+
"""
|
|
100
|
+
Generates an MD5 fingerprint for the given Avro schema.
|
|
101
|
+
|
|
102
|
+
:param schema_json: The Avro schema as a JSON string.
|
|
103
|
+
:return: The MD5 fingerprint as a base64 string.
|
|
104
|
+
"""
|
|
105
|
+
pcf = transform_to_pcf(schema_json)
|
|
106
|
+
md5_hash = hashlib.md5(pcf.encode('utf-8')).digest()
|
|
107
|
+
return base64.b64encode(md5_hash).decode('utf-8')
|
|
108
|
+
|
|
109
|
+
def fingerprint_rabin(schema_json):
|
|
110
|
+
"""
|
|
111
|
+
Generates a 64-bit Rabin fingerprint for the given Avro schema.
|
|
112
|
+
|
|
113
|
+
:param schema_json: The Avro schema as a JSON string.
|
|
114
|
+
:return: The Rabin fingerprint as a base64 string.
|
|
115
|
+
"""
|
|
116
|
+
pcf = transform_to_pcf(schema_json).encode('utf-8')
|
|
117
|
+
fp = fingerprint64(pcf)
|
|
118
|
+
return base64.b64encode(fp.to_bytes(8, 'big')).decode('utf-8')
|
|
119
|
+
|
|
120
|
+
def fingerprint64(buf):
|
|
121
|
+
"""
|
|
122
|
+
Computes a 64-bit Rabin fingerprint.
|
|
123
|
+
|
|
124
|
+
:param buf: The input byte buffer.
|
|
125
|
+
:return: The 64-bit Rabin fingerprint.
|
|
126
|
+
"""
|
|
127
|
+
if FP_TABLE is None:
|
|
128
|
+
init_fp_table()
|
|
129
|
+
fp = EMPTY
|
|
130
|
+
for byte in buf:
|
|
131
|
+
fp = (fp >> 8) ^ FP_TABLE[(fp ^ byte) & 0xff]
|
|
132
|
+
return fp
|
|
133
|
+
|
|
134
|
+
def init_fp_table():
|
|
135
|
+
"""
|
|
136
|
+
Initializes the fingerprint table for the Rabin fingerprint algorithm.
|
|
137
|
+
"""
|
|
138
|
+
global FP_TABLE
|
|
139
|
+
FP_TABLE = []
|
|
140
|
+
for i in range(256):
|
|
141
|
+
fp = i
|
|
142
|
+
for _ in range(8):
|
|
143
|
+
fp = (fp >> 1) ^ (EMPTY & -(fp & 1))
|
|
144
|
+
FP_TABLE.append(fp)
|
|
145
|
+
|
|
146
|
+
PRIMITIVE_TYPES = {"null", "boolean", "int", "long", "float", "double", "bytes", "string"}
|
|
147
|
+
FIELD_ORDER = ["name", "type", "fields", "symbols", "items", "values", "size"]
|
|
148
|
+
|
|
149
|
+
EMPTY = 0xc15d213aa4d7a795
|
|
150
|
+
FP_TABLE = None
|
|
151
|
+
|
|
152
|
+
class PCFSchemaResult:
|
|
153
|
+
def __init__(self, pcf: str, sha256: str, md5: str, rabin: str) -> None:
|
|
154
|
+
self.pcf = pcf
|
|
155
|
+
self.sha256 = sha256
|
|
156
|
+
self.md5 = md5
|
|
157
|
+
self.rabin = rabin
|
|
158
|
+
|
|
159
|
+
def pcf_schema(schema_json):
|
|
160
|
+
"""
|
|
161
|
+
Wrapper function to provide PCF transformation and fingerprinting.
|
|
162
|
+
|
|
163
|
+
:param schema_json: The Avro schema as a JSON string.
|
|
164
|
+
:return: An instance of the PCFSchemaResult class containing the PCF and fingerprints (SHA-256, MD5, and Rabin) as base64 strings.
|
|
165
|
+
"""
|
|
166
|
+
pcf = transform_to_pcf(schema_json)
|
|
167
|
+
return PCFSchemaResult(pcf, fingerprint_sha256(schema_json), fingerprint_md5(schema_json), fingerprint_rabin(schema_json))
|
|
168
|
+
|