linkml 1.9.4rc1__py3-none-any.whl → 1.9.5rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml/cli/main.py +4 -0
- linkml/generators/__init__.py +2 -0
- linkml/generators/common/build.py +5 -20
- linkml/generators/common/template.py +289 -3
- linkml/generators/docgen.py +55 -10
- linkml/generators/erdiagramgen.py +9 -5
- linkml/generators/graphqlgen.py +32 -6
- linkml/generators/jsonldcontextgen.py +78 -12
- linkml/generators/jsonschemagen.py +29 -12
- linkml/generators/mermaidclassdiagramgen.py +21 -3
- linkml/generators/owlgen.py +4 -1
- linkml/generators/panderagen/dataframe_class.py +13 -0
- linkml/generators/panderagen/dataframe_field.py +50 -0
- linkml/generators/panderagen/linkml_pandera_validator.py +186 -0
- linkml/generators/panderagen/panderagen.py +22 -5
- linkml/generators/panderagen/panderagen_class_based/class.jinja2 +70 -13
- linkml/generators/panderagen/panderagen_class_based/custom_checks.jinja2 +27 -0
- linkml/generators/panderagen/panderagen_class_based/enums.jinja2 +3 -3
- linkml/generators/panderagen/panderagen_class_based/pandera.jinja2 +12 -2
- linkml/generators/panderagen/panderagen_class_based/slots.jinja2 +19 -17
- linkml/generators/panderagen/slot_generator_mixin.py +143 -16
- linkml/generators/panderagen/transforms/__init__.py +19 -0
- linkml/generators/panderagen/transforms/collection_dict_model_transform.py +62 -0
- linkml/generators/panderagen/transforms/list_dict_model_transform.py +66 -0
- linkml/generators/panderagen/transforms/model_transform.py +8 -0
- linkml/generators/panderagen/transforms/nested_struct_model_transform.py +27 -0
- linkml/generators/panderagen/transforms/simple_dict_model_transform.py +86 -0
- linkml/generators/plantumlgen.py +17 -11
- linkml/generators/pydanticgen/pydanticgen.py +53 -2
- linkml/generators/pydanticgen/template.py +45 -233
- linkml/generators/pydanticgen/templates/attribute.py.jinja +1 -0
- linkml/generators/pydanticgen/templates/base_model.py.jinja +16 -2
- linkml/generators/pydanticgen/templates/imports.py.jinja +1 -1
- linkml/generators/rdfgen.py +11 -2
- linkml/generators/rustgen/__init__.py +3 -0
- linkml/generators/rustgen/build.py +94 -0
- linkml/generators/rustgen/cli.py +65 -0
- linkml/generators/rustgen/rustgen.py +1038 -0
- linkml/generators/rustgen/template.py +865 -0
- linkml/generators/rustgen/templates/Cargo.toml.jinja +42 -0
- linkml/generators/rustgen/templates/anything.rs.jinja +142 -0
- linkml/generators/rustgen/templates/as_key_value.rs.jinja +56 -0
- linkml/generators/rustgen/templates/class_module.rs.jinja +8 -0
- linkml/generators/rustgen/templates/enum.rs.jinja +54 -0
- linkml/generators/rustgen/templates/file.rs.jinja +62 -0
- linkml/generators/rustgen/templates/import.rs.jinja +4 -0
- linkml/generators/rustgen/templates/imports.rs.jinja +8 -0
- linkml/generators/rustgen/templates/poly.rs.jinja +9 -0
- linkml/generators/rustgen/templates/poly_containers.rs.jinja +439 -0
- linkml/generators/rustgen/templates/poly_trait.rs.jinja +15 -0
- linkml/generators/rustgen/templates/poly_trait_impl.rs.jinja +5 -0
- linkml/generators/rustgen/templates/poly_trait_impl_orsubtype.rs.jinja +5 -0
- linkml/generators/rustgen/templates/poly_trait_property.rs.jinja +8 -0
- linkml/generators/rustgen/templates/poly_trait_property_impl.rs.jinja +132 -0
- linkml/generators/rustgen/templates/poly_trait_property_match.rs.jinja +10 -0
- linkml/generators/rustgen/templates/property.rs.jinja +19 -0
- linkml/generators/rustgen/templates/pyproject.toml.jinja +10 -0
- linkml/generators/rustgen/templates/serde_utils.rs.jinja +310 -0
- linkml/generators/rustgen/templates/slot_range_as_union.rs.jinja +61 -0
- linkml/generators/rustgen/templates/struct.rs.jinja +75 -0
- linkml/generators/rustgen/templates/struct_or_subtype_enum.rs.jinja +108 -0
- linkml/generators/rustgen/templates/typealias.rs.jinja +13 -0
- linkml/generators/sqltablegen.py +18 -16
- linkml/generators/yarrrmlgen.py +157 -0
- linkml/linter/config/datamodel/config.py +160 -293
- linkml/linter/config/datamodel/config.yaml +34 -26
- linkml/linter/config/default.yaml +4 -0
- linkml/linter/config/recommended.yaml +4 -0
- linkml/linter/linter.py +1 -2
- linkml/linter/rules.py +37 -0
- linkml/utils/schemaloader.py +55 -3
- {linkml-1.9.4rc1.dist-info → linkml-1.9.5rc1.dist-info}/METADATA +2 -2
- {linkml-1.9.4rc1.dist-info → linkml-1.9.5rc1.dist-info}/RECORD +76 -38
- {linkml-1.9.4rc1.dist-info → linkml-1.9.5rc1.dist-info}/entry_points.txt +1 -0
- linkml/generators/panderagen/panderagen_class_based/mixins.jinja2 +0 -26
- {linkml-1.9.4rc1.dist-info → linkml-1.9.5rc1.dist-info}/WHEEL +0 -0
- {linkml-1.9.4rc1.dist-info → linkml-1.9.5rc1.dist-info}/licenses/LICENSE +0 -0
|
@@ -14,9 +14,10 @@ from linkml_runtime.utils.formatutils import camelcase
|
|
|
14
14
|
from linkml_runtime.utils.schemaview import SchemaView
|
|
15
15
|
|
|
16
16
|
from linkml._version import __version__
|
|
17
|
-
from linkml.generators.oocodegen import
|
|
17
|
+
from linkml.generators.oocodegen import OOCodeGenerator, OODocument
|
|
18
18
|
|
|
19
19
|
from .class_generator_mixin import ClassGeneratorMixin
|
|
20
|
+
from .dataframe_class import DataframeClass
|
|
20
21
|
from .enum_generator_mixin import EnumGeneratorMixin
|
|
21
22
|
from .slot_generator_mixin import SlotGeneratorMixin
|
|
22
23
|
|
|
@@ -28,6 +29,7 @@ TYPEMAP = {
|
|
|
28
29
|
"panderagen_class_based": {
|
|
29
30
|
"xsd:string": "str",
|
|
30
31
|
"xsd:integer": "int",
|
|
32
|
+
"xsd:int": "int",
|
|
31
33
|
"xsd:float": "float",
|
|
32
34
|
"xsd:double": "float",
|
|
33
35
|
"xsd:boolean": "bool",
|
|
@@ -83,6 +85,8 @@ class PanderaGenerator(OOCodeGenerator, EnumGeneratorMixin, ClassGeneratorMixin,
|
|
|
83
85
|
|
|
84
86
|
@staticmethod
|
|
85
87
|
def make_multivalued(range: str) -> str:
|
|
88
|
+
if range == "Struct":
|
|
89
|
+
return "pl.List"
|
|
86
90
|
return f"List[{range}]"
|
|
87
91
|
|
|
88
92
|
def uri_type_map(self, xsd_uri: str, template: str = None):
|
|
@@ -92,15 +96,22 @@ class PanderaGenerator(OOCodeGenerator, EnumGeneratorMixin, ClassGeneratorMixin,
|
|
|
92
96
|
return TYPEMAP[template].get(xsd_uri)
|
|
93
97
|
|
|
94
98
|
def map_type(self, t: TypeDefinition) -> str:
|
|
99
|
+
logger.info(f"type_map definition: {t}")
|
|
100
|
+
|
|
101
|
+
typ = None
|
|
102
|
+
|
|
95
103
|
if t.uri:
|
|
96
104
|
typ = self.uri_type_map(t.uri)
|
|
97
|
-
|
|
105
|
+
if typ is None:
|
|
106
|
+
typ = self.map_type(self.schemaview.get_type(t.typeof))
|
|
98
107
|
elif t.typeof:
|
|
99
108
|
typ = self.map_type(self.schemaview.get_type(t.typeof))
|
|
100
|
-
|
|
101
|
-
|
|
109
|
+
|
|
110
|
+
if typ is None:
|
|
102
111
|
raise ValueError(f"{t} cannot be mapped to a type")
|
|
103
112
|
|
|
113
|
+
return typ
|
|
114
|
+
|
|
104
115
|
def load_template(self, template_filename):
|
|
105
116
|
jinja_env = Environment(loader=PackageLoader("linkml.generators.panderagen", self.template_path))
|
|
106
117
|
return jinja_env.get_template(template_filename)
|
|
@@ -138,6 +149,7 @@ class PanderaGenerator(OOCodeGenerator, EnumGeneratorMixin, ClassGeneratorMixin,
|
|
|
138
149
|
coerce=self.coerce,
|
|
139
150
|
type_map=TYPEMAP,
|
|
140
151
|
template_path=self.template_path,
|
|
152
|
+
pandera_validator_code=None,
|
|
141
153
|
)
|
|
142
154
|
return code
|
|
143
155
|
|
|
@@ -156,12 +168,17 @@ class PanderaGenerator(OOCodeGenerator, EnumGeneratorMixin, ClassGeneratorMixin,
|
|
|
156
168
|
for c in self.ordered_classes():
|
|
157
169
|
cn = c.name
|
|
158
170
|
safe_cn = camelcase(cn)
|
|
159
|
-
|
|
171
|
+
annotations = {}
|
|
172
|
+
identifier_or_key_slot = self.get_identifier_or_key_slot(cn)
|
|
173
|
+
if identifier_or_key_slot:
|
|
174
|
+
annotations["identifier_key_slot"] = identifier_or_key_slot.name
|
|
175
|
+
ooclass = DataframeClass(
|
|
160
176
|
name=safe_cn,
|
|
161
177
|
description=c.description,
|
|
162
178
|
package=self.package,
|
|
163
179
|
fields=[],
|
|
164
180
|
source_class=c,
|
|
181
|
+
annotations=annotations,
|
|
165
182
|
)
|
|
166
183
|
classes.append(ooclass)
|
|
167
184
|
if c.mixin:
|
|
@@ -3,24 +3,81 @@
|
|
|
3
3
|
Details at https://pandera.readthedocs.io/en/stable/dataframe_models.html
|
|
4
4
|
-#}
|
|
5
5
|
{%- import 'slots.jinja2' as slot_macros -%}
|
|
6
|
+
{%- import 'custom_checks.jinja2' as custom_checks -%}
|
|
6
7
|
|
|
7
|
-
{%- macro
|
|
8
|
-
|
|
9
|
-
{
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
8
|
+
{%- macro render_parent_classes(cls) -%}
|
|
9
|
+
{%- if cls.is_a -%}
|
|
10
|
+
{{ cls.is_a }}
|
|
11
|
+
{%- else -%}
|
|
12
|
+
pla.DataFrameModel, _LinkmlPanderaValidator
|
|
13
|
+
{%- endif -%}
|
|
14
|
+
{%- endmacro -%}
|
|
15
|
+
|
|
16
|
+
{%- macro render_nested_ranges(cls) -%}
|
|
17
|
+
{#-
|
|
18
|
+
Helper class member that references another Pandera class that
|
|
19
|
+
handles a nested association.
|
|
20
|
+
-#}
|
|
21
|
+
_NESTED_RANGES = {
|
|
22
|
+
{%- for field in cls.fields -%}
|
|
23
|
+
{%- if field.reference_class() %}
|
|
24
|
+
"{{ field.name }}": "{{ field.reference_class() }}",
|
|
25
|
+
{% endif -%}
|
|
26
|
+
{%- endfor %}
|
|
27
|
+
}
|
|
28
|
+
{%- endmacro -%}
|
|
29
|
+
|
|
30
|
+
{%- macro render_inline_form(cls) -%}
|
|
31
|
+
{#-
|
|
32
|
+
This is a meta object with information used by the various check methods
|
|
33
|
+
-#}
|
|
34
|
+
_INLINE_FORM = {
|
|
35
|
+
{%- for field in cls.fields -%}
|
|
36
|
+
{%- if field.inline_form() %}
|
|
37
|
+
"{{ field.name }}": "{{ field.inline_form() }}",
|
|
38
|
+
{% endif -%}
|
|
39
|
+
{%- endfor %}
|
|
40
|
+
}
|
|
41
|
+
{%- endmacro -%}
|
|
42
|
+
|
|
43
|
+
{%- macro render_inline_details(cls) -%}
|
|
44
|
+
{#-
|
|
45
|
+
This is a meta object with information used by the various check methods
|
|
46
|
+
-#}
|
|
47
|
+
_INLINE_DETAILS = {
|
|
48
|
+
{%- for field in cls.fields -%}
|
|
49
|
+
{%- if field.inline_details() %}
|
|
50
|
+
"{{ field.name }}": {{ field.inline_details() }},
|
|
51
|
+
{% endif -%}
|
|
52
|
+
{%- endfor %}
|
|
53
|
+
}
|
|
54
|
+
{%- endmacro -%}
|
|
55
|
+
|
|
56
|
+
{%- macro render_slots(cls) -%}
|
|
57
|
+
{{ slot_macros.id_slot_name_class_variable(cls) }}
|
|
20
58
|
{%- if (cls.fields | length) == 0 %}
|
|
21
59
|
pass
|
|
22
60
|
{% endif -%}
|
|
23
61
|
{%- for field in cls.fields -%}
|
|
24
62
|
{{ slot_macros.render_slot(field) }}
|
|
25
63
|
{%- endfor -%}
|
|
64
|
+
{%- endmacro -%}
|
|
65
|
+
|
|
66
|
+
{%- macro render_class(cls) %}
|
|
67
|
+
{#-
|
|
68
|
+
Generates the main structure of a Pandera class.
|
|
69
|
+
This includes the slots, custom checks,
|
|
70
|
+
and helper class members with information used by the custom checks.
|
|
71
|
+
-#}
|
|
72
|
+
class {{cls.name}}({{ render_parent_classes(cls) }}):
|
|
73
|
+
{%- if cls.source_class.description %}
|
|
74
|
+
"""
|
|
75
|
+
{{ cls.source_class.description }}
|
|
76
|
+
"""
|
|
77
|
+
{% endif -%}
|
|
78
|
+
{{ render_slots(cls) }}
|
|
79
|
+
{{ custom_checks.render_custom_checks(cls) }}
|
|
80
|
+
{{ render_nested_ranges(cls) }}
|
|
81
|
+
{{ render_inline_form(cls) }}
|
|
82
|
+
{{ render_inline_details(cls) }}
|
|
26
83
|
{% endmacro -%}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
{%- macro nested_checks(field) -%}
|
|
3
|
+
{#-
|
|
4
|
+
field: DataframeField
|
|
5
|
+
Render a class method that follows nested structures
|
|
6
|
+
other than the simple_dict form.
|
|
7
|
+
-#}
|
|
8
|
+
{%- if field.inline_form() %}
|
|
9
|
+
@pla.check("{{ field.name }}")
|
|
10
|
+
def check_nested_struct_{{ field.name }}(cls, data: PolarsData):
|
|
11
|
+
{% if field.inline_form() == 'inline_list_dict' -%}
|
|
12
|
+
return cls._check_nested_struct(data)
|
|
13
|
+
{% elif field.inline_form() == 'simple_dict' -%}
|
|
14
|
+
return cls._check_simple_dict(data)
|
|
15
|
+
{% elif field.inline_form() == 'inline_collection_dict' -%}
|
|
16
|
+
return cls._check_collection_struct(data)
|
|
17
|
+
{% else -%}
|
|
18
|
+
return cls._check_nested_list_struct(data)
|
|
19
|
+
{% endif -%}
|
|
20
|
+
{%- endif -%}
|
|
21
|
+
{%- endmacro -%}
|
|
22
|
+
|
|
23
|
+
{%- macro render_custom_checks(cls) -%}
|
|
24
|
+
{%- for field in cls.fields -%}
|
|
25
|
+
{{ nested_checks(field) }}
|
|
26
|
+
{%- endfor %}
|
|
27
|
+
{%- endmacro -%}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
{%- macro enum_parameter(
|
|
2
|
-
{%- if
|
|
1
|
+
{%- macro enum_parameter(field) -%}
|
|
2
|
+
{%- if field.permissible_values()|length > 0 -%}
|
|
3
3
|
dtype_kwargs={"categories":(
|
|
4
|
-
{%- for pv in
|
|
4
|
+
{%- for pv in field.permissible_values() -%}
|
|
5
5
|
'{{ pv }}',
|
|
6
6
|
{%- endfor -%}
|
|
7
7
|
)}
|
|
@@ -4,10 +4,20 @@
|
|
|
4
4
|
-#}
|
|
5
5
|
{%- import 'header.jinja2' as header -%}
|
|
6
6
|
{%- import 'class.jinja2' as class_macros -%}
|
|
7
|
-
{%- import 'mixins.jinja2' as mixins -%}
|
|
8
7
|
{{ header }}
|
|
9
8
|
|
|
10
|
-
{
|
|
9
|
+
{% if pandera_validator_code %}
|
|
10
|
+
{{ pandera_validator_code }}
|
|
11
|
+
{% else %}
|
|
12
|
+
from linkml.generators.panderagen.linkml_pandera_validator import LinkmlPanderaValidator as _LinkmlPanderaValidator
|
|
13
|
+
{% endif %}
|
|
14
|
+
|
|
15
|
+
# These are all str for now
|
|
16
|
+
ID_TYPES = {
|
|
17
|
+
{%- for cls in doc.classes %}
|
|
18
|
+
"{{ cls.name }}": "str",
|
|
19
|
+
{%- endfor %}
|
|
20
|
+
}
|
|
11
21
|
|
|
12
22
|
{% if metamodel_version %}# metamodel_version: {{metamodel_version}}{% endif %}
|
|
13
23
|
{% if model_version %}# version: {{model_version}}{% endif %}
|
|
@@ -4,33 +4,35 @@
|
|
|
4
4
|
-#}
|
|
5
5
|
{%- import 'enums.jinja2' as enum_macros -%}
|
|
6
6
|
|
|
7
|
-
{%- macro constraint_parameters(field
|
|
7
|
+
{%- macro constraint_parameters(field) -%}
|
|
8
8
|
{%- if coerce is true -%}coerce=True, {% endif -%}
|
|
9
9
|
{%- if field.default_value is not none -%}default={{ field.default_value }}, {% endif -%}
|
|
10
|
-
{%- if
|
|
11
|
-
{%- if
|
|
12
|
-
{%- if
|
|
13
|
-
{%- if (
|
|
10
|
+
{%- if field.minimum_value() is not none -%}ge={{ field.minimum_value() }}, {% endif -%}
|
|
11
|
+
{%- if field.maximum_value() is not none -%}le={{ field.maximum_value() }}, {% endif -%}
|
|
12
|
+
{%- if field.pattern() is not none -%}str_matches=r"{{ field.pattern() }}", {% endif -%}
|
|
13
|
+
{%- if (field.required() is none or field.required() is false) and field.identifier() is not true -%}nullable=True, {% endif -%}
|
|
14
14
|
{%- endmacro -%}
|
|
15
15
|
|
|
16
|
-
{%- macro
|
|
17
|
-
|
|
16
|
+
{%- macro id_slot_name_class_variable(cls) %}
|
|
17
|
+
_id_name : str = {% if cls.identifier_key_slot() %} '{{ cls.identifier_key_slot() }}' {% else %}None{% endif %}
|
|
18
|
+
{%- endmacro -%}
|
|
19
|
+
|
|
20
|
+
{%- macro render_slot(field) %}
|
|
18
21
|
{{ field.name }}:{{ ' ' }}
|
|
19
|
-
{%- if (
|
|
22
|
+
{%- if (field.required() is none or field.required() is false) and (field.identifier() is not true) -%}
|
|
20
23
|
Optional[
|
|
21
24
|
{%- endif -%}
|
|
22
25
|
{{field.range}}
|
|
23
|
-
{%- if (
|
|
24
|
-
{{ ']
|
|
26
|
+
{%- if (field.required() is none or field.required() is false) and (field.identifier() is not true) -%}
|
|
27
|
+
{{ ']' }}
|
|
25
28
|
{%- endif -%}
|
|
26
|
-
= pla.Field(
|
|
27
|
-
{{- constraint_parameters(field
|
|
28
|
-
{{- enum_macros.enum_parameter(
|
|
29
|
+
{{ ' =' }} pla.Field(
|
|
30
|
+
{{- constraint_parameters(field) -}}
|
|
31
|
+
{{- enum_macros.enum_parameter(field) -}}
|
|
29
32
|
)
|
|
30
|
-
{%- if
|
|
33
|
+
{%- if field.description() %}
|
|
31
34
|
"""
|
|
32
|
-
{{
|
|
35
|
+
{{ field.description() }}
|
|
33
36
|
"""
|
|
34
|
-
|
|
35
|
-
{%- endwith -%}
|
|
37
|
+
{% endif -%}
|
|
36
38
|
{%- endmacro %}
|
|
@@ -1,31 +1,156 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
|
-
from
|
|
4
|
+
from linkml_runtime.linkml_model.meta import ClassDefinitionName, SlotDefinition
|
|
5
|
+
|
|
6
|
+
from linkml.utils.helpers import get_range_associated_slots
|
|
7
|
+
|
|
8
|
+
from .dataframe_field import DataframeField
|
|
4
9
|
|
|
5
10
|
logger = logging.getLogger(__file__)
|
|
6
11
|
|
|
7
12
|
|
|
8
13
|
class SlotGeneratorMixin:
|
|
14
|
+
"""
|
|
15
|
+
Prior to rendering the dataframe schema, this class provides
|
|
16
|
+
and adapter between the LinkML model and schema view
|
|
17
|
+
and the rendering engine.
|
|
18
|
+
"""
|
|
19
|
+
|
|
9
20
|
LINKML_ANY_CURIE = "linkml:Any"
|
|
21
|
+
|
|
22
|
+
# constants used to render the schema
|
|
23
|
+
# these will be moved to a dialect-specific place
|
|
10
24
|
ANY_RANGE_STRING = "Object"
|
|
11
25
|
CLASS_RANGE_STRING = "Struct"
|
|
26
|
+
SIMPLE_DICT_RANGE_STRING = "Struct"
|
|
12
27
|
ENUM_RANGE_STRING = "Enum"
|
|
13
|
-
DEFAULT_RANGE_STRING = "str"
|
|
14
28
|
|
|
15
|
-
#
|
|
16
|
-
|
|
17
|
-
|
|
29
|
+
# association form flags used for rendering decisions
|
|
30
|
+
FORM_INLINED_DICT = "inlined_dict"
|
|
31
|
+
FORM_INLINED_LIST_DICT = "inlined_list_dict"
|
|
32
|
+
FORM_INLINED_COLLECTION_DICT = "inline_collection_dict"
|
|
33
|
+
FORM_INLINED_SIMPLE_DICT = "simple_dict"
|
|
34
|
+
FORM_MULTIVALUED_FOREIGN_KEY = "list_foreign_key"
|
|
35
|
+
FORM_FOREIGN_KEY = "foreign_key"
|
|
36
|
+
FORM_ERROR = "error"
|
|
37
|
+
|
|
38
|
+
# When nested inlining is done, the Pandera validator needs a specific range
|
|
39
|
+
INLINED_FORM_RANGE_PANDERA = {
|
|
40
|
+
FORM_INLINED_SIMPLE_DICT: SIMPLE_DICT_RANGE_STRING,
|
|
41
|
+
FORM_INLINED_LIST_DICT: CLASS_RANGE_STRING,
|
|
42
|
+
FORM_INLINED_COLLECTION_DICT: CLASS_RANGE_STRING,
|
|
43
|
+
FORM_INLINED_DICT: CLASS_RANGE_STRING,
|
|
44
|
+
FORM_ERROR: None,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
def is_multivalued(self, slot):
|
|
48
|
+
return "multivalued" in slot and slot.multivalued is True
|
|
49
|
+
|
|
50
|
+
_INTERNAL_INLINED_FORM = {
|
|
51
|
+
# INLINED, INLINED_AS_LIST, MULTIVALUED,
|
|
52
|
+
(False, False, False): FORM_FOREIGN_KEY,
|
|
53
|
+
(False, False, True): FORM_MULTIVALUED_FOREIGN_KEY,
|
|
54
|
+
(False, True, False): FORM_INLINED_LIST_DICT,
|
|
55
|
+
(False, True, True): FORM_INLINED_LIST_DICT,
|
|
56
|
+
(True, False, False): FORM_INLINED_DICT,
|
|
57
|
+
(True, False, True): FORM_INLINED_COLLECTION_DICT,
|
|
58
|
+
(True, None, True): FORM_INLINED_DICT,
|
|
59
|
+
(True, True, False): FORM_INLINED_DICT,
|
|
60
|
+
(True, True, True): FORM_INLINED_LIST_DICT,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
def get_identifier_or_key_slot(self, cn: ClassDefinitionName) -> Optional[SlotDefinition]:
|
|
64
|
+
sv = self.schemaview
|
|
65
|
+
id_slot = sv.get_identifier_slot(cn)
|
|
66
|
+
if id_slot:
|
|
67
|
+
return id_slot
|
|
68
|
+
else:
|
|
69
|
+
for s in sv.class_induced_slots(cn):
|
|
70
|
+
if s.key:
|
|
71
|
+
return s
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
def calculate_inlined_form(self, slot: SlotDefinition) -> str:
|
|
75
|
+
is_multivalued = self.is_multivalued(slot)
|
|
76
|
+
internal_inlined_form_key = ((slot.inlined is True), (slot.inlined_as_list is True), is_multivalued)
|
|
77
|
+
logger.info(f"Inlined form key: {internal_inlined_form_key}")
|
|
78
|
+
internal_inlined_form = self._INTERNAL_INLINED_FORM.get(
|
|
79
|
+
internal_inlined_form_key, SlotGeneratorMixin.FORM_ERROR
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
if internal_inlined_form == SlotGeneratorMixin.FORM_INLINED_COLLECTION_DICT:
|
|
83
|
+
if self.get_identifier_or_key_slot(slot.range) is None:
|
|
84
|
+
internal_inlined_form = SlotGeneratorMixin.FORM_INLINED_LIST_DICT
|
|
85
|
+
|
|
86
|
+
if self.calculate_simple_dict(slot) is not None:
|
|
87
|
+
return SlotGeneratorMixin.FORM_INLINED_SIMPLE_DICT
|
|
88
|
+
|
|
89
|
+
return internal_inlined_form
|
|
90
|
+
|
|
91
|
+
def calculate_simple_dict(self, slot: SlotDefinition):
|
|
92
|
+
"""slot is the container for the simple dict slot"""
|
|
93
|
+
|
|
94
|
+
(_, range_simple_dict_value_slot, _) = get_range_associated_slots(self.schemaview, slot.range)
|
|
95
|
+
|
|
96
|
+
return range_simple_dict_value_slot
|
|
18
97
|
|
|
19
|
-
def handle_none_slot(self, slot
|
|
98
|
+
def handle_none_slot(self, slot) -> str:
|
|
20
99
|
range = self.schema.default_range # need to figure this out, set at the beginning?
|
|
21
100
|
if range is None:
|
|
22
|
-
range =
|
|
101
|
+
range = "str"
|
|
23
102
|
|
|
24
103
|
return range
|
|
25
104
|
|
|
26
105
|
def handle_class_slot(self, slot, range: str) -> str:
|
|
27
|
-
|
|
28
|
-
|
|
106
|
+
range_info = self.schemaview.all_classes().get(range)
|
|
107
|
+
|
|
108
|
+
if range_info["class_uri"] == SlotGeneratorMixin.LINKML_ANY_CURIE:
|
|
109
|
+
range = SlotGeneratorMixin.ANY_RANGE_STRING
|
|
110
|
+
else:
|
|
111
|
+
inlined_form = self.calculate_inlined_form(slot)
|
|
112
|
+
|
|
113
|
+
if inlined_form == SlotGeneratorMixin.FORM_INLINED_COLLECTION_DICT:
|
|
114
|
+
logger.warning(
|
|
115
|
+
f"Slot {slot.name} uses inlined dictionary form,"
|
|
116
|
+
"which may be less efficient than inlined as list form with the current implementation."
|
|
117
|
+
)
|
|
118
|
+
elif inlined_form == SlotGeneratorMixin.FORM_INLINED_SIMPLE_DICT:
|
|
119
|
+
logger.warning(
|
|
120
|
+
f"Slot {slot.name} uses inlined simple dictionary form. Support is incomplete "
|
|
121
|
+
"and performance is less efficient than inlined as list form with the current implementation."
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
if inlined_form in (SlotGeneratorMixin.FORM_MULTIVALUED_FOREIGN_KEY, SlotGeneratorMixin.FORM_FOREIGN_KEY):
|
|
125
|
+
logger.warning(f"Foreign key not implemented for slot {slot.name}")
|
|
126
|
+
range = f"ID_TYPES['{self.get_class_name(range)}']"
|
|
127
|
+
else:
|
|
128
|
+
# TODO: make these setters
|
|
129
|
+
slot.annotations["reference_class"] = self.get_class_name(range)
|
|
130
|
+
slot.annotations["inline_form"] = inlined_form
|
|
131
|
+
|
|
132
|
+
range = SlotGeneratorMixin.INLINED_FORM_RANGE_PANDERA[inlined_form]
|
|
133
|
+
|
|
134
|
+
if inlined_form == SlotGeneratorMixin.FORM_INLINED_SIMPLE_DICT:
|
|
135
|
+
self.set_simple_dict_inline_details_annotation(slot)
|
|
136
|
+
elif inlined_form in [SlotGeneratorMixin.FORM_INLINED_LIST_DICT]:
|
|
137
|
+
range = self.make_multivalued(range)
|
|
138
|
+
|
|
139
|
+
return range
|
|
140
|
+
|
|
141
|
+
def set_simple_dict_inline_details_annotation(self, slot):
|
|
142
|
+
"""Extra metadata is to help with the simple dict case"""
|
|
143
|
+
(range_id_slot, range_simple_dict_value_slot, _) = get_range_associated_slots( # range_required_slots,
|
|
144
|
+
self.schemaview, slot.range
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
simple_dict_id = range_id_slot.name
|
|
148
|
+
other_slot = range_simple_dict_value_slot.name
|
|
149
|
+
slot.annotations["inline_details"] = {"id": simple_dict_id, "other": other_slot}
|
|
150
|
+
|
|
151
|
+
def handle_non_inlined_class_slot(self, slot, range: str) -> str:
|
|
152
|
+
"""non-inlined class slots have been temporarily removed but this will be needed to support them"""
|
|
153
|
+
return f"ID_TYPES['{self.get_class_name(range)}']"
|
|
29
154
|
|
|
30
155
|
def handle_type_slot(self, slot, range: str) -> str:
|
|
31
156
|
del slot # unused for now
|
|
@@ -43,9 +168,10 @@ class SlotGeneratorMixin:
|
|
|
43
168
|
return range
|
|
44
169
|
|
|
45
170
|
def handle_multivalued_slot(self, slot, range: str) -> str:
|
|
46
|
-
if slot.
|
|
47
|
-
|
|
48
|
-
|
|
171
|
+
if (slot.inlined_as_list is True and self.is_multivalued(slot)) or (
|
|
172
|
+
slot.inlined is True and slot.inlined_as_list is True and self.is_multivalued(slot)
|
|
173
|
+
):
|
|
174
|
+
range = self.make_multivalued(range)
|
|
49
175
|
|
|
50
176
|
return range
|
|
51
177
|
|
|
@@ -58,19 +184,20 @@ class SlotGeneratorMixin:
|
|
|
58
184
|
safe_sn = self.get_slot_name(slot.alias)
|
|
59
185
|
|
|
60
186
|
if range is None:
|
|
61
|
-
range = self.handle_none_slot(slot
|
|
187
|
+
range = self.handle_none_slot(slot)
|
|
62
188
|
elif range in self.schemaview.all_classes():
|
|
63
189
|
range = self.handle_class_slot(slot, range)
|
|
64
190
|
elif range in self.schemaview.all_types():
|
|
65
191
|
range = self.handle_type_slot(slot, range)
|
|
192
|
+
if self.is_multivalued(slot):
|
|
193
|
+
range = self.make_multivalued(range)
|
|
66
194
|
elif range in self.schemaview.all_enums():
|
|
67
195
|
range = self.handle_enum_slot(slot, range)
|
|
196
|
+
range = self.handle_multivalued_slot(slot, range)
|
|
68
197
|
else:
|
|
69
198
|
raise Exception(f"Unknown range {range}")
|
|
70
199
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
return OOField(
|
|
200
|
+
return DataframeField(
|
|
74
201
|
name=safe_sn,
|
|
75
202
|
source_slot=slot,
|
|
76
203
|
range=range,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Transform classes for LinkML Pandera validation.
|
|
2
|
+
|
|
3
|
+
This module provides transform classes that convert LinkML inline formats
|
|
4
|
+
into forms suitable for Polars DataFrame validation with Pandera models.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .collection_dict_model_transform import CollectionDictModelTransform
|
|
8
|
+
from .list_dict_model_transform import ListDictModelTransform
|
|
9
|
+
from .model_transform import ModelTransform
|
|
10
|
+
from .nested_struct_model_transform import NestedStructModelTransform
|
|
11
|
+
from .simple_dict_model_transform import SimpleDictModelTransform
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"ModelTransform",
|
|
15
|
+
"SimpleDictModelTransform",
|
|
16
|
+
"CollectionDictModelTransform",
|
|
17
|
+
"ListDictModelTransform",
|
|
18
|
+
"NestedStructModelTransform",
|
|
19
|
+
]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
from .model_transform import ModelTransform
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CollectionDictModelTransform(ModelTransform):
|
|
7
|
+
"""This class assists in converting a LinkML 'collection dict' inline column
|
|
8
|
+
into a form that is better for representing in a PolaRS dataframe and
|
|
9
|
+
validating with a Pandera model.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, polars_schema, id_col):
|
|
13
|
+
self.polars_schema = polars_schema
|
|
14
|
+
"""A polars schema representing a collection dict column"""
|
|
15
|
+
|
|
16
|
+
self.id_col = id_col
|
|
17
|
+
"""The ID column in the sense of a LinkML inline collection dict"""
|
|
18
|
+
|
|
19
|
+
def transform(self, linkml_collection_dict):
|
|
20
|
+
"""Converts a collection dict nested column to a list of dicts.
|
|
21
|
+
{ 'A': {...}, 'B': {...}, ... } -> [{'id': 'A', ...}, {'id': 'B', ...}, ...]
|
|
22
|
+
"""
|
|
23
|
+
return self._collection_dict_to_list_of_structs(linkml_collection_dict)
|
|
24
|
+
|
|
25
|
+
def _collection_dict_to_list_of_structs(self, linkml_collection_dict):
|
|
26
|
+
"""Converts a collection dict nested column to a list of dicts.
|
|
27
|
+
{ 'A': {...}, 'B': {...}, ... } -> [{'id': 'A', ...}, {'id': 'B', ...}, ...]
|
|
28
|
+
|
|
29
|
+
An inefficient conversion (relative to native PolaRS operations)
|
|
30
|
+
from a collection dict form to a dataframe struct column.
|
|
31
|
+
|
|
32
|
+
linkml_collection_dict : dict
|
|
33
|
+
A single row entry in a dataframe column (one cell), which itself is a dict.
|
|
34
|
+
The value entries are dicts that get the key added as an id field.
|
|
35
|
+
"""
|
|
36
|
+
arr = []
|
|
37
|
+
for k, v in linkml_collection_dict.items():
|
|
38
|
+
if k not in v:
|
|
39
|
+
v[self.id_col] = k
|
|
40
|
+
arr.append(v)
|
|
41
|
+
return arr
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def prepare_dataframe(cls, data, column_name, nested_cls):
|
|
45
|
+
"""Returns just the collection dict column transformed to an inlined list form
|
|
46
|
+
|
|
47
|
+
note that this method uses collect and iter_rows so is very inefficient
|
|
48
|
+
"""
|
|
49
|
+
id_column = nested_cls.get_id_column_name()
|
|
50
|
+
polars_schema = nested_cls.to_schema()
|
|
51
|
+
|
|
52
|
+
collection_dict_transformer = cls(polars_schema, id_column)
|
|
53
|
+
|
|
54
|
+
one_column_df = data.lazyframe.select(pl.col(column_name)).collect()
|
|
55
|
+
|
|
56
|
+
list_of_structs = [collection_dict_transformer.transform(e) for [e] in one_column_df.iter_rows()]
|
|
57
|
+
|
|
58
|
+
return pl.DataFrame(pl.Series(list_of_structs).alias(column_name))
|
|
59
|
+
|
|
60
|
+
def explode_unnest_dataframe(self, df, column_name):
|
|
61
|
+
"""Filter, explode and unnest for collection dict."""
|
|
62
|
+
return df.lazy().filter(pl.col(column_name).list.len() > 0).explode(column_name).unnest(column_name).collect()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import polars as pl
|
|
2
|
+
|
|
3
|
+
from .model_transform import ModelTransform
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ListDictModelTransform(ModelTransform):
|
|
7
|
+
"""This class assists in converting a LinkML 'list dict' inline column
|
|
8
|
+
into a form that is better for representing in a PolaRS dataframe and
|
|
9
|
+
validating with a Pandera model.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, polars_schema):
|
|
13
|
+
self.polars_schema = polars_schema
|
|
14
|
+
"""A polars schema representing a list dict column"""
|
|
15
|
+
|
|
16
|
+
def transform(self, linkml_list_dict):
|
|
17
|
+
"""Transforms a list dict nested column.
|
|
18
|
+
This is a pass-through since list dicts are already in the correct format.
|
|
19
|
+
"""
|
|
20
|
+
return linkml_list_dict
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def unnest_list_struct(cls, column_name: str, df):
|
|
24
|
+
"""Use this in a custom check. Pass the nested model as pandera_model."""
|
|
25
|
+
|
|
26
|
+
# fmt: off
|
|
27
|
+
unnested_column = (
|
|
28
|
+
df
|
|
29
|
+
.select(column_name)
|
|
30
|
+
.filter(pl.col(column_name).list.len() > 0) # see: https://github.com/pola-rs/polars/issues/14381
|
|
31
|
+
.explode(column_name)
|
|
32
|
+
.unnest(column_name)
|
|
33
|
+
)
|
|
34
|
+
# fmt: on
|
|
35
|
+
|
|
36
|
+
return unnested_column
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def prepare_dataframe(cls, data, column_name, nested_cls):
|
|
40
|
+
"""Returns just the list dict column transformed to an inlined list form
|
|
41
|
+
|
|
42
|
+
note that this method uses collect and iter_rows so is very inefficient
|
|
43
|
+
"""
|
|
44
|
+
polars_schema = nested_cls.to_schema()
|
|
45
|
+
|
|
46
|
+
list_dict_transformer = cls(polars_schema)
|
|
47
|
+
|
|
48
|
+
one_column_df = data.lazyframe.select(pl.col(column_name)).collect()
|
|
49
|
+
|
|
50
|
+
list_of_structs = [list_dict_transformer.transform(e) for [e] in one_column_df.iter_rows()]
|
|
51
|
+
|
|
52
|
+
return pl.DataFrame(pl.Series(list_of_structs).alias(column_name))
|
|
53
|
+
|
|
54
|
+
def explode_unnest_dataframe(self, df, column_name, data=None):
|
|
55
|
+
"""Filter, explode and unnest for list dict with struct fallback."""
|
|
56
|
+
try:
|
|
57
|
+
return (
|
|
58
|
+
df.lazy().filter(pl.col(column_name).list.len() > 0).explode(column_name).unnest(column_name).collect()
|
|
59
|
+
)
|
|
60
|
+
except (pl.exceptions.PanicException, Exception):
|
|
61
|
+
if data:
|
|
62
|
+
from .nested_struct_model_transform import NestedStructModelTransform
|
|
63
|
+
|
|
64
|
+
nested_transform = NestedStructModelTransform(self.polars_schema)
|
|
65
|
+
return nested_transform.explode_unnest_dataframe(data.lazyframe, column_name)
|
|
66
|
+
raise
|