arize 8.0.0b1__py3-none-any.whl → 8.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +1 -1
- arize/_client_factory.py +50 -0
- arize/_flight/client.py +4 -4
- arize/_generated/api_client/api/datasets_api.py +6 -6
- arize/_generated/api_client/api/experiments_api.py +6 -6
- arize/_generated/api_client/api/projects_api.py +3 -3
- arize/_lazy.py +25 -9
- arize/client.py +6 -16
- arize/config.py +9 -36
- arize/constants/ml.py +9 -16
- arize/constants/spans.py +5 -10
- arize/datasets/client.py +13 -9
- arize/datasets/errors.py +1 -1
- arize/datasets/validation.py +2 -2
- arize/embeddings/auto_generator.py +2 -2
- arize/embeddings/errors.py +2 -2
- arize/embeddings/tabular_generators.py +1 -1
- arize/exceptions/base.py +0 -52
- arize/exceptions/parameters.py +0 -329
- arize/experiments/client.py +14 -7
- arize/experiments/evaluators/base.py +6 -6
- arize/experiments/evaluators/executors.py +10 -3
- arize/experiments/evaluators/types.py +2 -2
- arize/experiments/functions.py +18 -11
- arize/experiments/types.py +3 -5
- arize/logging.py +1 -1
- arize/ml/batch_validation/errors.py +10 -1004
- arize/ml/batch_validation/validator.py +273 -225
- arize/ml/casting.py +7 -7
- arize/ml/client.py +12 -11
- arize/ml/proto.py +6 -6
- arize/ml/stream_validation.py +2 -3
- arize/ml/surrogate_explainer/mimic.py +3 -3
- arize/ml/types.py +1 -55
- arize/pre_releases.py +6 -3
- arize/projects/client.py +9 -4
- arize/regions.py +2 -2
- arize/spans/client.py +13 -11
- arize/spans/columns.py +32 -36
- arize/spans/conversion.py +5 -6
- arize/spans/validation/common/argument_validation.py +3 -3
- arize/spans/validation/common/dataframe_form_validation.py +6 -6
- arize/spans/validation/common/value_validation.py +1 -1
- arize/spans/validation/evals/dataframe_form_validation.py +4 -4
- arize/spans/validation/evals/evals_validation.py +6 -6
- arize/spans/validation/metadata/dataframe_form_validation.py +1 -1
- arize/spans/validation/spans/dataframe_form_validation.py +2 -2
- arize/spans/validation/spans/spans_validation.py +6 -6
- arize/utils/arrow.py +2 -2
- arize/utils/cache.py +2 -2
- arize/utils/dataframe.py +4 -4
- arize/utils/online_tasks/dataframe_preprocessor.py +7 -7
- arize/utils/openinference_conversion.py +10 -10
- arize/utils/proto.py +1 -1
- arize/version.py +1 -1
- {arize-8.0.0b1.dist-info → arize-8.0.0b2.dist-info}/METADATA +23 -6
- {arize-8.0.0b1.dist-info → arize-8.0.0b2.dist-info}/RECORD +60 -59
- {arize-8.0.0b1.dist-info → arize-8.0.0b2.dist-info}/WHEEL +0 -0
- {arize-8.0.0b1.dist-info → arize-8.0.0b2.dist-info}/licenses/LICENSE +0 -0
- {arize-8.0.0b1.dist-info → arize-8.0.0b2.dist-info}/licenses/NOTICE +0 -0
|
@@ -39,8 +39,70 @@ from arize.constants.ml import (
|
|
|
39
39
|
MIN_PREDICTION_ID_LEN,
|
|
40
40
|
MODEL_MAPPING_CONFIG,
|
|
41
41
|
)
|
|
42
|
+
from arize.exceptions.base import (
|
|
43
|
+
InvalidDataFrameIndex,
|
|
44
|
+
InvalidFieldTypeConversion,
|
|
45
|
+
ValidationError,
|
|
46
|
+
)
|
|
47
|
+
from arize.exceptions.types import (
|
|
48
|
+
InvalidFieldTypeLlmConfig,
|
|
49
|
+
InvalidFieldTypePromptTemplates,
|
|
50
|
+
InvalidType,
|
|
51
|
+
InvalidTypeColumns,
|
|
52
|
+
InvalidTypeFeatures,
|
|
53
|
+
InvalidTypeShapValues,
|
|
54
|
+
InvalidTypeTags,
|
|
55
|
+
InvalidValueEmbeddingRawDataTooLong,
|
|
56
|
+
InvalidValueEmbeddingVectorDimensionality,
|
|
57
|
+
)
|
|
58
|
+
from arize.exceptions.values import (
|
|
59
|
+
InvalidBoundingBoxesCategories,
|
|
60
|
+
InvalidBoundingBoxesCoordinates,
|
|
61
|
+
InvalidBoundingBoxesScores,
|
|
62
|
+
InvalidMultiClassActScoreValue,
|
|
63
|
+
InvalidMultiClassClassNameLength,
|
|
64
|
+
InvalidMultiClassPredScoreValue,
|
|
65
|
+
InvalidMultiClassThresholdClasses,
|
|
66
|
+
InvalidNumClassesMultiClassMap,
|
|
67
|
+
InvalidPolygonCategories,
|
|
68
|
+
InvalidPolygonCoordinates,
|
|
69
|
+
InvalidPolygonScores,
|
|
70
|
+
InvalidRankingCategoryValue,
|
|
71
|
+
InvalidRankValue,
|
|
72
|
+
InvalidRecord,
|
|
73
|
+
InvalidStringLengthInColumn,
|
|
74
|
+
InvalidTagLength,
|
|
75
|
+
InvalidValueMissingValue,
|
|
76
|
+
InvalidValueTimestamp,
|
|
77
|
+
)
|
|
42
78
|
from arize.logging import get_truncation_warning_message
|
|
43
|
-
from arize.ml.batch_validation import
|
|
79
|
+
from arize.ml.batch_validation.errors import (
|
|
80
|
+
DuplicateColumnsInDataframe,
|
|
81
|
+
InvalidBatchId,
|
|
82
|
+
InvalidColumnNameEmptyString,
|
|
83
|
+
InvalidEnvironment,
|
|
84
|
+
InvalidFieldTypeEmbeddingFeatures,
|
|
85
|
+
InvalidFieldTypePromptResponse,
|
|
86
|
+
InvalidModelId,
|
|
87
|
+
InvalidModelType,
|
|
88
|
+
InvalidModelTypeAndMetricsCombination,
|
|
89
|
+
InvalidModelVersion,
|
|
90
|
+
InvalidNumberOfEmbeddings,
|
|
91
|
+
InvalidPredActColumnNamesForModelType,
|
|
92
|
+
InvalidPredActCVColumnNamesForModelType,
|
|
93
|
+
InvalidSchemaType,
|
|
94
|
+
InvalidShapSuffix,
|
|
95
|
+
MissingColumns,
|
|
96
|
+
MissingCVPredAct,
|
|
97
|
+
MissingPredictionIdColumnForDelayedRecords,
|
|
98
|
+
MissingPreprodAct,
|
|
99
|
+
MissingPreprodPredActNumericAndCategorical,
|
|
100
|
+
MissingReqPredActColumnNamesForMultiClass,
|
|
101
|
+
MissingRequiredColumnsForRankingModel,
|
|
102
|
+
MissingRequiredColumnsMetricsValidation,
|
|
103
|
+
MultipleCVPredAct,
|
|
104
|
+
ReservedColumns,
|
|
105
|
+
)
|
|
44
106
|
from arize.ml.types import (
|
|
45
107
|
CATEGORICAL_MODEL_TYPES,
|
|
46
108
|
NUMERIC_MODEL_TYPES,
|
|
@@ -74,8 +136,8 @@ class Validator:
|
|
|
74
136
|
schema: BaseSchema,
|
|
75
137
|
model_version: str | None = None,
|
|
76
138
|
batch_id: str | None = None,
|
|
77
|
-
) -> list[
|
|
78
|
-
"""Validate required checks for schema, environment, and DataFrame structure."""
|
|
139
|
+
) -> list[ValidationError]:
|
|
140
|
+
"""Validate required checks for schema, environment, and :class:`pandas.DataFrame` structure."""
|
|
79
141
|
general_checks = chain(
|
|
80
142
|
Validator._check_valid_schema_type(schema, environment),
|
|
81
143
|
Validator._check_field_convertible_to_str(
|
|
@@ -115,7 +177,7 @@ class Validator:
|
|
|
115
177
|
metric_families: list[Metrics] | None = None,
|
|
116
178
|
model_version: str | None = None,
|
|
117
179
|
batch_id: str | None = None,
|
|
118
|
-
) -> list[
|
|
180
|
+
) -> list[ValidationError]:
|
|
119
181
|
"""Validate parameters including model type, environment, and schema consistency."""
|
|
120
182
|
# general checks
|
|
121
183
|
general_checks = chain(
|
|
@@ -223,7 +285,7 @@ class Validator:
|
|
|
223
285
|
model_type: ModelTypes,
|
|
224
286
|
schema: BaseSchema,
|
|
225
287
|
pyarrow_schema: pa.Schema,
|
|
226
|
-
) -> list[
|
|
288
|
+
) -> list[ValidationError]:
|
|
227
289
|
"""Validate column data types against expected types for the schema."""
|
|
228
290
|
column_types = dict(
|
|
229
291
|
zip(pyarrow_schema.names, pyarrow_schema.types, strict=True)
|
|
@@ -323,7 +385,7 @@ class Validator:
|
|
|
323
385
|
environment: Environments,
|
|
324
386
|
schema: BaseSchema,
|
|
325
387
|
model_type: ModelTypes,
|
|
326
|
-
) -> list[
|
|
388
|
+
) -> list[ValidationError]:
|
|
327
389
|
"""Validate data values including ranges, formats, and consistency checks."""
|
|
328
390
|
# ASSUMPTION: at this point the param and type checks should have passed.
|
|
329
391
|
# This function may crash if that is not true, e.g. if columns are missing
|
|
@@ -444,15 +506,15 @@ class Validator:
|
|
|
444
506
|
@staticmethod
|
|
445
507
|
def _check_column_names_for_empty_strings(
|
|
446
508
|
schema: BaseSchema,
|
|
447
|
-
) -> list[
|
|
509
|
+
) -> list[InvalidColumnNameEmptyString]:
|
|
448
510
|
if "" in schema.get_used_columns():
|
|
449
|
-
return [
|
|
511
|
+
return [InvalidColumnNameEmptyString()]
|
|
450
512
|
return []
|
|
451
513
|
|
|
452
514
|
@staticmethod
|
|
453
515
|
def _check_field_convertible_to_str(
|
|
454
516
|
model_id: object, model_version: object, batch_id: object
|
|
455
|
-
) -> list[
|
|
517
|
+
) -> list[InvalidFieldTypeConversion]:
|
|
456
518
|
# converting to a set first makes the checks run a lot faster
|
|
457
519
|
wrong_fields = []
|
|
458
520
|
if model_id is not None and not isinstance(model_id, str):
|
|
@@ -472,61 +534,59 @@ class Validator:
|
|
|
472
534
|
wrong_fields.append("batch_id")
|
|
473
535
|
|
|
474
536
|
if wrong_fields:
|
|
475
|
-
return [
|
|
537
|
+
return [InvalidFieldTypeConversion(wrong_fields, "string")]
|
|
476
538
|
return []
|
|
477
539
|
|
|
478
540
|
@staticmethod
|
|
479
541
|
def _check_field_type_embedding_features_column_names(
|
|
480
542
|
schema: Schema,
|
|
481
|
-
) -> list[
|
|
543
|
+
) -> list[InvalidFieldTypeEmbeddingFeatures]:
|
|
482
544
|
if schema.embedding_feature_column_names is not None:
|
|
483
545
|
if not isinstance(schema.embedding_feature_column_names, dict):
|
|
484
|
-
return [
|
|
546
|
+
return [InvalidFieldTypeEmbeddingFeatures()]
|
|
485
547
|
for k, v in schema.embedding_feature_column_names.items():
|
|
486
548
|
if not isinstance(k, str) or not isinstance(
|
|
487
549
|
v, EmbeddingColumnNames
|
|
488
550
|
):
|
|
489
|
-
return [
|
|
551
|
+
return [InvalidFieldTypeEmbeddingFeatures()]
|
|
490
552
|
return []
|
|
491
553
|
|
|
492
554
|
@staticmethod
|
|
493
555
|
def _check_field_type_prompt_response(
|
|
494
556
|
schema: Schema,
|
|
495
|
-
) -> list[
|
|
557
|
+
) -> list[InvalidFieldTypePromptResponse]:
|
|
496
558
|
errors = []
|
|
497
559
|
if schema.prompt_column_names is not None and not isinstance(
|
|
498
560
|
schema.prompt_column_names, (str, EmbeddingColumnNames)
|
|
499
561
|
):
|
|
500
|
-
errors.append(
|
|
501
|
-
err.InvalidFieldTypePromptResponse("prompt_column_names")
|
|
502
|
-
)
|
|
562
|
+
errors.append(InvalidFieldTypePromptResponse("prompt_column_names"))
|
|
503
563
|
if schema.response_column_names is not None and not isinstance(
|
|
504
564
|
schema.response_column_names, (str, EmbeddingColumnNames)
|
|
505
565
|
):
|
|
506
566
|
errors.append(
|
|
507
|
-
|
|
567
|
+
InvalidFieldTypePromptResponse("response_column_names")
|
|
508
568
|
)
|
|
509
569
|
return errors
|
|
510
570
|
|
|
511
571
|
@staticmethod
|
|
512
572
|
def _check_field_type_prompt_templates(
|
|
513
573
|
schema: Schema,
|
|
514
|
-
) -> list[
|
|
574
|
+
) -> list[InvalidFieldTypePromptTemplates]:
|
|
515
575
|
if schema.prompt_template_column_names is not None and not isinstance(
|
|
516
576
|
schema.prompt_template_column_names, PromptTemplateColumnNames
|
|
517
577
|
):
|
|
518
|
-
return [
|
|
578
|
+
return [InvalidFieldTypePromptTemplates()]
|
|
519
579
|
return []
|
|
520
580
|
|
|
521
581
|
@staticmethod
|
|
522
582
|
def _check_field_type_llm_config(
|
|
523
583
|
dataframe: pd.DataFrame,
|
|
524
584
|
schema: Schema,
|
|
525
|
-
) -> list[
|
|
585
|
+
) -> list[InvalidFieldTypeLlmConfig | InvalidTypeColumns]:
|
|
526
586
|
if schema.llm_config_column_names is None:
|
|
527
587
|
return []
|
|
528
588
|
if not isinstance(schema.llm_config_column_names, LLMConfigColumnNames):
|
|
529
|
-
return [
|
|
589
|
+
return [InvalidFieldTypeLlmConfig()]
|
|
530
590
|
col = schema.llm_config_column_names.params_column_name
|
|
531
591
|
# We check the types if the columns are in the dataframe.
|
|
532
592
|
# If the columns are reflected in the schema but not present
|
|
@@ -545,7 +605,7 @@ class Validator:
|
|
|
545
605
|
)
|
|
546
606
|
):
|
|
547
607
|
return [
|
|
548
|
-
|
|
608
|
+
InvalidTypeColumns(
|
|
549
609
|
wrong_type_columns=[col],
|
|
550
610
|
expected_types=[
|
|
551
611
|
"Dict[str, (bool, int, float, string or list[str])]"
|
|
@@ -557,9 +617,9 @@ class Validator:
|
|
|
557
617
|
@staticmethod
|
|
558
618
|
def _check_invalid_index(
|
|
559
619
|
dataframe: pd.DataFrame,
|
|
560
|
-
) -> list[
|
|
620
|
+
) -> list[InvalidDataFrameIndex]:
|
|
561
621
|
if (dataframe.index != dataframe.reset_index(drop=True).index).any():
|
|
562
|
-
return [
|
|
622
|
+
return [InvalidDataFrameIndex()]
|
|
563
623
|
return []
|
|
564
624
|
|
|
565
625
|
# ----------------
|
|
@@ -571,7 +631,7 @@ class Validator:
|
|
|
571
631
|
model_type: ModelTypes,
|
|
572
632
|
metric_families: list[Metrics] | None,
|
|
573
633
|
schema: Schema,
|
|
574
|
-
) -> list[
|
|
634
|
+
) -> list[ValidationError]:
|
|
575
635
|
if metric_families is None:
|
|
576
636
|
return []
|
|
577
637
|
|
|
@@ -597,7 +657,7 @@ class Validator:
|
|
|
597
657
|
if not valid_combination:
|
|
598
658
|
# Model type + metrics combination is not valid.
|
|
599
659
|
return [
|
|
600
|
-
|
|
660
|
+
InvalidModelTypeAndMetricsCombination(
|
|
601
661
|
model_type,
|
|
602
662
|
metric_families,
|
|
603
663
|
suggested_model_metric_combinations,
|
|
@@ -606,7 +666,7 @@ class Validator:
|
|
|
606
666
|
if missing_columns:
|
|
607
667
|
# For this model type, the schema is missing columns required for the requested metrics.
|
|
608
668
|
return [
|
|
609
|
-
|
|
669
|
+
MissingRequiredColumnsMetricsValidation(
|
|
610
670
|
model_type, metric_families, missing_columns
|
|
611
671
|
)
|
|
612
672
|
]
|
|
@@ -674,7 +734,7 @@ class Validator:
|
|
|
674
734
|
@staticmethod
|
|
675
735
|
def _check_existence_prediction_id_column_delayed_schema(
|
|
676
736
|
schema: Schema, model_type: ModelTypes
|
|
677
|
-
) -> list[
|
|
737
|
+
) -> list[MissingPredictionIdColumnForDelayedRecords]:
|
|
678
738
|
if schema.prediction_id_column_name is not None:
|
|
679
739
|
return []
|
|
680
740
|
# TODO: Revise logic once prediction_label column addition (for generative models)
|
|
@@ -683,7 +743,7 @@ class Validator:
|
|
|
683
743
|
# We skip GENERATIVE model types since they are assigned a default
|
|
684
744
|
# prediction label column with values equal 1
|
|
685
745
|
return [
|
|
686
|
-
|
|
746
|
+
MissingPredictionIdColumnForDelayedRecords(
|
|
687
747
|
schema.has_actual_columns(),
|
|
688
748
|
schema.has_feature_importance_columns(),
|
|
689
749
|
)
|
|
@@ -705,7 +765,7 @@ class Validator:
|
|
|
705
765
|
def _check_missing_columns(
|
|
706
766
|
dataframe: pd.DataFrame,
|
|
707
767
|
schema: BaseSchema,
|
|
708
|
-
) -> list[
|
|
768
|
+
) -> list[MissingColumns]:
|
|
709
769
|
if isinstance(schema, CorpusSchema):
|
|
710
770
|
return Validator._check_missing_columns_corpus_schema(
|
|
711
771
|
dataframe, schema
|
|
@@ -718,7 +778,7 @@ class Validator:
|
|
|
718
778
|
def _check_missing_columns_schema(
|
|
719
779
|
dataframe: pd.DataFrame,
|
|
720
780
|
schema: Schema,
|
|
721
|
-
) -> list[
|
|
781
|
+
) -> list[MissingColumns]:
|
|
722
782
|
# converting to a set first makes the checks run a lot faster
|
|
723
783
|
existing_columns = set(dataframe.columns)
|
|
724
784
|
missing_columns = []
|
|
@@ -901,14 +961,14 @@ class Validator:
|
|
|
901
961
|
)
|
|
902
962
|
|
|
903
963
|
if missing_columns:
|
|
904
|
-
return [
|
|
964
|
+
return [MissingColumns(missing_columns)]
|
|
905
965
|
return []
|
|
906
966
|
|
|
907
967
|
@staticmethod
|
|
908
968
|
def _check_missing_columns_corpus_schema(
|
|
909
969
|
dataframe: pd.DataFrame,
|
|
910
970
|
schema: CorpusSchema,
|
|
911
|
-
) -> list[
|
|
971
|
+
) -> list[MissingColumns]:
|
|
912
972
|
# converting to a set first makes the checks run a lot faster
|
|
913
973
|
existing_columns = set(dataframe.columns)
|
|
914
974
|
missing_columns = []
|
|
@@ -958,19 +1018,19 @@ class Validator:
|
|
|
958
1018
|
schema.document_text_embedding_column_names.link_to_data_column_name
|
|
959
1019
|
)
|
|
960
1020
|
if missing_columns:
|
|
961
|
-
return [
|
|
1021
|
+
return [MissingColumns(missing_columns)]
|
|
962
1022
|
return []
|
|
963
1023
|
|
|
964
1024
|
@staticmethod
|
|
965
1025
|
def _check_valid_schema_type(
|
|
966
1026
|
schema: BaseSchema,
|
|
967
1027
|
environment: Environments,
|
|
968
|
-
) -> list[
|
|
1028
|
+
) -> list[InvalidSchemaType]:
|
|
969
1029
|
if environment == Environments.CORPUS and not (
|
|
970
1030
|
isinstance(schema, CorpusSchema)
|
|
971
1031
|
):
|
|
972
1032
|
return [
|
|
973
|
-
|
|
1033
|
+
InvalidSchemaType(
|
|
974
1034
|
schema_type=str(type(schema)), environment=environment
|
|
975
1035
|
)
|
|
976
1036
|
]
|
|
@@ -978,7 +1038,7 @@ class Validator:
|
|
|
978
1038
|
schema, CorpusSchema
|
|
979
1039
|
):
|
|
980
1040
|
return [
|
|
981
|
-
|
|
1041
|
+
InvalidSchemaType(
|
|
982
1042
|
schema_type=str(type(schema)), environment=environment
|
|
983
1043
|
)
|
|
984
1044
|
]
|
|
@@ -987,7 +1047,7 @@ class Validator:
|
|
|
987
1047
|
@staticmethod
|
|
988
1048
|
def _check_invalid_shap_suffix(
|
|
989
1049
|
schema: Schema,
|
|
990
|
-
) -> list[
|
|
1050
|
+
) -> list[InvalidShapSuffix]:
|
|
991
1051
|
invalid_column_names = set()
|
|
992
1052
|
|
|
993
1053
|
if schema.feature_column_names is not None:
|
|
@@ -1016,14 +1076,14 @@ class Validator:
|
|
|
1016
1076
|
invalid_column_names.add(col)
|
|
1017
1077
|
|
|
1018
1078
|
if invalid_column_names:
|
|
1019
|
-
return [
|
|
1079
|
+
return [InvalidShapSuffix(invalid_column_names)]
|
|
1020
1080
|
return []
|
|
1021
1081
|
|
|
1022
1082
|
@staticmethod
|
|
1023
1083
|
def _check_reserved_columns(
|
|
1024
1084
|
schema: BaseSchema,
|
|
1025
1085
|
model_type: ModelTypes,
|
|
1026
|
-
) -> list[
|
|
1086
|
+
) -> list[ReservedColumns]:
|
|
1027
1087
|
if isinstance(schema, CorpusSchema):
|
|
1028
1088
|
return []
|
|
1029
1089
|
if isinstance(schema, Schema):
|
|
@@ -1127,29 +1187,29 @@ class Validator:
|
|
|
1127
1187
|
)
|
|
1128
1188
|
|
|
1129
1189
|
if reserved_columns:
|
|
1130
|
-
return [
|
|
1190
|
+
return [ReservedColumns(reserved_columns)]
|
|
1131
1191
|
return []
|
|
1132
1192
|
|
|
1133
1193
|
@staticmethod
|
|
1134
1194
|
def _check_invalid_model_id(
|
|
1135
1195
|
model_id: str | None,
|
|
1136
|
-
) -> list[
|
|
1196
|
+
) -> list[InvalidModelId]:
|
|
1137
1197
|
# assume it's been coerced to string beforehand
|
|
1138
1198
|
if (not isinstance(model_id, str)) or len(model_id.strip()) == 0:
|
|
1139
|
-
return [
|
|
1199
|
+
return [InvalidModelId()]
|
|
1140
1200
|
return []
|
|
1141
1201
|
|
|
1142
1202
|
@staticmethod
|
|
1143
1203
|
def _check_invalid_model_version(
|
|
1144
1204
|
model_version: str | None = None,
|
|
1145
|
-
) -> list[
|
|
1205
|
+
) -> list[InvalidModelVersion]:
|
|
1146
1206
|
if model_version is None:
|
|
1147
1207
|
return []
|
|
1148
1208
|
if (
|
|
1149
1209
|
not isinstance(model_version, str)
|
|
1150
1210
|
or len(model_version.strip()) == 0
|
|
1151
1211
|
):
|
|
1152
|
-
return [
|
|
1212
|
+
return [InvalidModelVersion()]
|
|
1153
1213
|
|
|
1154
1214
|
return []
|
|
1155
1215
|
|
|
@@ -1157,35 +1217,35 @@ class Validator:
|
|
|
1157
1217
|
def _check_invalid_batch_id(
|
|
1158
1218
|
batch_id: str | None,
|
|
1159
1219
|
environment: Environments,
|
|
1160
|
-
) -> list[
|
|
1220
|
+
) -> list[InvalidBatchId]:
|
|
1161
1221
|
# assume it's been coerced to string beforehand
|
|
1162
1222
|
if environment in (Environments.VALIDATION,) and (
|
|
1163
1223
|
(not isinstance(batch_id, str)) or len(batch_id.strip()) == 0
|
|
1164
1224
|
):
|
|
1165
|
-
return [
|
|
1225
|
+
return [InvalidBatchId()]
|
|
1166
1226
|
return []
|
|
1167
1227
|
|
|
1168
1228
|
@staticmethod
|
|
1169
1229
|
def _check_invalid_model_type(
|
|
1170
1230
|
model_type: ModelTypes,
|
|
1171
|
-
) -> list[
|
|
1231
|
+
) -> list[InvalidModelType]:
|
|
1172
1232
|
if model_type in (mt for mt in ModelTypes):
|
|
1173
1233
|
return []
|
|
1174
|
-
return [
|
|
1234
|
+
return [InvalidModelType()]
|
|
1175
1235
|
|
|
1176
1236
|
@staticmethod
|
|
1177
1237
|
def _check_invalid_environment(
|
|
1178
1238
|
environment: Environments,
|
|
1179
|
-
) -> list[
|
|
1239
|
+
) -> list[InvalidEnvironment]:
|
|
1180
1240
|
if environment in (env for env in Environments):
|
|
1181
1241
|
return []
|
|
1182
|
-
return [
|
|
1242
|
+
return [InvalidEnvironment()]
|
|
1183
1243
|
|
|
1184
1244
|
@staticmethod
|
|
1185
1245
|
def _check_existence_preprod_pred_act_score_or_label(
|
|
1186
1246
|
schema: Schema,
|
|
1187
1247
|
environment: Environments,
|
|
1188
|
-
) -> list[
|
|
1248
|
+
) -> list[MissingPreprodPredActNumericAndCategorical]:
|
|
1189
1249
|
if environment in (Environments.VALIDATION, Environments.TRAINING) and (
|
|
1190
1250
|
(
|
|
1191
1251
|
schema.prediction_label_column_name is None
|
|
@@ -1196,13 +1256,13 @@ class Validator:
|
|
|
1196
1256
|
and schema.actual_score_column_name is None
|
|
1197
1257
|
)
|
|
1198
1258
|
):
|
|
1199
|
-
return [
|
|
1259
|
+
return [MissingPreprodPredActNumericAndCategorical()]
|
|
1200
1260
|
return []
|
|
1201
1261
|
|
|
1202
1262
|
@staticmethod
|
|
1203
1263
|
def _check_exactly_one_cv_column_type(
|
|
1204
1264
|
schema: Schema, environment: Environments
|
|
1205
|
-
) -> list[
|
|
1265
|
+
) -> list[MultipleCVPredAct | MissingCVPredAct]:
|
|
1206
1266
|
# Checks that the required prediction/actual columns are given in the schema depending on
|
|
1207
1267
|
# the environment, for object detection models. There should be exactly one of
|
|
1208
1268
|
# object detection, semantic segmentation, or instance segmentation columns.
|
|
@@ -1232,9 +1292,9 @@ class Validator:
|
|
|
1232
1292
|
)
|
|
1233
1293
|
|
|
1234
1294
|
if cv_types_count == 0:
|
|
1235
|
-
return [
|
|
1295
|
+
return [MissingCVPredAct(environment)]
|
|
1236
1296
|
if cv_types_count > 1:
|
|
1237
|
-
return [
|
|
1297
|
+
return [MultipleCVPredAct(environment)]
|
|
1238
1298
|
|
|
1239
1299
|
elif environment in (
|
|
1240
1300
|
Environments.TRAINING,
|
|
@@ -1265,16 +1325,16 @@ class Validator:
|
|
|
1265
1325
|
)
|
|
1266
1326
|
|
|
1267
1327
|
if cv_types_count == 0:
|
|
1268
|
-
return [
|
|
1328
|
+
return [MissingCVPredAct(environment)]
|
|
1269
1329
|
if cv_types_count > 1:
|
|
1270
|
-
return [
|
|
1330
|
+
return [MultipleCVPredAct(environment)]
|
|
1271
1331
|
|
|
1272
1332
|
return []
|
|
1273
1333
|
|
|
1274
1334
|
@staticmethod
|
|
1275
1335
|
def _check_missing_object_detection_columns(
|
|
1276
1336
|
schema: Schema, model_type: ModelTypes
|
|
1277
|
-
) -> list[
|
|
1337
|
+
) -> list[InvalidPredActCVColumnNamesForModelType]:
|
|
1278
1338
|
# Checks that models that are not Object Detection models don't have, in the schema, the
|
|
1279
1339
|
# object detection, semantic segmentation, or instance segmentation dedicated prediction/actual
|
|
1280
1340
|
# column names
|
|
@@ -1286,13 +1346,13 @@ class Validator:
|
|
|
1286
1346
|
or schema.instance_segmentation_prediction_column_names is not None
|
|
1287
1347
|
or schema.instance_segmentation_actual_column_names is not None
|
|
1288
1348
|
):
|
|
1289
|
-
return [
|
|
1349
|
+
return [InvalidPredActCVColumnNamesForModelType(model_type)]
|
|
1290
1350
|
return []
|
|
1291
1351
|
|
|
1292
1352
|
@staticmethod
|
|
1293
1353
|
def _check_missing_non_object_detection_columns(
|
|
1294
1354
|
schema: Schema, model_type: ModelTypes
|
|
1295
|
-
) -> list[
|
|
1355
|
+
) -> list[InvalidPredActColumnNamesForModelType]:
|
|
1296
1356
|
# Checks that object detection models don't have, in the schema, the columns reserved for
|
|
1297
1357
|
# other model types
|
|
1298
1358
|
columns_to_check = (
|
|
@@ -1317,7 +1377,7 @@ class Validator:
|
|
|
1317
1377
|
"instance_segmentation_actual_column_names",
|
|
1318
1378
|
]
|
|
1319
1379
|
return [
|
|
1320
|
-
|
|
1380
|
+
InvalidPredActColumnNamesForModelType(
|
|
1321
1381
|
model_type, allowed_cols, wrong_cols
|
|
1322
1382
|
)
|
|
1323
1383
|
]
|
|
@@ -1326,7 +1386,7 @@ class Validator:
|
|
|
1326
1386
|
@staticmethod
|
|
1327
1387
|
def _check_missing_multi_class_columns(
|
|
1328
1388
|
schema: Schema, model_type: ModelTypes
|
|
1329
|
-
) -> list[
|
|
1389
|
+
) -> list[InvalidPredActColumnNamesForModelType]:
|
|
1330
1390
|
# Checks that models that are not Multi Class models don't have, in the schema, the
|
|
1331
1391
|
# multi class dedicated threshold column
|
|
1332
1392
|
if (
|
|
@@ -1334,7 +1394,7 @@ class Validator:
|
|
|
1334
1394
|
and schema.multi_class_threshold_scores_column_name is not None
|
|
1335
1395
|
):
|
|
1336
1396
|
return [
|
|
1337
|
-
|
|
1397
|
+
InvalidPredActColumnNamesForModelType(
|
|
1338
1398
|
model_type,
|
|
1339
1399
|
None,
|
|
1340
1400
|
[schema.multi_class_threshold_scores_column_name],
|
|
@@ -1345,7 +1405,7 @@ class Validator:
|
|
|
1345
1405
|
@staticmethod
|
|
1346
1406
|
def _check_existing_multi_class_columns(
|
|
1347
1407
|
schema: Schema,
|
|
1348
|
-
) -> list[
|
|
1408
|
+
) -> list[MissingReqPredActColumnNamesForMultiClass]:
|
|
1349
1409
|
# Checks that models that are Multi Class models have, in the schema, the
|
|
1350
1410
|
# required prediction score or actual score columns
|
|
1351
1411
|
if (
|
|
@@ -1355,13 +1415,13 @@ class Validator:
|
|
|
1355
1415
|
schema.multi_class_threshold_scores_column_name is not None
|
|
1356
1416
|
and schema.prediction_score_column_name is None
|
|
1357
1417
|
):
|
|
1358
|
-
return [
|
|
1418
|
+
return [MissingReqPredActColumnNamesForMultiClass()]
|
|
1359
1419
|
return []
|
|
1360
1420
|
|
|
1361
1421
|
@staticmethod
|
|
1362
1422
|
def _check_missing_non_multi_class_columns(
|
|
1363
1423
|
schema: Schema, model_type: ModelTypes
|
|
1364
|
-
) -> list[
|
|
1424
|
+
) -> list[InvalidPredActColumnNamesForModelType]:
|
|
1365
1425
|
# Checks that multi class models don't have, in the schema, the columns reserved for
|
|
1366
1426
|
# other model types
|
|
1367
1427
|
columns_to_check = (
|
|
@@ -1387,7 +1447,7 @@ class Validator:
|
|
|
1387
1447
|
"actual_score_column_name",
|
|
1388
1448
|
]
|
|
1389
1449
|
return [
|
|
1390
|
-
|
|
1450
|
+
InvalidPredActColumnNamesForModelType(
|
|
1391
1451
|
model_type, allowed_cols, wrong_cols
|
|
1392
1452
|
)
|
|
1393
1453
|
]
|
|
@@ -1397,17 +1457,17 @@ class Validator:
|
|
|
1397
1457
|
def _check_existence_preprod_act(
|
|
1398
1458
|
schema: Schema,
|
|
1399
1459
|
environment: Environments,
|
|
1400
|
-
) -> list[
|
|
1460
|
+
) -> list[MissingPreprodAct]:
|
|
1401
1461
|
if environment in (Environments.VALIDATION, Environments.TRAINING) and (
|
|
1402
1462
|
schema.actual_label_column_name is None
|
|
1403
1463
|
):
|
|
1404
|
-
return [
|
|
1464
|
+
return [MissingPreprodAct()]
|
|
1405
1465
|
return []
|
|
1406
1466
|
|
|
1407
1467
|
@staticmethod
|
|
1408
1468
|
def _check_existence_group_id_rank_category_relevance(
|
|
1409
1469
|
schema: Schema,
|
|
1410
|
-
) -> list[
|
|
1470
|
+
) -> list[MissingRequiredColumnsForRankingModel]:
|
|
1411
1471
|
# prediction_group_id and rank columns are required as ranking prediction columns.
|
|
1412
1472
|
ranking_prediction_cols = (
|
|
1413
1473
|
schema.prediction_label_column_name,
|
|
@@ -1425,13 +1485,13 @@ class Validator:
|
|
|
1425
1485
|
# If there is prediction information (not delayed actuals),
|
|
1426
1486
|
# there must exist a rank and prediction group id columns
|
|
1427
1487
|
if has_prediction_info and any(col is None for col in required):
|
|
1428
|
-
return [
|
|
1488
|
+
return [MissingRequiredColumnsForRankingModel()]
|
|
1429
1489
|
return []
|
|
1430
1490
|
|
|
1431
1491
|
@staticmethod
|
|
1432
1492
|
def _check_dataframe_for_duplicate_columns(
|
|
1433
1493
|
schema: BaseSchema, dataframe: pd.DataFrame
|
|
1434
|
-
) -> list[
|
|
1494
|
+
) -> list[DuplicateColumnsInDataframe]:
|
|
1435
1495
|
# Get the columns used in the schema
|
|
1436
1496
|
schema_col_used = schema.get_used_columns()
|
|
1437
1497
|
# Get the duplicated column names from the dataframe
|
|
@@ -1441,17 +1501,17 @@ class Validator:
|
|
|
1441
1501
|
col for col in duplicate_columns if col in schema_col_used
|
|
1442
1502
|
]
|
|
1443
1503
|
if schema_duplicate_cols:
|
|
1444
|
-
return [
|
|
1504
|
+
return [DuplicateColumnsInDataframe(schema_duplicate_cols)]
|
|
1445
1505
|
return []
|
|
1446
1506
|
|
|
1447
1507
|
@staticmethod
|
|
1448
1508
|
def _check_invalid_number_of_embeddings(
|
|
1449
1509
|
schema: Schema,
|
|
1450
|
-
) -> list[
|
|
1510
|
+
) -> list[InvalidNumberOfEmbeddings]:
|
|
1451
1511
|
if schema.embedding_feature_column_names is not None:
|
|
1452
1512
|
number_of_embeddings = len(schema.embedding_feature_column_names)
|
|
1453
1513
|
if number_of_embeddings > MAX_NUMBER_OF_EMBEDDINGS:
|
|
1454
|
-
return [
|
|
1514
|
+
return [InvalidNumberOfEmbeddings(number_of_embeddings)]
|
|
1455
1515
|
return []
|
|
1456
1516
|
|
|
1457
1517
|
# -----------
|
|
@@ -1461,7 +1521,7 @@ class Validator:
|
|
|
1461
1521
|
@staticmethod
|
|
1462
1522
|
def _check_type_prediction_id(
|
|
1463
1523
|
schema: Schema, column_types: dict[str, Any]
|
|
1464
|
-
) -> list[
|
|
1524
|
+
) -> list[InvalidType]:
|
|
1465
1525
|
col = schema.prediction_id_column_name
|
|
1466
1526
|
if col in column_types:
|
|
1467
1527
|
# should mirror server side
|
|
@@ -1474,7 +1534,7 @@ class Validator:
|
|
|
1474
1534
|
)
|
|
1475
1535
|
if column_types[col] not in allowed_datatypes:
|
|
1476
1536
|
return [
|
|
1477
|
-
|
|
1537
|
+
InvalidType(
|
|
1478
1538
|
"Prediction IDs",
|
|
1479
1539
|
expected_types=["str", "int"],
|
|
1480
1540
|
found_data_type=column_types[col],
|
|
@@ -1485,7 +1545,7 @@ class Validator:
|
|
|
1485
1545
|
@staticmethod
|
|
1486
1546
|
def _check_type_timestamp(
|
|
1487
1547
|
schema: Schema, column_types: dict[str, Any]
|
|
1488
|
-
) -> list[
|
|
1548
|
+
) -> list[InvalidType]:
|
|
1489
1549
|
col = schema.timestamp_column_name
|
|
1490
1550
|
if col in column_types:
|
|
1491
1551
|
# should mirror server side
|
|
@@ -1501,7 +1561,7 @@ class Validator:
|
|
|
1501
1561
|
and t not in allowed_datatypes
|
|
1502
1562
|
):
|
|
1503
1563
|
return [
|
|
1504
|
-
|
|
1564
|
+
InvalidType(
|
|
1505
1565
|
"Prediction timestamp",
|
|
1506
1566
|
expected_types=["Date", "Timestamp", "int", "float"],
|
|
1507
1567
|
found_data_type=t,
|
|
@@ -1512,7 +1572,7 @@ class Validator:
|
|
|
1512
1572
|
@staticmethod
|
|
1513
1573
|
def _check_type_features(
|
|
1514
1574
|
schema: Schema, column_types: dict[str, Any]
|
|
1515
|
-
) -> list[
|
|
1575
|
+
) -> list[InvalidTypeFeatures]:
|
|
1516
1576
|
if schema.feature_column_names is not None:
|
|
1517
1577
|
# should mirror server side
|
|
1518
1578
|
allowed_datatypes = (
|
|
@@ -1535,7 +1595,7 @@ class Validator:
|
|
|
1535
1595
|
]
|
|
1536
1596
|
if wrong_type_cols:
|
|
1537
1597
|
return [
|
|
1538
|
-
|
|
1598
|
+
InvalidTypeFeatures(
|
|
1539
1599
|
wrong_type_cols,
|
|
1540
1600
|
expected_types=[
|
|
1541
1601
|
"float",
|
|
@@ -1551,7 +1611,7 @@ class Validator:
|
|
|
1551
1611
|
@staticmethod
|
|
1552
1612
|
def _check_type_embedding_features(
|
|
1553
1613
|
schema: Schema, column_types: dict[str, Any]
|
|
1554
|
-
) -> list[
|
|
1614
|
+
) -> list[InvalidTypeFeatures]:
|
|
1555
1615
|
if schema.embedding_feature_column_names is not None:
|
|
1556
1616
|
# should mirror server side
|
|
1557
1617
|
allowed_vector_datatypes = (
|
|
@@ -1599,20 +1659,20 @@ class Validator:
|
|
|
1599
1659
|
wrong_type_embedding_errors = []
|
|
1600
1660
|
if wrong_type_vector_columns:
|
|
1601
1661
|
wrong_type_embedding_errors.append(
|
|
1602
|
-
|
|
1662
|
+
InvalidTypeFeatures(
|
|
1603
1663
|
wrong_type_vector_columns,
|
|
1604
1664
|
expected_types=["list[float], np.array[float]"],
|
|
1605
1665
|
)
|
|
1606
1666
|
)
|
|
1607
1667
|
if wrong_type_data_columns:
|
|
1608
1668
|
wrong_type_embedding_errors.append(
|
|
1609
|
-
|
|
1669
|
+
InvalidTypeFeatures(
|
|
1610
1670
|
wrong_type_data_columns, expected_types=["list[string]"]
|
|
1611
1671
|
)
|
|
1612
1672
|
)
|
|
1613
1673
|
if wrong_type_link_to_data_columns:
|
|
1614
1674
|
wrong_type_embedding_errors.append(
|
|
1615
|
-
|
|
1675
|
+
InvalidTypeFeatures(
|
|
1616
1676
|
wrong_type_link_to_data_columns,
|
|
1617
1677
|
expected_types=["string"],
|
|
1618
1678
|
)
|
|
@@ -1627,7 +1687,7 @@ class Validator:
|
|
|
1627
1687
|
@staticmethod
|
|
1628
1688
|
def _check_type_tags(
|
|
1629
1689
|
schema: Schema, column_types: dict[str, Any]
|
|
1630
|
-
) -> list[
|
|
1690
|
+
) -> list[InvalidTypeTags]:
|
|
1631
1691
|
if schema.tag_column_names is not None:
|
|
1632
1692
|
# should mirror server side
|
|
1633
1693
|
allowed_datatypes = (
|
|
@@ -1649,7 +1709,7 @@ class Validator:
|
|
|
1649
1709
|
]
|
|
1650
1710
|
if wrong_type_cols:
|
|
1651
1711
|
return [
|
|
1652
|
-
|
|
1712
|
+
InvalidTypeTags(
|
|
1653
1713
|
wrong_type_cols, ["float", "int", "bool", "str"]
|
|
1654
1714
|
)
|
|
1655
1715
|
]
|
|
@@ -1658,7 +1718,7 @@ class Validator:
|
|
|
1658
1718
|
@staticmethod
|
|
1659
1719
|
def _check_type_shap_values(
|
|
1660
1720
|
schema: Schema, column_types: dict[str, Any]
|
|
1661
|
-
) -> list[
|
|
1721
|
+
) -> list[InvalidTypeShapValues]:
|
|
1662
1722
|
if schema.shap_values_column_names is not None:
|
|
1663
1723
|
# should mirror server side
|
|
1664
1724
|
allowed_datatypes = (
|
|
@@ -1675,7 +1735,7 @@ class Validator:
|
|
|
1675
1735
|
]
|
|
1676
1736
|
if wrong_type_cols:
|
|
1677
1737
|
return [
|
|
1678
|
-
|
|
1738
|
+
InvalidTypeShapValues(
|
|
1679
1739
|
wrong_type_cols, expected_types=["float", "int"]
|
|
1680
1740
|
)
|
|
1681
1741
|
]
|
|
@@ -1684,7 +1744,7 @@ class Validator:
|
|
|
1684
1744
|
@staticmethod
|
|
1685
1745
|
def _check_type_pred_act_labels(
|
|
1686
1746
|
model_type: ModelTypes, schema: Schema, column_types: dict[str, Any]
|
|
1687
|
-
) -> list[
|
|
1747
|
+
) -> list[InvalidType]:
|
|
1688
1748
|
errors = []
|
|
1689
1749
|
columns = (
|
|
1690
1750
|
("Prediction labels", schema.prediction_label_column_name),
|
|
@@ -1713,7 +1773,7 @@ class Validator:
|
|
|
1713
1773
|
and column_types[col] not in allowed_datatypes
|
|
1714
1774
|
):
|
|
1715
1775
|
errors.append(
|
|
1716
|
-
|
|
1776
|
+
InvalidType(
|
|
1717
1777
|
name,
|
|
1718
1778
|
expected_types=["float", "int", "bool", "str"],
|
|
1719
1779
|
found_data_type=column_types[col],
|
|
@@ -1737,7 +1797,7 @@ class Validator:
|
|
|
1737
1797
|
and column_types[col] not in allowed_datatypes
|
|
1738
1798
|
):
|
|
1739
1799
|
errors.append(
|
|
1740
|
-
|
|
1800
|
+
InvalidType(
|
|
1741
1801
|
name,
|
|
1742
1802
|
expected_types=["float", "int"],
|
|
1743
1803
|
found_data_type=column_types[col],
|
|
@@ -1748,7 +1808,7 @@ class Validator:
|
|
|
1748
1808
|
@staticmethod
|
|
1749
1809
|
def _check_type_pred_act_scores(
|
|
1750
1810
|
model_type: ModelTypes, schema: Schema, column_types: dict[str, Any]
|
|
1751
|
-
) -> list[
|
|
1811
|
+
) -> list[InvalidType]:
|
|
1752
1812
|
errors = []
|
|
1753
1813
|
columns = (
|
|
1754
1814
|
("Prediction scores", schema.prediction_score_column_name),
|
|
@@ -1777,7 +1837,7 @@ class Validator:
|
|
|
1777
1837
|
and column_types[col] not in allowed_datatypes
|
|
1778
1838
|
):
|
|
1779
1839
|
errors.append(
|
|
1780
|
-
|
|
1840
|
+
InvalidType(
|
|
1781
1841
|
name,
|
|
1782
1842
|
expected_types=["float", "int"],
|
|
1783
1843
|
found_data_type=column_types[col],
|
|
@@ -1788,7 +1848,7 @@ class Validator:
|
|
|
1788
1848
|
@staticmethod
|
|
1789
1849
|
def _check_type_multi_class_pred_threshold_act_scores(
|
|
1790
1850
|
schema: Schema, column_types: dict[str, Any]
|
|
1791
|
-
) -> list[
|
|
1851
|
+
) -> list[InvalidType]:
|
|
1792
1852
|
"""Check type for prediction / threshold / actual scores for multiclass model.
|
|
1793
1853
|
|
|
1794
1854
|
Expect the scores to be a list of pyarrow structs that contains field
|
|
@@ -1834,7 +1894,7 @@ class Validator:
|
|
|
1834
1894
|
and column_types[col] not in allowed_class_score_map_datatypes
|
|
1835
1895
|
):
|
|
1836
1896
|
errors.append(
|
|
1837
|
-
|
|
1897
|
+
InvalidType(
|
|
1838
1898
|
name,
|
|
1839
1899
|
expected_types=[
|
|
1840
1900
|
"List[Dict{class_name: str, score: int}]",
|
|
@@ -1848,7 +1908,7 @@ class Validator:
|
|
|
1848
1908
|
@staticmethod
|
|
1849
1909
|
def _check_type_prompt_response(
|
|
1850
1910
|
schema: Schema, column_types: dict[str, Any]
|
|
1851
|
-
) -> list[
|
|
1911
|
+
) -> list[InvalidTypeColumns]:
|
|
1852
1912
|
fields_to_check = []
|
|
1853
1913
|
if schema.prompt_column_names is not None:
|
|
1854
1914
|
fields_to_check.append(schema.prompt_column_names)
|
|
@@ -1895,20 +1955,20 @@ class Validator:
|
|
|
1895
1955
|
wrong_type_col_errors = []
|
|
1896
1956
|
if wrong_type_vector_columns:
|
|
1897
1957
|
wrong_type_col_errors.append(
|
|
1898
|
-
|
|
1958
|
+
InvalidTypeColumns(
|
|
1899
1959
|
wrong_type_vector_columns,
|
|
1900
1960
|
expected_types=["list[float], np.array[float]"],
|
|
1901
1961
|
)
|
|
1902
1962
|
)
|
|
1903
1963
|
if wrong_type_data_columns:
|
|
1904
1964
|
wrong_type_col_errors.append(
|
|
1905
|
-
|
|
1965
|
+
InvalidTypeColumns(
|
|
1906
1966
|
wrong_type_data_columns, expected_types=["str, list[str]"]
|
|
1907
1967
|
)
|
|
1908
1968
|
)
|
|
1909
1969
|
if wrong_type_str_columns:
|
|
1910
1970
|
wrong_type_col_errors.append(
|
|
1911
|
-
|
|
1971
|
+
InvalidTypeColumns(
|
|
1912
1972
|
wrong_type_str_columns, expected_types=["str"]
|
|
1913
1973
|
)
|
|
1914
1974
|
)
|
|
@@ -1918,7 +1978,7 @@ class Validator:
|
|
|
1918
1978
|
@staticmethod
|
|
1919
1979
|
def _check_type_llm_prompt_templates(
|
|
1920
1980
|
schema: Schema, column_types: dict[str, Any]
|
|
1921
|
-
) -> list[
|
|
1981
|
+
) -> list[InvalidTypeColumns]:
|
|
1922
1982
|
if schema.prompt_template_column_names is None:
|
|
1923
1983
|
return []
|
|
1924
1984
|
|
|
@@ -1949,7 +2009,7 @@ class Validator:
|
|
|
1949
2009
|
# Return errors if any
|
|
1950
2010
|
if wrong_type_cols:
|
|
1951
2011
|
return [
|
|
1952
|
-
|
|
2012
|
+
InvalidTypeColumns(
|
|
1953
2013
|
wrong_type_columns=wrong_type_cols,
|
|
1954
2014
|
expected_types=["string"],
|
|
1955
2015
|
)
|
|
@@ -1959,7 +2019,7 @@ class Validator:
|
|
|
1959
2019
|
@staticmethod
|
|
1960
2020
|
def _check_type_llm_config(
|
|
1961
2021
|
schema: Schema, column_types: dict[str, Any]
|
|
1962
|
-
) -> list[
|
|
2022
|
+
) -> list[InvalidTypeColumns]:
|
|
1963
2023
|
if schema.llm_config_column_names is None:
|
|
1964
2024
|
return []
|
|
1965
2025
|
|
|
@@ -1986,7 +2046,7 @@ class Validator:
|
|
|
1986
2046
|
# Return errors if any
|
|
1987
2047
|
if wrong_type_cols:
|
|
1988
2048
|
return [
|
|
1989
|
-
|
|
2049
|
+
InvalidTypeColumns(
|
|
1990
2050
|
wrong_type_columns=wrong_type_cols,
|
|
1991
2051
|
expected_types=["string"],
|
|
1992
2052
|
)
|
|
@@ -1996,7 +2056,7 @@ class Validator:
|
|
|
1996
2056
|
@staticmethod
|
|
1997
2057
|
def _check_type_llm_run_metadata(
|
|
1998
2058
|
schema: Schema, column_types: dict[str, Any]
|
|
1999
|
-
) -> list[
|
|
2059
|
+
) -> list[InvalidTypeColumns]:
|
|
2000
2060
|
if schema.llm_run_metadata_column_names is None:
|
|
2001
2061
|
return []
|
|
2002
2062
|
|
|
@@ -2059,7 +2119,7 @@ class Validator:
|
|
|
2059
2119
|
# Return errors if there are any
|
|
2060
2120
|
if wrong_type_cols:
|
|
2061
2121
|
return [
|
|
2062
|
-
|
|
2122
|
+
InvalidTypeColumns(
|
|
2063
2123
|
wrong_type_columns=wrong_type_cols,
|
|
2064
2124
|
expected_types=["int", "float"],
|
|
2065
2125
|
)
|
|
@@ -2069,7 +2129,7 @@ class Validator:
|
|
|
2069
2129
|
@staticmethod
|
|
2070
2130
|
def _check_type_retrieved_document_ids(
|
|
2071
2131
|
schema: Schema, column_types: dict[str, Any]
|
|
2072
|
-
) -> list[
|
|
2132
|
+
) -> list[InvalidType]:
|
|
2073
2133
|
col = schema.retrieved_document_ids_column_name
|
|
2074
2134
|
if col in column_types:
|
|
2075
2135
|
# should mirror server side
|
|
@@ -2079,7 +2139,7 @@ class Validator:
|
|
|
2079
2139
|
)
|
|
2080
2140
|
if column_types[col] not in allowed_datatypes:
|
|
2081
2141
|
return [
|
|
2082
|
-
|
|
2142
|
+
InvalidType(
|
|
2083
2143
|
"Retrieved Document IDs",
|
|
2084
2144
|
expected_types=["List[str]"],
|
|
2085
2145
|
found_data_type=column_types[col],
|
|
@@ -2090,7 +2150,7 @@ class Validator:
|
|
|
2090
2150
|
@staticmethod
|
|
2091
2151
|
def _check_type_image_segment_coordinates(
|
|
2092
2152
|
schema: Schema, column_types: dict[str, Any]
|
|
2093
|
-
) -> list[
|
|
2153
|
+
) -> list[InvalidTypeColumns]:
|
|
2094
2154
|
# should mirror server side
|
|
2095
2155
|
allowed_coordinate_types = (
|
|
2096
2156
|
pa.list_(pa.list_(pa.float64())),
|
|
@@ -2173,7 +2233,7 @@ class Validator:
|
|
|
2173
2233
|
|
|
2174
2234
|
return (
|
|
2175
2235
|
[
|
|
2176
|
-
|
|
2236
|
+
InvalidTypeColumns(
|
|
2177
2237
|
wrong_type_columns=wrong_type_cols,
|
|
2178
2238
|
expected_types=["List[List[float]]"],
|
|
2179
2239
|
)
|
|
@@ -2185,7 +2245,7 @@ class Validator:
|
|
|
2185
2245
|
@staticmethod
|
|
2186
2246
|
def _check_type_image_segment_categories(
|
|
2187
2247
|
schema: Schema, column_types: dict[str, Any]
|
|
2188
|
-
) -> list[
|
|
2248
|
+
) -> list[InvalidTypeColumns]:
|
|
2189
2249
|
# should mirror server side
|
|
2190
2250
|
allowed_category_datatypes = (
|
|
2191
2251
|
pa.list_(pa.string()),
|
|
@@ -2242,7 +2302,7 @@ class Validator:
|
|
|
2242
2302
|
|
|
2243
2303
|
return (
|
|
2244
2304
|
[
|
|
2245
|
-
|
|
2305
|
+
InvalidTypeColumns(
|
|
2246
2306
|
wrong_type_columns=wrong_type_cols,
|
|
2247
2307
|
expected_types=["List[str]"],
|
|
2248
2308
|
)
|
|
@@ -2254,7 +2314,7 @@ class Validator:
|
|
|
2254
2314
|
@staticmethod
|
|
2255
2315
|
def _check_type_image_segment_scores(
|
|
2256
2316
|
schema: Schema, column_types: dict[str, Any]
|
|
2257
|
-
) -> list[
|
|
2317
|
+
) -> list[InvalidTypeColumns]:
|
|
2258
2318
|
# should mirror server side
|
|
2259
2319
|
allowed_score_datatypes = (
|
|
2260
2320
|
pa.list_(pa.float64()),
|
|
@@ -2297,7 +2357,7 @@ class Validator:
|
|
|
2297
2357
|
|
|
2298
2358
|
return (
|
|
2299
2359
|
[
|
|
2300
|
-
|
|
2360
|
+
InvalidTypeColumns(
|
|
2301
2361
|
wrong_type_columns=wrong_type_cols,
|
|
2302
2362
|
expected_types=["List[float]"],
|
|
2303
2363
|
)
|
|
@@ -2313,7 +2373,7 @@ class Validator:
|
|
|
2313
2373
|
@staticmethod
|
|
2314
2374
|
def _check_embedding_vectors_dimensionality(
|
|
2315
2375
|
dataframe: pd.DataFrame, schema: Schema
|
|
2316
|
-
) -> list[
|
|
2376
|
+
) -> list[ValidationError]:
|
|
2317
2377
|
if schema.embedding_feature_column_names is None:
|
|
2318
2378
|
return []
|
|
2319
2379
|
|
|
@@ -2331,7 +2391,7 @@ class Validator:
|
|
|
2331
2391
|
|
|
2332
2392
|
return (
|
|
2333
2393
|
[
|
|
2334
|
-
|
|
2394
|
+
InvalidValueEmbeddingVectorDimensionality(
|
|
2335
2395
|
invalid_low_dim_vector_cols,
|
|
2336
2396
|
invalid_high_dim_vector_cols,
|
|
2337
2397
|
),
|
|
@@ -2343,7 +2403,7 @@ class Validator:
|
|
|
2343
2403
|
@staticmethod
|
|
2344
2404
|
def _check_embedding_raw_data_characters(
|
|
2345
2405
|
dataframe: pd.DataFrame, schema: Schema
|
|
2346
|
-
) -> list[
|
|
2406
|
+
) -> list[ValidationError]:
|
|
2347
2407
|
if schema.embedding_feature_column_names is None:
|
|
2348
2408
|
return []
|
|
2349
2409
|
|
|
@@ -2361,7 +2421,7 @@ class Validator:
|
|
|
2361
2421
|
|
|
2362
2422
|
if invalid_long_string_data_cols:
|
|
2363
2423
|
return [
|
|
2364
|
-
|
|
2424
|
+
InvalidValueEmbeddingRawDataTooLong(
|
|
2365
2425
|
invalid_long_string_data_cols
|
|
2366
2426
|
)
|
|
2367
2427
|
]
|
|
@@ -2377,20 +2437,20 @@ class Validator:
|
|
|
2377
2437
|
@staticmethod
|
|
2378
2438
|
def _check_value_rank(
|
|
2379
2439
|
dataframe: pd.DataFrame, schema: Schema
|
|
2380
|
-
) -> list[
|
|
2440
|
+
) -> list[InvalidRankValue]:
|
|
2381
2441
|
col = schema.rank_column_name
|
|
2382
2442
|
lbound, ubound = (1, 100)
|
|
2383
2443
|
|
|
2384
2444
|
if col is not None and col in dataframe.columns:
|
|
2385
2445
|
rank_min_max = dataframe[col].agg(["min", "max"])
|
|
2386
2446
|
if rank_min_max["min"] < lbound or rank_min_max["max"] > ubound:
|
|
2387
|
-
return [
|
|
2447
|
+
return [InvalidRankValue(col, "1-100")]
|
|
2388
2448
|
return []
|
|
2389
2449
|
|
|
2390
2450
|
@staticmethod
|
|
2391
2451
|
def _check_id_field_str_length(
|
|
2392
2452
|
dataframe: pd.DataFrame, schema_name: str, id_col_name: str | None
|
|
2393
|
-
) -> list[
|
|
2453
|
+
) -> list[ValidationError]:
|
|
2394
2454
|
"""Require prediction_id to be a string of length between MIN and MAX.
|
|
2395
2455
|
|
|
2396
2456
|
Between MIN_PREDICTION_ID_LEN and MAX_PREDICTION_ID_LEN.
|
|
@@ -2412,7 +2472,7 @@ class Validator:
|
|
|
2412
2472
|
.all()
|
|
2413
2473
|
):
|
|
2414
2474
|
return [
|
|
2415
|
-
|
|
2475
|
+
InvalidStringLengthInColumn(
|
|
2416
2476
|
schema_name=schema_name,
|
|
2417
2477
|
col_name=id_col_name,
|
|
2418
2478
|
min_length=MIN_PREDICTION_ID_LEN,
|
|
@@ -2424,7 +2484,7 @@ class Validator:
|
|
|
2424
2484
|
@staticmethod
|
|
2425
2485
|
def _check_document_id_field_str_length(
|
|
2426
2486
|
dataframe: pd.DataFrame, schema_name: str, id_col_name: str | None
|
|
2427
|
-
) -> list[
|
|
2487
|
+
) -> list[ValidationError]:
|
|
2428
2488
|
"""Require document id to be a string of length between MIN and MAX.
|
|
2429
2489
|
|
|
2430
2490
|
Between MIN_DOCUMENT_ID_LEN and MAX_DOCUMENT_ID_LEN.
|
|
@@ -2446,7 +2506,7 @@ class Validator:
|
|
|
2446
2506
|
.all()
|
|
2447
2507
|
):
|
|
2448
2508
|
return [
|
|
2449
|
-
|
|
2509
|
+
InvalidStringLengthInColumn(
|
|
2450
2510
|
schema_name=schema_name,
|
|
2451
2511
|
col_name=id_col_name,
|
|
2452
2512
|
min_length=MIN_DOCUMENT_ID_LEN,
|
|
@@ -2476,7 +2536,7 @@ class Validator:
|
|
|
2476
2536
|
@staticmethod
|
|
2477
2537
|
def _check_value_tag(
|
|
2478
2538
|
dataframe: pd.DataFrame, schema: Schema
|
|
2479
|
-
) -> list[
|
|
2539
|
+
) -> list[InvalidTagLength]:
|
|
2480
2540
|
if schema.tag_column_names is None:
|
|
2481
2541
|
return []
|
|
2482
2542
|
|
|
@@ -2501,7 +2561,7 @@ class Validator:
|
|
|
2501
2561
|
elif max_tag_len > MAX_TAG_LENGTH_TRUNCATION:
|
|
2502
2562
|
truncated_tag_cols.append(col)
|
|
2503
2563
|
if wrong_tag_cols:
|
|
2504
|
-
return [
|
|
2564
|
+
return [InvalidTagLength(wrong_tag_cols)]
|
|
2505
2565
|
if truncated_tag_cols:
|
|
2506
2566
|
logger.warning(
|
|
2507
2567
|
get_truncation_warning_message(
|
|
@@ -2513,7 +2573,7 @@ class Validator:
|
|
|
2513
2573
|
@staticmethod
|
|
2514
2574
|
def _check_value_ranking_category(
|
|
2515
2575
|
dataframe: pd.DataFrame, schema: Schema
|
|
2516
|
-
) -> list[
|
|
2576
|
+
) -> list[InvalidValueMissingValue | InvalidRankingCategoryValue]:
|
|
2517
2577
|
if schema.relevance_labels_column_name is not None:
|
|
2518
2578
|
col = schema.relevance_labels_column_name
|
|
2519
2579
|
elif schema.attributions_column_name is not None:
|
|
@@ -2526,11 +2586,11 @@ class Validator:
|
|
|
2526
2586
|
# which would be caught by _check_value_missing
|
|
2527
2587
|
return []
|
|
2528
2588
|
if dataframe[col].astype(str).str.len().min() == 0:
|
|
2529
|
-
return [
|
|
2589
|
+
return [InvalidRankingCategoryValue(col)]
|
|
2530
2590
|
# empty list
|
|
2531
2591
|
not_null_filter = dataframe[col].notnull()
|
|
2532
2592
|
if dataframe[not_null_filter][col].map(len).min() == 0:
|
|
2533
|
-
return [
|
|
2593
|
+
return [InvalidValueMissingValue(col, "empty list")]
|
|
2534
2594
|
# no empty string in list
|
|
2535
2595
|
if (
|
|
2536
2596
|
dataframe[not_null_filter][col]
|
|
@@ -2538,13 +2598,13 @@ class Validator:
|
|
|
2538
2598
|
.min()
|
|
2539
2599
|
== 0
|
|
2540
2600
|
):
|
|
2541
|
-
return [
|
|
2601
|
+
return [InvalidRankingCategoryValue(col)]
|
|
2542
2602
|
return []
|
|
2543
2603
|
|
|
2544
2604
|
@staticmethod
|
|
2545
2605
|
def _check_length_multi_class_maps(
|
|
2546
2606
|
dataframe: pd.DataFrame, schema: Schema
|
|
2547
|
-
) -> list[
|
|
2607
|
+
) -> list[InvalidNumClassesMultiClassMap]:
|
|
2548
2608
|
# each entry in column is a list of dictionaries mapping class names and scores
|
|
2549
2609
|
# validate length of list of dictionaries for each column
|
|
2550
2610
|
invalid_cols = {}
|
|
@@ -2575,16 +2635,16 @@ class Validator:
|
|
|
2575
2635
|
if invalid_num_classes:
|
|
2576
2636
|
invalid_cols[col] = invalid_num_classes
|
|
2577
2637
|
if invalid_cols:
|
|
2578
|
-
return [
|
|
2638
|
+
return [InvalidNumClassesMultiClassMap(invalid_cols)]
|
|
2579
2639
|
return []
|
|
2580
2640
|
|
|
2581
2641
|
@staticmethod
|
|
2582
2642
|
def _check_classes_and_scores_values_in_multi_class_maps(
|
|
2583
2643
|
dataframe: pd.DataFrame, schema: Schema
|
|
2584
2644
|
) -> list[
|
|
2585
|
-
|
|
2586
|
-
|
|
|
2587
|
-
|
|
|
2645
|
+
InvalidMultiClassClassNameLength
|
|
2646
|
+
| InvalidMultiClassActScoreValue
|
|
2647
|
+
| InvalidMultiClassPredScoreValue
|
|
2588
2648
|
]:
|
|
2589
2649
|
"""Validate the class names and score values of dictionaries.
|
|
2590
2650
|
|
|
@@ -2649,21 +2709,17 @@ class Validator:
|
|
|
2649
2709
|
if invalid_scores_for_col:
|
|
2650
2710
|
invalid_pred_scores[col] = invalid_scores_for_col
|
|
2651
2711
|
if invalid_class_names:
|
|
2652
|
-
errors.append(
|
|
2653
|
-
err.InvalidMultiClassClassNameLength(invalid_class_names)
|
|
2654
|
-
)
|
|
2712
|
+
errors.append(InvalidMultiClassClassNameLength(invalid_class_names))
|
|
2655
2713
|
if invalid_pred_scores:
|
|
2656
|
-
errors.append(
|
|
2657
|
-
err.InvalidMultiClassPredScoreValue(invalid_pred_scores)
|
|
2658
|
-
)
|
|
2714
|
+
errors.append(InvalidMultiClassPredScoreValue(invalid_pred_scores))
|
|
2659
2715
|
if invalid_actual_scores:
|
|
2660
|
-
errors.append(
|
|
2716
|
+
errors.append(InvalidMultiClassActScoreValue(col))
|
|
2661
2717
|
return errors
|
|
2662
2718
|
|
|
2663
2719
|
@staticmethod
|
|
2664
2720
|
def _check_each_multi_class_pred_has_threshold(
|
|
2665
2721
|
dataframe: pd.DataFrame, schema: Schema
|
|
2666
|
-
) -> list[
|
|
2722
|
+
) -> list[InvalidMultiClassThresholdClasses]:
|
|
2667
2723
|
"""Validate threshold scores for Multi Class models.
|
|
2668
2724
|
|
|
2669
2725
|
If threshold scores column is included in schema and dataframe, validate that
|
|
@@ -2687,7 +2743,7 @@ class Validator:
|
|
|
2687
2743
|
pred_class_set = set(pred_classes)
|
|
2688
2744
|
if pred_class_set != thresh_class_set:
|
|
2689
2745
|
return [
|
|
2690
|
-
|
|
2746
|
+
InvalidMultiClassThresholdClasses(
|
|
2691
2747
|
threshold_col, pred_class_set, thresh_class_set
|
|
2692
2748
|
)
|
|
2693
2749
|
]
|
|
@@ -2697,7 +2753,7 @@ class Validator:
|
|
|
2697
2753
|
def _check_value_timestamp(
|
|
2698
2754
|
dataframe: pd.DataFrame,
|
|
2699
2755
|
schema: Schema,
|
|
2700
|
-
) -> list[
|
|
2756
|
+
) -> list[InvalidValueMissingValue | InvalidValueTimestamp]:
|
|
2701
2757
|
# Due to the timing difference between checking this here and the data finally
|
|
2702
2758
|
# hitting the same check on server side, there's a some chance for a false
|
|
2703
2759
|
# result, i.e. the check here succeeds but the same check on server side fails.
|
|
@@ -2708,9 +2764,7 @@ class Validator:
|
|
|
2708
2764
|
# missing value first.
|
|
2709
2765
|
if dataframe[col].isnull().values.any(): # type: ignore
|
|
2710
2766
|
return [
|
|
2711
|
-
|
|
2712
|
-
"Prediction timestamp", "missing"
|
|
2713
|
-
)
|
|
2767
|
+
InvalidValueMissingValue("Prediction timestamp", "missing")
|
|
2714
2768
|
]
|
|
2715
2769
|
|
|
2716
2770
|
now_t = datetime.now(tz=timezone.utc)
|
|
@@ -2794,7 +2848,7 @@ class Validator:
|
|
|
2794
2848
|
)
|
|
2795
2849
|
)
|
|
2796
2850
|
):
|
|
2797
|
-
return [
|
|
2851
|
+
return [InvalidValueTimestamp(timestamp_col_name=col)]
|
|
2798
2852
|
|
|
2799
2853
|
return []
|
|
2800
2854
|
|
|
@@ -2803,7 +2857,7 @@ class Validator:
|
|
|
2803
2857
|
@staticmethod
|
|
2804
2858
|
def _check_invalid_missing_values(
|
|
2805
2859
|
dataframe: pd.DataFrame, schema: BaseSchema, model_type: ModelTypes
|
|
2806
|
-
) -> list[
|
|
2860
|
+
) -> list[InvalidValueMissingValue]:
|
|
2807
2861
|
errors = []
|
|
2808
2862
|
columns = ()
|
|
2809
2863
|
if isinstance(schema, CorpusSchema):
|
|
@@ -2824,7 +2878,7 @@ class Validator:
|
|
|
2824
2878
|
if col is not None and col in dataframe.columns:
|
|
2825
2879
|
if dataframe[col].isnull().any():
|
|
2826
2880
|
errors.append(
|
|
2827
|
-
|
|
2881
|
+
InvalidValueMissingValue(
|
|
2828
2882
|
name, wrong_values="missing", column=col
|
|
2829
2883
|
)
|
|
2830
2884
|
)
|
|
@@ -2834,7 +2888,7 @@ class Validator:
|
|
|
2834
2888
|
and np.isinf(dataframe[col]).any()
|
|
2835
2889
|
):
|
|
2836
2890
|
errors.append(
|
|
2837
|
-
|
|
2891
|
+
InvalidValueMissingValue(
|
|
2838
2892
|
name, wrong_values="infinite", column=col
|
|
2839
2893
|
)
|
|
2840
2894
|
)
|
|
@@ -2850,7 +2904,7 @@ class Validator:
|
|
|
2850
2904
|
environment: Environments,
|
|
2851
2905
|
schema: Schema,
|
|
2852
2906
|
model_type: ModelTypes,
|
|
2853
|
-
) -> list[
|
|
2907
|
+
) -> list[InvalidRecord]:
|
|
2854
2908
|
if environment in (Environments.VALIDATION, Environments.TRAINING):
|
|
2855
2909
|
return []
|
|
2856
2910
|
|
|
@@ -2894,7 +2948,7 @@ class Validator:
|
|
|
2894
2948
|
environment: Environments,
|
|
2895
2949
|
schema: Schema,
|
|
2896
2950
|
model_type: ModelTypes,
|
|
2897
|
-
) -> list[
|
|
2951
|
+
) -> list[InvalidRecord]:
|
|
2898
2952
|
"""Validates there's not a single row in the dataframe with all nulls.
|
|
2899
2953
|
|
|
2900
2954
|
Returns errors if any row has all of pred_label and pred_score evaluating to
|
|
@@ -2942,7 +2996,7 @@ class Validator:
|
|
|
2942
2996
|
@staticmethod
|
|
2943
2997
|
def _check_invalid_record_helper(
|
|
2944
2998
|
dataframe: pd.DataFrame, column_names: list[str | None]
|
|
2945
|
-
) -> list[
|
|
2999
|
+
) -> list[InvalidRecord]:
|
|
2946
3000
|
"""Check that there are no null values in a subset of columns.
|
|
2947
3001
|
|
|
2948
3002
|
The column subset is computed from the input list of columns `column_names`
|
|
@@ -2950,7 +3004,7 @@ class Validator:
|
|
|
2950
3004
|
null values are found.
|
|
2951
3005
|
|
|
2952
3006
|
Returns:
|
|
2953
|
-
List[
|
|
3007
|
+
List[InvalidRecord]: An error expressing the rows that are problematic
|
|
2954
3008
|
|
|
2955
3009
|
"""
|
|
2956
3010
|
columns_subset = [
|
|
@@ -2964,12 +3018,12 @@ class Validator:
|
|
|
2964
3018
|
null_index = null_filter[null_filter].index.values
|
|
2965
3019
|
if len(null_index) == 0:
|
|
2966
3020
|
return []
|
|
2967
|
-
return [
|
|
3021
|
+
return [InvalidRecord(columns_subset, null_index)] # type: ignore
|
|
2968
3022
|
|
|
2969
3023
|
@staticmethod
|
|
2970
3024
|
def _check_type_prediction_group_id(
|
|
2971
3025
|
schema: Schema, column_types: dict[str, Any]
|
|
2972
|
-
) -> list[
|
|
3026
|
+
) -> list[InvalidType]:
|
|
2973
3027
|
col = schema.prediction_group_id_column_name
|
|
2974
3028
|
if col in column_types:
|
|
2975
3029
|
# should mirror server side
|
|
@@ -2982,7 +3036,7 @@ class Validator:
|
|
|
2982
3036
|
)
|
|
2983
3037
|
if column_types[col] not in allowed_datatypes:
|
|
2984
3038
|
return [
|
|
2985
|
-
|
|
3039
|
+
InvalidType(
|
|
2986
3040
|
"prediction_group_ids",
|
|
2987
3041
|
expected_types=["str", "int"],
|
|
2988
3042
|
found_data_type=column_types[col],
|
|
@@ -2993,7 +3047,7 @@ class Validator:
|
|
|
2993
3047
|
@staticmethod
|
|
2994
3048
|
def _check_type_rank(
|
|
2995
3049
|
schema: Schema, column_types: dict[str, Any]
|
|
2996
|
-
) -> list[
|
|
3050
|
+
) -> list[InvalidType]:
|
|
2997
3051
|
col = schema.rank_column_name
|
|
2998
3052
|
if col in column_types:
|
|
2999
3053
|
allowed_datatypes = (
|
|
@@ -3004,7 +3058,7 @@ class Validator:
|
|
|
3004
3058
|
)
|
|
3005
3059
|
if column_types[col] not in allowed_datatypes:
|
|
3006
3060
|
return [
|
|
3007
|
-
|
|
3061
|
+
InvalidType(
|
|
3008
3062
|
"rank",
|
|
3009
3063
|
expected_types=["int"],
|
|
3010
3064
|
found_data_type=column_types[col],
|
|
@@ -3015,7 +3069,7 @@ class Validator:
|
|
|
3015
3069
|
@staticmethod
|
|
3016
3070
|
def _check_type_ranking_category(
|
|
3017
3071
|
schema: Schema, column_types: dict[str, Any]
|
|
3018
|
-
) -> list[
|
|
3072
|
+
) -> list[InvalidType]:
|
|
3019
3073
|
if schema.relevance_labels_column_name is not None:
|
|
3020
3074
|
col = schema.relevance_labels_column_name
|
|
3021
3075
|
elif schema.attributions_column_name is not None:
|
|
@@ -3026,7 +3080,7 @@ class Validator:
|
|
|
3026
3080
|
allowed_datatypes = (pa.list_(pa.string()), pa.string(), pa.null())
|
|
3027
3081
|
if column_types[col] not in allowed_datatypes:
|
|
3028
3082
|
return [
|
|
3029
|
-
|
|
3083
|
+
InvalidType(
|
|
3030
3084
|
"relevance labels column for ranking models",
|
|
3031
3085
|
expected_types=["list of string", "string"],
|
|
3032
3086
|
found_data_type=column_types[col],
|
|
@@ -3037,7 +3091,7 @@ class Validator:
|
|
|
3037
3091
|
@staticmethod
|
|
3038
3092
|
def _check_value_bounding_boxes_coordinates(
|
|
3039
3093
|
dataframe: pd.DataFrame, schema: Schema
|
|
3040
|
-
) -> list[
|
|
3094
|
+
) -> list[InvalidBoundingBoxesCoordinates]:
|
|
3041
3095
|
errors = []
|
|
3042
3096
|
if schema.object_detection_prediction_column_names is not None:
|
|
3043
3097
|
coords_col_name = schema.object_detection_prediction_column_names.bounding_boxes_coordinates_column_name # noqa: E501
|
|
@@ -3058,7 +3112,7 @@ class Validator:
|
|
|
3058
3112
|
@staticmethod
|
|
3059
3113
|
def _check_value_bounding_boxes_categories(
|
|
3060
3114
|
dataframe: pd.DataFrame, schema: Schema
|
|
3061
|
-
) -> list[
|
|
3115
|
+
) -> list[InvalidBoundingBoxesCategories]:
|
|
3062
3116
|
errors = []
|
|
3063
3117
|
if schema.object_detection_prediction_column_names is not None:
|
|
3064
3118
|
cat_col_name = schema.object_detection_prediction_column_names.categories_column_name
|
|
@@ -3079,7 +3133,7 @@ class Validator:
|
|
|
3079
3133
|
@staticmethod
|
|
3080
3134
|
def _check_value_bounding_boxes_scores(
|
|
3081
3135
|
dataframe: pd.DataFrame, schema: Schema
|
|
3082
|
-
) -> list[
|
|
3136
|
+
) -> list[InvalidBoundingBoxesScores]:
|
|
3083
3137
|
errors = []
|
|
3084
3138
|
if schema.object_detection_prediction_column_names is not None:
|
|
3085
3139
|
sc_col_name = schema.object_detection_prediction_column_names.scores_column_name
|
|
@@ -3104,7 +3158,7 @@ class Validator:
|
|
|
3104
3158
|
@staticmethod
|
|
3105
3159
|
def _check_value_semantic_segmentation_polygon_coordinates(
|
|
3106
3160
|
dataframe: pd.DataFrame, schema: Schema
|
|
3107
|
-
) -> list[
|
|
3161
|
+
) -> list[InvalidPolygonCoordinates]:
|
|
3108
3162
|
errors = []
|
|
3109
3163
|
if schema.semantic_segmentation_prediction_column_names is not None:
|
|
3110
3164
|
coords_col_name = schema.semantic_segmentation_prediction_column_names.polygon_coordinates_column_name # noqa: E501
|
|
@@ -3125,7 +3179,7 @@ class Validator:
|
|
|
3125
3179
|
@staticmethod
|
|
3126
3180
|
def _check_value_semantic_segmentation_polygon_categories(
|
|
3127
3181
|
dataframe: pd.DataFrame, schema: Schema
|
|
3128
|
-
) -> list[
|
|
3182
|
+
) -> list[InvalidPolygonCategories]:
|
|
3129
3183
|
errors = []
|
|
3130
3184
|
if schema.semantic_segmentation_prediction_column_names is not None:
|
|
3131
3185
|
cat_col_name = schema.semantic_segmentation_prediction_column_names.categories_column_name
|
|
@@ -3146,7 +3200,7 @@ class Validator:
|
|
|
3146
3200
|
@staticmethod
|
|
3147
3201
|
def _check_value_instance_segmentation_polygon_coordinates(
|
|
3148
3202
|
dataframe: pd.DataFrame, schema: Schema
|
|
3149
|
-
) -> list[
|
|
3203
|
+
) -> list[InvalidPolygonCoordinates]:
|
|
3150
3204
|
errors = []
|
|
3151
3205
|
if schema.instance_segmentation_prediction_column_names is not None:
|
|
3152
3206
|
coords_col_name = schema.instance_segmentation_prediction_column_names.polygon_coordinates_column_name # noqa: E501
|
|
@@ -3167,7 +3221,7 @@ class Validator:
|
|
|
3167
3221
|
@staticmethod
|
|
3168
3222
|
def _check_value_instance_segmentation_polygon_categories(
|
|
3169
3223
|
dataframe: pd.DataFrame, schema: Schema
|
|
3170
|
-
) -> list[
|
|
3224
|
+
) -> list[InvalidPolygonCategories]:
|
|
3171
3225
|
errors = []
|
|
3172
3226
|
if schema.instance_segmentation_prediction_column_names is not None:
|
|
3173
3227
|
cat_col_name = schema.instance_segmentation_prediction_column_names.categories_column_name
|
|
@@ -3188,7 +3242,7 @@ class Validator:
|
|
|
3188
3242
|
@staticmethod
|
|
3189
3243
|
def _check_value_instance_segmentation_polygon_scores(
|
|
3190
3244
|
dataframe: pd.DataFrame, schema: Schema
|
|
3191
|
-
) -> list[
|
|
3245
|
+
) -> list[InvalidPolygonScores]:
|
|
3192
3246
|
errors = []
|
|
3193
3247
|
if schema.instance_segmentation_prediction_column_names is not None:
|
|
3194
3248
|
sc_col_name = schema.instance_segmentation_prediction_column_names.scores_column_name
|
|
@@ -3203,7 +3257,7 @@ class Validator:
|
|
|
3203
3257
|
@staticmethod
|
|
3204
3258
|
def _check_value_instance_segmentation_bbox_coordinates(
|
|
3205
3259
|
dataframe: pd.DataFrame, schema: Schema
|
|
3206
|
-
) -> list[
|
|
3260
|
+
) -> list[InvalidBoundingBoxesCoordinates]:
|
|
3207
3261
|
errors = []
|
|
3208
3262
|
if schema.instance_segmentation_prediction_column_names is not None:
|
|
3209
3263
|
coords_col_name = schema.instance_segmentation_prediction_column_names.bounding_boxes_coordinates_column_name # noqa: E501
|
|
@@ -3226,7 +3280,7 @@ class Validator:
|
|
|
3226
3280
|
@staticmethod
|
|
3227
3281
|
def _check_value_prompt_response(
|
|
3228
3282
|
dataframe: pd.DataFrame, schema: Schema
|
|
3229
|
-
) -> list[
|
|
3283
|
+
) -> list[ValidationError]:
|
|
3230
3284
|
vector_cols_to_check = []
|
|
3231
3285
|
text_cols_to_check = []
|
|
3232
3286
|
if isinstance(schema.prompt_column_names, str):
|
|
@@ -3265,13 +3319,13 @@ class Validator:
|
|
|
3265
3319
|
errors = []
|
|
3266
3320
|
if invalid_long_string_data_cols:
|
|
3267
3321
|
errors.append(
|
|
3268
|
-
|
|
3322
|
+
InvalidValueEmbeddingRawDataTooLong(
|
|
3269
3323
|
invalid_long_string_data_cols
|
|
3270
3324
|
)
|
|
3271
3325
|
)
|
|
3272
3326
|
if invalid_low_dim_vector_cols or invalid_high_dim_vector_cols:
|
|
3273
3327
|
errors.append(
|
|
3274
|
-
|
|
3328
|
+
InvalidValueEmbeddingVectorDimensionality(
|
|
3275
3329
|
invalid_low_dim_vector_cols,
|
|
3276
3330
|
invalid_high_dim_vector_cols,
|
|
3277
3331
|
)
|
|
@@ -3291,7 +3345,7 @@ class Validator:
|
|
|
3291
3345
|
@staticmethod
|
|
3292
3346
|
def _check_value_llm_model_name(
|
|
3293
3347
|
dataframe: pd.DataFrame, schema: Schema
|
|
3294
|
-
) -> list[
|
|
3348
|
+
) -> list[InvalidStringLengthInColumn]:
|
|
3295
3349
|
if schema.llm_config_column_names is None:
|
|
3296
3350
|
return []
|
|
3297
3351
|
col = schema.llm_config_column_names.model_column_name
|
|
@@ -3301,7 +3355,7 @@ class Validator:
|
|
|
3301
3355
|
)
|
|
3302
3356
|
if max_len > MAX_LLM_MODEL_NAME_LENGTH:
|
|
3303
3357
|
return [
|
|
3304
|
-
|
|
3358
|
+
InvalidStringLengthInColumn(
|
|
3305
3359
|
schema_name="llm_config_column_names.model_column_name",
|
|
3306
3360
|
col_name=col,
|
|
3307
3361
|
min_length=0,
|
|
@@ -3319,7 +3373,7 @@ class Validator:
|
|
|
3319
3373
|
@staticmethod
|
|
3320
3374
|
def _check_value_llm_prompt_template(
|
|
3321
3375
|
dataframe: pd.DataFrame, schema: Schema
|
|
3322
|
-
) -> list[
|
|
3376
|
+
) -> list[InvalidStringLengthInColumn]:
|
|
3323
3377
|
if schema.prompt_template_column_names is None:
|
|
3324
3378
|
return []
|
|
3325
3379
|
col = schema.prompt_template_column_names.template_column_name
|
|
@@ -3329,7 +3383,7 @@ class Validator:
|
|
|
3329
3383
|
)
|
|
3330
3384
|
if max_len > MAX_PROMPT_TEMPLATE_LENGTH:
|
|
3331
3385
|
return [
|
|
3332
|
-
|
|
3386
|
+
InvalidStringLengthInColumn(
|
|
3333
3387
|
schema_name="prompt_template_column_names.template_column_name",
|
|
3334
3388
|
col_name=col,
|
|
3335
3389
|
min_length=0,
|
|
@@ -3348,7 +3402,7 @@ class Validator:
|
|
|
3348
3402
|
@staticmethod
|
|
3349
3403
|
def _check_value_llm_prompt_template_version(
|
|
3350
3404
|
dataframe: pd.DataFrame, schema: Schema
|
|
3351
|
-
) -> list[
|
|
3405
|
+
) -> list[InvalidStringLengthInColumn]:
|
|
3352
3406
|
if schema.prompt_template_column_names is None:
|
|
3353
3407
|
return []
|
|
3354
3408
|
col = schema.prompt_template_column_names.template_version_column_name
|
|
@@ -3358,7 +3412,7 @@ class Validator:
|
|
|
3358
3412
|
)
|
|
3359
3413
|
if max_len > MAX_PROMPT_TEMPLATE_VERSION_LENGTH:
|
|
3360
3414
|
return [
|
|
3361
|
-
|
|
3415
|
+
InvalidStringLengthInColumn(
|
|
3362
3416
|
schema_name="prompt_template_column_names.template_version_column_name",
|
|
3363
3417
|
col_name=col,
|
|
3364
3418
|
min_length=0,
|
|
@@ -3377,7 +3431,7 @@ class Validator:
|
|
|
3377
3431
|
@staticmethod
|
|
3378
3432
|
def _check_type_document_columns(
|
|
3379
3433
|
schema: CorpusSchema, column_types: dict[str, Any]
|
|
3380
|
-
) -> list[
|
|
3434
|
+
) -> list[InvalidTypeColumns]:
|
|
3381
3435
|
invalid_types = []
|
|
3382
3436
|
# Check document id
|
|
3383
3437
|
col = schema.document_id_column_name
|
|
@@ -3391,7 +3445,7 @@ class Validator:
|
|
|
3391
3445
|
)
|
|
3392
3446
|
if column_types[col] not in allowed_datatypes:
|
|
3393
3447
|
invalid_types += [
|
|
3394
|
-
|
|
3448
|
+
InvalidTypeColumns(
|
|
3395
3449
|
wrong_type_columns=[col],
|
|
3396
3450
|
expected_types=["str", "int"],
|
|
3397
3451
|
)
|
|
@@ -3403,7 +3457,7 @@ class Validator:
|
|
|
3403
3457
|
allowed_datatype = pa.string()
|
|
3404
3458
|
if column_types[col] != allowed_datatype:
|
|
3405
3459
|
invalid_types += [
|
|
3406
|
-
|
|
3460
|
+
InvalidTypeColumns(
|
|
3407
3461
|
wrong_type_columns=[col],
|
|
3408
3462
|
expected_types=["str"],
|
|
3409
3463
|
)
|
|
@@ -3421,7 +3475,7 @@ class Validator:
|
|
|
3421
3475
|
)
|
|
3422
3476
|
if column_types[col] not in allowed_datatypes:
|
|
3423
3477
|
invalid_types += [
|
|
3424
|
-
|
|
3478
|
+
InvalidTypeColumns(
|
|
3425
3479
|
wrong_type_columns=[col],
|
|
3426
3480
|
expected_types=["list[float], np.array[float]"],
|
|
3427
3481
|
)
|
|
@@ -3436,7 +3490,7 @@ class Validator:
|
|
|
3436
3490
|
)
|
|
3437
3491
|
if column_types[col] not in allowed_datatypes:
|
|
3438
3492
|
invalid_types += [
|
|
3439
|
-
|
|
3493
|
+
InvalidTypeColumns(
|
|
3440
3494
|
wrong_type_columns=[col],
|
|
3441
3495
|
expected_types=["list[str]"],
|
|
3442
3496
|
)
|
|
@@ -3450,7 +3504,7 @@ class Validator:
|
|
|
3450
3504
|
allowed_datatypes = (pa.string(),)
|
|
3451
3505
|
if column_types[col] not in allowed_datatypes:
|
|
3452
3506
|
invalid_types += [
|
|
3453
|
-
|
|
3507
|
+
InvalidTypeColumns(
|
|
3454
3508
|
wrong_type_columns=[col],
|
|
3455
3509
|
expected_types=["str"],
|
|
3456
3510
|
)
|
|
@@ -3517,15 +3571,15 @@ def _check_value_raw_data_length_helper(
|
|
|
3517
3571
|
|
|
3518
3572
|
def _check_value_bounding_boxes_coordinates_helper(
|
|
3519
3573
|
coordinates_col: pd.Series,
|
|
3520
|
-
) ->
|
|
3574
|
+
) -> InvalidBoundingBoxesCoordinates | None:
|
|
3521
3575
|
def check(boxes: object) -> None:
|
|
3522
3576
|
# We allow for zero boxes. None coordinates list is not allowed (will break following tests:
|
|
3523
3577
|
# 'NoneType is not iterable')
|
|
3524
3578
|
if boxes is None:
|
|
3525
|
-
raise
|
|
3579
|
+
raise InvalidBoundingBoxesCoordinates(reason="none_boxes")
|
|
3526
3580
|
for box in boxes:
|
|
3527
3581
|
if box is None or len(box) == 0:
|
|
3528
|
-
raise
|
|
3582
|
+
raise InvalidBoundingBoxesCoordinates(
|
|
3529
3583
|
reason="none_or_empty_box"
|
|
3530
3584
|
)
|
|
3531
3585
|
error = _box_coordinates_wrong_format(box)
|
|
@@ -3534,14 +3588,14 @@ def _check_value_bounding_boxes_coordinates_helper(
|
|
|
3534
3588
|
|
|
3535
3589
|
try:
|
|
3536
3590
|
coordinates_col.apply(check)
|
|
3537
|
-
except
|
|
3591
|
+
except InvalidBoundingBoxesCoordinates as e:
|
|
3538
3592
|
return e
|
|
3539
3593
|
return None
|
|
3540
3594
|
|
|
3541
3595
|
|
|
3542
3596
|
def _box_coordinates_wrong_format(
|
|
3543
3597
|
box_coords: object,
|
|
3544
|
-
) ->
|
|
3598
|
+
) -> InvalidBoundingBoxesCoordinates | None:
|
|
3545
3599
|
if (
|
|
3546
3600
|
# Coordinates should be a collection of 4 floats
|
|
3547
3601
|
len(box_coords) != 4
|
|
@@ -3552,7 +3606,7 @@ def _box_coordinates_wrong_format(
|
|
|
3552
3606
|
# Coordinates represent the top-left & bottom-right corners of a box: y1 < y2
|
|
3553
3607
|
or box_coords[1] >= box_coords[3]
|
|
3554
3608
|
):
|
|
3555
|
-
return
|
|
3609
|
+
return InvalidBoundingBoxesCoordinates(
|
|
3556
3610
|
reason="boxes_coordinates_wrong_format"
|
|
3557
3611
|
)
|
|
3558
3612
|
return None
|
|
@@ -3560,51 +3614,47 @@ def _box_coordinates_wrong_format(
|
|
|
3560
3614
|
|
|
3561
3615
|
def _check_value_bounding_boxes_categories_helper(
|
|
3562
3616
|
categories_col: pd.Series,
|
|
3563
|
-
) ->
|
|
3617
|
+
) -> InvalidBoundingBoxesCategories | None:
|
|
3564
3618
|
def check(categories: object) -> None:
|
|
3565
3619
|
# We allow for zero boxes. None category list is not allowed (will break following tests:
|
|
3566
3620
|
# 'NoneType is not iterable')
|
|
3567
3621
|
if categories is None:
|
|
3568
|
-
raise
|
|
3569
|
-
reason="none_category_list"
|
|
3570
|
-
)
|
|
3622
|
+
raise InvalidBoundingBoxesCategories(reason="none_category_list")
|
|
3571
3623
|
for category in categories:
|
|
3572
3624
|
# Allow for empty string category, no None values
|
|
3573
3625
|
if category is None:
|
|
3574
|
-
raise
|
|
3626
|
+
raise InvalidBoundingBoxesCategories(reason="none_category")
|
|
3575
3627
|
|
|
3576
3628
|
try:
|
|
3577
3629
|
categories_col.apply(check)
|
|
3578
|
-
except
|
|
3630
|
+
except InvalidBoundingBoxesCategories as e:
|
|
3579
3631
|
return e
|
|
3580
3632
|
return None
|
|
3581
3633
|
|
|
3582
3634
|
|
|
3583
3635
|
def _check_value_bounding_boxes_scores_helper(
|
|
3584
3636
|
scores_col: pd.Series,
|
|
3585
|
-
) ->
|
|
3637
|
+
) -> InvalidBoundingBoxesScores | None:
|
|
3586
3638
|
def check(scores: object) -> None:
|
|
3587
3639
|
# We allow for zero boxes. None confidence score list is not allowed (will break following tests:
|
|
3588
3640
|
# 'NoneType is not iterable')
|
|
3589
3641
|
if scores is None:
|
|
3590
|
-
raise
|
|
3642
|
+
raise InvalidBoundingBoxesScores(reason="none_score_list")
|
|
3591
3643
|
for score in scores:
|
|
3592
3644
|
# Confidence scores are between 0 and 1
|
|
3593
3645
|
if score < 0 or score > 1:
|
|
3594
|
-
raise
|
|
3595
|
-
reason="scores_out_of_bounds"
|
|
3596
|
-
)
|
|
3646
|
+
raise InvalidBoundingBoxesScores(reason="scores_out_of_bounds")
|
|
3597
3647
|
|
|
3598
3648
|
try:
|
|
3599
3649
|
scores_col.apply(check)
|
|
3600
|
-
except
|
|
3650
|
+
except InvalidBoundingBoxesScores as e:
|
|
3601
3651
|
return e
|
|
3602
3652
|
return None
|
|
3603
3653
|
|
|
3604
3654
|
|
|
3605
3655
|
def _polygon_coordinates_wrong_format(
|
|
3606
3656
|
polygon_coords: object,
|
|
3607
|
-
) ->
|
|
3657
|
+
) -> InvalidPolygonCoordinates | None:
|
|
3608
3658
|
"""Check if polygon coordinates are valid.
|
|
3609
3659
|
|
|
3610
3660
|
Validates:
|
|
@@ -3629,7 +3679,7 @@ def _polygon_coordinates_wrong_format(
|
|
|
3629
3679
|
# Coordinates should be a collection of pairs of floats
|
|
3630
3680
|
or len(polygon_coords) % 2 != 0
|
|
3631
3681
|
):
|
|
3632
|
-
return
|
|
3682
|
+
return InvalidPolygonCoordinates(
|
|
3633
3683
|
reason="polygon_coordinates_wrong_format",
|
|
3634
3684
|
coordinates=polygon_coords,
|
|
3635
3685
|
)
|
|
@@ -3644,7 +3694,7 @@ def _polygon_coordinates_wrong_format(
|
|
|
3644
3694
|
for i in range(len(points)):
|
|
3645
3695
|
for j in range(i + 1, len(points)):
|
|
3646
3696
|
if points[i] == points[j]:
|
|
3647
|
-
return
|
|
3697
|
+
return InvalidPolygonCoordinates(
|
|
3648
3698
|
reason="polygon_coordinates_repeated_vertices",
|
|
3649
3699
|
coordinates=polygon_coords,
|
|
3650
3700
|
)
|
|
@@ -3665,7 +3715,7 @@ def _polygon_coordinates_wrong_format(
|
|
|
3665
3715
|
if segments_intersect(
|
|
3666
3716
|
edges[i][0], edges[i][1], edges[j][0], edges[j][1]
|
|
3667
3717
|
):
|
|
3668
|
-
return
|
|
3718
|
+
return InvalidPolygonCoordinates(
|
|
3669
3719
|
reason="polygon_coordinates_self_intersecting_vertices",
|
|
3670
3720
|
coordinates=polygon_coords,
|
|
3671
3721
|
)
|
|
@@ -3675,64 +3725,62 @@ def _polygon_coordinates_wrong_format(
|
|
|
3675
3725
|
|
|
3676
3726
|
def _check_value_polygon_coordinates_helper(
|
|
3677
3727
|
coordinates_col: pd.Series,
|
|
3678
|
-
) ->
|
|
3728
|
+
) -> InvalidPolygonCoordinates | None:
|
|
3679
3729
|
def check(polygons: object) -> None:
|
|
3680
3730
|
# We allow for zero polygons. None coordinates list is not allowed (will break following tests:
|
|
3681
3731
|
# 'NoneType is not iterable')
|
|
3682
3732
|
if polygons is None:
|
|
3683
|
-
raise
|
|
3733
|
+
raise InvalidPolygonCoordinates(reason="none_polygons")
|
|
3684
3734
|
for polygon in polygons:
|
|
3685
3735
|
if polygon is None or len(polygon) == 0:
|
|
3686
|
-
raise
|
|
3687
|
-
reason="none_or_empty_polygon"
|
|
3688
|
-
)
|
|
3736
|
+
raise InvalidPolygonCoordinates(reason="none_or_empty_polygon")
|
|
3689
3737
|
error = _polygon_coordinates_wrong_format(polygon)
|
|
3690
3738
|
if error is not None:
|
|
3691
3739
|
raise error
|
|
3692
3740
|
|
|
3693
3741
|
try:
|
|
3694
3742
|
coordinates_col.apply(check)
|
|
3695
|
-
except
|
|
3743
|
+
except InvalidPolygonCoordinates as e:
|
|
3696
3744
|
return e
|
|
3697
3745
|
return None
|
|
3698
3746
|
|
|
3699
3747
|
|
|
3700
3748
|
def _check_value_polygon_categories_helper(
|
|
3701
3749
|
categories_col: pd.Series,
|
|
3702
|
-
) ->
|
|
3750
|
+
) -> InvalidPolygonCategories | None:
|
|
3703
3751
|
def check(categories: object) -> None:
|
|
3704
3752
|
# We allow for zero boxes. None category list is not allowed (will break following tests:
|
|
3705
3753
|
# 'NoneType is not iterable')
|
|
3706
3754
|
if categories is None:
|
|
3707
|
-
raise
|
|
3755
|
+
raise InvalidPolygonCategories(reason="none_category_list")
|
|
3708
3756
|
for category in categories:
|
|
3709
3757
|
# Allow for empty string category, no None values
|
|
3710
3758
|
if category is None:
|
|
3711
|
-
raise
|
|
3759
|
+
raise InvalidPolygonCategories(reason="none_category")
|
|
3712
3760
|
|
|
3713
3761
|
try:
|
|
3714
3762
|
categories_col.apply(check)
|
|
3715
|
-
except
|
|
3763
|
+
except InvalidPolygonCategories as e:
|
|
3716
3764
|
return e
|
|
3717
3765
|
return None
|
|
3718
3766
|
|
|
3719
3767
|
|
|
3720
3768
|
def _check_value_polygon_scores_helper(
|
|
3721
3769
|
scores_col: pd.Series,
|
|
3722
|
-
) ->
|
|
3770
|
+
) -> InvalidPolygonScores | None:
|
|
3723
3771
|
def check(scores: object) -> None:
|
|
3724
3772
|
# We allow for zero boxes. None confidence score list is not allowed (will break following tests:
|
|
3725
3773
|
# 'NoneType is not iterable')
|
|
3726
3774
|
if scores is None:
|
|
3727
|
-
raise
|
|
3775
|
+
raise InvalidPolygonScores(reason="none_score_list")
|
|
3728
3776
|
for score in scores:
|
|
3729
3777
|
# Confidence scores are between 0 and 1
|
|
3730
3778
|
if score < 0 or score > 1:
|
|
3731
|
-
raise
|
|
3779
|
+
raise InvalidPolygonScores(reason="scores_out_of_bounds")
|
|
3732
3780
|
|
|
3733
3781
|
try:
|
|
3734
3782
|
scores_col.apply(check)
|
|
3735
|
-
except
|
|
3783
|
+
except InvalidPolygonScores as e:
|
|
3736
3784
|
return e
|
|
3737
3785
|
return None
|
|
3738
3786
|
|