arize 8.0.0b1__py3-none-any.whl → 8.0.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arize/__init__.py +9 -2
- arize/_client_factory.py +50 -0
- arize/_exporter/client.py +18 -17
- arize/_exporter/parsers/tracing_data_parser.py +9 -4
- arize/_exporter/validation.py +1 -1
- arize/_flight/client.py +37 -17
- arize/_generated/api_client/api/datasets_api.py +6 -6
- arize/_generated/api_client/api/experiments_api.py +6 -6
- arize/_generated/api_client/api/projects_api.py +3 -3
- arize/_lazy.py +61 -10
- arize/client.py +66 -50
- arize/config.py +175 -48
- arize/constants/config.py +1 -0
- arize/constants/ml.py +9 -16
- arize/constants/spans.py +5 -10
- arize/datasets/client.py +45 -28
- arize/datasets/errors.py +1 -1
- arize/datasets/validation.py +2 -2
- arize/embeddings/auto_generator.py +16 -9
- arize/embeddings/base_generators.py +15 -9
- arize/embeddings/cv_generators.py +2 -2
- arize/embeddings/errors.py +2 -2
- arize/embeddings/nlp_generators.py +8 -8
- arize/embeddings/tabular_generators.py +6 -6
- arize/exceptions/base.py +0 -52
- arize/exceptions/config.py +22 -0
- arize/exceptions/parameters.py +1 -330
- arize/exceptions/values.py +8 -5
- arize/experiments/__init__.py +4 -0
- arize/experiments/client.py +31 -18
- arize/experiments/evaluators/base.py +12 -9
- arize/experiments/evaluators/executors.py +16 -7
- arize/experiments/evaluators/rate_limiters.py +3 -1
- arize/experiments/evaluators/types.py +9 -7
- arize/experiments/evaluators/utils.py +7 -5
- arize/experiments/functions.py +128 -58
- arize/experiments/tracing.py +4 -1
- arize/experiments/types.py +34 -31
- arize/logging.py +54 -33
- arize/ml/batch_validation/errors.py +10 -1004
- arize/ml/batch_validation/validator.py +351 -291
- arize/ml/bounded_executor.py +25 -6
- arize/ml/casting.py +51 -33
- arize/ml/client.py +43 -35
- arize/ml/proto.py +21 -22
- arize/ml/stream_validation.py +64 -27
- arize/ml/surrogate_explainer/mimic.py +18 -10
- arize/ml/types.py +27 -67
- arize/pre_releases.py +10 -6
- arize/projects/client.py +9 -4
- arize/py.typed +0 -0
- arize/regions.py +11 -11
- arize/spans/client.py +125 -31
- arize/spans/columns.py +32 -36
- arize/spans/conversion.py +12 -11
- arize/spans/validation/annotations/dataframe_form_validation.py +1 -1
- arize/spans/validation/annotations/value_validation.py +11 -14
- arize/spans/validation/common/argument_validation.py +3 -3
- arize/spans/validation/common/dataframe_form_validation.py +7 -7
- arize/spans/validation/common/value_validation.py +11 -14
- arize/spans/validation/evals/dataframe_form_validation.py +4 -4
- arize/spans/validation/evals/evals_validation.py +6 -6
- arize/spans/validation/evals/value_validation.py +1 -1
- arize/spans/validation/metadata/argument_validation.py +1 -1
- arize/spans/validation/metadata/dataframe_form_validation.py +2 -2
- arize/spans/validation/metadata/value_validation.py +23 -1
- arize/spans/validation/spans/dataframe_form_validation.py +2 -2
- arize/spans/validation/spans/spans_validation.py +6 -6
- arize/utils/arrow.py +38 -2
- arize/utils/cache.py +2 -2
- arize/utils/dataframe.py +4 -4
- arize/utils/online_tasks/dataframe_preprocessor.py +15 -11
- arize/utils/openinference_conversion.py +10 -10
- arize/utils/proto.py +0 -1
- arize/utils/types.py +6 -6
- arize/version.py +1 -1
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/METADATA +32 -7
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/RECORD +81 -78
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/WHEEL +0 -0
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/LICENSE +0 -0
- {arize-8.0.0b1.dist-info → arize-8.0.0b4.dist-info}/licenses/NOTICE +0 -0
|
@@ -6,7 +6,10 @@ import logging
|
|
|
6
6
|
import math
|
|
7
7
|
from datetime import datetime, timedelta, timezone
|
|
8
8
|
from itertools import chain
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Sequence
|
|
10
13
|
|
|
11
14
|
import numpy as np
|
|
12
15
|
import pandas as pd
|
|
@@ -39,8 +42,70 @@ from arize.constants.ml import (
|
|
|
39
42
|
MIN_PREDICTION_ID_LEN,
|
|
40
43
|
MODEL_MAPPING_CONFIG,
|
|
41
44
|
)
|
|
45
|
+
from arize.exceptions.base import (
|
|
46
|
+
InvalidDataFrameIndex,
|
|
47
|
+
InvalidFieldTypeConversion,
|
|
48
|
+
ValidationError,
|
|
49
|
+
)
|
|
50
|
+
from arize.exceptions.types import (
|
|
51
|
+
InvalidFieldTypeLlmConfig,
|
|
52
|
+
InvalidFieldTypePromptTemplates,
|
|
53
|
+
InvalidType,
|
|
54
|
+
InvalidTypeColumns,
|
|
55
|
+
InvalidTypeFeatures,
|
|
56
|
+
InvalidTypeShapValues,
|
|
57
|
+
InvalidTypeTags,
|
|
58
|
+
InvalidValueEmbeddingRawDataTooLong,
|
|
59
|
+
InvalidValueEmbeddingVectorDimensionality,
|
|
60
|
+
)
|
|
61
|
+
from arize.exceptions.values import (
|
|
62
|
+
InvalidBoundingBoxesCategories,
|
|
63
|
+
InvalidBoundingBoxesCoordinates,
|
|
64
|
+
InvalidBoundingBoxesScores,
|
|
65
|
+
InvalidMultiClassActScoreValue,
|
|
66
|
+
InvalidMultiClassClassNameLength,
|
|
67
|
+
InvalidMultiClassPredScoreValue,
|
|
68
|
+
InvalidMultiClassThresholdClasses,
|
|
69
|
+
InvalidNumClassesMultiClassMap,
|
|
70
|
+
InvalidPolygonCategories,
|
|
71
|
+
InvalidPolygonCoordinates,
|
|
72
|
+
InvalidPolygonScores,
|
|
73
|
+
InvalidRankingCategoryValue,
|
|
74
|
+
InvalidRankValue,
|
|
75
|
+
InvalidRecord,
|
|
76
|
+
InvalidStringLengthInColumn,
|
|
77
|
+
InvalidTagLength,
|
|
78
|
+
InvalidValueMissingValue,
|
|
79
|
+
InvalidValueTimestamp,
|
|
80
|
+
)
|
|
42
81
|
from arize.logging import get_truncation_warning_message
|
|
43
|
-
from arize.ml.batch_validation import
|
|
82
|
+
from arize.ml.batch_validation.errors import (
|
|
83
|
+
DuplicateColumnsInDataframe,
|
|
84
|
+
InvalidBatchId,
|
|
85
|
+
InvalidColumnNameEmptyString,
|
|
86
|
+
InvalidEnvironment,
|
|
87
|
+
InvalidFieldTypeEmbeddingFeatures,
|
|
88
|
+
InvalidFieldTypePromptResponse,
|
|
89
|
+
InvalidModelId,
|
|
90
|
+
InvalidModelType,
|
|
91
|
+
InvalidModelTypeAndMetricsCombination,
|
|
92
|
+
InvalidModelVersion,
|
|
93
|
+
InvalidNumberOfEmbeddings,
|
|
94
|
+
InvalidPredActColumnNamesForModelType,
|
|
95
|
+
InvalidPredActCVColumnNamesForModelType,
|
|
96
|
+
InvalidSchemaType,
|
|
97
|
+
InvalidShapSuffix,
|
|
98
|
+
MissingColumns,
|
|
99
|
+
MissingCVPredAct,
|
|
100
|
+
MissingPredictionIdColumnForDelayedRecords,
|
|
101
|
+
MissingPreprodAct,
|
|
102
|
+
MissingPreprodPredActNumericAndCategorical,
|
|
103
|
+
MissingReqPredActColumnNamesForMultiClass,
|
|
104
|
+
MissingRequiredColumnsForRankingModel,
|
|
105
|
+
MissingRequiredColumnsMetricsValidation,
|
|
106
|
+
MultipleCVPredAct,
|
|
107
|
+
ReservedColumns,
|
|
108
|
+
)
|
|
44
109
|
from arize.ml.types import (
|
|
45
110
|
CATEGORICAL_MODEL_TYPES,
|
|
46
111
|
NUMERIC_MODEL_TYPES,
|
|
@@ -53,6 +118,7 @@ from arize.ml.types import (
|
|
|
53
118
|
ModelTypes,
|
|
54
119
|
PromptTemplateColumnNames,
|
|
55
120
|
Schema,
|
|
121
|
+
_normalize_column_names,
|
|
56
122
|
segments_intersect,
|
|
57
123
|
)
|
|
58
124
|
from arize.utils.types import (
|
|
@@ -74,8 +140,8 @@ class Validator:
|
|
|
74
140
|
schema: BaseSchema,
|
|
75
141
|
model_version: str | None = None,
|
|
76
142
|
batch_id: str | None = None,
|
|
77
|
-
) -> list[
|
|
78
|
-
"""Validate required checks for schema, environment, and DataFrame structure."""
|
|
143
|
+
) -> list[ValidationError]:
|
|
144
|
+
"""Validate required checks for schema, environment, and :class:`pandas.DataFrame` structure."""
|
|
79
145
|
general_checks = chain(
|
|
80
146
|
Validator._check_valid_schema_type(schema, environment),
|
|
81
147
|
Validator._check_field_convertible_to_str(
|
|
@@ -115,7 +181,7 @@ class Validator:
|
|
|
115
181
|
metric_families: list[Metrics] | None = None,
|
|
116
182
|
model_version: str | None = None,
|
|
117
183
|
batch_id: str | None = None,
|
|
118
|
-
) -> list[
|
|
184
|
+
) -> list[ValidationError]:
|
|
119
185
|
"""Validate parameters including model type, environment, and schema consistency."""
|
|
120
186
|
# general checks
|
|
121
187
|
general_checks = chain(
|
|
@@ -223,7 +289,7 @@ class Validator:
|
|
|
223
289
|
model_type: ModelTypes,
|
|
224
290
|
schema: BaseSchema,
|
|
225
291
|
pyarrow_schema: pa.Schema,
|
|
226
|
-
) -> list[
|
|
292
|
+
) -> list[ValidationError]:
|
|
227
293
|
"""Validate column data types against expected types for the schema."""
|
|
228
294
|
column_types = dict(
|
|
229
295
|
zip(pyarrow_schema.names, pyarrow_schema.types, strict=True)
|
|
@@ -323,7 +389,7 @@ class Validator:
|
|
|
323
389
|
environment: Environments,
|
|
324
390
|
schema: BaseSchema,
|
|
325
391
|
model_type: ModelTypes,
|
|
326
|
-
) -> list[
|
|
392
|
+
) -> list[ValidationError]:
|
|
327
393
|
"""Validate data values including ranges, formats, and consistency checks."""
|
|
328
394
|
# ASSUMPTION: at this point the param and type checks should have passed.
|
|
329
395
|
# This function may crash if that is not true, e.g. if columns are missing
|
|
@@ -350,25 +416,25 @@ class Validator:
|
|
|
350
416
|
if isinstance(schema, Schema):
|
|
351
417
|
general_checks = chain(
|
|
352
418
|
general_checks,
|
|
353
|
-
Validator._check_value_timestamp(dataframe, schema),
|
|
354
|
-
Validator._check_id_field_str_length(
|
|
419
|
+
Validator._check_value_timestamp(dataframe, schema), # type: ignore[arg-type]
|
|
420
|
+
Validator._check_id_field_str_length( # type: ignore[arg-type]
|
|
355
421
|
dataframe,
|
|
356
422
|
"prediction_id_column_name",
|
|
357
423
|
schema.prediction_id_column_name,
|
|
358
424
|
),
|
|
359
|
-
Validator._check_embedding_vectors_dimensionality(
|
|
425
|
+
Validator._check_embedding_vectors_dimensionality( # type: ignore[arg-type]
|
|
360
426
|
dataframe, schema
|
|
361
427
|
),
|
|
362
|
-
Validator._check_embedding_raw_data_characters(
|
|
428
|
+
Validator._check_embedding_raw_data_characters( # type: ignore[arg-type]
|
|
363
429
|
dataframe, schema
|
|
364
430
|
),
|
|
365
|
-
Validator._check_invalid_record_prod(
|
|
431
|
+
Validator._check_invalid_record_prod( # type: ignore[arg-type]
|
|
366
432
|
dataframe, environment, schema, model_type
|
|
367
433
|
),
|
|
368
|
-
Validator._check_invalid_record_preprod(
|
|
434
|
+
Validator._check_invalid_record_preprod( # type: ignore[arg-type]
|
|
369
435
|
dataframe, environment, schema, model_type
|
|
370
436
|
),
|
|
371
|
-
Validator._check_value_tag(dataframe, schema),
|
|
437
|
+
Validator._check_value_tag(dataframe, schema), # type: ignore[arg-type]
|
|
372
438
|
)
|
|
373
439
|
if model_type == ModelTypes.RANKING:
|
|
374
440
|
r_checks = chain(
|
|
@@ -444,15 +510,15 @@ class Validator:
|
|
|
444
510
|
@staticmethod
|
|
445
511
|
def _check_column_names_for_empty_strings(
|
|
446
512
|
schema: BaseSchema,
|
|
447
|
-
) -> list[
|
|
513
|
+
) -> list[InvalidColumnNameEmptyString]:
|
|
448
514
|
if "" in schema.get_used_columns():
|
|
449
|
-
return [
|
|
515
|
+
return [InvalidColumnNameEmptyString()]
|
|
450
516
|
return []
|
|
451
517
|
|
|
452
518
|
@staticmethod
|
|
453
519
|
def _check_field_convertible_to_str(
|
|
454
520
|
model_id: object, model_version: object, batch_id: object
|
|
455
|
-
) -> list[
|
|
521
|
+
) -> list[InvalidFieldTypeConversion]:
|
|
456
522
|
# converting to a set first makes the checks run a lot faster
|
|
457
523
|
wrong_fields = []
|
|
458
524
|
if model_id is not None and not isinstance(model_id, str):
|
|
@@ -472,61 +538,59 @@ class Validator:
|
|
|
472
538
|
wrong_fields.append("batch_id")
|
|
473
539
|
|
|
474
540
|
if wrong_fields:
|
|
475
|
-
return [
|
|
541
|
+
return [InvalidFieldTypeConversion(wrong_fields, "string")]
|
|
476
542
|
return []
|
|
477
543
|
|
|
478
544
|
@staticmethod
|
|
479
545
|
def _check_field_type_embedding_features_column_names(
|
|
480
546
|
schema: Schema,
|
|
481
|
-
) -> list[
|
|
547
|
+
) -> list[InvalidFieldTypeEmbeddingFeatures]:
|
|
482
548
|
if schema.embedding_feature_column_names is not None:
|
|
483
549
|
if not isinstance(schema.embedding_feature_column_names, dict):
|
|
484
|
-
return [
|
|
550
|
+
return [InvalidFieldTypeEmbeddingFeatures()]
|
|
485
551
|
for k, v in schema.embedding_feature_column_names.items():
|
|
486
552
|
if not isinstance(k, str) or not isinstance(
|
|
487
553
|
v, EmbeddingColumnNames
|
|
488
554
|
):
|
|
489
|
-
return [
|
|
555
|
+
return [InvalidFieldTypeEmbeddingFeatures()]
|
|
490
556
|
return []
|
|
491
557
|
|
|
492
558
|
@staticmethod
|
|
493
559
|
def _check_field_type_prompt_response(
|
|
494
560
|
schema: Schema,
|
|
495
|
-
) -> list[
|
|
496
|
-
errors = []
|
|
561
|
+
) -> list[InvalidFieldTypePromptResponse]:
|
|
562
|
+
errors: list[InvalidFieldTypePromptResponse] = []
|
|
497
563
|
if schema.prompt_column_names is not None and not isinstance(
|
|
498
564
|
schema.prompt_column_names, (str, EmbeddingColumnNames)
|
|
499
565
|
):
|
|
500
|
-
errors.append(
|
|
501
|
-
err.InvalidFieldTypePromptResponse("prompt_column_names")
|
|
502
|
-
)
|
|
566
|
+
errors.append(InvalidFieldTypePromptResponse("prompt_column_names"))
|
|
503
567
|
if schema.response_column_names is not None and not isinstance(
|
|
504
568
|
schema.response_column_names, (str, EmbeddingColumnNames)
|
|
505
569
|
):
|
|
506
570
|
errors.append(
|
|
507
|
-
|
|
571
|
+
InvalidFieldTypePromptResponse("response_column_names")
|
|
508
572
|
)
|
|
509
573
|
return errors
|
|
510
574
|
|
|
511
575
|
@staticmethod
|
|
512
576
|
def _check_field_type_prompt_templates(
|
|
513
577
|
schema: Schema,
|
|
514
|
-
) -> list[
|
|
578
|
+
) -> list[InvalidFieldTypePromptTemplates]:
|
|
515
579
|
if schema.prompt_template_column_names is not None and not isinstance(
|
|
516
580
|
schema.prompt_template_column_names, PromptTemplateColumnNames
|
|
517
581
|
):
|
|
518
|
-
return [
|
|
582
|
+
return [InvalidFieldTypePromptTemplates()]
|
|
519
583
|
return []
|
|
520
584
|
|
|
521
585
|
@staticmethod
|
|
522
586
|
def _check_field_type_llm_config(
|
|
523
587
|
dataframe: pd.DataFrame,
|
|
524
588
|
schema: Schema,
|
|
525
|
-
) -> list[
|
|
589
|
+
) -> list[InvalidFieldTypeLlmConfig | InvalidTypeColumns]:
|
|
526
590
|
if schema.llm_config_column_names is None:
|
|
527
591
|
return []
|
|
528
592
|
if not isinstance(schema.llm_config_column_names, LLMConfigColumnNames):
|
|
529
|
-
return [
|
|
593
|
+
return [InvalidFieldTypeLlmConfig()]
|
|
530
594
|
col = schema.llm_config_column_names.params_column_name
|
|
531
595
|
# We check the types if the columns are in the dataframe.
|
|
532
596
|
# If the columns are reflected in the schema but not present
|
|
@@ -545,7 +609,7 @@ class Validator:
|
|
|
545
609
|
)
|
|
546
610
|
):
|
|
547
611
|
return [
|
|
548
|
-
|
|
612
|
+
InvalidTypeColumns(
|
|
549
613
|
wrong_type_columns=[col],
|
|
550
614
|
expected_types=[
|
|
551
615
|
"Dict[str, (bool, int, float, string or list[str])]"
|
|
@@ -557,9 +621,9 @@ class Validator:
|
|
|
557
621
|
@staticmethod
|
|
558
622
|
def _check_invalid_index(
|
|
559
623
|
dataframe: pd.DataFrame,
|
|
560
|
-
) -> list[
|
|
624
|
+
) -> list[InvalidDataFrameIndex]:
|
|
561
625
|
if (dataframe.index != dataframe.reset_index(drop=True).index).any():
|
|
562
|
-
return [
|
|
626
|
+
return [InvalidDataFrameIndex()]
|
|
563
627
|
return []
|
|
564
628
|
|
|
565
629
|
# ----------------
|
|
@@ -571,7 +635,7 @@ class Validator:
|
|
|
571
635
|
model_type: ModelTypes,
|
|
572
636
|
metric_families: list[Metrics] | None,
|
|
573
637
|
schema: Schema,
|
|
574
|
-
) -> list[
|
|
638
|
+
) -> list[ValidationError]:
|
|
575
639
|
if metric_families is None:
|
|
576
640
|
return []
|
|
577
641
|
|
|
@@ -597,7 +661,7 @@ class Validator:
|
|
|
597
661
|
if not valid_combination:
|
|
598
662
|
# Model type + metrics combination is not valid.
|
|
599
663
|
return [
|
|
600
|
-
|
|
664
|
+
InvalidModelTypeAndMetricsCombination(
|
|
601
665
|
model_type,
|
|
602
666
|
metric_families,
|
|
603
667
|
suggested_model_metric_combinations,
|
|
@@ -606,7 +670,7 @@ class Validator:
|
|
|
606
670
|
if missing_columns:
|
|
607
671
|
# For this model type, the schema is missing columns required for the requested metrics.
|
|
608
672
|
return [
|
|
609
|
-
|
|
673
|
+
MissingRequiredColumnsMetricsValidation(
|
|
610
674
|
model_type, metric_families, missing_columns
|
|
611
675
|
)
|
|
612
676
|
]
|
|
@@ -619,7 +683,7 @@ class Validator:
|
|
|
619
683
|
schema: Schema,
|
|
620
684
|
required_columns_map: list[dict[str, Any]],
|
|
621
685
|
) -> tuple[bool, list[str], list[list[str]]]:
|
|
622
|
-
missing_columns = []
|
|
686
|
+
missing_columns: list[str] = []
|
|
623
687
|
for item in required_columns_map:
|
|
624
688
|
if model_type.name.lower() == item.get("external_model_type"):
|
|
625
689
|
is_valid_combination = False
|
|
@@ -674,7 +738,7 @@ class Validator:
|
|
|
674
738
|
@staticmethod
|
|
675
739
|
def _check_existence_prediction_id_column_delayed_schema(
|
|
676
740
|
schema: Schema, model_type: ModelTypes
|
|
677
|
-
) -> list[
|
|
741
|
+
) -> list[MissingPredictionIdColumnForDelayedRecords]:
|
|
678
742
|
if schema.prediction_id_column_name is not None:
|
|
679
743
|
return []
|
|
680
744
|
# TODO: Revise logic once prediction_label column addition (for generative models)
|
|
@@ -683,7 +747,7 @@ class Validator:
|
|
|
683
747
|
# We skip GENERATIVE model types since they are assigned a default
|
|
684
748
|
# prediction label column with values equal 1
|
|
685
749
|
return [
|
|
686
|
-
|
|
750
|
+
MissingPredictionIdColumnForDelayedRecords(
|
|
687
751
|
schema.has_actual_columns(),
|
|
688
752
|
schema.has_feature_importance_columns(),
|
|
689
753
|
)
|
|
@@ -705,7 +769,7 @@ class Validator:
|
|
|
705
769
|
def _check_missing_columns(
|
|
706
770
|
dataframe: pd.DataFrame,
|
|
707
771
|
schema: BaseSchema,
|
|
708
|
-
) -> list[
|
|
772
|
+
) -> list[MissingColumns]:
|
|
709
773
|
if isinstance(schema, CorpusSchema):
|
|
710
774
|
return Validator._check_missing_columns_corpus_schema(
|
|
711
775
|
dataframe, schema
|
|
@@ -718,7 +782,7 @@ class Validator:
|
|
|
718
782
|
def _check_missing_columns_schema(
|
|
719
783
|
dataframe: pd.DataFrame,
|
|
720
784
|
schema: Schema,
|
|
721
|
-
) -> list[
|
|
785
|
+
) -> list[MissingColumns]:
|
|
722
786
|
# converting to a set first makes the checks run a lot faster
|
|
723
787
|
existing_columns = set(dataframe.columns)
|
|
724
788
|
missing_columns = []
|
|
@@ -733,7 +797,9 @@ class Validator:
|
|
|
733
797
|
missing_columns.extend(
|
|
734
798
|
[
|
|
735
799
|
col
|
|
736
|
-
for col in
|
|
800
|
+
for col in _normalize_column_names(
|
|
801
|
+
schema.feature_column_names
|
|
802
|
+
)
|
|
737
803
|
if col not in existing_columns
|
|
738
804
|
]
|
|
739
805
|
)
|
|
@@ -768,7 +834,7 @@ class Validator:
|
|
|
768
834
|
missing_columns.extend(
|
|
769
835
|
[
|
|
770
836
|
col
|
|
771
|
-
for col in schema.tag_column_names
|
|
837
|
+
for col in _normalize_column_names(schema.tag_column_names)
|
|
772
838
|
if col not in existing_columns
|
|
773
839
|
]
|
|
774
840
|
)
|
|
@@ -901,14 +967,14 @@ class Validator:
|
|
|
901
967
|
)
|
|
902
968
|
|
|
903
969
|
if missing_columns:
|
|
904
|
-
return [
|
|
970
|
+
return [MissingColumns(missing_columns)]
|
|
905
971
|
return []
|
|
906
972
|
|
|
907
973
|
@staticmethod
|
|
908
974
|
def _check_missing_columns_corpus_schema(
|
|
909
975
|
dataframe: pd.DataFrame,
|
|
910
976
|
schema: CorpusSchema,
|
|
911
|
-
) -> list[
|
|
977
|
+
) -> list[MissingColumns]:
|
|
912
978
|
# converting to a set first makes the checks run a lot faster
|
|
913
979
|
existing_columns = set(dataframe.columns)
|
|
914
980
|
missing_columns = []
|
|
@@ -958,19 +1024,19 @@ class Validator:
|
|
|
958
1024
|
schema.document_text_embedding_column_names.link_to_data_column_name
|
|
959
1025
|
)
|
|
960
1026
|
if missing_columns:
|
|
961
|
-
return [
|
|
1027
|
+
return [MissingColumns(missing_columns)]
|
|
962
1028
|
return []
|
|
963
1029
|
|
|
964
1030
|
@staticmethod
|
|
965
1031
|
def _check_valid_schema_type(
|
|
966
1032
|
schema: BaseSchema,
|
|
967
1033
|
environment: Environments,
|
|
968
|
-
) -> list[
|
|
1034
|
+
) -> list[InvalidSchemaType]:
|
|
969
1035
|
if environment == Environments.CORPUS and not (
|
|
970
1036
|
isinstance(schema, CorpusSchema)
|
|
971
1037
|
):
|
|
972
1038
|
return [
|
|
973
|
-
|
|
1039
|
+
InvalidSchemaType(
|
|
974
1040
|
schema_type=str(type(schema)), environment=environment
|
|
975
1041
|
)
|
|
976
1042
|
]
|
|
@@ -978,7 +1044,7 @@ class Validator:
|
|
|
978
1044
|
schema, CorpusSchema
|
|
979
1045
|
):
|
|
980
1046
|
return [
|
|
981
|
-
|
|
1047
|
+
InvalidSchemaType(
|
|
982
1048
|
schema_type=str(type(schema)), environment=environment
|
|
983
1049
|
)
|
|
984
1050
|
]
|
|
@@ -987,26 +1053,23 @@ class Validator:
|
|
|
987
1053
|
@staticmethod
|
|
988
1054
|
def _check_invalid_shap_suffix(
|
|
989
1055
|
schema: Schema,
|
|
990
|
-
) -> list[
|
|
1056
|
+
) -> list[InvalidShapSuffix]:
|
|
991
1057
|
invalid_column_names = set()
|
|
992
1058
|
|
|
993
1059
|
if schema.feature_column_names is not None:
|
|
994
|
-
for col in schema.feature_column_names:
|
|
1060
|
+
for col in _normalize_column_names(schema.feature_column_names):
|
|
995
1061
|
if isinstance(col, str) and col.endswith("_shap"):
|
|
996
1062
|
invalid_column_names.add(col)
|
|
997
1063
|
|
|
998
1064
|
if schema.embedding_feature_column_names is not None:
|
|
999
1065
|
for emb_col_names in schema.embedding_feature_column_names.values():
|
|
1000
|
-
for
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
and isinstance(col, str)
|
|
1004
|
-
and col.endswith("_shap")
|
|
1005
|
-
):
|
|
1066
|
+
cols_list = [c for c in emb_col_names if c is not None]
|
|
1067
|
+
for col in cols_list:
|
|
1068
|
+
if col.endswith("_shap"):
|
|
1006
1069
|
invalid_column_names.add(col)
|
|
1007
1070
|
|
|
1008
1071
|
if schema.tag_column_names is not None:
|
|
1009
|
-
for col in schema.tag_column_names:
|
|
1072
|
+
for col in _normalize_column_names(schema.tag_column_names):
|
|
1010
1073
|
if isinstance(col, str) and col.endswith("_shap"):
|
|
1011
1074
|
invalid_column_names.add(col)
|
|
1012
1075
|
|
|
@@ -1016,14 +1079,14 @@ class Validator:
|
|
|
1016
1079
|
invalid_column_names.add(col)
|
|
1017
1080
|
|
|
1018
1081
|
if invalid_column_names:
|
|
1019
|
-
return [
|
|
1082
|
+
return [InvalidShapSuffix(invalid_column_names)]
|
|
1020
1083
|
return []
|
|
1021
1084
|
|
|
1022
1085
|
@staticmethod
|
|
1023
1086
|
def _check_reserved_columns(
|
|
1024
1087
|
schema: BaseSchema,
|
|
1025
1088
|
model_type: ModelTypes,
|
|
1026
|
-
) -> list[
|
|
1089
|
+
) -> list[ReservedColumns]:
|
|
1027
1090
|
if isinstance(schema, CorpusSchema):
|
|
1028
1091
|
return []
|
|
1029
1092
|
if isinstance(schema, Schema):
|
|
@@ -1127,29 +1190,29 @@ class Validator:
|
|
|
1127
1190
|
)
|
|
1128
1191
|
|
|
1129
1192
|
if reserved_columns:
|
|
1130
|
-
return [
|
|
1193
|
+
return [ReservedColumns(reserved_columns)]
|
|
1131
1194
|
return []
|
|
1132
1195
|
|
|
1133
1196
|
@staticmethod
|
|
1134
1197
|
def _check_invalid_model_id(
|
|
1135
1198
|
model_id: str | None,
|
|
1136
|
-
) -> list[
|
|
1199
|
+
) -> list[InvalidModelId]:
|
|
1137
1200
|
# assume it's been coerced to string beforehand
|
|
1138
1201
|
if (not isinstance(model_id, str)) or len(model_id.strip()) == 0:
|
|
1139
|
-
return [
|
|
1202
|
+
return [InvalidModelId()]
|
|
1140
1203
|
return []
|
|
1141
1204
|
|
|
1142
1205
|
@staticmethod
|
|
1143
1206
|
def _check_invalid_model_version(
|
|
1144
1207
|
model_version: str | None = None,
|
|
1145
|
-
) -> list[
|
|
1208
|
+
) -> list[InvalidModelVersion]:
|
|
1146
1209
|
if model_version is None:
|
|
1147
1210
|
return []
|
|
1148
1211
|
if (
|
|
1149
1212
|
not isinstance(model_version, str)
|
|
1150
1213
|
or len(model_version.strip()) == 0
|
|
1151
1214
|
):
|
|
1152
|
-
return [
|
|
1215
|
+
return [InvalidModelVersion()]
|
|
1153
1216
|
|
|
1154
1217
|
return []
|
|
1155
1218
|
|
|
@@ -1157,35 +1220,35 @@ class Validator:
|
|
|
1157
1220
|
def _check_invalid_batch_id(
|
|
1158
1221
|
batch_id: str | None,
|
|
1159
1222
|
environment: Environments,
|
|
1160
|
-
) -> list[
|
|
1223
|
+
) -> list[InvalidBatchId]:
|
|
1161
1224
|
# assume it's been coerced to string beforehand
|
|
1162
1225
|
if environment in (Environments.VALIDATION,) and (
|
|
1163
1226
|
(not isinstance(batch_id, str)) or len(batch_id.strip()) == 0
|
|
1164
1227
|
):
|
|
1165
|
-
return [
|
|
1228
|
+
return [InvalidBatchId()]
|
|
1166
1229
|
return []
|
|
1167
1230
|
|
|
1168
1231
|
@staticmethod
|
|
1169
1232
|
def _check_invalid_model_type(
|
|
1170
1233
|
model_type: ModelTypes,
|
|
1171
|
-
) -> list[
|
|
1234
|
+
) -> list[InvalidModelType]:
|
|
1172
1235
|
if model_type in (mt for mt in ModelTypes):
|
|
1173
1236
|
return []
|
|
1174
|
-
return [
|
|
1237
|
+
return [InvalidModelType()]
|
|
1175
1238
|
|
|
1176
1239
|
@staticmethod
|
|
1177
1240
|
def _check_invalid_environment(
|
|
1178
1241
|
environment: Environments,
|
|
1179
|
-
) -> list[
|
|
1242
|
+
) -> list[InvalidEnvironment]:
|
|
1180
1243
|
if environment in (env for env in Environments):
|
|
1181
1244
|
return []
|
|
1182
|
-
return [
|
|
1245
|
+
return [InvalidEnvironment()]
|
|
1183
1246
|
|
|
1184
1247
|
@staticmethod
|
|
1185
1248
|
def _check_existence_preprod_pred_act_score_or_label(
|
|
1186
1249
|
schema: Schema,
|
|
1187
1250
|
environment: Environments,
|
|
1188
|
-
) -> list[
|
|
1251
|
+
) -> list[MissingPreprodPredActNumericAndCategorical]:
|
|
1189
1252
|
if environment in (Environments.VALIDATION, Environments.TRAINING) and (
|
|
1190
1253
|
(
|
|
1191
1254
|
schema.prediction_label_column_name is None
|
|
@@ -1196,13 +1259,13 @@ class Validator:
|
|
|
1196
1259
|
and schema.actual_score_column_name is None
|
|
1197
1260
|
)
|
|
1198
1261
|
):
|
|
1199
|
-
return [
|
|
1262
|
+
return [MissingPreprodPredActNumericAndCategorical()]
|
|
1200
1263
|
return []
|
|
1201
1264
|
|
|
1202
1265
|
@staticmethod
|
|
1203
1266
|
def _check_exactly_one_cv_column_type(
|
|
1204
1267
|
schema: Schema, environment: Environments
|
|
1205
|
-
) -> list[
|
|
1268
|
+
) -> list[MultipleCVPredAct | MissingCVPredAct]:
|
|
1206
1269
|
# Checks that the required prediction/actual columns are given in the schema depending on
|
|
1207
1270
|
# the environment, for object detection models. There should be exactly one of
|
|
1208
1271
|
# object detection, semantic segmentation, or instance segmentation columns.
|
|
@@ -1232,9 +1295,9 @@ class Validator:
|
|
|
1232
1295
|
)
|
|
1233
1296
|
|
|
1234
1297
|
if cv_types_count == 0:
|
|
1235
|
-
return [
|
|
1298
|
+
return [MissingCVPredAct(environment)]
|
|
1236
1299
|
if cv_types_count > 1:
|
|
1237
|
-
return [
|
|
1300
|
+
return [MultipleCVPredAct(environment)]
|
|
1238
1301
|
|
|
1239
1302
|
elif environment in (
|
|
1240
1303
|
Environments.TRAINING,
|
|
@@ -1265,16 +1328,16 @@ class Validator:
|
|
|
1265
1328
|
)
|
|
1266
1329
|
|
|
1267
1330
|
if cv_types_count == 0:
|
|
1268
|
-
return [
|
|
1331
|
+
return [MissingCVPredAct(environment)]
|
|
1269
1332
|
if cv_types_count > 1:
|
|
1270
|
-
return [
|
|
1333
|
+
return [MultipleCVPredAct(environment)]
|
|
1271
1334
|
|
|
1272
1335
|
return []
|
|
1273
1336
|
|
|
1274
1337
|
@staticmethod
|
|
1275
1338
|
def _check_missing_object_detection_columns(
|
|
1276
1339
|
schema: Schema, model_type: ModelTypes
|
|
1277
|
-
) -> list[
|
|
1340
|
+
) -> list[InvalidPredActCVColumnNamesForModelType]:
|
|
1278
1341
|
# Checks that models that are not Object Detection models don't have, in the schema, the
|
|
1279
1342
|
# object detection, semantic segmentation, or instance segmentation dedicated prediction/actual
|
|
1280
1343
|
# column names
|
|
@@ -1286,13 +1349,13 @@ class Validator:
|
|
|
1286
1349
|
or schema.instance_segmentation_prediction_column_names is not None
|
|
1287
1350
|
or schema.instance_segmentation_actual_column_names is not None
|
|
1288
1351
|
):
|
|
1289
|
-
return [
|
|
1352
|
+
return [InvalidPredActCVColumnNamesForModelType(model_type)]
|
|
1290
1353
|
return []
|
|
1291
1354
|
|
|
1292
1355
|
@staticmethod
|
|
1293
1356
|
def _check_missing_non_object_detection_columns(
|
|
1294
1357
|
schema: Schema, model_type: ModelTypes
|
|
1295
|
-
) -> list[
|
|
1358
|
+
) -> list[InvalidPredActColumnNamesForModelType]:
|
|
1296
1359
|
# Checks that object detection models don't have, in the schema, the columns reserved for
|
|
1297
1360
|
# other model types
|
|
1298
1361
|
columns_to_check = (
|
|
@@ -1317,7 +1380,7 @@ class Validator:
|
|
|
1317
1380
|
"instance_segmentation_actual_column_names",
|
|
1318
1381
|
]
|
|
1319
1382
|
return [
|
|
1320
|
-
|
|
1383
|
+
InvalidPredActColumnNamesForModelType(
|
|
1321
1384
|
model_type, allowed_cols, wrong_cols
|
|
1322
1385
|
)
|
|
1323
1386
|
]
|
|
@@ -1326,7 +1389,7 @@ class Validator:
|
|
|
1326
1389
|
@staticmethod
|
|
1327
1390
|
def _check_missing_multi_class_columns(
|
|
1328
1391
|
schema: Schema, model_type: ModelTypes
|
|
1329
|
-
) -> list[
|
|
1392
|
+
) -> list[InvalidPredActColumnNamesForModelType]:
|
|
1330
1393
|
# Checks that models that are not Multi Class models don't have, in the schema, the
|
|
1331
1394
|
# multi class dedicated threshold column
|
|
1332
1395
|
if (
|
|
@@ -1334,9 +1397,9 @@ class Validator:
|
|
|
1334
1397
|
and schema.multi_class_threshold_scores_column_name is not None
|
|
1335
1398
|
):
|
|
1336
1399
|
return [
|
|
1337
|
-
|
|
1400
|
+
InvalidPredActColumnNamesForModelType(
|
|
1338
1401
|
model_type,
|
|
1339
|
-
None,
|
|
1402
|
+
None, # type: ignore[arg-type]
|
|
1340
1403
|
[schema.multi_class_threshold_scores_column_name],
|
|
1341
1404
|
)
|
|
1342
1405
|
]
|
|
@@ -1345,7 +1408,7 @@ class Validator:
|
|
|
1345
1408
|
@staticmethod
|
|
1346
1409
|
def _check_existing_multi_class_columns(
|
|
1347
1410
|
schema: Schema,
|
|
1348
|
-
) -> list[
|
|
1411
|
+
) -> list[MissingReqPredActColumnNamesForMultiClass]:
|
|
1349
1412
|
# Checks that models that are Multi Class models have, in the schema, the
|
|
1350
1413
|
# required prediction score or actual score columns
|
|
1351
1414
|
if (
|
|
@@ -1355,13 +1418,13 @@ class Validator:
|
|
|
1355
1418
|
schema.multi_class_threshold_scores_column_name is not None
|
|
1356
1419
|
and schema.prediction_score_column_name is None
|
|
1357
1420
|
):
|
|
1358
|
-
return [
|
|
1421
|
+
return [MissingReqPredActColumnNamesForMultiClass()]
|
|
1359
1422
|
return []
|
|
1360
1423
|
|
|
1361
1424
|
@staticmethod
|
|
1362
1425
|
def _check_missing_non_multi_class_columns(
|
|
1363
1426
|
schema: Schema, model_type: ModelTypes
|
|
1364
|
-
) -> list[
|
|
1427
|
+
) -> list[InvalidPredActColumnNamesForModelType]:
|
|
1365
1428
|
# Checks that multi class models don't have, in the schema, the columns reserved for
|
|
1366
1429
|
# other model types
|
|
1367
1430
|
columns_to_check = (
|
|
@@ -1387,8 +1450,10 @@ class Validator:
|
|
|
1387
1450
|
"actual_score_column_name",
|
|
1388
1451
|
]
|
|
1389
1452
|
return [
|
|
1390
|
-
|
|
1391
|
-
model_type,
|
|
1453
|
+
InvalidPredActColumnNamesForModelType(
|
|
1454
|
+
model_type,
|
|
1455
|
+
allowed_cols,
|
|
1456
|
+
wrong_cols, # type: ignore[arg-type]
|
|
1392
1457
|
)
|
|
1393
1458
|
]
|
|
1394
1459
|
return []
|
|
@@ -1397,17 +1462,17 @@ class Validator:
|
|
|
1397
1462
|
def _check_existence_preprod_act(
|
|
1398
1463
|
schema: Schema,
|
|
1399
1464
|
environment: Environments,
|
|
1400
|
-
) -> list[
|
|
1465
|
+
) -> list[MissingPreprodAct]:
|
|
1401
1466
|
if environment in (Environments.VALIDATION, Environments.TRAINING) and (
|
|
1402
1467
|
schema.actual_label_column_name is None
|
|
1403
1468
|
):
|
|
1404
|
-
return [
|
|
1469
|
+
return [MissingPreprodAct()]
|
|
1405
1470
|
return []
|
|
1406
1471
|
|
|
1407
1472
|
@staticmethod
|
|
1408
1473
|
def _check_existence_group_id_rank_category_relevance(
|
|
1409
1474
|
schema: Schema,
|
|
1410
|
-
) -> list[
|
|
1475
|
+
) -> list[MissingRequiredColumnsForRankingModel]:
|
|
1411
1476
|
# prediction_group_id and rank columns are required as ranking prediction columns.
|
|
1412
1477
|
ranking_prediction_cols = (
|
|
1413
1478
|
schema.prediction_label_column_name,
|
|
@@ -1425,13 +1490,13 @@ class Validator:
|
|
|
1425
1490
|
# If there is prediction information (not delayed actuals),
|
|
1426
1491
|
# there must exist a rank and prediction group id columns
|
|
1427
1492
|
if has_prediction_info and any(col is None for col in required):
|
|
1428
|
-
return [
|
|
1493
|
+
return [MissingRequiredColumnsForRankingModel()]
|
|
1429
1494
|
return []
|
|
1430
1495
|
|
|
1431
1496
|
@staticmethod
|
|
1432
1497
|
def _check_dataframe_for_duplicate_columns(
|
|
1433
1498
|
schema: BaseSchema, dataframe: pd.DataFrame
|
|
1434
|
-
) -> list[
|
|
1499
|
+
) -> list[DuplicateColumnsInDataframe]:
|
|
1435
1500
|
# Get the columns used in the schema
|
|
1436
1501
|
schema_col_used = schema.get_used_columns()
|
|
1437
1502
|
# Get the duplicated column names from the dataframe
|
|
@@ -1441,17 +1506,17 @@ class Validator:
|
|
|
1441
1506
|
col for col in duplicate_columns if col in schema_col_used
|
|
1442
1507
|
]
|
|
1443
1508
|
if schema_duplicate_cols:
|
|
1444
|
-
return [
|
|
1509
|
+
return [DuplicateColumnsInDataframe(schema_duplicate_cols)]
|
|
1445
1510
|
return []
|
|
1446
1511
|
|
|
1447
1512
|
@staticmethod
|
|
1448
1513
|
def _check_invalid_number_of_embeddings(
|
|
1449
1514
|
schema: Schema,
|
|
1450
|
-
) -> list[
|
|
1515
|
+
) -> list[InvalidNumberOfEmbeddings]:
|
|
1451
1516
|
if schema.embedding_feature_column_names is not None:
|
|
1452
1517
|
number_of_embeddings = len(schema.embedding_feature_column_names)
|
|
1453
1518
|
if number_of_embeddings > MAX_NUMBER_OF_EMBEDDINGS:
|
|
1454
|
-
return [
|
|
1519
|
+
return [InvalidNumberOfEmbeddings(number_of_embeddings)]
|
|
1455
1520
|
return []
|
|
1456
1521
|
|
|
1457
1522
|
# -----------
|
|
@@ -1461,7 +1526,7 @@ class Validator:
|
|
|
1461
1526
|
@staticmethod
|
|
1462
1527
|
def _check_type_prediction_id(
|
|
1463
1528
|
schema: Schema, column_types: dict[str, Any]
|
|
1464
|
-
) -> list[
|
|
1529
|
+
) -> list[InvalidType]:
|
|
1465
1530
|
col = schema.prediction_id_column_name
|
|
1466
1531
|
if col in column_types:
|
|
1467
1532
|
# should mirror server side
|
|
@@ -1474,7 +1539,7 @@ class Validator:
|
|
|
1474
1539
|
)
|
|
1475
1540
|
if column_types[col] not in allowed_datatypes:
|
|
1476
1541
|
return [
|
|
1477
|
-
|
|
1542
|
+
InvalidType(
|
|
1478
1543
|
"Prediction IDs",
|
|
1479
1544
|
expected_types=["str", "int"],
|
|
1480
1545
|
found_data_type=column_types[col],
|
|
@@ -1485,7 +1550,7 @@ class Validator:
|
|
|
1485
1550
|
@staticmethod
|
|
1486
1551
|
def _check_type_timestamp(
|
|
1487
1552
|
schema: Schema, column_types: dict[str, Any]
|
|
1488
|
-
) -> list[
|
|
1553
|
+
) -> list[InvalidType]:
|
|
1489
1554
|
col = schema.timestamp_column_name
|
|
1490
1555
|
if col in column_types:
|
|
1491
1556
|
# should mirror server side
|
|
@@ -1501,7 +1566,7 @@ class Validator:
|
|
|
1501
1566
|
and t not in allowed_datatypes
|
|
1502
1567
|
):
|
|
1503
1568
|
return [
|
|
1504
|
-
|
|
1569
|
+
InvalidType(
|
|
1505
1570
|
"Prediction timestamp",
|
|
1506
1571
|
expected_types=["Date", "Timestamp", "int", "float"],
|
|
1507
1572
|
found_data_type=t,
|
|
@@ -1512,7 +1577,7 @@ class Validator:
|
|
|
1512
1577
|
@staticmethod
|
|
1513
1578
|
def _check_type_features(
|
|
1514
1579
|
schema: Schema, column_types: dict[str, Any]
|
|
1515
|
-
) -> list[
|
|
1580
|
+
) -> list[InvalidTypeFeatures]:
|
|
1516
1581
|
if schema.feature_column_names is not None:
|
|
1517
1582
|
# should mirror server side
|
|
1518
1583
|
allowed_datatypes = (
|
|
@@ -1529,13 +1594,13 @@ class Validator:
|
|
|
1529
1594
|
)
|
|
1530
1595
|
wrong_type_cols = [
|
|
1531
1596
|
col
|
|
1532
|
-
for col in schema.feature_column_names
|
|
1597
|
+
for col in _normalize_column_names(schema.feature_column_names)
|
|
1533
1598
|
if col in column_types
|
|
1534
1599
|
and column_types[col] not in allowed_datatypes
|
|
1535
1600
|
]
|
|
1536
1601
|
if wrong_type_cols:
|
|
1537
1602
|
return [
|
|
1538
|
-
|
|
1603
|
+
InvalidTypeFeatures(
|
|
1539
1604
|
wrong_type_cols,
|
|
1540
1605
|
expected_types=[
|
|
1541
1606
|
"float",
|
|
@@ -1551,7 +1616,7 @@ class Validator:
|
|
|
1551
1616
|
@staticmethod
|
|
1552
1617
|
def _check_type_embedding_features(
|
|
1553
1618
|
schema: Schema, column_types: dict[str, Any]
|
|
1554
|
-
) -> list[
|
|
1619
|
+
) -> list[InvalidTypeFeatures]:
|
|
1555
1620
|
if schema.embedding_feature_column_names is not None:
|
|
1556
1621
|
# should mirror server side
|
|
1557
1622
|
allowed_vector_datatypes = (
|
|
@@ -1599,20 +1664,20 @@ class Validator:
|
|
|
1599
1664
|
wrong_type_embedding_errors = []
|
|
1600
1665
|
if wrong_type_vector_columns:
|
|
1601
1666
|
wrong_type_embedding_errors.append(
|
|
1602
|
-
|
|
1667
|
+
InvalidTypeFeatures(
|
|
1603
1668
|
wrong_type_vector_columns,
|
|
1604
1669
|
expected_types=["list[float], np.array[float]"],
|
|
1605
1670
|
)
|
|
1606
1671
|
)
|
|
1607
1672
|
if wrong_type_data_columns:
|
|
1608
1673
|
wrong_type_embedding_errors.append(
|
|
1609
|
-
|
|
1674
|
+
InvalidTypeFeatures(
|
|
1610
1675
|
wrong_type_data_columns, expected_types=["list[string]"]
|
|
1611
1676
|
)
|
|
1612
1677
|
)
|
|
1613
1678
|
if wrong_type_link_to_data_columns:
|
|
1614
1679
|
wrong_type_embedding_errors.append(
|
|
1615
|
-
|
|
1680
|
+
InvalidTypeFeatures(
|
|
1616
1681
|
wrong_type_link_to_data_columns,
|
|
1617
1682
|
expected_types=["string"],
|
|
1618
1683
|
)
|
|
@@ -1627,7 +1692,7 @@ class Validator:
|
|
|
1627
1692
|
@staticmethod
|
|
1628
1693
|
def _check_type_tags(
|
|
1629
1694
|
schema: Schema, column_types: dict[str, Any]
|
|
1630
|
-
) -> list[
|
|
1695
|
+
) -> list[InvalidTypeTags]:
|
|
1631
1696
|
if schema.tag_column_names is not None:
|
|
1632
1697
|
# should mirror server side
|
|
1633
1698
|
allowed_datatypes = (
|
|
@@ -1643,13 +1708,13 @@ class Validator:
|
|
|
1643
1708
|
)
|
|
1644
1709
|
wrong_type_cols = [
|
|
1645
1710
|
col
|
|
1646
|
-
for col in schema.tag_column_names
|
|
1711
|
+
for col in _normalize_column_names(schema.tag_column_names)
|
|
1647
1712
|
if col in column_types
|
|
1648
1713
|
and column_types[col] not in allowed_datatypes
|
|
1649
1714
|
]
|
|
1650
1715
|
if wrong_type_cols:
|
|
1651
1716
|
return [
|
|
1652
|
-
|
|
1717
|
+
InvalidTypeTags(
|
|
1653
1718
|
wrong_type_cols, ["float", "int", "bool", "str"]
|
|
1654
1719
|
)
|
|
1655
1720
|
]
|
|
@@ -1658,7 +1723,7 @@ class Validator:
|
|
|
1658
1723
|
@staticmethod
|
|
1659
1724
|
def _check_type_shap_values(
|
|
1660
1725
|
schema: Schema, column_types: dict[str, Any]
|
|
1661
|
-
) -> list[
|
|
1726
|
+
) -> list[InvalidTypeShapValues]:
|
|
1662
1727
|
if schema.shap_values_column_names is not None:
|
|
1663
1728
|
# should mirror server side
|
|
1664
1729
|
allowed_datatypes = (
|
|
@@ -1675,7 +1740,7 @@ class Validator:
|
|
|
1675
1740
|
]
|
|
1676
1741
|
if wrong_type_cols:
|
|
1677
1742
|
return [
|
|
1678
|
-
|
|
1743
|
+
InvalidTypeShapValues(
|
|
1679
1744
|
wrong_type_cols, expected_types=["float", "int"]
|
|
1680
1745
|
)
|
|
1681
1746
|
]
|
|
@@ -1684,12 +1749,13 @@ class Validator:
|
|
|
1684
1749
|
@staticmethod
|
|
1685
1750
|
def _check_type_pred_act_labels(
|
|
1686
1751
|
model_type: ModelTypes, schema: Schema, column_types: dict[str, Any]
|
|
1687
|
-
) -> list[
|
|
1752
|
+
) -> list[InvalidType]:
|
|
1688
1753
|
errors = []
|
|
1689
1754
|
columns = (
|
|
1690
1755
|
("Prediction labels", schema.prediction_label_column_name),
|
|
1691
1756
|
("Actual labels", schema.actual_label_column_name),
|
|
1692
1757
|
)
|
|
1758
|
+
allowed_datatypes: tuple[Any, ...]
|
|
1693
1759
|
if (
|
|
1694
1760
|
model_type in CATEGORICAL_MODEL_TYPES
|
|
1695
1761
|
or model_type == ModelTypes.GENERATIVE_LLM
|
|
@@ -1713,7 +1779,7 @@ class Validator:
|
|
|
1713
1779
|
and column_types[col] not in allowed_datatypes
|
|
1714
1780
|
):
|
|
1715
1781
|
errors.append(
|
|
1716
|
-
|
|
1782
|
+
InvalidType(
|
|
1717
1783
|
name,
|
|
1718
1784
|
expected_types=["float", "int", "bool", "str"],
|
|
1719
1785
|
found_data_type=column_types[col],
|
|
@@ -1737,7 +1803,7 @@ class Validator:
|
|
|
1737
1803
|
and column_types[col] not in allowed_datatypes
|
|
1738
1804
|
):
|
|
1739
1805
|
errors.append(
|
|
1740
|
-
|
|
1806
|
+
InvalidType(
|
|
1741
1807
|
name,
|
|
1742
1808
|
expected_types=["float", "int"],
|
|
1743
1809
|
found_data_type=column_types[col],
|
|
@@ -1748,7 +1814,7 @@ class Validator:
|
|
|
1748
1814
|
@staticmethod
|
|
1749
1815
|
def _check_type_pred_act_scores(
|
|
1750
1816
|
model_type: ModelTypes, schema: Schema, column_types: dict[str, Any]
|
|
1751
|
-
) -> list[
|
|
1817
|
+
) -> list[InvalidType]:
|
|
1752
1818
|
errors = []
|
|
1753
1819
|
columns = (
|
|
1754
1820
|
("Prediction scores", schema.prediction_score_column_name),
|
|
@@ -1777,7 +1843,7 @@ class Validator:
|
|
|
1777
1843
|
and column_types[col] not in allowed_datatypes
|
|
1778
1844
|
):
|
|
1779
1845
|
errors.append(
|
|
1780
|
-
|
|
1846
|
+
InvalidType(
|
|
1781
1847
|
name,
|
|
1782
1848
|
expected_types=["float", "int"],
|
|
1783
1849
|
found_data_type=column_types[col],
|
|
@@ -1788,7 +1854,7 @@ class Validator:
|
|
|
1788
1854
|
@staticmethod
|
|
1789
1855
|
def _check_type_multi_class_pred_threshold_act_scores(
|
|
1790
1856
|
schema: Schema, column_types: dict[str, Any]
|
|
1791
|
-
) -> list[
|
|
1857
|
+
) -> list[InvalidType]:
|
|
1792
1858
|
"""Check type for prediction / threshold / actual scores for multiclass model.
|
|
1793
1859
|
|
|
1794
1860
|
Expect the scores to be a list of pyarrow structs that contains field
|
|
@@ -1834,7 +1900,7 @@ class Validator:
|
|
|
1834
1900
|
and column_types[col] not in allowed_class_score_map_datatypes
|
|
1835
1901
|
):
|
|
1836
1902
|
errors.append(
|
|
1837
|
-
|
|
1903
|
+
InvalidType(
|
|
1838
1904
|
name,
|
|
1839
1905
|
expected_types=[
|
|
1840
1906
|
"List[Dict{class_name: str, score: int}]",
|
|
@@ -1848,7 +1914,7 @@ class Validator:
|
|
|
1848
1914
|
@staticmethod
|
|
1849
1915
|
def _check_type_prompt_response(
|
|
1850
1916
|
schema: Schema, column_types: dict[str, Any]
|
|
1851
|
-
) -> list[
|
|
1917
|
+
) -> list[InvalidTypeColumns]:
|
|
1852
1918
|
fields_to_check = []
|
|
1853
1919
|
if schema.prompt_column_names is not None:
|
|
1854
1920
|
fields_to_check.append(schema.prompt_column_names)
|
|
@@ -1895,20 +1961,20 @@ class Validator:
|
|
|
1895
1961
|
wrong_type_col_errors = []
|
|
1896
1962
|
if wrong_type_vector_columns:
|
|
1897
1963
|
wrong_type_col_errors.append(
|
|
1898
|
-
|
|
1964
|
+
InvalidTypeColumns(
|
|
1899
1965
|
wrong_type_vector_columns,
|
|
1900
1966
|
expected_types=["list[float], np.array[float]"],
|
|
1901
1967
|
)
|
|
1902
1968
|
)
|
|
1903
1969
|
if wrong_type_data_columns:
|
|
1904
1970
|
wrong_type_col_errors.append(
|
|
1905
|
-
|
|
1971
|
+
InvalidTypeColumns(
|
|
1906
1972
|
wrong_type_data_columns, expected_types=["str, list[str]"]
|
|
1907
1973
|
)
|
|
1908
1974
|
)
|
|
1909
1975
|
if wrong_type_str_columns:
|
|
1910
1976
|
wrong_type_col_errors.append(
|
|
1911
|
-
|
|
1977
|
+
InvalidTypeColumns(
|
|
1912
1978
|
wrong_type_str_columns, expected_types=["str"]
|
|
1913
1979
|
)
|
|
1914
1980
|
)
|
|
@@ -1918,7 +1984,7 @@ class Validator:
|
|
|
1918
1984
|
@staticmethod
|
|
1919
1985
|
def _check_type_llm_prompt_templates(
|
|
1920
1986
|
schema: Schema, column_types: dict[str, Any]
|
|
1921
|
-
) -> list[
|
|
1987
|
+
) -> list[InvalidTypeColumns]:
|
|
1922
1988
|
if schema.prompt_template_column_names is None:
|
|
1923
1989
|
return []
|
|
1924
1990
|
|
|
@@ -1949,7 +2015,7 @@ class Validator:
|
|
|
1949
2015
|
# Return errors if any
|
|
1950
2016
|
if wrong_type_cols:
|
|
1951
2017
|
return [
|
|
1952
|
-
|
|
2018
|
+
InvalidTypeColumns(
|
|
1953
2019
|
wrong_type_columns=wrong_type_cols,
|
|
1954
2020
|
expected_types=["string"],
|
|
1955
2021
|
)
|
|
@@ -1959,7 +2025,7 @@ class Validator:
|
|
|
1959
2025
|
@staticmethod
|
|
1960
2026
|
def _check_type_llm_config(
|
|
1961
2027
|
schema: Schema, column_types: dict[str, Any]
|
|
1962
|
-
) -> list[
|
|
2028
|
+
) -> list[InvalidTypeColumns]:
|
|
1963
2029
|
if schema.llm_config_column_names is None:
|
|
1964
2030
|
return []
|
|
1965
2031
|
|
|
@@ -1986,7 +2052,7 @@ class Validator:
|
|
|
1986
2052
|
# Return errors if any
|
|
1987
2053
|
if wrong_type_cols:
|
|
1988
2054
|
return [
|
|
1989
|
-
|
|
2055
|
+
InvalidTypeColumns(
|
|
1990
2056
|
wrong_type_columns=wrong_type_cols,
|
|
1991
2057
|
expected_types=["string"],
|
|
1992
2058
|
)
|
|
@@ -1996,7 +2062,7 @@ class Validator:
|
|
|
1996
2062
|
@staticmethod
|
|
1997
2063
|
def _check_type_llm_run_metadata(
|
|
1998
2064
|
schema: Schema, column_types: dict[str, Any]
|
|
1999
|
-
) -> list[
|
|
2065
|
+
) -> list[InvalidTypeColumns]:
|
|
2000
2066
|
if schema.llm_run_metadata_column_names is None:
|
|
2001
2067
|
return []
|
|
2002
2068
|
|
|
@@ -2011,10 +2077,8 @@ class Validator:
|
|
|
2011
2077
|
)
|
|
2012
2078
|
wrong_type_cols = []
|
|
2013
2079
|
if schema.tag_column_names:
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
in schema.tag_column_names
|
|
2017
|
-
) and (
|
|
2080
|
+
tag_cols = _normalize_column_names(schema.tag_column_names)
|
|
2081
|
+
if (LLM_RUN_METADATA_TOTAL_TOKEN_COUNT_TAG_NAME in tag_cols) and (
|
|
2018
2082
|
LLM_RUN_METADATA_TOTAL_TOKEN_COUNT_TAG_NAME in column_types
|
|
2019
2083
|
and column_types[LLM_RUN_METADATA_TOTAL_TOKEN_COUNT_TAG_NAME]
|
|
2020
2084
|
not in allowed_datatypes
|
|
@@ -2022,10 +2086,7 @@ class Validator:
|
|
|
2022
2086
|
wrong_type_cols.append(
|
|
2023
2087
|
schema.llm_run_metadata_column_names.total_token_count_column_name
|
|
2024
2088
|
)
|
|
2025
|
-
if (
|
|
2026
|
-
LLM_RUN_METADATA_PROMPT_TOKEN_COUNT_TAG_NAME
|
|
2027
|
-
in schema.tag_column_names
|
|
2028
|
-
) and (
|
|
2089
|
+
if (LLM_RUN_METADATA_PROMPT_TOKEN_COUNT_TAG_NAME in tag_cols) and (
|
|
2029
2090
|
LLM_RUN_METADATA_PROMPT_TOKEN_COUNT_TAG_NAME in column_types
|
|
2030
2091
|
and column_types[LLM_RUN_METADATA_PROMPT_TOKEN_COUNT_TAG_NAME]
|
|
2031
2092
|
not in allowed_datatypes
|
|
@@ -2034,8 +2095,7 @@ class Validator:
|
|
|
2034
2095
|
schema.llm_run_metadata_column_names.prompt_token_count_column_name
|
|
2035
2096
|
)
|
|
2036
2097
|
if (
|
|
2037
|
-
LLM_RUN_METADATA_RESPONSE_TOKEN_COUNT_TAG_NAME
|
|
2038
|
-
in schema.tag_column_names
|
|
2098
|
+
LLM_RUN_METADATA_RESPONSE_TOKEN_COUNT_TAG_NAME in tag_cols
|
|
2039
2099
|
) and (
|
|
2040
2100
|
LLM_RUN_METADATA_RESPONSE_TOKEN_COUNT_TAG_NAME in column_types
|
|
2041
2101
|
and column_types[LLM_RUN_METADATA_RESPONSE_TOKEN_COUNT_TAG_NAME]
|
|
@@ -2044,10 +2104,7 @@ class Validator:
|
|
|
2044
2104
|
wrong_type_cols.append(
|
|
2045
2105
|
schema.llm_run_metadata_column_names.response_token_count_column_name
|
|
2046
2106
|
)
|
|
2047
|
-
if (
|
|
2048
|
-
LLM_RUN_METADATA_RESPONSE_LATENCY_MS_TAG_NAME
|
|
2049
|
-
in schema.tag_column_names
|
|
2050
|
-
) and (
|
|
2107
|
+
if (LLM_RUN_METADATA_RESPONSE_LATENCY_MS_TAG_NAME in tag_cols) and (
|
|
2051
2108
|
LLM_RUN_METADATA_RESPONSE_LATENCY_MS_TAG_NAME in column_types
|
|
2052
2109
|
and column_types[LLM_RUN_METADATA_RESPONSE_LATENCY_MS_TAG_NAME]
|
|
2053
2110
|
not in allowed_datatypes
|
|
@@ -2059,8 +2116,8 @@ class Validator:
|
|
|
2059
2116
|
# Return errors if there are any
|
|
2060
2117
|
if wrong_type_cols:
|
|
2061
2118
|
return [
|
|
2062
|
-
|
|
2063
|
-
wrong_type_columns=wrong_type_cols,
|
|
2119
|
+
InvalidTypeColumns(
|
|
2120
|
+
wrong_type_columns=wrong_type_cols, # type: ignore[arg-type]
|
|
2064
2121
|
expected_types=["int", "float"],
|
|
2065
2122
|
)
|
|
2066
2123
|
]
|
|
@@ -2069,7 +2126,7 @@ class Validator:
|
|
|
2069
2126
|
@staticmethod
|
|
2070
2127
|
def _check_type_retrieved_document_ids(
|
|
2071
2128
|
schema: Schema, column_types: dict[str, Any]
|
|
2072
|
-
) -> list[
|
|
2129
|
+
) -> list[InvalidType]:
|
|
2073
2130
|
col = schema.retrieved_document_ids_column_name
|
|
2074
2131
|
if col in column_types:
|
|
2075
2132
|
# should mirror server side
|
|
@@ -2079,7 +2136,7 @@ class Validator:
|
|
|
2079
2136
|
)
|
|
2080
2137
|
if column_types[col] not in allowed_datatypes:
|
|
2081
2138
|
return [
|
|
2082
|
-
|
|
2139
|
+
InvalidType(
|
|
2083
2140
|
"Retrieved Document IDs",
|
|
2084
2141
|
expected_types=["List[str]"],
|
|
2085
2142
|
found_data_type=column_types[col],
|
|
@@ -2090,7 +2147,7 @@ class Validator:
|
|
|
2090
2147
|
@staticmethod
|
|
2091
2148
|
def _check_type_image_segment_coordinates(
|
|
2092
2149
|
schema: Schema, column_types: dict[str, Any]
|
|
2093
|
-
) -> list[
|
|
2150
|
+
) -> list[InvalidTypeColumns]:
|
|
2094
2151
|
# should mirror server side
|
|
2095
2152
|
allowed_coordinate_types = (
|
|
2096
2153
|
pa.list_(pa.list_(pa.float64())),
|
|
@@ -2173,7 +2230,7 @@ class Validator:
|
|
|
2173
2230
|
|
|
2174
2231
|
return (
|
|
2175
2232
|
[
|
|
2176
|
-
|
|
2233
|
+
InvalidTypeColumns(
|
|
2177
2234
|
wrong_type_columns=wrong_type_cols,
|
|
2178
2235
|
expected_types=["List[List[float]]"],
|
|
2179
2236
|
)
|
|
@@ -2185,7 +2242,7 @@ class Validator:
|
|
|
2185
2242
|
@staticmethod
|
|
2186
2243
|
def _check_type_image_segment_categories(
|
|
2187
2244
|
schema: Schema, column_types: dict[str, Any]
|
|
2188
|
-
) -> list[
|
|
2245
|
+
) -> list[InvalidTypeColumns]:
|
|
2189
2246
|
# should mirror server side
|
|
2190
2247
|
allowed_category_datatypes = (
|
|
2191
2248
|
pa.list_(pa.string()),
|
|
@@ -2242,7 +2299,7 @@ class Validator:
|
|
|
2242
2299
|
|
|
2243
2300
|
return (
|
|
2244
2301
|
[
|
|
2245
|
-
|
|
2302
|
+
InvalidTypeColumns(
|
|
2246
2303
|
wrong_type_columns=wrong_type_cols,
|
|
2247
2304
|
expected_types=["List[str]"],
|
|
2248
2305
|
)
|
|
@@ -2254,7 +2311,7 @@ class Validator:
|
|
|
2254
2311
|
@staticmethod
|
|
2255
2312
|
def _check_type_image_segment_scores(
|
|
2256
2313
|
schema: Schema, column_types: dict[str, Any]
|
|
2257
|
-
) -> list[
|
|
2314
|
+
) -> list[InvalidTypeColumns]:
|
|
2258
2315
|
# should mirror server side
|
|
2259
2316
|
allowed_score_datatypes = (
|
|
2260
2317
|
pa.list_(pa.float64()),
|
|
@@ -2297,7 +2354,7 @@ class Validator:
|
|
|
2297
2354
|
|
|
2298
2355
|
return (
|
|
2299
2356
|
[
|
|
2300
|
-
|
|
2357
|
+
InvalidTypeColumns(
|
|
2301
2358
|
wrong_type_columns=wrong_type_cols,
|
|
2302
2359
|
expected_types=["List[float]"],
|
|
2303
2360
|
)
|
|
@@ -2313,7 +2370,7 @@ class Validator:
|
|
|
2313
2370
|
@staticmethod
|
|
2314
2371
|
def _check_embedding_vectors_dimensionality(
|
|
2315
2372
|
dataframe: pd.DataFrame, schema: Schema
|
|
2316
|
-
) -> list[
|
|
2373
|
+
) -> list[ValidationError]:
|
|
2317
2374
|
if schema.embedding_feature_column_names is None:
|
|
2318
2375
|
return []
|
|
2319
2376
|
|
|
@@ -2331,7 +2388,7 @@ class Validator:
|
|
|
2331
2388
|
|
|
2332
2389
|
return (
|
|
2333
2390
|
[
|
|
2334
|
-
|
|
2391
|
+
InvalidValueEmbeddingVectorDimensionality(
|
|
2335
2392
|
invalid_low_dim_vector_cols,
|
|
2336
2393
|
invalid_high_dim_vector_cols,
|
|
2337
2394
|
),
|
|
@@ -2343,7 +2400,7 @@ class Validator:
|
|
|
2343
2400
|
@staticmethod
|
|
2344
2401
|
def _check_embedding_raw_data_characters(
|
|
2345
2402
|
dataframe: pd.DataFrame, schema: Schema
|
|
2346
|
-
) -> list[
|
|
2403
|
+
) -> list[ValidationError]:
|
|
2347
2404
|
if schema.embedding_feature_column_names is None:
|
|
2348
2405
|
return []
|
|
2349
2406
|
|
|
@@ -2361,7 +2418,7 @@ class Validator:
|
|
|
2361
2418
|
|
|
2362
2419
|
if invalid_long_string_data_cols:
|
|
2363
2420
|
return [
|
|
2364
|
-
|
|
2421
|
+
InvalidValueEmbeddingRawDataTooLong(
|
|
2365
2422
|
invalid_long_string_data_cols
|
|
2366
2423
|
)
|
|
2367
2424
|
]
|
|
@@ -2377,20 +2434,20 @@ class Validator:
|
|
|
2377
2434
|
@staticmethod
|
|
2378
2435
|
def _check_value_rank(
|
|
2379
2436
|
dataframe: pd.DataFrame, schema: Schema
|
|
2380
|
-
) -> list[
|
|
2437
|
+
) -> list[InvalidRankValue]:
|
|
2381
2438
|
col = schema.rank_column_name
|
|
2382
2439
|
lbound, ubound = (1, 100)
|
|
2383
2440
|
|
|
2384
2441
|
if col is not None and col in dataframe.columns:
|
|
2385
2442
|
rank_min_max = dataframe[col].agg(["min", "max"])
|
|
2386
2443
|
if rank_min_max["min"] < lbound or rank_min_max["max"] > ubound:
|
|
2387
|
-
return [
|
|
2444
|
+
return [InvalidRankValue(col, "1-100")]
|
|
2388
2445
|
return []
|
|
2389
2446
|
|
|
2390
2447
|
@staticmethod
|
|
2391
2448
|
def _check_id_field_str_length(
|
|
2392
2449
|
dataframe: pd.DataFrame, schema_name: str, id_col_name: str | None
|
|
2393
|
-
) -> list[
|
|
2450
|
+
) -> list[ValidationError]:
|
|
2394
2451
|
"""Require prediction_id to be a string of length between MIN and MAX.
|
|
2395
2452
|
|
|
2396
2453
|
Between MIN_PREDICTION_ID_LEN and MAX_PREDICTION_ID_LEN.
|
|
@@ -2412,7 +2469,7 @@ class Validator:
|
|
|
2412
2469
|
.all()
|
|
2413
2470
|
):
|
|
2414
2471
|
return [
|
|
2415
|
-
|
|
2472
|
+
InvalidStringLengthInColumn(
|
|
2416
2473
|
schema_name=schema_name,
|
|
2417
2474
|
col_name=id_col_name,
|
|
2418
2475
|
min_length=MIN_PREDICTION_ID_LEN,
|
|
@@ -2424,7 +2481,7 @@ class Validator:
|
|
|
2424
2481
|
@staticmethod
|
|
2425
2482
|
def _check_document_id_field_str_length(
|
|
2426
2483
|
dataframe: pd.DataFrame, schema_name: str, id_col_name: str | None
|
|
2427
|
-
) -> list[
|
|
2484
|
+
) -> list[ValidationError]:
|
|
2428
2485
|
"""Require document id to be a string of length between MIN and MAX.
|
|
2429
2486
|
|
|
2430
2487
|
Between MIN_DOCUMENT_ID_LEN and MAX_DOCUMENT_ID_LEN.
|
|
@@ -2446,7 +2503,7 @@ class Validator:
|
|
|
2446
2503
|
.all()
|
|
2447
2504
|
):
|
|
2448
2505
|
return [
|
|
2449
|
-
|
|
2506
|
+
InvalidStringLengthInColumn(
|
|
2450
2507
|
schema_name=schema_name,
|
|
2451
2508
|
col_name=id_col_name,
|
|
2452
2509
|
min_length=MIN_DOCUMENT_ID_LEN,
|
|
@@ -2465,7 +2522,7 @@ class Validator:
|
|
|
2465
2522
|
and len(dataframe)
|
|
2466
2523
|
):
|
|
2467
2524
|
return True
|
|
2468
|
-
return (
|
|
2525
|
+
return bool(
|
|
2469
2526
|
dataframe[col_name]
|
|
2470
2527
|
.astype(str)
|
|
2471
2528
|
.str.len()
|
|
@@ -2476,21 +2533,21 @@ class Validator:
|
|
|
2476
2533
|
@staticmethod
|
|
2477
2534
|
def _check_value_tag(
|
|
2478
2535
|
dataframe: pd.DataFrame, schema: Schema
|
|
2479
|
-
) -> list[
|
|
2536
|
+
) -> list[InvalidTagLength]:
|
|
2480
2537
|
if schema.tag_column_names is None:
|
|
2481
2538
|
return []
|
|
2482
2539
|
|
|
2483
2540
|
wrong_tag_cols = []
|
|
2484
2541
|
truncated_tag_cols = []
|
|
2485
|
-
for col in schema.tag_column_names:
|
|
2542
|
+
for col in _normalize_column_names(schema.tag_column_names):
|
|
2486
2543
|
# This is to be defensive, validate_params should guarantee that this column is in
|
|
2487
2544
|
# the dataframe, via _check_missing_columns, and return an error before reaching this
|
|
2488
2545
|
# block if not
|
|
2489
2546
|
# Checks max tag length when any values in a column are strings
|
|
2490
2547
|
if (
|
|
2491
2548
|
col in dataframe.columns
|
|
2492
|
-
and dataframe[col].map(type).eq(str).any()
|
|
2493
|
-
):
|
|
2549
|
+
and dataframe[col].map(type).eq(str).any() # type: ignore[arg-type]
|
|
2550
|
+
):
|
|
2494
2551
|
max_tag_len = (
|
|
2495
2552
|
dataframe[col]
|
|
2496
2553
|
.apply(_check_value_string_length_helper)
|
|
@@ -2501,7 +2558,7 @@ class Validator:
|
|
|
2501
2558
|
elif max_tag_len > MAX_TAG_LENGTH_TRUNCATION:
|
|
2502
2559
|
truncated_tag_cols.append(col)
|
|
2503
2560
|
if wrong_tag_cols:
|
|
2504
|
-
return [
|
|
2561
|
+
return [InvalidTagLength(wrong_tag_cols)]
|
|
2505
2562
|
if truncated_tag_cols:
|
|
2506
2563
|
logger.warning(
|
|
2507
2564
|
get_truncation_warning_message(
|
|
@@ -2513,7 +2570,8 @@ class Validator:
|
|
|
2513
2570
|
@staticmethod
|
|
2514
2571
|
def _check_value_ranking_category(
|
|
2515
2572
|
dataframe: pd.DataFrame, schema: Schema
|
|
2516
|
-
) -> list[
|
|
2573
|
+
) -> list[InvalidValueMissingValue | InvalidRankingCategoryValue]:
|
|
2574
|
+
col: str | None
|
|
2517
2575
|
if schema.relevance_labels_column_name is not None:
|
|
2518
2576
|
col = schema.relevance_labels_column_name
|
|
2519
2577
|
elif schema.attributions_column_name is not None:
|
|
@@ -2521,16 +2579,16 @@ class Validator:
|
|
|
2521
2579
|
else:
|
|
2522
2580
|
col = schema.actual_label_column_name
|
|
2523
2581
|
if col is not None and col in dataframe.columns:
|
|
2524
|
-
if dataframe[col].isnull().
|
|
2582
|
+
if dataframe[col].isnull().any():
|
|
2525
2583
|
# do not attach duplicated missing value error
|
|
2526
2584
|
# which would be caught by _check_value_missing
|
|
2527
2585
|
return []
|
|
2528
2586
|
if dataframe[col].astype(str).str.len().min() == 0:
|
|
2529
|
-
return [
|
|
2587
|
+
return [InvalidRankingCategoryValue(col)]
|
|
2530
2588
|
# empty list
|
|
2531
2589
|
not_null_filter = dataframe[col].notnull()
|
|
2532
2590
|
if dataframe[not_null_filter][col].map(len).min() == 0:
|
|
2533
|
-
return [
|
|
2591
|
+
return [InvalidValueMissingValue(col, "empty list")]
|
|
2534
2592
|
# no empty string in list
|
|
2535
2593
|
if (
|
|
2536
2594
|
dataframe[not_null_filter][col]
|
|
@@ -2538,13 +2596,13 @@ class Validator:
|
|
|
2538
2596
|
.min()
|
|
2539
2597
|
== 0
|
|
2540
2598
|
):
|
|
2541
|
-
return [
|
|
2599
|
+
return [InvalidRankingCategoryValue(col)]
|
|
2542
2600
|
return []
|
|
2543
2601
|
|
|
2544
2602
|
@staticmethod
|
|
2545
2603
|
def _check_length_multi_class_maps(
|
|
2546
2604
|
dataframe: pd.DataFrame, schema: Schema
|
|
2547
|
-
) -> list[
|
|
2605
|
+
) -> list[InvalidNumClassesMultiClassMap]:
|
|
2548
2606
|
# each entry in column is a list of dictionaries mapping class names and scores
|
|
2549
2607
|
# validate length of list of dictionaries for each column
|
|
2550
2608
|
invalid_cols = {}
|
|
@@ -2575,16 +2633,16 @@ class Validator:
|
|
|
2575
2633
|
if invalid_num_classes:
|
|
2576
2634
|
invalid_cols[col] = invalid_num_classes
|
|
2577
2635
|
if invalid_cols:
|
|
2578
|
-
return [
|
|
2636
|
+
return [InvalidNumClassesMultiClassMap(invalid_cols)]
|
|
2579
2637
|
return []
|
|
2580
2638
|
|
|
2581
2639
|
@staticmethod
|
|
2582
2640
|
def _check_classes_and_scores_values_in_multi_class_maps(
|
|
2583
2641
|
dataframe: pd.DataFrame, schema: Schema
|
|
2584
2642
|
) -> list[
|
|
2585
|
-
|
|
2586
|
-
|
|
|
2587
|
-
|
|
|
2643
|
+
InvalidMultiClassClassNameLength
|
|
2644
|
+
| InvalidMultiClassActScoreValue
|
|
2645
|
+
| InvalidMultiClassPredScoreValue
|
|
2588
2646
|
]:
|
|
2589
2647
|
"""Validate the class names and score values of dictionaries.
|
|
2590
2648
|
|
|
@@ -2601,7 +2659,11 @@ class Validator:
|
|
|
2601
2659
|
invalid_pred_scores = {}
|
|
2602
2660
|
lbound, ubound = (0, 1)
|
|
2603
2661
|
invalid_actual_scores = False
|
|
2604
|
-
errors
|
|
2662
|
+
errors: list[
|
|
2663
|
+
InvalidMultiClassClassNameLength
|
|
2664
|
+
| InvalidMultiClassActScoreValue
|
|
2665
|
+
| InvalidMultiClassPredScoreValue
|
|
2666
|
+
] = []
|
|
2605
2667
|
for col in cols:
|
|
2606
2668
|
if (
|
|
2607
2669
|
col is None
|
|
@@ -2649,21 +2711,17 @@ class Validator:
|
|
|
2649
2711
|
if invalid_scores_for_col:
|
|
2650
2712
|
invalid_pred_scores[col] = invalid_scores_for_col
|
|
2651
2713
|
if invalid_class_names:
|
|
2652
|
-
errors.append(
|
|
2653
|
-
err.InvalidMultiClassClassNameLength(invalid_class_names)
|
|
2654
|
-
)
|
|
2714
|
+
errors.append(InvalidMultiClassClassNameLength(invalid_class_names))
|
|
2655
2715
|
if invalid_pred_scores:
|
|
2656
|
-
errors.append(
|
|
2657
|
-
err.InvalidMultiClassPredScoreValue(invalid_pred_scores)
|
|
2658
|
-
)
|
|
2716
|
+
errors.append(InvalidMultiClassPredScoreValue(invalid_pred_scores)) # type: ignore[arg-type]
|
|
2659
2717
|
if invalid_actual_scores:
|
|
2660
|
-
errors.append(
|
|
2718
|
+
errors.append(InvalidMultiClassActScoreValue(col)) # type: ignore[arg-type, arg-type]
|
|
2661
2719
|
return errors
|
|
2662
2720
|
|
|
2663
2721
|
@staticmethod
|
|
2664
2722
|
def _check_each_multi_class_pred_has_threshold(
|
|
2665
2723
|
dataframe: pd.DataFrame, schema: Schema
|
|
2666
|
-
) -> list[
|
|
2724
|
+
) -> list[InvalidMultiClassThresholdClasses]:
|
|
2667
2725
|
"""Validate threshold scores for Multi Class models.
|
|
2668
2726
|
|
|
2669
2727
|
If threshold scores column is included in schema and dataframe, validate that
|
|
@@ -2687,7 +2745,7 @@ class Validator:
|
|
|
2687
2745
|
pred_class_set = set(pred_classes)
|
|
2688
2746
|
if pred_class_set != thresh_class_set:
|
|
2689
2747
|
return [
|
|
2690
|
-
|
|
2748
|
+
InvalidMultiClassThresholdClasses(
|
|
2691
2749
|
threshold_col, pred_class_set, thresh_class_set
|
|
2692
2750
|
)
|
|
2693
2751
|
]
|
|
@@ -2697,7 +2755,7 @@ class Validator:
|
|
|
2697
2755
|
def _check_value_timestamp(
|
|
2698
2756
|
dataframe: pd.DataFrame,
|
|
2699
2757
|
schema: Schema,
|
|
2700
|
-
) -> list[
|
|
2758
|
+
) -> list[InvalidValueMissingValue | InvalidValueTimestamp]:
|
|
2701
2759
|
# Due to the timing difference between checking this here and the data finally
|
|
2702
2760
|
# hitting the same check on server side, there's a some chance for a false
|
|
2703
2761
|
# result, i.e. the check here succeeds but the same check on server side fails.
|
|
@@ -2706,11 +2764,9 @@ class Validator:
|
|
|
2706
2764
|
# When a timestamp column has Date and NaN, pyarrow will be fine, but
|
|
2707
2765
|
# pandas min/max will fail due to type incompatibility. So we check for
|
|
2708
2766
|
# missing value first.
|
|
2709
|
-
if dataframe[col].isnull().
|
|
2767
|
+
if dataframe[col].isnull().any():
|
|
2710
2768
|
return [
|
|
2711
|
-
|
|
2712
|
-
"Prediction timestamp", "missing"
|
|
2713
|
-
)
|
|
2769
|
+
InvalidValueMissingValue("Prediction timestamp", "missing")
|
|
2714
2770
|
]
|
|
2715
2771
|
|
|
2716
2772
|
now_t = datetime.now(tz=timezone.utc)
|
|
@@ -2794,7 +2850,7 @@ class Validator:
|
|
|
2794
2850
|
)
|
|
2795
2851
|
)
|
|
2796
2852
|
):
|
|
2797
|
-
return [
|
|
2853
|
+
return [InvalidValueTimestamp(timestamp_col_name=col)]
|
|
2798
2854
|
|
|
2799
2855
|
return []
|
|
2800
2856
|
|
|
@@ -2803,9 +2859,9 @@ class Validator:
|
|
|
2803
2859
|
@staticmethod
|
|
2804
2860
|
def _check_invalid_missing_values(
|
|
2805
2861
|
dataframe: pd.DataFrame, schema: BaseSchema, model_type: ModelTypes
|
|
2806
|
-
) -> list[
|
|
2862
|
+
) -> list[InvalidValueMissingValue]:
|
|
2807
2863
|
errors = []
|
|
2808
|
-
columns = ()
|
|
2864
|
+
columns: tuple[tuple[str, str | None], ...] = ()
|
|
2809
2865
|
if isinstance(schema, CorpusSchema):
|
|
2810
2866
|
columns = (("Document ID", schema.document_id_column_name),)
|
|
2811
2867
|
elif isinstance(schema, Schema):
|
|
@@ -2824,7 +2880,7 @@ class Validator:
|
|
|
2824
2880
|
if col is not None and col in dataframe.columns:
|
|
2825
2881
|
if dataframe[col].isnull().any():
|
|
2826
2882
|
errors.append(
|
|
2827
|
-
|
|
2883
|
+
InvalidValueMissingValue(
|
|
2828
2884
|
name, wrong_values="missing", column=col
|
|
2829
2885
|
)
|
|
2830
2886
|
)
|
|
@@ -2834,7 +2890,7 @@ class Validator:
|
|
|
2834
2890
|
and np.isinf(dataframe[col]).any()
|
|
2835
2891
|
):
|
|
2836
2892
|
errors.append(
|
|
2837
|
-
|
|
2893
|
+
InvalidValueMissingValue(
|
|
2838
2894
|
name, wrong_values="infinite", column=col
|
|
2839
2895
|
)
|
|
2840
2896
|
)
|
|
@@ -2850,7 +2906,7 @@ class Validator:
|
|
|
2850
2906
|
environment: Environments,
|
|
2851
2907
|
schema: Schema,
|
|
2852
2908
|
model_type: ModelTypes,
|
|
2853
|
-
) -> list[
|
|
2909
|
+
) -> list[InvalidRecord]:
|
|
2854
2910
|
if environment in (Environments.VALIDATION, Environments.TRAINING):
|
|
2855
2911
|
return []
|
|
2856
2912
|
|
|
@@ -2894,7 +2950,7 @@ class Validator:
|
|
|
2894
2950
|
environment: Environments,
|
|
2895
2951
|
schema: Schema,
|
|
2896
2952
|
model_type: ModelTypes,
|
|
2897
|
-
) -> list[
|
|
2953
|
+
) -> list[InvalidRecord]:
|
|
2898
2954
|
"""Validates there's not a single row in the dataframe with all nulls.
|
|
2899
2955
|
|
|
2900
2956
|
Returns errors if any row has all of pred_label and pred_score evaluating to
|
|
@@ -2942,7 +2998,7 @@ class Validator:
|
|
|
2942
2998
|
@staticmethod
|
|
2943
2999
|
def _check_invalid_record_helper(
|
|
2944
3000
|
dataframe: pd.DataFrame, column_names: list[str | None]
|
|
2945
|
-
) -> list[
|
|
3001
|
+
) -> list[InvalidRecord]:
|
|
2946
3002
|
"""Check that there are no null values in a subset of columns.
|
|
2947
3003
|
|
|
2948
3004
|
The column subset is computed from the input list of columns `column_names`
|
|
@@ -2950,7 +3006,7 @@ class Validator:
|
|
|
2950
3006
|
null values are found.
|
|
2951
3007
|
|
|
2952
3008
|
Returns:
|
|
2953
|
-
List[
|
|
3009
|
+
List[InvalidRecord]: An error expressing the rows that are problematic
|
|
2954
3010
|
|
|
2955
3011
|
"""
|
|
2956
3012
|
columns_subset = [
|
|
@@ -2964,12 +3020,12 @@ class Validator:
|
|
|
2964
3020
|
null_index = null_filter[null_filter].index.values
|
|
2965
3021
|
if len(null_index) == 0:
|
|
2966
3022
|
return []
|
|
2967
|
-
return [
|
|
3023
|
+
return [InvalidRecord(columns_subset, null_index)] # type: ignore[arg-type]
|
|
2968
3024
|
|
|
2969
3025
|
@staticmethod
|
|
2970
3026
|
def _check_type_prediction_group_id(
|
|
2971
3027
|
schema: Schema, column_types: dict[str, Any]
|
|
2972
|
-
) -> list[
|
|
3028
|
+
) -> list[InvalidType]:
|
|
2973
3029
|
col = schema.prediction_group_id_column_name
|
|
2974
3030
|
if col in column_types:
|
|
2975
3031
|
# should mirror server side
|
|
@@ -2982,7 +3038,7 @@ class Validator:
|
|
|
2982
3038
|
)
|
|
2983
3039
|
if column_types[col] not in allowed_datatypes:
|
|
2984
3040
|
return [
|
|
2985
|
-
|
|
3041
|
+
InvalidType(
|
|
2986
3042
|
"prediction_group_ids",
|
|
2987
3043
|
expected_types=["str", "int"],
|
|
2988
3044
|
found_data_type=column_types[col],
|
|
@@ -2993,7 +3049,7 @@ class Validator:
|
|
|
2993
3049
|
@staticmethod
|
|
2994
3050
|
def _check_type_rank(
|
|
2995
3051
|
schema: Schema, column_types: dict[str, Any]
|
|
2996
|
-
) -> list[
|
|
3052
|
+
) -> list[InvalidType]:
|
|
2997
3053
|
col = schema.rank_column_name
|
|
2998
3054
|
if col in column_types:
|
|
2999
3055
|
allowed_datatypes = (
|
|
@@ -3004,7 +3060,7 @@ class Validator:
|
|
|
3004
3060
|
)
|
|
3005
3061
|
if column_types[col] not in allowed_datatypes:
|
|
3006
3062
|
return [
|
|
3007
|
-
|
|
3063
|
+
InvalidType(
|
|
3008
3064
|
"rank",
|
|
3009
3065
|
expected_types=["int"],
|
|
3010
3066
|
found_data_type=column_types[col],
|
|
@@ -3015,7 +3071,8 @@ class Validator:
|
|
|
3015
3071
|
@staticmethod
|
|
3016
3072
|
def _check_type_ranking_category(
|
|
3017
3073
|
schema: Schema, column_types: dict[str, Any]
|
|
3018
|
-
) -> list[
|
|
3074
|
+
) -> list[InvalidType]:
|
|
3075
|
+
col: str | None
|
|
3019
3076
|
if schema.relevance_labels_column_name is not None:
|
|
3020
3077
|
col = schema.relevance_labels_column_name
|
|
3021
3078
|
elif schema.attributions_column_name is not None:
|
|
@@ -3026,7 +3083,7 @@ class Validator:
|
|
|
3026
3083
|
allowed_datatypes = (pa.list_(pa.string()), pa.string(), pa.null())
|
|
3027
3084
|
if column_types[col] not in allowed_datatypes:
|
|
3028
3085
|
return [
|
|
3029
|
-
|
|
3086
|
+
InvalidType(
|
|
3030
3087
|
"relevance labels column for ranking models",
|
|
3031
3088
|
expected_types=["list of string", "string"],
|
|
3032
3089
|
found_data_type=column_types[col],
|
|
@@ -3037,7 +3094,7 @@ class Validator:
|
|
|
3037
3094
|
@staticmethod
|
|
3038
3095
|
def _check_value_bounding_boxes_coordinates(
|
|
3039
3096
|
dataframe: pd.DataFrame, schema: Schema
|
|
3040
|
-
) -> list[
|
|
3097
|
+
) -> list[InvalidBoundingBoxesCoordinates]:
|
|
3041
3098
|
errors = []
|
|
3042
3099
|
if schema.object_detection_prediction_column_names is not None:
|
|
3043
3100
|
coords_col_name = schema.object_detection_prediction_column_names.bounding_boxes_coordinates_column_name # noqa: E501
|
|
@@ -3058,7 +3115,7 @@ class Validator:
|
|
|
3058
3115
|
@staticmethod
|
|
3059
3116
|
def _check_value_bounding_boxes_categories(
|
|
3060
3117
|
dataframe: pd.DataFrame, schema: Schema
|
|
3061
|
-
) -> list[
|
|
3118
|
+
) -> list[InvalidBoundingBoxesCategories]:
|
|
3062
3119
|
errors = []
|
|
3063
3120
|
if schema.object_detection_prediction_column_names is not None:
|
|
3064
3121
|
cat_col_name = schema.object_detection_prediction_column_names.categories_column_name
|
|
@@ -3079,7 +3136,7 @@ class Validator:
|
|
|
3079
3136
|
@staticmethod
|
|
3080
3137
|
def _check_value_bounding_boxes_scores(
|
|
3081
3138
|
dataframe: pd.DataFrame, schema: Schema
|
|
3082
|
-
) -> list[
|
|
3139
|
+
) -> list[InvalidBoundingBoxesScores]:
|
|
3083
3140
|
errors = []
|
|
3084
3141
|
if schema.object_detection_prediction_column_names is not None:
|
|
3085
3142
|
sc_col_name = schema.object_detection_prediction_column_names.scores_column_name
|
|
@@ -3104,7 +3161,7 @@ class Validator:
|
|
|
3104
3161
|
@staticmethod
|
|
3105
3162
|
def _check_value_semantic_segmentation_polygon_coordinates(
|
|
3106
3163
|
dataframe: pd.DataFrame, schema: Schema
|
|
3107
|
-
) -> list[
|
|
3164
|
+
) -> list[InvalidPolygonCoordinates]:
|
|
3108
3165
|
errors = []
|
|
3109
3166
|
if schema.semantic_segmentation_prediction_column_names is not None:
|
|
3110
3167
|
coords_col_name = schema.semantic_segmentation_prediction_column_names.polygon_coordinates_column_name # noqa: E501
|
|
@@ -3125,7 +3182,7 @@ class Validator:
|
|
|
3125
3182
|
@staticmethod
|
|
3126
3183
|
def _check_value_semantic_segmentation_polygon_categories(
|
|
3127
3184
|
dataframe: pd.DataFrame, schema: Schema
|
|
3128
|
-
) -> list[
|
|
3185
|
+
) -> list[InvalidPolygonCategories]:
|
|
3129
3186
|
errors = []
|
|
3130
3187
|
if schema.semantic_segmentation_prediction_column_names is not None:
|
|
3131
3188
|
cat_col_name = schema.semantic_segmentation_prediction_column_names.categories_column_name
|
|
@@ -3146,7 +3203,7 @@ class Validator:
|
|
|
3146
3203
|
@staticmethod
|
|
3147
3204
|
def _check_value_instance_segmentation_polygon_coordinates(
|
|
3148
3205
|
dataframe: pd.DataFrame, schema: Schema
|
|
3149
|
-
) -> list[
|
|
3206
|
+
) -> list[InvalidPolygonCoordinates]:
|
|
3150
3207
|
errors = []
|
|
3151
3208
|
if schema.instance_segmentation_prediction_column_names is not None:
|
|
3152
3209
|
coords_col_name = schema.instance_segmentation_prediction_column_names.polygon_coordinates_column_name # noqa: E501
|
|
@@ -3167,7 +3224,7 @@ class Validator:
|
|
|
3167
3224
|
@staticmethod
|
|
3168
3225
|
def _check_value_instance_segmentation_polygon_categories(
|
|
3169
3226
|
dataframe: pd.DataFrame, schema: Schema
|
|
3170
|
-
) -> list[
|
|
3227
|
+
) -> list[InvalidPolygonCategories]:
|
|
3171
3228
|
errors = []
|
|
3172
3229
|
if schema.instance_segmentation_prediction_column_names is not None:
|
|
3173
3230
|
cat_col_name = schema.instance_segmentation_prediction_column_names.categories_column_name
|
|
@@ -3188,7 +3245,7 @@ class Validator:
|
|
|
3188
3245
|
@staticmethod
|
|
3189
3246
|
def _check_value_instance_segmentation_polygon_scores(
|
|
3190
3247
|
dataframe: pd.DataFrame, schema: Schema
|
|
3191
|
-
) -> list[
|
|
3248
|
+
) -> list[InvalidPolygonScores]:
|
|
3192
3249
|
errors = []
|
|
3193
3250
|
if schema.instance_segmentation_prediction_column_names is not None:
|
|
3194
3251
|
sc_col_name = schema.instance_segmentation_prediction_column_names.scores_column_name
|
|
@@ -3203,7 +3260,7 @@ class Validator:
|
|
|
3203
3260
|
@staticmethod
|
|
3204
3261
|
def _check_value_instance_segmentation_bbox_coordinates(
|
|
3205
3262
|
dataframe: pd.DataFrame, schema: Schema
|
|
3206
|
-
) -> list[
|
|
3263
|
+
) -> list[InvalidBoundingBoxesCoordinates]:
|
|
3207
3264
|
errors = []
|
|
3208
3265
|
if schema.instance_segmentation_prediction_column_names is not None:
|
|
3209
3266
|
coords_col_name = schema.instance_segmentation_prediction_column_names.bounding_boxes_coordinates_column_name # noqa: E501
|
|
@@ -3226,7 +3283,7 @@ class Validator:
|
|
|
3226
3283
|
@staticmethod
|
|
3227
3284
|
def _check_value_prompt_response(
|
|
3228
3285
|
dataframe: pd.DataFrame, schema: Schema
|
|
3229
|
-
) -> list[
|
|
3286
|
+
) -> list[ValidationError]:
|
|
3230
3287
|
vector_cols_to_check = []
|
|
3231
3288
|
text_cols_to_check = []
|
|
3232
3289
|
if isinstance(schema.prompt_column_names, str):
|
|
@@ -3262,16 +3319,16 @@ class Validator:
|
|
|
3262
3319
|
dataframe, vector_cols_to_check
|
|
3263
3320
|
)
|
|
3264
3321
|
|
|
3265
|
-
errors = []
|
|
3322
|
+
errors: list[ValidationError] = []
|
|
3266
3323
|
if invalid_long_string_data_cols:
|
|
3267
3324
|
errors.append(
|
|
3268
|
-
|
|
3325
|
+
InvalidValueEmbeddingRawDataTooLong(
|
|
3269
3326
|
invalid_long_string_data_cols
|
|
3270
3327
|
)
|
|
3271
3328
|
)
|
|
3272
3329
|
if invalid_low_dim_vector_cols or invalid_high_dim_vector_cols:
|
|
3273
3330
|
errors.append(
|
|
3274
|
-
|
|
3331
|
+
InvalidValueEmbeddingVectorDimensionality( # type: ignore[arg-type]
|
|
3275
3332
|
invalid_low_dim_vector_cols,
|
|
3276
3333
|
invalid_high_dim_vector_cols,
|
|
3277
3334
|
)
|
|
@@ -3291,7 +3348,7 @@ class Validator:
|
|
|
3291
3348
|
@staticmethod
|
|
3292
3349
|
def _check_value_llm_model_name(
|
|
3293
3350
|
dataframe: pd.DataFrame, schema: Schema
|
|
3294
|
-
) -> list[
|
|
3351
|
+
) -> list[InvalidStringLengthInColumn]:
|
|
3295
3352
|
if schema.llm_config_column_names is None:
|
|
3296
3353
|
return []
|
|
3297
3354
|
col = schema.llm_config_column_names.model_column_name
|
|
@@ -3301,7 +3358,7 @@ class Validator:
|
|
|
3301
3358
|
)
|
|
3302
3359
|
if max_len > MAX_LLM_MODEL_NAME_LENGTH:
|
|
3303
3360
|
return [
|
|
3304
|
-
|
|
3361
|
+
InvalidStringLengthInColumn(
|
|
3305
3362
|
schema_name="llm_config_column_names.model_column_name",
|
|
3306
3363
|
col_name=col,
|
|
3307
3364
|
min_length=0,
|
|
@@ -3319,7 +3376,7 @@ class Validator:
|
|
|
3319
3376
|
@staticmethod
|
|
3320
3377
|
def _check_value_llm_prompt_template(
|
|
3321
3378
|
dataframe: pd.DataFrame, schema: Schema
|
|
3322
|
-
) -> list[
|
|
3379
|
+
) -> list[InvalidStringLengthInColumn]:
|
|
3323
3380
|
if schema.prompt_template_column_names is None:
|
|
3324
3381
|
return []
|
|
3325
3382
|
col = schema.prompt_template_column_names.template_column_name
|
|
@@ -3329,7 +3386,7 @@ class Validator:
|
|
|
3329
3386
|
)
|
|
3330
3387
|
if max_len > MAX_PROMPT_TEMPLATE_LENGTH:
|
|
3331
3388
|
return [
|
|
3332
|
-
|
|
3389
|
+
InvalidStringLengthInColumn(
|
|
3333
3390
|
schema_name="prompt_template_column_names.template_column_name",
|
|
3334
3391
|
col_name=col,
|
|
3335
3392
|
min_length=0,
|
|
@@ -3348,7 +3405,7 @@ class Validator:
|
|
|
3348
3405
|
@staticmethod
|
|
3349
3406
|
def _check_value_llm_prompt_template_version(
|
|
3350
3407
|
dataframe: pd.DataFrame, schema: Schema
|
|
3351
|
-
) -> list[
|
|
3408
|
+
) -> list[InvalidStringLengthInColumn]:
|
|
3352
3409
|
if schema.prompt_template_column_names is None:
|
|
3353
3410
|
return []
|
|
3354
3411
|
col = schema.prompt_template_column_names.template_version_column_name
|
|
@@ -3358,7 +3415,7 @@ class Validator:
|
|
|
3358
3415
|
)
|
|
3359
3416
|
if max_len > MAX_PROMPT_TEMPLATE_VERSION_LENGTH:
|
|
3360
3417
|
return [
|
|
3361
|
-
|
|
3418
|
+
InvalidStringLengthInColumn(
|
|
3362
3419
|
schema_name="prompt_template_column_names.template_version_column_name",
|
|
3363
3420
|
col_name=col,
|
|
3364
3421
|
min_length=0,
|
|
@@ -3377,8 +3434,9 @@ class Validator:
|
|
|
3377
3434
|
@staticmethod
|
|
3378
3435
|
def _check_type_document_columns(
|
|
3379
3436
|
schema: CorpusSchema, column_types: dict[str, Any]
|
|
3380
|
-
) -> list[
|
|
3437
|
+
) -> list[InvalidTypeColumns]:
|
|
3381
3438
|
invalid_types = []
|
|
3439
|
+
allowed_datatypes: tuple[Any, ...]
|
|
3382
3440
|
# Check document id
|
|
3383
3441
|
col = schema.document_id_column_name
|
|
3384
3442
|
if col in column_types:
|
|
@@ -3391,7 +3449,7 @@ class Validator:
|
|
|
3391
3449
|
)
|
|
3392
3450
|
if column_types[col] not in allowed_datatypes:
|
|
3393
3451
|
invalid_types += [
|
|
3394
|
-
|
|
3452
|
+
InvalidTypeColumns(
|
|
3395
3453
|
wrong_type_columns=[col],
|
|
3396
3454
|
expected_types=["str", "int"],
|
|
3397
3455
|
)
|
|
@@ -3403,7 +3461,7 @@ class Validator:
|
|
|
3403
3461
|
allowed_datatype = pa.string()
|
|
3404
3462
|
if column_types[col] != allowed_datatype:
|
|
3405
3463
|
invalid_types += [
|
|
3406
|
-
|
|
3464
|
+
InvalidTypeColumns(
|
|
3407
3465
|
wrong_type_columns=[col],
|
|
3408
3466
|
expected_types=["str"],
|
|
3409
3467
|
)
|
|
@@ -3421,7 +3479,7 @@ class Validator:
|
|
|
3421
3479
|
)
|
|
3422
3480
|
if column_types[col] not in allowed_datatypes:
|
|
3423
3481
|
invalid_types += [
|
|
3424
|
-
|
|
3482
|
+
InvalidTypeColumns(
|
|
3425
3483
|
wrong_type_columns=[col],
|
|
3426
3484
|
expected_types=["list[float], np.array[float]"],
|
|
3427
3485
|
)
|
|
@@ -3436,7 +3494,7 @@ class Validator:
|
|
|
3436
3494
|
)
|
|
3437
3495
|
if column_types[col] not in allowed_datatypes:
|
|
3438
3496
|
invalid_types += [
|
|
3439
|
-
|
|
3497
|
+
InvalidTypeColumns(
|
|
3440
3498
|
wrong_type_columns=[col],
|
|
3441
3499
|
expected_types=["list[str]"],
|
|
3442
3500
|
)
|
|
@@ -3450,7 +3508,7 @@ class Validator:
|
|
|
3450
3508
|
allowed_datatypes = (pa.string(),)
|
|
3451
3509
|
if column_types[col] not in allowed_datatypes:
|
|
3452
3510
|
invalid_types += [
|
|
3453
|
-
|
|
3511
|
+
InvalidTypeColumns(
|
|
3454
3512
|
wrong_type_columns=[col],
|
|
3455
3513
|
expected_types=["str"],
|
|
3456
3514
|
)
|
|
@@ -3517,15 +3575,16 @@ def _check_value_raw_data_length_helper(
|
|
|
3517
3575
|
|
|
3518
3576
|
def _check_value_bounding_boxes_coordinates_helper(
|
|
3519
3577
|
coordinates_col: pd.Series,
|
|
3520
|
-
) ->
|
|
3578
|
+
) -> InvalidBoundingBoxesCoordinates | None:
|
|
3521
3579
|
def check(boxes: object) -> None:
|
|
3522
3580
|
# We allow for zero boxes. None coordinates list is not allowed (will break following tests:
|
|
3523
3581
|
# 'NoneType is not iterable')
|
|
3524
3582
|
if boxes is None:
|
|
3525
|
-
raise
|
|
3526
|
-
|
|
3583
|
+
raise InvalidBoundingBoxesCoordinates(reason="none_boxes")
|
|
3584
|
+
# Type ignore: boxes comes from pandas Series, validated at runtime to be iterable
|
|
3585
|
+
for box in boxes: # type: ignore[attr-defined]
|
|
3527
3586
|
if box is None or len(box) == 0:
|
|
3528
|
-
raise
|
|
3587
|
+
raise InvalidBoundingBoxesCoordinates(
|
|
3529
3588
|
reason="none_or_empty_box"
|
|
3530
3589
|
)
|
|
3531
3590
|
error = _box_coordinates_wrong_format(box)
|
|
@@ -3534,25 +3593,26 @@ def _check_value_bounding_boxes_coordinates_helper(
|
|
|
3534
3593
|
|
|
3535
3594
|
try:
|
|
3536
3595
|
coordinates_col.apply(check)
|
|
3537
|
-
except
|
|
3596
|
+
except InvalidBoundingBoxesCoordinates as e:
|
|
3538
3597
|
return e
|
|
3539
3598
|
return None
|
|
3540
3599
|
|
|
3541
3600
|
|
|
3542
3601
|
def _box_coordinates_wrong_format(
|
|
3543
3602
|
box_coords: object,
|
|
3544
|
-
) ->
|
|
3603
|
+
) -> InvalidBoundingBoxesCoordinates | None:
|
|
3545
3604
|
if (
|
|
3546
3605
|
# Coordinates should be a collection of 4 floats
|
|
3547
|
-
len(box_coords) != 4
|
|
3606
|
+
len(box_coords) != 4 # type: ignore[arg-type]
|
|
3548
3607
|
# Coordinates should be positive
|
|
3549
|
-
|
|
3608
|
+
# Type ignore: box_coords validated at runtime to be iterable/indexable
|
|
3609
|
+
or any(k < 0 for k in box_coords) # type: ignore[attr-defined]
|
|
3550
3610
|
# Coordinates represent the top-left & bottom-right corners of a box: x1 < x2
|
|
3551
|
-
or box_coords[0] >= box_coords[2]
|
|
3611
|
+
or box_coords[0] >= box_coords[2] # type: ignore[index]
|
|
3552
3612
|
# Coordinates represent the top-left & bottom-right corners of a box: y1 < y2
|
|
3553
|
-
or box_coords[1] >= box_coords[3]
|
|
3613
|
+
or box_coords[1] >= box_coords[3] # type: ignore[index]
|
|
3554
3614
|
):
|
|
3555
|
-
return
|
|
3615
|
+
return InvalidBoundingBoxesCoordinates(
|
|
3556
3616
|
reason="boxes_coordinates_wrong_format"
|
|
3557
3617
|
)
|
|
3558
3618
|
return None
|
|
@@ -3560,51 +3620,49 @@ def _box_coordinates_wrong_format(
|
|
|
3560
3620
|
|
|
3561
3621
|
def _check_value_bounding_boxes_categories_helper(
|
|
3562
3622
|
categories_col: pd.Series,
|
|
3563
|
-
) ->
|
|
3623
|
+
) -> InvalidBoundingBoxesCategories | None:
|
|
3564
3624
|
def check(categories: object) -> None:
|
|
3565
3625
|
# We allow for zero boxes. None category list is not allowed (will break following tests:
|
|
3566
3626
|
# 'NoneType is not iterable')
|
|
3567
3627
|
if categories is None:
|
|
3568
|
-
raise
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
for category in categories:
|
|
3628
|
+
raise InvalidBoundingBoxesCategories(reason="none_category_list")
|
|
3629
|
+
# Type ignore: categories validated at runtime to be iterable
|
|
3630
|
+
for category in categories: # type: ignore[attr-defined]
|
|
3572
3631
|
# Allow for empty string category, no None values
|
|
3573
3632
|
if category is None:
|
|
3574
|
-
raise
|
|
3633
|
+
raise InvalidBoundingBoxesCategories(reason="none_category")
|
|
3575
3634
|
|
|
3576
3635
|
try:
|
|
3577
3636
|
categories_col.apply(check)
|
|
3578
|
-
except
|
|
3637
|
+
except InvalidBoundingBoxesCategories as e:
|
|
3579
3638
|
return e
|
|
3580
3639
|
return None
|
|
3581
3640
|
|
|
3582
3641
|
|
|
3583
3642
|
def _check_value_bounding_boxes_scores_helper(
|
|
3584
3643
|
scores_col: pd.Series,
|
|
3585
|
-
) ->
|
|
3644
|
+
) -> InvalidBoundingBoxesScores | None:
|
|
3586
3645
|
def check(scores: object) -> None:
|
|
3587
3646
|
# We allow for zero boxes. None confidence score list is not allowed (will break following tests:
|
|
3588
3647
|
# 'NoneType is not iterable')
|
|
3589
3648
|
if scores is None:
|
|
3590
|
-
raise
|
|
3591
|
-
|
|
3649
|
+
raise InvalidBoundingBoxesScores(reason="none_score_list")
|
|
3650
|
+
# Type ignore: scores validated at runtime to be iterable
|
|
3651
|
+
for score in scores: # type: ignore[attr-defined]
|
|
3592
3652
|
# Confidence scores are between 0 and 1
|
|
3593
3653
|
if score < 0 or score > 1:
|
|
3594
|
-
raise
|
|
3595
|
-
reason="scores_out_of_bounds"
|
|
3596
|
-
)
|
|
3654
|
+
raise InvalidBoundingBoxesScores(reason="scores_out_of_bounds")
|
|
3597
3655
|
|
|
3598
3656
|
try:
|
|
3599
3657
|
scores_col.apply(check)
|
|
3600
|
-
except
|
|
3658
|
+
except InvalidBoundingBoxesScores as e:
|
|
3601
3659
|
return e
|
|
3602
3660
|
return None
|
|
3603
3661
|
|
|
3604
3662
|
|
|
3605
3663
|
def _polygon_coordinates_wrong_format(
|
|
3606
3664
|
polygon_coords: object,
|
|
3607
|
-
) ->
|
|
3665
|
+
) -> InvalidPolygonCoordinates | None:
|
|
3608
3666
|
"""Check if polygon coordinates are valid.
|
|
3609
3667
|
|
|
3610
3668
|
Validates:
|
|
@@ -3623,30 +3681,31 @@ def _polygon_coordinates_wrong_format(
|
|
|
3623
3681
|
# Basic validations
|
|
3624
3682
|
if (
|
|
3625
3683
|
# Coordinates should be a collection of more than 6 floats (3 pairs of x,y coordinates)
|
|
3626
|
-
len(polygon_coords) < 6
|
|
3684
|
+
len(polygon_coords) < 6 # type: ignore[arg-type]
|
|
3627
3685
|
# Coordinates should be positive
|
|
3628
|
-
|
|
3686
|
+
# Type ignore: polygon_coords validated at runtime to be iterable
|
|
3687
|
+
or any(k < 0 for k in polygon_coords) # type: ignore[arg-type, attr-defined]
|
|
3629
3688
|
# Coordinates should be a collection of pairs of floats
|
|
3630
|
-
or len(polygon_coords) % 2 != 0
|
|
3689
|
+
or len(polygon_coords) % 2 != 0 # type: ignore[arg-type]
|
|
3631
3690
|
):
|
|
3632
|
-
return
|
|
3691
|
+
return InvalidPolygonCoordinates(
|
|
3633
3692
|
reason="polygon_coordinates_wrong_format",
|
|
3634
|
-
coordinates=polygon_coords,
|
|
3693
|
+
coordinates=polygon_coords, # type: ignore[arg-type]
|
|
3635
3694
|
)
|
|
3636
3695
|
|
|
3637
3696
|
# Convert flat list to list of points [(x1,y1), (x2,y2), ...]
|
|
3697
|
+
coords_seq = cast("Sequence[float]", polygon_coords)
|
|
3638
3698
|
points = [
|
|
3639
|
-
(
|
|
3640
|
-
for i in range(0, len(polygon_coords), 2)
|
|
3699
|
+
(coords_seq[i], coords_seq[i + 1]) for i in range(0, len(coords_seq), 2)
|
|
3641
3700
|
]
|
|
3642
3701
|
|
|
3643
3702
|
# Check for repeated vertices
|
|
3644
3703
|
for i in range(len(points)):
|
|
3645
3704
|
for j in range(i + 1, len(points)):
|
|
3646
3705
|
if points[i] == points[j]:
|
|
3647
|
-
return
|
|
3706
|
+
return InvalidPolygonCoordinates(
|
|
3648
3707
|
reason="polygon_coordinates_repeated_vertices",
|
|
3649
|
-
coordinates=polygon_coords,
|
|
3708
|
+
coordinates=polygon_coords, # type: ignore[arg-type]
|
|
3650
3709
|
)
|
|
3651
3710
|
|
|
3652
3711
|
# Check for self-intersections
|
|
@@ -3665,9 +3724,9 @@ def _polygon_coordinates_wrong_format(
|
|
|
3665
3724
|
if segments_intersect(
|
|
3666
3725
|
edges[i][0], edges[i][1], edges[j][0], edges[j][1]
|
|
3667
3726
|
):
|
|
3668
|
-
return
|
|
3727
|
+
return InvalidPolygonCoordinates(
|
|
3669
3728
|
reason="polygon_coordinates_self_intersecting_vertices",
|
|
3670
|
-
coordinates=polygon_coords,
|
|
3729
|
+
coordinates=polygon_coords, # type: ignore[arg-type]
|
|
3671
3730
|
)
|
|
3672
3731
|
|
|
3673
3732
|
return None
|
|
@@ -3675,64 +3734,65 @@ def _polygon_coordinates_wrong_format(
|
|
|
3675
3734
|
|
|
3676
3735
|
def _check_value_polygon_coordinates_helper(
|
|
3677
3736
|
coordinates_col: pd.Series,
|
|
3678
|
-
) ->
|
|
3737
|
+
) -> InvalidPolygonCoordinates | None:
|
|
3679
3738
|
def check(polygons: object) -> None:
|
|
3680
3739
|
# We allow for zero polygons. None coordinates list is not allowed (will break following tests:
|
|
3681
3740
|
# 'NoneType is not iterable')
|
|
3682
3741
|
if polygons is None:
|
|
3683
|
-
raise
|
|
3684
|
-
|
|
3742
|
+
raise InvalidPolygonCoordinates(reason="none_polygons")
|
|
3743
|
+
# Type ignore: polygons validated at runtime to be iterable
|
|
3744
|
+
for polygon in polygons: # type: ignore[attr-defined]
|
|
3685
3745
|
if polygon is None or len(polygon) == 0:
|
|
3686
|
-
raise
|
|
3687
|
-
reason="none_or_empty_polygon"
|
|
3688
|
-
)
|
|
3746
|
+
raise InvalidPolygonCoordinates(reason="none_or_empty_polygon")
|
|
3689
3747
|
error = _polygon_coordinates_wrong_format(polygon)
|
|
3690
3748
|
if error is not None:
|
|
3691
3749
|
raise error
|
|
3692
3750
|
|
|
3693
3751
|
try:
|
|
3694
3752
|
coordinates_col.apply(check)
|
|
3695
|
-
except
|
|
3753
|
+
except InvalidPolygonCoordinates as e:
|
|
3696
3754
|
return e
|
|
3697
3755
|
return None
|
|
3698
3756
|
|
|
3699
3757
|
|
|
3700
3758
|
def _check_value_polygon_categories_helper(
|
|
3701
3759
|
categories_col: pd.Series,
|
|
3702
|
-
) ->
|
|
3760
|
+
) -> InvalidPolygonCategories | None:
|
|
3703
3761
|
def check(categories: object) -> None:
|
|
3704
3762
|
# We allow for zero boxes. None category list is not allowed (will break following tests:
|
|
3705
3763
|
# 'NoneType is not iterable')
|
|
3706
3764
|
if categories is None:
|
|
3707
|
-
raise
|
|
3708
|
-
|
|
3765
|
+
raise InvalidPolygonCategories(reason="none_category_list")
|
|
3766
|
+
# Type ignore: categories validated at runtime to be iterable
|
|
3767
|
+
for category in categories: # type: ignore[attr-defined]
|
|
3709
3768
|
# Allow for empty string category, no None values
|
|
3710
3769
|
if category is None:
|
|
3711
|
-
raise
|
|
3770
|
+
raise InvalidPolygonCategories(reason="none_category")
|
|
3712
3771
|
|
|
3713
3772
|
try:
|
|
3714
3773
|
categories_col.apply(check)
|
|
3715
|
-
except
|
|
3774
|
+
except InvalidPolygonCategories as e:
|
|
3716
3775
|
return e
|
|
3717
3776
|
return None
|
|
3718
3777
|
|
|
3719
3778
|
|
|
3720
3779
|
def _check_value_polygon_scores_helper(
|
|
3721
3780
|
scores_col: pd.Series,
|
|
3722
|
-
) ->
|
|
3781
|
+
) -> InvalidPolygonScores | None:
|
|
3723
3782
|
def check(scores: object) -> None:
|
|
3724
3783
|
# We allow for zero boxes. None confidence score list is not allowed (will break following tests:
|
|
3725
3784
|
# 'NoneType is not iterable')
|
|
3726
3785
|
if scores is None:
|
|
3727
|
-
raise
|
|
3728
|
-
|
|
3786
|
+
raise InvalidPolygonScores(reason="none_score_list")
|
|
3787
|
+
# Type ignore: scores validated at runtime to be iterable
|
|
3788
|
+
for score in scores: # type: ignore[attr-defined]
|
|
3729
3789
|
# Confidence scores are between 0 and 1
|
|
3730
3790
|
if score < 0 or score > 1:
|
|
3731
|
-
raise
|
|
3791
|
+
raise InvalidPolygonScores(reason="scores_out_of_bounds")
|
|
3732
3792
|
|
|
3733
3793
|
try:
|
|
3734
3794
|
scores_col.apply(check)
|
|
3735
|
-
except
|
|
3795
|
+
except InvalidPolygonScores as e:
|
|
3736
3796
|
return e
|
|
3737
3797
|
return None
|
|
3738
3798
|
|