validmind 2.8.22__tar.gz → 2.8.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {validmind-2.8.22 → validmind-2.8.27}/PKG-INFO +4 -3
- {validmind-2.8.22 → validmind-2.8.27}/pyproject.toml +4 -3
- {validmind-2.8.22 → validmind-2.8.27}/validmind/__init__.py +3 -0
- validmind-2.8.27/validmind/__version__.py +1 -0
- validmind-2.8.27/validmind/ai/utils.py +219 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/api_client.py +4 -0
- validmind-2.8.27/validmind/experimental/agents.py +65 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/MutualInformation.py +14 -2
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/AspectCritic.py +5 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/ContextEntityRecall.py +5 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/ContextPrecision.py +5 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +5 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/ContextRecall.py +5 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/Faithfulness.py +5 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/NoiseSensitivity.py +3 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/ResponseRelevancy.py +6 -4
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/SemanticSimilarity.py +5 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/utils.py +4 -24
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +11 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +13 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/prompt_validation/Bias.py +2 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/prompt_validation/Clarity.py +2 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/prompt_validation/Conciseness.py +2 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/prompt_validation/Delimitation.py +2 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/prompt_validation/NegativeInstruction.py +2 -1
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/prompt_validation/Robustness.py +3 -2
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/prompt_validation/Specificity.py +2 -1
- validmind-2.8.27/validmind/tests/prompt_validation/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/prompt_validation/ai_powered_test.py +18 -17
- validmind-2.8.27/validmind/vm_models/result/__init__.py +21 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/result/result.py +127 -14
- validmind-2.8.22/validmind/__version__.py +0 -1
- validmind-2.8.22/validmind/ai/utils.py +0 -130
- validmind-2.8.22/validmind/vm_models/result/__init__.py +0 -7
- {validmind-2.8.22 → validmind-2.8.27}/LICENSE +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/README.pypi.md +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/ai/test_descriptions.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/client.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/client_config.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/classification/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/classification/customer_churn.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/classification/datasets/bank_customer_churn.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/classification/datasets/taiwan_credit.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/classification/taiwan_credit.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/cluster/digits.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/credit_risk/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/credit_risk/datasets/lending_club_biased.csv.gz +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/credit_risk/datasets/lending_club_loan_data_2007_2014_clean.csv.gz +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/credit_risk/lending_club.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/credit_risk/lending_club_bias.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/llm/rag/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_1.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_2.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_3.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_4.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/llm/rag/datasets/rfp_existing_questions_client_5.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/llm/rag/rfp.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/nlp/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/nlp/cnn_dailymail.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/nlp/datasets/Covid_19.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/nlp/datasets/sentiments_with_predictions.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/nlp/twitter_covid_19.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/california_housing.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/CPIAUCSL.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/CSUSHPISA.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/DRSFRMACBS.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/FEDFUNDS.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/GDP.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/GDPC1.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/GS10.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/GS3.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/GS5.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/MORTGAGE30US.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred/UNRATE.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred_loan_rates.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred_loan_rates_test_1.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred_loan_rates_test_2.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred_loan_rates_test_3.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred_loan_rates_test_4.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/fred_loan_rates_test_5.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/datasets/leanding_club_loan_rates.csv +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/fred.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/fred_timeseries.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/lending_club.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/models/fred_loan_rates_model_1.pkl +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/models/fred_loan_rates_model_2.pkl +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/models/fred_loan_rates_model_3.pkl +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/models/fred_loan_rates_model_4.pkl +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/datasets/regression/models/fred_loan_rates_model_5.pkl +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/errors.py +0 -0
- {validmind-2.8.22/validmind/html_templates → validmind-2.8.27/validmind/experimental}/__init__.py +0 -0
- {validmind-2.8.22/validmind/tests/data_validation → validmind-2.8.27/validmind/html_templates}/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/html_templates/content_blocks.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/input_registry.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/logging.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/models/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/models/foundation.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/models/function.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/models/huggingface.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/models/metadata.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/models/pipeline.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/models/pytorch.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/models/r_model.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/models/sklearn.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/template.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/classifier.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/cluster.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/embeddings.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/llm.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/nlp.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/parameters_optimization.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/regression.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/statsmodels_timeseries.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/summarization.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/tabular_datasets.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/text_data.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/test_suites/time_series.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/__types__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/_store.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/comparison.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ACFandPACFPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ADF.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/AutoAR.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/AutoMA.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/AutoStationarity.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/BivariateScatterPlots.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/BoxPierce.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ChiSquaredFeaturesTable.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ClassImbalance.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/DatasetDescription.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/DatasetSplit.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/DescriptiveStatistics.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/DickeyFullerGLS.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/Duplicates.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/EngleGrangerCoint.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/FeatureTargetCorrelationPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/HighCardinality.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/HighPearsonCorrelation.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/IQROutliersBarPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/IQROutliersTable.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/IsolationForestOutliers.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/JarqueBera.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/KPSS.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/LJungBox.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/LaggedCorrelationHeatmap.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/MissingValues.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/MissingValuesBarPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/PearsonCorrelationMatrix.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/PhillipsPerronArch.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ProtectedClassesCombination.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ProtectedClassesDescription.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ProtectedClassesDisparity.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ProtectedClassesThresholdOptimizer.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/RollingStatsPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/RunsTest.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ScatterPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ScoreBandDefaultRates.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/SeasonalDecompose.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ShapiroWilk.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/Skewness.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/SpreadPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TabularCategoricalBarPlots.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TabularDateTimeHistograms.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TabularDescriptionTables.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TabularNumericalHistograms.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TargetRateBarPlots.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TimeSeriesDescription.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TimeSeriesDescriptiveStatistics.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TimeSeriesFrequency.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TimeSeriesHistogram.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TimeSeriesLinePlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TimeSeriesMissingValues.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TimeSeriesOutliers.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/TooManyZeroValues.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/UniqueRows.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/WOEBinPlots.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/WOEBinTable.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/ZivotAndrewsArch.py +0 -0
- {validmind-2.8.22/validmind/tests/data_validation/nlp → validmind-2.8.27/validmind/tests/data_validation}/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/nlp/CommonWords.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/nlp/Hashtags.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/nlp/LanguageDetection.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/nlp/Mentions.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/nlp/PolarityAndSubjectivity.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/nlp/Punctuations.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/nlp/Sentiment.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/nlp/StopWords.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/nlp/TextDescription.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/data_validation/nlp/Toxicity.py +0 -0
- {validmind-2.8.22/validmind/tests/model_validation → validmind-2.8.27/validmind/tests/data_validation/nlp}/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/decorator.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/load.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/BertScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/BleuScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ClusterSizeDistribution.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ContextualRecall.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/FeaturesAUC.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/MeteorScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ModelMetadata.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ModelPredictionResiduals.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/RegardScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/RegressionResidualsPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/RougeScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/TimeSeriesPredictionWithCI.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/TimeSeriesPredictionsPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/TimeSeriesR2SquareBySegments.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/TokenDisparity.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ToxicityScore.py +0 -0
- {validmind-2.8.22/validmind/tests/model_validation/sklearn → validmind-2.8.27/validmind/tests/model_validation}/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/ClusterDistribution.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/CosineSimilarityComparison.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/CosineSimilarityHeatmap.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/EuclideanDistanceComparison.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/EuclideanDistanceHeatmap.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/PCAComponentsPairwisePlots.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/StabilityAnalysisKeyword.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/StabilityAnalysisRandomNoise.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/StabilityAnalysisSynonyms.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/embeddings/utils.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/AdjustedMutualInformation.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/AdjustedRandIndex.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/CalibrationCurve.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/ClassifierPerformance.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/ClassifierThresholdOptimization.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/ClusterCosineSimilarity.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/ClusterPerformanceMetrics.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/CompletenessScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/ConfusionMatrix.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/FeatureImportance.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/FowlkesMallowsScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/HomogeneityScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/HyperParametersTuning.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/KMeansClustersOptimization.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/MinimumAccuracy.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/MinimumF1Score.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/MinimumROCAUCScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/ModelParameters.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/ModelsPerformanceComparison.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/PermutationFeatureImportance.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/PopulationStabilityIndex.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/PrecisionRecallCurve.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/ROCCurve.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/RegressionErrors.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/RegressionErrorsComparison.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/RegressionPerformance.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/RegressionR2Square.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/RegressionR2SquareComparison.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/RobustnessDiagnosis.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/SHAPGlobalImportance.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/ScoreProbabilityAlignment.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/SilhouettePlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/TrainingTestDegradation.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/sklearn/VMeasure.py +0 -0
- {validmind-2.8.22/validmind/tests/model_validation/statsmodels → validmind-2.8.27/validmind/tests/model_validation/sklearn}/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/AutoARIMA.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/CumulativePredictionProbabilities.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/DurbinWatsonTest.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/GINITable.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/KolmogorovSmirnov.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/Lilliefors.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/PredictionProbabilitiesHistogram.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/RegressionCoeffs.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/RegressionFeatureSignificance.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/RegressionModelForecastPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/RegressionModelForecastPlotLevels.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/RegressionModelSensitivityPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/RegressionModelSummary.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/RegressionPermutationFeatureImportance.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/ScorecardHistogram.py +0 -0
- {validmind-2.8.22/validmind/tests/prompt_validation → validmind-2.8.27/validmind/tests/model_validation/statsmodels}/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/statsmodels/statsutils.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/CalibrationCurveDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/ClassDiscriminationDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/ClassImbalanceDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/ClassificationAccuracyDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/ConfusionMatrixDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/CumulativePredictionProbabilitiesDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/FeatureDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/PredictionAcrossEachFeature.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/PredictionCorrelation.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/PredictionProbabilitiesHistogramDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/PredictionQuantilesAcrossFeatures.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/ROCCurveDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/ScoreBandsDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/ScorecardHistogramDrift.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/ongoing_monitoring/TargetPredictionDistributionPlot.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/output.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/run.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/test_providers.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/tests/utils.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/classification/Accuracy.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/classification/F1.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/classification/Precision.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/classification/ROC_AUC.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/classification/Recall.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/AdjustedRSquaredScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/GiniCoefficient.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/HuberLoss.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/KolmogorovSmirnovStatistic.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/MeanAbsoluteError.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/MeanAbsolutePercentageError.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/MeanBiasDeviation.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/MeanSquaredError.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/QuantileLoss.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/RSquaredScore.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/unit_metrics/regression/RootMeanSquaredError.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/utils.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/dataset/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/dataset/dataset.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/dataset/utils.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/figure.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/input.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/model.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/result/result.jinja +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/result/utils.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/test_suite/__init__.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/test_suite/runner.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/test_suite/summary.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/test_suite/test.py +0 -0
- {validmind-2.8.22 → validmind-2.8.27}/validmind/vm_models/test_suite/test_suite.py +0 -0
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: validmind
|
3
|
-
Version: 2.8.
|
3
|
+
Version: 2.8.27
|
4
4
|
Summary: ValidMind Library
|
5
5
|
License: Commercial License
|
6
6
|
Author: Andres Rodriguez
|
7
7
|
Author-email: andres@validmind.ai
|
8
|
-
Requires-Python: >=3.
|
8
|
+
Requires-Python: >=3.9.0,<3.12
|
9
9
|
Classifier: License :: Other/Proprietary License
|
10
10
|
Classifier: Programming Language :: Python :: 3
|
11
11
|
Classifier: Programming Language :: Python :: 3.9
|
@@ -22,6 +22,7 @@ Requires-Dist: bert-score (>=0.3.13)
|
|
22
22
|
Requires-Dist: catboost
|
23
23
|
Requires-Dist: datasets (>=2.10.0,<3.0.0)
|
24
24
|
Requires-Dist: evaluate
|
25
|
+
Requires-Dist: h11 (>=0.16.0)
|
25
26
|
Requires-Dist: ipywidgets
|
26
27
|
Requires-Dist: kaleido (>=0.2.1,!=0.2.1.post1)
|
27
28
|
Requires-Dist: langchain-openai (>=0.1.8) ; extra == "all" or extra == "llm"
|
@@ -53,7 +54,7 @@ Requires-Dist: statsmodels
|
|
53
54
|
Requires-Dist: tabulate (>=0.8.9,<0.9.0)
|
54
55
|
Requires-Dist: textblob (>=0.18.0.post0,<0.19.0)
|
55
56
|
Requires-Dist: tiktoken
|
56
|
-
Requires-Dist: torch (
|
57
|
+
Requires-Dist: torch (==2.7.0) ; extra == "all" or extra == "llm" or extra == "pytorch"
|
57
58
|
Requires-Dist: tqdm
|
58
59
|
Requires-Dist: transformers (>=4.32.0,<5.0.0) ; extra == "all" or extra == "huggingface" or extra == "llm"
|
59
60
|
Requires-Dist: xgboost (>=1.5.2,<3)
|
@@ -10,7 +10,7 @@ description = "ValidMind Library"
|
|
10
10
|
license = "Commercial License"
|
11
11
|
name = "validmind"
|
12
12
|
readme = "README.pypi.md"
|
13
|
-
version = "2.8.
|
13
|
+
version = "2.8.27"
|
14
14
|
|
15
15
|
[tool.poetry.dependencies]
|
16
16
|
aiohttp = {extras = ["speedups"], version = "*"}
|
@@ -20,6 +20,7 @@ bert-score = ">=0.3.13"
|
|
20
20
|
catboost = "*"
|
21
21
|
datasets = "^2.10.0"
|
22
22
|
evaluate = "*"
|
23
|
+
h11 = ">=0.16.0"
|
23
24
|
ipywidgets = "*"
|
24
25
|
kaleido = ">=0.2.1,!=0.2.1.post1"
|
25
26
|
langchain-openai = {version = ">=0.1.8", optional = true}
|
@@ -37,7 +38,7 @@ plotly = "<6.0.0"
|
|
37
38
|
plotly-express = "*"
|
38
39
|
polars = "*"
|
39
40
|
pycocoevalcap = {version = "^1.2", optional = true}
|
40
|
-
python = ">=3.
|
41
|
+
python = ">=3.9.0,<3.12"
|
41
42
|
python-dotenv = "*"
|
42
43
|
ragas = {version = ">=0.2.3,<=0.2.7", optional = true}
|
43
44
|
rouge = ">=1"
|
@@ -52,7 +53,7 @@ statsmodels = "*"
|
|
52
53
|
tabulate = "^0.8.9"
|
53
54
|
textblob = "^0.18.0.post0"
|
54
55
|
tiktoken = "*"
|
55
|
-
torch = {version = "
|
56
|
+
torch = {version = "2.7.0", optional = true}
|
56
57
|
tqdm = "*"
|
57
58
|
transformers = {version = "^4.32.0", optional = true}
|
58
59
|
xgboost = ">=1.5.2,<3"
|
@@ -53,6 +53,7 @@ from .client import ( # noqa: E402
|
|
53
53
|
run_documentation_tests,
|
54
54
|
run_test_suite,
|
55
55
|
)
|
56
|
+
from .experimental import agents as experimental_agent
|
56
57
|
from .tests.decorator import tags, tasks, test
|
57
58
|
from .tests.run import print_env
|
58
59
|
from .utils import is_notebook, parse_version
|
@@ -126,4 +127,6 @@ __all__ = [ # noqa
|
|
126
127
|
"unit_metrics",
|
127
128
|
"test_suites",
|
128
129
|
"log_text",
|
130
|
+
# experimental features
|
131
|
+
"experimental_agent",
|
129
132
|
]
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "2.8.27"
|
@@ -0,0 +1,219 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
import os
|
6
|
+
from urllib.parse import urljoin
|
7
|
+
|
8
|
+
from openai import AzureOpenAI, Client, OpenAI
|
9
|
+
|
10
|
+
from ..logging import get_logger
|
11
|
+
from ..utils import md_to_html
|
12
|
+
|
13
|
+
logger = get_logger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
__client = None
|
17
|
+
__model = None
|
18
|
+
__judge_llm = None
|
19
|
+
__judge_embeddings = None
|
20
|
+
EMBEDDINGS_MODEL = "text-embedding-3-small"
|
21
|
+
|
22
|
+
# can be None, True or False (ternary to represent initial state, ack and failed ack)
|
23
|
+
__ack = None
|
24
|
+
|
25
|
+
|
26
|
+
class DescriptionFuture:
|
27
|
+
"""This will be immediately returned from generate_description so that
|
28
|
+
the tests can continue to be run in parallel while the description is
|
29
|
+
retrieved asynchronously.
|
30
|
+
|
31
|
+
The value will be retrieved later and, if it is not ready yet, it should
|
32
|
+
block until it is.
|
33
|
+
"""
|
34
|
+
|
35
|
+
def __init__(self, future):
|
36
|
+
self._future = future
|
37
|
+
|
38
|
+
def get_description(self):
|
39
|
+
if isinstance(self._future, str):
|
40
|
+
description = self._future
|
41
|
+
else:
|
42
|
+
# This will block until the future is completed
|
43
|
+
description = self._future.result()
|
44
|
+
|
45
|
+
return md_to_html(description, mathml=True)
|
46
|
+
|
47
|
+
|
48
|
+
def get_client_and_model():
|
49
|
+
"""Get model and client to use for generating interpretations.
|
50
|
+
|
51
|
+
On first call, it will look in the environment for the API key endpoint, model etc.
|
52
|
+
and store them in a global variable to avoid loading them up again.
|
53
|
+
"""
|
54
|
+
global __client, __model
|
55
|
+
|
56
|
+
if __client and __model:
|
57
|
+
return __client, __model
|
58
|
+
|
59
|
+
if "OPENAI_API_KEY" in os.environ:
|
60
|
+
__client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
61
|
+
__model = os.getenv("VM_OPENAI_MODEL", "gpt-4o")
|
62
|
+
|
63
|
+
logger.debug(f"Using OpenAI {__model} for generating descriptions")
|
64
|
+
|
65
|
+
elif "AZURE_OPENAI_KEY" in os.environ:
|
66
|
+
if "AZURE_OPENAI_ENDPOINT" not in os.environ:
|
67
|
+
raise ValueError(
|
68
|
+
"AZURE_OPENAI_ENDPOINT must be set to run LLM tests with Azure"
|
69
|
+
)
|
70
|
+
|
71
|
+
if "AZURE_OPENAI_MODEL" not in os.environ:
|
72
|
+
raise ValueError(
|
73
|
+
"AZURE_OPENAI_MODEL must be set to run LLM tests with Azure"
|
74
|
+
)
|
75
|
+
|
76
|
+
__client = AzureOpenAI(
|
77
|
+
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
|
78
|
+
api_key=os.getenv("AZURE_OPENAI_KEY"),
|
79
|
+
api_version=os.getenv("AZURE_OPENAI_VERSION", "2023-05-15"),
|
80
|
+
)
|
81
|
+
__model = os.getenv("AZURE_OPENAI_MODEL")
|
82
|
+
|
83
|
+
logger.debug(f"Using Azure OpenAI {__model} for generating descriptions")
|
84
|
+
|
85
|
+
else:
|
86
|
+
try:
|
87
|
+
# TODO: fix circular import
|
88
|
+
from ..api_client import get_ai_key, get_api_host
|
89
|
+
|
90
|
+
response = get_ai_key()
|
91
|
+
__client = Client(
|
92
|
+
base_url=(
|
93
|
+
# TODO: improve this to be a bit more dynamic
|
94
|
+
"http://localhost:4000/genai"
|
95
|
+
if "localhost" in get_api_host()
|
96
|
+
else urljoin(get_api_host(), "/genai")
|
97
|
+
),
|
98
|
+
api_key=response["key"],
|
99
|
+
)
|
100
|
+
__model = "gpt-4o" # TODO: backend should tell us which model to use
|
101
|
+
logger.debug(f"Using ValidMind {__model} for generating descriptions")
|
102
|
+
except Exception as e:
|
103
|
+
logger.debug(f"Failed to get API key: {e}")
|
104
|
+
raise ValueError(
|
105
|
+
"OPENAI_API_KEY, AZURE_OPENAI_KEY must be set, or your account "
|
106
|
+
"must be setup to use ValidMind's LLM in order to use LLM features"
|
107
|
+
)
|
108
|
+
|
109
|
+
return __client, __model
|
110
|
+
|
111
|
+
|
112
|
+
def get_judge_config(judge_llm=None, judge_embeddings=None):
|
113
|
+
try:
|
114
|
+
from langchain_core.embeddings import Embeddings
|
115
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
116
|
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
117
|
+
|
118
|
+
from validmind.models.function import FunctionModel
|
119
|
+
except ImportError:
|
120
|
+
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
121
|
+
|
122
|
+
if judge_llm is not None or judge_embeddings is not None:
|
123
|
+
if isinstance(judge_llm, FunctionModel) and judge_llm is not None:
|
124
|
+
if isinstance(judge_llm.model, BaseChatModel):
|
125
|
+
judge_llm = judge_llm.model
|
126
|
+
else:
|
127
|
+
raise ValueError(
|
128
|
+
"The ValidMind Functional model provided does not have have a langchain compatible LLM as a model attribute."
|
129
|
+
"To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
|
130
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
131
|
+
)
|
132
|
+
if isinstance(judge_embeddings, FunctionModel) and judge_embeddings is not None:
|
133
|
+
if isinstance(judge_llm.model, BaseChatModel):
|
134
|
+
judge_embeddings = judge_embeddings.model
|
135
|
+
else:
|
136
|
+
raise ValueError(
|
137
|
+
"The ValidMind Functional model provided does not have have a langchain compatible embeddings model as a model attribute."
|
138
|
+
"To use default ValidMind LLM, do not set judge_embedding parameter, "
|
139
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
140
|
+
)
|
141
|
+
|
142
|
+
if (isinstance(judge_llm, BaseChatModel) or judge_llm is None) and (
|
143
|
+
isinstance(judge_embeddings, Embeddings) or judge_embeddings is None
|
144
|
+
):
|
145
|
+
return judge_llm, judge_embeddings
|
146
|
+
else:
|
147
|
+
raise ValueError(
|
148
|
+
"Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
|
149
|
+
"Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
|
150
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
151
|
+
)
|
152
|
+
|
153
|
+
# grab default values if not passed at run time
|
154
|
+
global __judge_llm, __judge_embeddings
|
155
|
+
if __judge_llm and __judge_embeddings:
|
156
|
+
return __judge_llm, __judge_embeddings
|
157
|
+
|
158
|
+
client, model = get_client_and_model()
|
159
|
+
os.environ["OPENAI_API_BASE"] = str(client.base_url)
|
160
|
+
|
161
|
+
__judge_llm = ChatOpenAI(api_key=client.api_key, model=model)
|
162
|
+
__judge_embeddings = OpenAIEmbeddings(
|
163
|
+
api_key=client.api_key, model=EMBEDDINGS_MODEL
|
164
|
+
)
|
165
|
+
|
166
|
+
return __judge_llm, __judge_embeddings
|
167
|
+
|
168
|
+
|
169
|
+
def set_judge_config(judge_llm, judge_embeddings):
|
170
|
+
global __judge_llm, __judge_embeddings
|
171
|
+
try:
|
172
|
+
from langchain_core.embeddings import Embeddings
|
173
|
+
from langchain_core.language_models.chat_models import BaseChatModel
|
174
|
+
|
175
|
+
from validmind.models.function import FunctionModel
|
176
|
+
except ImportError:
|
177
|
+
raise ImportError("Please run `pip install validmind[llm]` to use LLM tests")
|
178
|
+
if isinstance(judge_llm, BaseChatModel) and isinstance(
|
179
|
+
judge_embeddings, Embeddings
|
180
|
+
):
|
181
|
+
__judge_llm = judge_llm
|
182
|
+
__judge_embeddings = judge_embeddings
|
183
|
+
# Assuming 'your_object' is the object you want to check
|
184
|
+
elif isinstance(judge_llm, FunctionModel) and isinstance(
|
185
|
+
judge_embeddings, FunctionModel
|
186
|
+
):
|
187
|
+
__judge_llm = judge_llm.model
|
188
|
+
__judge_embeddings = judge_embeddings.model
|
189
|
+
else:
|
190
|
+
raise ValueError(
|
191
|
+
"Provided Judge LLM/Embeddings are not Langchain compatible. Ensure the judge LLM/embedding provided are an instance of "
|
192
|
+
"Langchain BaseChatModel and LangchainEmbeddings. To use default ValidMind LLM, do not set judge_llm/judge_embedding parameter, "
|
193
|
+
"ensure that you are connected to the ValidMind API and confirm ValidMind AI is enabled for your account."
|
194
|
+
)
|
195
|
+
|
196
|
+
|
197
|
+
def is_configured():
|
198
|
+
global __ack
|
199
|
+
|
200
|
+
if __ack:
|
201
|
+
return True
|
202
|
+
|
203
|
+
try:
|
204
|
+
client, model = get_client_and_model()
|
205
|
+
# send an empty message with max_tokens=1 to "ping" the API
|
206
|
+
response = client.chat.completions.create(
|
207
|
+
model=model,
|
208
|
+
messages=[{"role": "user", "content": ""}],
|
209
|
+
max_tokens=1,
|
210
|
+
)
|
211
|
+
logger.debug(
|
212
|
+
f"Received response from OpenAI: {response.choices[0].message.content}"
|
213
|
+
)
|
214
|
+
__ack = True
|
215
|
+
except Exception as e:
|
216
|
+
logger.debug(f"Failed to connect to OpenAI: {e}")
|
217
|
+
__ack = False
|
218
|
+
|
219
|
+
return __ack
|
@@ -448,6 +448,7 @@ async def alog_metric(
|
|
448
448
|
params: Optional[Dict[str, Any]] = None,
|
449
449
|
recorded_at: Optional[str] = None,
|
450
450
|
thresholds: Optional[Dict[str, Any]] = None,
|
451
|
+
passed: Optional[bool] = None,
|
451
452
|
):
|
452
453
|
"""See log_metric for details."""
|
453
454
|
if not key or not isinstance(key, str):
|
@@ -476,6 +477,7 @@ async def alog_metric(
|
|
476
477
|
"params": params or {},
|
477
478
|
"recorded_at": recorded_at,
|
478
479
|
"thresholds": thresholds or {},
|
480
|
+
"passed": passed if passed is not None else None,
|
479
481
|
},
|
480
482
|
cls=NumpyEncoder,
|
481
483
|
allow_nan=False,
|
@@ -493,6 +495,7 @@ def log_metric(
|
|
493
495
|
params: Optional[Dict[str, Any]] = None,
|
494
496
|
recorded_at: Optional[str] = None,
|
495
497
|
thresholds: Optional[Dict[str, Any]] = None,
|
498
|
+
passed: Optional[bool] = None,
|
496
499
|
):
|
497
500
|
"""Logs a unit metric.
|
498
501
|
|
@@ -518,6 +521,7 @@ def log_metric(
|
|
518
521
|
params=params,
|
519
522
|
recorded_at=recorded_at,
|
520
523
|
thresholds=thresholds,
|
524
|
+
passed=passed,
|
521
525
|
)
|
522
526
|
|
523
527
|
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# Copyright © 2023-2024 ValidMind Inc. All rights reserved.
|
2
|
+
# See the LICENSE file in the root of this repository for details.
|
3
|
+
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
|
+
|
5
|
+
"""
|
6
|
+
Agent interface for all text generation tasks
|
7
|
+
"""
|
8
|
+
|
9
|
+
import requests
|
10
|
+
|
11
|
+
from validmind.api_client import _get_api_headers, _get_url, raise_api_error
|
12
|
+
from validmind.utils import is_html, md_to_html
|
13
|
+
from validmind.vm_models.result import TextGenerationResult
|
14
|
+
|
15
|
+
|
16
|
+
def run_task(
|
17
|
+
task: str,
|
18
|
+
input: dict,
|
19
|
+
show: bool = True,
|
20
|
+
) -> TextGenerationResult:
|
21
|
+
"""
|
22
|
+
Run text generation tasks using AI models.
|
23
|
+
|
24
|
+
Args:
|
25
|
+
task (str): Type of text generation task to run. Currently supports:
|
26
|
+
- 'code_explainer': Generates natural language explanations of code
|
27
|
+
input (dict): Input parameters for the generation task:
|
28
|
+
- For code_explainer: Must contain 'source_code' and optional parameters
|
29
|
+
show (bool): Whether to display the generated result. Defaults to True.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
TextGenerationResult: Result object containing the generated text and metadata
|
33
|
+
|
34
|
+
Raises:
|
35
|
+
ValueError: If an unsupported task is provided
|
36
|
+
requests.exceptions.RequestException: If the API request fails
|
37
|
+
"""
|
38
|
+
if task == "code_explainer" or task == "qualitative_text_generation":
|
39
|
+
r = requests.post(
|
40
|
+
url=_get_url(f"ai/generate/{task}"),
|
41
|
+
headers=_get_api_headers(),
|
42
|
+
json=input,
|
43
|
+
)
|
44
|
+
|
45
|
+
if r.status_code != 200:
|
46
|
+
raise_api_error(r.text)
|
47
|
+
|
48
|
+
generated_text = r.json()["content"]
|
49
|
+
else:
|
50
|
+
raise ValueError(f"Unsupported task: {task}")
|
51
|
+
|
52
|
+
if not is_html(generated_text):
|
53
|
+
generated_text = md_to_html(generated_text, mathml=True)
|
54
|
+
|
55
|
+
# Create a test result with the generated text
|
56
|
+
result = TextGenerationResult(
|
57
|
+
result_type=f"{task}",
|
58
|
+
description=generated_text,
|
59
|
+
title=f"Text Generation: {task}",
|
60
|
+
doc=f"Generated {task}",
|
61
|
+
)
|
62
|
+
if show:
|
63
|
+
result.show()
|
64
|
+
|
65
|
+
return result
|
@@ -68,8 +68,20 @@ def MutualInformation(
|
|
68
68
|
if task not in ["classification", "regression"]:
|
69
69
|
raise ValueError("task must be either 'classification' or 'regression'")
|
70
70
|
|
71
|
-
|
72
|
-
|
71
|
+
# Check if numeric features exist
|
72
|
+
if not dataset.feature_columns_numeric:
|
73
|
+
raise ValueError(
|
74
|
+
"No numeric features found in dataset. Mutual Information test requires numeric features."
|
75
|
+
)
|
76
|
+
|
77
|
+
# Check if target column exists
|
78
|
+
if not dataset.target_column:
|
79
|
+
raise ValueError(
|
80
|
+
"Target column is required for Mutual Information calculation but was not provided."
|
81
|
+
)
|
82
|
+
|
83
|
+
X = dataset._df[dataset.feature_columns_numeric]
|
84
|
+
y = dataset._df[dataset.target_column]
|
73
85
|
|
74
86
|
# Select appropriate MI function based on task type
|
75
87
|
if task == "classification":
|
{validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/AnswerCorrectness.py
RENAMED
@@ -34,6 +34,8 @@ def AnswerCorrectness(
|
|
34
34
|
user_input_column="user_input",
|
35
35
|
response_column="response",
|
36
36
|
reference_column="reference",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
):
|
38
40
|
"""
|
39
41
|
Evaluates the correctness of answers in a dataset with respect to the provided ground
|
@@ -118,7 +120,9 @@ def AnswerCorrectness(
|
|
118
120
|
df = get_renamed_columns(dataset._df, required_columns)
|
119
121
|
|
120
122
|
result_df = evaluate(
|
121
|
-
Dataset.from_pandas(df),
|
123
|
+
Dataset.from_pandas(df),
|
124
|
+
metrics=[answer_correctness()],
|
125
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
122
126
|
).to_pandas()
|
123
127
|
|
124
128
|
score_column = "answer_correctness"
|
{validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/AspectCritic.py
RENAMED
@@ -51,6 +51,8 @@ def AspectCritic(
|
|
51
51
|
"maliciousness",
|
52
52
|
],
|
53
53
|
additional_aspects: list = None,
|
54
|
+
judge_llm=None,
|
55
|
+
judge_embeddings=None,
|
54
56
|
):
|
55
57
|
"""
|
56
58
|
Evaluates generations against the following aspects: harmfulness, maliciousness,
|
@@ -158,7 +160,9 @@ def AspectCritic(
|
|
158
160
|
all_aspects = [built_in_aspects[aspect] for aspect in aspects] + custom_aspects
|
159
161
|
|
160
162
|
result_df = evaluate(
|
161
|
-
Dataset.from_pandas(df),
|
163
|
+
Dataset.from_pandas(df),
|
164
|
+
metrics=all_aspects,
|
165
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
162
166
|
).to_pandas()
|
163
167
|
|
164
168
|
# reverse the score for aspects where lower is better
|
{validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/ContextEntityRecall.py
RENAMED
@@ -33,6 +33,8 @@ def ContextEntityRecall(
|
|
33
33
|
dataset,
|
34
34
|
retrieved_contexts_column: str = "retrieved_contexts",
|
35
35
|
reference_column: str = "reference",
|
36
|
+
judge_llm=None,
|
37
|
+
judge_embeddings=None,
|
36
38
|
):
|
37
39
|
"""
|
38
40
|
Evaluates the context entity recall for dataset entries and visualizes the results.
|
@@ -113,7 +115,9 @@ def ContextEntityRecall(
|
|
113
115
|
df = get_renamed_columns(dataset._df, required_columns)
|
114
116
|
|
115
117
|
result_df = evaluate(
|
116
|
-
Dataset.from_pandas(df),
|
118
|
+
Dataset.from_pandas(df),
|
119
|
+
metrics=[context_entity_recall()],
|
120
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
117
121
|
).to_pandas()
|
118
122
|
|
119
123
|
score_column = "context_entity_recall"
|
{validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/ContextPrecision.py
RENAMED
@@ -34,6 +34,8 @@ def ContextPrecision(
|
|
34
34
|
user_input_column: str = "user_input",
|
35
35
|
retrieved_contexts_column: str = "retrieved_contexts",
|
36
36
|
reference_column: str = "reference",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
): # noqa: B950
|
38
40
|
"""
|
39
41
|
Context Precision is a metric that evaluates whether all of the ground-truth
|
@@ -109,7 +111,9 @@ def ContextPrecision(
|
|
109
111
|
df = get_renamed_columns(dataset._df, required_columns)
|
110
112
|
|
111
113
|
result_df = evaluate(
|
112
|
-
Dataset.from_pandas(df),
|
114
|
+
Dataset.from_pandas(df),
|
115
|
+
metrics=[context_precision()],
|
116
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
113
117
|
).to_pandas()
|
114
118
|
|
115
119
|
score_column = "llm_context_precision_with_reference"
|
@@ -34,6 +34,8 @@ def ContextPrecisionWithoutReference(
|
|
34
34
|
user_input_column: str = "user_input",
|
35
35
|
retrieved_contexts_column: str = "retrieved_contexts",
|
36
36
|
response_column: str = "response",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
): # noqa: B950
|
38
40
|
"""
|
39
41
|
Context Precision Without Reference is a metric used to evaluate the relevance of
|
@@ -104,7 +106,9 @@ def ContextPrecisionWithoutReference(
|
|
104
106
|
df = get_renamed_columns(dataset._df, required_columns)
|
105
107
|
|
106
108
|
result_df = evaluate(
|
107
|
-
Dataset.from_pandas(df),
|
109
|
+
Dataset.from_pandas(df),
|
110
|
+
metrics=[context_precision()],
|
111
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
108
112
|
).to_pandas()
|
109
113
|
|
110
114
|
score_column = "llm_context_precision_without_reference"
|
{validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/ContextRecall.py
RENAMED
@@ -34,6 +34,8 @@ def ContextRecall(
|
|
34
34
|
user_input_column: str = "user_input",
|
35
35
|
retrieved_contexts_column: str = "retrieved_contexts",
|
36
36
|
reference_column: str = "reference",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
):
|
38
40
|
"""
|
39
41
|
Context recall measures the extent to which the retrieved context aligns with the
|
@@ -109,7 +111,9 @@ def ContextRecall(
|
|
109
111
|
df = get_renamed_columns(dataset._df, required_columns)
|
110
112
|
|
111
113
|
result_df = evaluate(
|
112
|
-
Dataset.from_pandas(df),
|
114
|
+
Dataset.from_pandas(df),
|
115
|
+
metrics=[context_recall()],
|
116
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
113
117
|
).to_pandas()
|
114
118
|
|
115
119
|
score_column = "context_recall"
|
{validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/Faithfulness.py
RENAMED
@@ -34,6 +34,8 @@ def Faithfulness(
|
|
34
34
|
user_input_column="user_input",
|
35
35
|
response_column="response",
|
36
36
|
retrieved_contexts_column="retrieved_contexts",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
): # noqa
|
38
40
|
"""
|
39
41
|
Evaluates the faithfulness of the generated answers with respect to retrieved contexts.
|
@@ -114,7 +116,9 @@ def Faithfulness(
|
|
114
116
|
df = get_renamed_columns(dataset._df, required_columns)
|
115
117
|
|
116
118
|
result_df = evaluate(
|
117
|
-
Dataset.from_pandas(df),
|
119
|
+
Dataset.from_pandas(df),
|
120
|
+
metrics=[faithfulness()],
|
121
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
118
122
|
).to_pandas()
|
119
123
|
|
120
124
|
score_column = "faithfulness"
|
{validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/NoiseSensitivity.py
RENAMED
@@ -38,6 +38,8 @@ def NoiseSensitivity(
|
|
38
38
|
reference_column="reference",
|
39
39
|
focus="relevant",
|
40
40
|
user_input_column="user_input",
|
41
|
+
judge_llm=None,
|
42
|
+
judge_embeddings=None,
|
41
43
|
):
|
42
44
|
"""
|
43
45
|
Assesses the sensitivity of a Large Language Model (LLM) to noise in retrieved context by measuring how often it
|
@@ -149,7 +151,7 @@ def NoiseSensitivity(
|
|
149
151
|
result_df = evaluate(
|
150
152
|
Dataset.from_pandas(df),
|
151
153
|
metrics=[noise_sensitivity(focus=focus)],
|
152
|
-
**get_ragas_config(),
|
154
|
+
**get_ragas_config(judge_llm, judge_embeddings),
|
153
155
|
).to_pandas()
|
154
156
|
|
155
157
|
score_column = f"noise_sensitivity_{focus}"
|
{validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/ResponseRelevancy.py
RENAMED
@@ -34,6 +34,8 @@ def ResponseRelevancy(
|
|
34
34
|
user_input_column="user_input",
|
35
35
|
retrieved_contexts_column=None,
|
36
36
|
response_column="response",
|
37
|
+
judge_llm=None,
|
38
|
+
judge_embeddings=None,
|
37
39
|
):
|
38
40
|
"""
|
39
41
|
Assesses how pertinent the generated answer is to the given prompt.
|
@@ -44,8 +46,8 @@ def ResponseRelevancy(
|
|
44
46
|
relevancy. This metric is computed using the `user_input`, the `retrieved_contexts`
|
45
47
|
and the `response`.
|
46
48
|
|
47
|
-
The Response Relevancy is defined as the mean cosine
|
48
|
-
`user_input` to a number of
|
49
|
+
The Response Relevancy is defined as the mean cosine similarity of the original
|
50
|
+
`user_input` to a number of artificial questions, which are generated (reverse-engineered)
|
49
51
|
based on the `response`:
|
50
52
|
|
51
53
|
$$
|
@@ -62,7 +64,7 @@ def ResponseRelevancy(
|
|
62
64
|
|
63
65
|
**Note**: *This is a reference-free metric, meaning that it does not require a
|
64
66
|
`ground_truth` answer to compare against. A similar metric that does evaluate the
|
65
|
-
correctness of a generated
|
67
|
+
correctness of a generated answers with respect to a `ground_truth` answer is
|
66
68
|
`validmind.model_validation.ragas.AnswerCorrectness`.*
|
67
69
|
|
68
70
|
### Configuring Columns
|
@@ -128,7 +130,7 @@ def ResponseRelevancy(
|
|
128
130
|
result_df = evaluate(
|
129
131
|
Dataset.from_pandas(df),
|
130
132
|
metrics=metrics,
|
131
|
-
**get_ragas_config(),
|
133
|
+
**get_ragas_config(judge_llm, judge_embeddings),
|
132
134
|
).to_pandas()
|
133
135
|
|
134
136
|
score_column = "answer_relevancy"
|
{validmind-2.8.22 → validmind-2.8.27}/validmind/tests/model_validation/ragas/SemanticSimilarity.py
RENAMED
@@ -33,6 +33,8 @@ def SemanticSimilarity(
|
|
33
33
|
dataset,
|
34
34
|
response_column="response",
|
35
35
|
reference_column="reference",
|
36
|
+
judge_llm=None,
|
37
|
+
judge_embeddings=None,
|
36
38
|
):
|
37
39
|
"""
|
38
40
|
Calculates the semantic similarity between generated responses and ground truths
|
@@ -107,7 +109,9 @@ def SemanticSimilarity(
|
|
107
109
|
df = get_renamed_columns(dataset._df, required_columns)
|
108
110
|
|
109
111
|
result_df = evaluate(
|
110
|
-
Dataset.from_pandas(df),
|
112
|
+
Dataset.from_pandas(df),
|
113
|
+
metrics=[semantic_similarity()],
|
114
|
+
**get_ragas_config(judge_llm, judge_embeddings)
|
111
115
|
).to_pandas()
|
112
116
|
|
113
117
|
score_column = "semantic_similarity"
|