langchain 0.3.26__py3-none-any.whl → 0.4.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain/__init__.py +110 -96
- langchain/_api/__init__.py +2 -2
- langchain/_api/deprecation.py +3 -3
- langchain/_api/module_import.py +51 -46
- langchain/_api/path.py +1 -1
- langchain/adapters/openai.py +8 -8
- langchain/agents/__init__.py +15 -12
- langchain/agents/agent.py +174 -151
- langchain/agents/agent_iterator.py +50 -26
- langchain/agents/agent_toolkits/__init__.py +7 -6
- langchain/agents/agent_toolkits/ainetwork/toolkit.py +1 -1
- langchain/agents/agent_toolkits/amadeus/toolkit.py +1 -1
- langchain/agents/agent_toolkits/azure_cognitive_services.py +1 -1
- langchain/agents/agent_toolkits/clickup/toolkit.py +1 -1
- langchain/agents/agent_toolkits/conversational_retrieval/openai_functions.py +6 -4
- langchain/agents/agent_toolkits/csv/__init__.py +4 -2
- langchain/agents/agent_toolkits/file_management/__init__.py +1 -1
- langchain/agents/agent_toolkits/file_management/toolkit.py +1 -1
- langchain/agents/agent_toolkits/github/toolkit.py +9 -9
- langchain/agents/agent_toolkits/gitlab/toolkit.py +1 -1
- langchain/agents/agent_toolkits/json/base.py +1 -1
- langchain/agents/agent_toolkits/multion/toolkit.py +1 -1
- langchain/agents/agent_toolkits/office365/toolkit.py +1 -1
- langchain/agents/agent_toolkits/openapi/base.py +1 -1
- langchain/agents/agent_toolkits/openapi/planner.py +2 -2
- langchain/agents/agent_toolkits/openapi/planner_prompt.py +10 -10
- langchain/agents/agent_toolkits/openapi/prompt.py +1 -1
- langchain/agents/agent_toolkits/openapi/toolkit.py +1 -1
- langchain/agents/agent_toolkits/pandas/__init__.py +4 -2
- langchain/agents/agent_toolkits/playwright/__init__.py +1 -1
- langchain/agents/agent_toolkits/playwright/toolkit.py +1 -1
- langchain/agents/agent_toolkits/powerbi/base.py +1 -1
- langchain/agents/agent_toolkits/powerbi/chat_base.py +1 -1
- langchain/agents/agent_toolkits/powerbi/prompt.py +2 -2
- langchain/agents/agent_toolkits/powerbi/toolkit.py +1 -1
- langchain/agents/agent_toolkits/python/__init__.py +4 -2
- langchain/agents/agent_toolkits/spark/__init__.py +4 -2
- langchain/agents/agent_toolkits/spark_sql/base.py +1 -1
- langchain/agents/agent_toolkits/spark_sql/toolkit.py +1 -1
- langchain/agents/agent_toolkits/sql/prompt.py +1 -1
- langchain/agents/agent_toolkits/sql/toolkit.py +1 -1
- langchain/agents/agent_toolkits/vectorstore/base.py +4 -2
- langchain/agents/agent_toolkits/vectorstore/prompt.py +2 -4
- langchain/agents/agent_toolkits/vectorstore/toolkit.py +12 -11
- langchain/agents/agent_toolkits/xorbits/__init__.py +4 -2
- langchain/agents/agent_toolkits/zapier/toolkit.py +1 -1
- langchain/agents/agent_types.py +6 -6
- langchain/agents/chat/base.py +8 -12
- langchain/agents/chat/output_parser.py +9 -6
- langchain/agents/chat/prompt.py +3 -4
- langchain/agents/conversational/base.py +11 -5
- langchain/agents/conversational/output_parser.py +4 -2
- langchain/agents/conversational/prompt.py +2 -3
- langchain/agents/conversational_chat/base.py +9 -5
- langchain/agents/conversational_chat/output_parser.py +9 -11
- langchain/agents/conversational_chat/prompt.py +5 -6
- langchain/agents/format_scratchpad/__init__.py +3 -3
- langchain/agents/format_scratchpad/log_to_messages.py +1 -1
- langchain/agents/format_scratchpad/openai_functions.py +8 -6
- langchain/agents/format_scratchpad/tools.py +5 -3
- langchain/agents/format_scratchpad/xml.py +33 -2
- langchain/agents/initialize.py +17 -9
- langchain/agents/json_chat/base.py +19 -18
- langchain/agents/json_chat/prompt.py +2 -3
- langchain/agents/load_tools.py +2 -1
- langchain/agents/loading.py +28 -18
- langchain/agents/mrkl/base.py +11 -4
- langchain/agents/mrkl/output_parser.py +17 -13
- langchain/agents/mrkl/prompt.py +1 -2
- langchain/agents/openai_assistant/base.py +81 -71
- langchain/agents/openai_functions_agent/agent_token_buffer_memory.py +2 -0
- langchain/agents/openai_functions_agent/base.py +47 -37
- langchain/agents/openai_functions_multi_agent/base.py +40 -27
- langchain/agents/openai_tools/base.py +9 -8
- langchain/agents/output_parsers/__init__.py +3 -3
- langchain/agents/output_parsers/json.py +8 -6
- langchain/agents/output_parsers/openai_functions.py +24 -9
- langchain/agents/output_parsers/openai_tools.py +16 -4
- langchain/agents/output_parsers/react_json_single_input.py +13 -5
- langchain/agents/output_parsers/react_single_input.py +18 -11
- langchain/agents/output_parsers/self_ask.py +5 -2
- langchain/agents/output_parsers/tools.py +32 -13
- langchain/agents/output_parsers/xml.py +102 -28
- langchain/agents/react/agent.py +5 -4
- langchain/agents/react/base.py +26 -17
- langchain/agents/react/output_parser.py +7 -6
- langchain/agents/react/textworld_prompt.py +0 -1
- langchain/agents/react/wiki_prompt.py +14 -15
- langchain/agents/schema.py +5 -2
- langchain/agents/self_ask_with_search/base.py +23 -15
- langchain/agents/self_ask_with_search/prompt.py +0 -1
- langchain/agents/structured_chat/base.py +19 -11
- langchain/agents/structured_chat/output_parser.py +29 -18
- langchain/agents/structured_chat/prompt.py +3 -4
- langchain/agents/tool_calling_agent/base.py +8 -6
- langchain/agents/tools.py +5 -2
- langchain/agents/utils.py +2 -3
- langchain/agents/xml/base.py +12 -6
- langchain/agents/xml/prompt.py +1 -2
- langchain/cache.py +12 -12
- langchain/callbacks/__init__.py +11 -11
- langchain/callbacks/aim_callback.py +2 -2
- langchain/callbacks/argilla_callback.py +1 -1
- langchain/callbacks/arize_callback.py +1 -1
- langchain/callbacks/arthur_callback.py +1 -1
- langchain/callbacks/base.py +7 -7
- langchain/callbacks/clearml_callback.py +1 -1
- langchain/callbacks/comet_ml_callback.py +1 -1
- langchain/callbacks/confident_callback.py +1 -1
- langchain/callbacks/context_callback.py +1 -1
- langchain/callbacks/flyte_callback.py +1 -1
- langchain/callbacks/human.py +2 -2
- langchain/callbacks/infino_callback.py +1 -1
- langchain/callbacks/labelstudio_callback.py +1 -1
- langchain/callbacks/llmonitor_callback.py +1 -1
- langchain/callbacks/manager.py +5 -5
- langchain/callbacks/mlflow_callback.py +2 -2
- langchain/callbacks/openai_info.py +1 -1
- langchain/callbacks/promptlayer_callback.py +1 -1
- langchain/callbacks/sagemaker_callback.py +1 -1
- langchain/callbacks/streaming_aiter.py +17 -3
- langchain/callbacks/streaming_aiter_final_only.py +16 -5
- langchain/callbacks/streaming_stdout_final_only.py +10 -3
- langchain/callbacks/streamlit/__init__.py +3 -2
- langchain/callbacks/streamlit/mutable_expander.py +1 -1
- langchain/callbacks/streamlit/streamlit_callback_handler.py +3 -3
- langchain/callbacks/tracers/__init__.py +1 -1
- langchain/callbacks/tracers/comet.py +1 -1
- langchain/callbacks/tracers/evaluation.py +1 -1
- langchain/callbacks/tracers/log_stream.py +1 -1
- langchain/callbacks/tracers/logging.py +12 -1
- langchain/callbacks/tracers/stdout.py +1 -1
- langchain/callbacks/trubrics_callback.py +1 -1
- langchain/callbacks/utils.py +4 -4
- langchain/callbacks/wandb_callback.py +1 -1
- langchain/callbacks/whylabs_callback.py +1 -1
- langchain/chains/api/base.py +41 -23
- langchain/chains/api/news_docs.py +1 -2
- langchain/chains/api/open_meteo_docs.py +1 -2
- langchain/chains/api/openapi/requests_chain.py +1 -1
- langchain/chains/api/openapi/response_chain.py +1 -1
- langchain/chains/api/podcast_docs.py +1 -2
- langchain/chains/api/prompt.py +1 -2
- langchain/chains/api/tmdb_docs.py +1 -2
- langchain/chains/base.py +96 -56
- langchain/chains/chat_vector_db/prompts.py +2 -3
- langchain/chains/combine_documents/__init__.py +1 -1
- langchain/chains/combine_documents/base.py +30 -11
- langchain/chains/combine_documents/map_reduce.py +41 -30
- langchain/chains/combine_documents/map_rerank.py +39 -24
- langchain/chains/combine_documents/reduce.py +48 -26
- langchain/chains/combine_documents/refine.py +27 -17
- langchain/chains/combine_documents/stuff.py +24 -13
- langchain/chains/constitutional_ai/base.py +11 -4
- langchain/chains/constitutional_ai/principles.py +22 -25
- langchain/chains/constitutional_ai/prompts.py +25 -28
- langchain/chains/conversation/base.py +9 -4
- langchain/chains/conversation/memory.py +5 -5
- langchain/chains/conversation/prompt.py +5 -5
- langchain/chains/conversational_retrieval/base.py +108 -79
- langchain/chains/conversational_retrieval/prompts.py +2 -3
- langchain/chains/elasticsearch_database/base.py +10 -10
- langchain/chains/elasticsearch_database/prompts.py +2 -3
- langchain/chains/ernie_functions/__init__.py +2 -2
- langchain/chains/example_generator.py +3 -1
- langchain/chains/flare/base.py +28 -12
- langchain/chains/flare/prompts.py +2 -0
- langchain/chains/graph_qa/cypher.py +2 -2
- langchain/chains/graph_qa/falkordb.py +1 -1
- langchain/chains/graph_qa/gremlin.py +1 -1
- langchain/chains/graph_qa/neptune_sparql.py +1 -1
- langchain/chains/graph_qa/prompts.py +2 -2
- langchain/chains/history_aware_retriever.py +2 -1
- langchain/chains/hyde/base.py +6 -5
- langchain/chains/hyde/prompts.py +5 -6
- langchain/chains/llm.py +82 -61
- langchain/chains/llm_bash/__init__.py +3 -2
- langchain/chains/llm_checker/base.py +19 -6
- langchain/chains/llm_checker/prompt.py +3 -4
- langchain/chains/llm_math/base.py +25 -10
- langchain/chains/llm_math/prompt.py +1 -2
- langchain/chains/llm_summarization_checker/base.py +22 -7
- langchain/chains/llm_symbolic_math/__init__.py +3 -2
- langchain/chains/loading.py +155 -97
- langchain/chains/mapreduce.py +4 -3
- langchain/chains/moderation.py +11 -9
- langchain/chains/natbot/base.py +11 -9
- langchain/chains/natbot/crawler.py +102 -76
- langchain/chains/natbot/prompt.py +2 -3
- langchain/chains/openai_functions/__init__.py +7 -7
- langchain/chains/openai_functions/base.py +15 -10
- langchain/chains/openai_functions/citation_fuzzy_match.py +21 -11
- langchain/chains/openai_functions/extraction.py +19 -19
- langchain/chains/openai_functions/openapi.py +39 -35
- langchain/chains/openai_functions/qa_with_structure.py +22 -15
- langchain/chains/openai_functions/tagging.py +4 -4
- langchain/chains/openai_tools/extraction.py +7 -8
- langchain/chains/qa_generation/base.py +8 -3
- langchain/chains/qa_generation/prompt.py +5 -5
- langchain/chains/qa_with_sources/base.py +17 -6
- langchain/chains/qa_with_sources/loading.py +16 -8
- langchain/chains/qa_with_sources/map_reduce_prompt.py +8 -9
- langchain/chains/qa_with_sources/refine_prompts.py +0 -1
- langchain/chains/qa_with_sources/retrieval.py +15 -6
- langchain/chains/qa_with_sources/stuff_prompt.py +6 -7
- langchain/chains/qa_with_sources/vector_db.py +21 -8
- langchain/chains/query_constructor/base.py +37 -34
- langchain/chains/query_constructor/ir.py +4 -4
- langchain/chains/query_constructor/parser.py +101 -34
- langchain/chains/query_constructor/prompt.py +5 -6
- langchain/chains/question_answering/chain.py +21 -10
- langchain/chains/question_answering/map_reduce_prompt.py +14 -14
- langchain/chains/question_answering/map_rerank_prompt.py +3 -3
- langchain/chains/question_answering/refine_prompts.py +2 -5
- langchain/chains/question_answering/stuff_prompt.py +5 -5
- langchain/chains/retrieval.py +1 -3
- langchain/chains/retrieval_qa/base.py +38 -27
- langchain/chains/retrieval_qa/prompt.py +1 -2
- langchain/chains/router/__init__.py +3 -3
- langchain/chains/router/base.py +38 -22
- langchain/chains/router/embedding_router.py +15 -8
- langchain/chains/router/llm_router.py +23 -20
- langchain/chains/router/multi_prompt.py +5 -2
- langchain/chains/router/multi_retrieval_qa.py +28 -5
- langchain/chains/sequential.py +30 -18
- langchain/chains/sql_database/prompt.py +14 -16
- langchain/chains/sql_database/query.py +7 -5
- langchain/chains/structured_output/__init__.py +1 -1
- langchain/chains/structured_output/base.py +77 -67
- langchain/chains/summarize/chain.py +11 -5
- langchain/chains/summarize/map_reduce_prompt.py +0 -1
- langchain/chains/summarize/stuff_prompt.py +0 -1
- langchain/chains/transform.py +9 -6
- langchain/chat_loaders/facebook_messenger.py +1 -1
- langchain/chat_loaders/langsmith.py +1 -1
- langchain/chat_loaders/utils.py +3 -3
- langchain/chat_models/__init__.py +20 -19
- langchain/chat_models/anthropic.py +1 -1
- langchain/chat_models/azureml_endpoint.py +1 -1
- langchain/chat_models/baidu_qianfan_endpoint.py +1 -1
- langchain/chat_models/base.py +213 -139
- langchain/chat_models/bedrock.py +1 -1
- langchain/chat_models/fake.py +1 -1
- langchain/chat_models/meta.py +1 -1
- langchain/chat_models/pai_eas_endpoint.py +1 -1
- langchain/chat_models/promptlayer_openai.py +1 -1
- langchain/chat_models/volcengine_maas.py +1 -1
- langchain/docstore/base.py +1 -1
- langchain/document_loaders/__init__.py +9 -9
- langchain/document_loaders/airbyte.py +3 -3
- langchain/document_loaders/assemblyai.py +1 -1
- langchain/document_loaders/azure_blob_storage_container.py +1 -1
- langchain/document_loaders/azure_blob_storage_file.py +1 -1
- langchain/document_loaders/baiducloud_bos_file.py +1 -1
- langchain/document_loaders/base.py +1 -1
- langchain/document_loaders/blob_loaders/__init__.py +1 -1
- langchain/document_loaders/blob_loaders/schema.py +1 -4
- langchain/document_loaders/blockchain.py +1 -1
- langchain/document_loaders/chatgpt.py +1 -1
- langchain/document_loaders/college_confidential.py +1 -1
- langchain/document_loaders/confluence.py +1 -1
- langchain/document_loaders/email.py +1 -1
- langchain/document_loaders/facebook_chat.py +1 -1
- langchain/document_loaders/markdown.py +1 -1
- langchain/document_loaders/notebook.py +1 -1
- langchain/document_loaders/org_mode.py +1 -1
- langchain/document_loaders/parsers/__init__.py +1 -1
- langchain/document_loaders/parsers/docai.py +1 -1
- langchain/document_loaders/parsers/generic.py +1 -1
- langchain/document_loaders/parsers/html/__init__.py +1 -1
- langchain/document_loaders/parsers/html/bs4.py +1 -1
- langchain/document_loaders/parsers/language/cobol.py +1 -1
- langchain/document_loaders/parsers/language/python.py +1 -1
- langchain/document_loaders/parsers/msword.py +1 -1
- langchain/document_loaders/parsers/pdf.py +5 -5
- langchain/document_loaders/parsers/registry.py +1 -1
- langchain/document_loaders/pdf.py +8 -8
- langchain/document_loaders/powerpoint.py +1 -1
- langchain/document_loaders/pyspark_dataframe.py +1 -1
- langchain/document_loaders/telegram.py +2 -2
- langchain/document_loaders/tencent_cos_directory.py +1 -1
- langchain/document_loaders/unstructured.py +5 -5
- langchain/document_loaders/url_playwright.py +1 -1
- langchain/document_loaders/whatsapp_chat.py +1 -1
- langchain/document_loaders/youtube.py +2 -2
- langchain/document_transformers/__init__.py +3 -3
- langchain/document_transformers/beautiful_soup_transformer.py +1 -1
- langchain/document_transformers/doctran_text_extract.py +1 -1
- langchain/document_transformers/doctran_text_qa.py +1 -1
- langchain/document_transformers/doctran_text_translate.py +1 -1
- langchain/document_transformers/embeddings_redundant_filter.py +3 -3
- langchain/document_transformers/google_translate.py +1 -1
- langchain/document_transformers/html2text.py +1 -1
- langchain/document_transformers/nuclia_text_transform.py +1 -1
- langchain/embeddings/__init__.py +5 -5
- langchain/embeddings/base.py +35 -24
- langchain/embeddings/cache.py +37 -32
- langchain/embeddings/fake.py +1 -1
- langchain/embeddings/huggingface.py +2 -2
- langchain/evaluation/__init__.py +22 -22
- langchain/evaluation/agents/trajectory_eval_chain.py +26 -25
- langchain/evaluation/agents/trajectory_eval_prompt.py +6 -9
- langchain/evaluation/comparison/__init__.py +1 -1
- langchain/evaluation/comparison/eval_chain.py +21 -13
- langchain/evaluation/comparison/prompt.py +1 -2
- langchain/evaluation/criteria/__init__.py +1 -1
- langchain/evaluation/criteria/eval_chain.py +23 -11
- langchain/evaluation/criteria/prompt.py +2 -3
- langchain/evaluation/embedding_distance/base.py +34 -20
- langchain/evaluation/exact_match/base.py +14 -1
- langchain/evaluation/loading.py +16 -11
- langchain/evaluation/parsing/base.py +20 -4
- langchain/evaluation/parsing/json_distance.py +24 -10
- langchain/evaluation/parsing/json_schema.py +13 -12
- langchain/evaluation/qa/__init__.py +1 -1
- langchain/evaluation/qa/eval_chain.py +20 -5
- langchain/evaluation/qa/eval_prompt.py +7 -8
- langchain/evaluation/qa/generate_chain.py +4 -1
- langchain/evaluation/qa/generate_prompt.py +2 -4
- langchain/evaluation/regex_match/base.py +9 -1
- langchain/evaluation/schema.py +38 -30
- langchain/evaluation/scoring/__init__.py +1 -1
- langchain/evaluation/scoring/eval_chain.py +23 -15
- langchain/evaluation/scoring/prompt.py +0 -1
- langchain/evaluation/string_distance/base.py +20 -9
- langchain/globals.py +12 -11
- langchain/graphs/__init__.py +6 -6
- langchain/graphs/graph_document.py +1 -1
- langchain/graphs/networkx_graph.py +2 -2
- langchain/hub.py +9 -11
- langchain/indexes/__init__.py +3 -3
- langchain/indexes/_sql_record_manager.py +63 -46
- langchain/indexes/prompts/entity_extraction.py +1 -2
- langchain/indexes/prompts/entity_summarization.py +1 -2
- langchain/indexes/prompts/knowledge_triplet_extraction.py +1 -3
- langchain/indexes/vectorstore.py +35 -19
- langchain/llms/__init__.py +13 -13
- langchain/llms/ai21.py +1 -1
- langchain/llms/azureml_endpoint.py +4 -4
- langchain/llms/base.py +15 -7
- langchain/llms/bedrock.py +1 -1
- langchain/llms/cloudflare_workersai.py +1 -1
- langchain/llms/gradient_ai.py +1 -1
- langchain/llms/loading.py +1 -1
- langchain/llms/openai.py +1 -1
- langchain/llms/sagemaker_endpoint.py +1 -1
- langchain/load/dump.py +1 -1
- langchain/load/load.py +1 -1
- langchain/load/serializable.py +3 -3
- langchain/memory/__init__.py +3 -3
- langchain/memory/buffer.py +14 -7
- langchain/memory/buffer_window.py +2 -0
- langchain/memory/chat_memory.py +14 -8
- langchain/memory/chat_message_histories/__init__.py +1 -1
- langchain/memory/chat_message_histories/astradb.py +1 -1
- langchain/memory/chat_message_histories/cassandra.py +1 -1
- langchain/memory/chat_message_histories/cosmos_db.py +1 -1
- langchain/memory/chat_message_histories/dynamodb.py +1 -1
- langchain/memory/chat_message_histories/elasticsearch.py +1 -1
- langchain/memory/chat_message_histories/file.py +1 -1
- langchain/memory/chat_message_histories/firestore.py +1 -1
- langchain/memory/chat_message_histories/momento.py +1 -1
- langchain/memory/chat_message_histories/mongodb.py +1 -1
- langchain/memory/chat_message_histories/neo4j.py +1 -1
- langchain/memory/chat_message_histories/postgres.py +1 -1
- langchain/memory/chat_message_histories/redis.py +1 -1
- langchain/memory/chat_message_histories/rocksetdb.py +1 -1
- langchain/memory/chat_message_histories/singlestoredb.py +1 -1
- langchain/memory/chat_message_histories/streamlit.py +1 -1
- langchain/memory/chat_message_histories/upstash_redis.py +1 -1
- langchain/memory/chat_message_histories/xata.py +1 -1
- langchain/memory/chat_message_histories/zep.py +1 -1
- langchain/memory/combined.py +14 -13
- langchain/memory/entity.py +131 -61
- langchain/memory/prompt.py +10 -11
- langchain/memory/readonly.py +0 -2
- langchain/memory/simple.py +4 -3
- langchain/memory/summary.py +43 -11
- langchain/memory/summary_buffer.py +20 -8
- langchain/memory/token_buffer.py +2 -0
- langchain/memory/utils.py +3 -2
- langchain/memory/vectorstore.py +12 -5
- langchain/memory/vectorstore_token_buffer_memory.py +5 -5
- langchain/model_laboratory.py +12 -11
- langchain/output_parsers/__init__.py +4 -4
- langchain/output_parsers/boolean.py +7 -4
- langchain/output_parsers/combining.py +14 -7
- langchain/output_parsers/datetime.py +32 -31
- langchain/output_parsers/enum.py +10 -4
- langchain/output_parsers/fix.py +60 -53
- langchain/output_parsers/format_instructions.py +6 -8
- langchain/output_parsers/json.py +2 -2
- langchain/output_parsers/list.py +2 -2
- langchain/output_parsers/loading.py +9 -9
- langchain/output_parsers/openai_functions.py +3 -3
- langchain/output_parsers/openai_tools.py +1 -1
- langchain/output_parsers/pandas_dataframe.py +59 -48
- langchain/output_parsers/prompts.py +1 -2
- langchain/output_parsers/rail_parser.py +1 -1
- langchain/output_parsers/regex.py +9 -8
- langchain/output_parsers/regex_dict.py +7 -10
- langchain/output_parsers/retry.py +99 -80
- langchain/output_parsers/structured.py +21 -6
- langchain/output_parsers/yaml.py +19 -11
- langchain/prompts/__init__.py +5 -3
- langchain/prompts/base.py +5 -5
- langchain/prompts/chat.py +8 -8
- langchain/prompts/example_selector/__init__.py +3 -1
- langchain/prompts/example_selector/semantic_similarity.py +2 -2
- langchain/prompts/few_shot.py +1 -1
- langchain/prompts/loading.py +3 -3
- langchain/prompts/prompt.py +1 -1
- langchain/pydantic_v1/__init__.py +1 -1
- langchain/retrievers/__init__.py +5 -5
- langchain/retrievers/bedrock.py +2 -2
- langchain/retrievers/bm25.py +1 -1
- langchain/retrievers/contextual_compression.py +14 -8
- langchain/retrievers/docarray.py +1 -1
- langchain/retrievers/document_compressors/__init__.py +5 -4
- langchain/retrievers/document_compressors/base.py +12 -6
- langchain/retrievers/document_compressors/chain_extract.py +5 -3
- langchain/retrievers/document_compressors/chain_extract_prompt.py +2 -3
- langchain/retrievers/document_compressors/chain_filter.py +9 -9
- langchain/retrievers/document_compressors/chain_filter_prompt.py +1 -2
- langchain/retrievers/document_compressors/cohere_rerank.py +17 -15
- langchain/retrievers/document_compressors/cross_encoder_rerank.py +2 -0
- langchain/retrievers/document_compressors/embeddings_filter.py +24 -17
- langchain/retrievers/document_compressors/flashrank_rerank.py +1 -1
- langchain/retrievers/document_compressors/listwise_rerank.py +8 -5
- langchain/retrievers/ensemble.py +30 -27
- langchain/retrievers/google_cloud_documentai_warehouse.py +1 -1
- langchain/retrievers/google_vertex_ai_search.py +2 -2
- langchain/retrievers/kendra.py +10 -10
- langchain/retrievers/llama_index.py +1 -1
- langchain/retrievers/merger_retriever.py +11 -11
- langchain/retrievers/milvus.py +1 -1
- langchain/retrievers/multi_query.py +35 -27
- langchain/retrievers/multi_vector.py +24 -9
- langchain/retrievers/parent_document_retriever.py +33 -9
- langchain/retrievers/re_phraser.py +6 -5
- langchain/retrievers/self_query/base.py +157 -127
- langchain/retrievers/time_weighted_retriever.py +21 -7
- langchain/retrievers/zilliz.py +1 -1
- langchain/runnables/hub.py +12 -0
- langchain/runnables/openai_functions.py +12 -2
- langchain/schema/__init__.py +23 -23
- langchain/schema/cache.py +1 -1
- langchain/schema/callbacks/base.py +7 -7
- langchain/schema/callbacks/manager.py +19 -19
- langchain/schema/callbacks/tracers/base.py +1 -1
- langchain/schema/callbacks/tracers/evaluation.py +1 -1
- langchain/schema/callbacks/tracers/langchain.py +1 -1
- langchain/schema/callbacks/tracers/langchain_v1.py +1 -1
- langchain/schema/callbacks/tracers/log_stream.py +1 -1
- langchain/schema/callbacks/tracers/schemas.py +8 -8
- langchain/schema/callbacks/tracers/stdout.py +3 -3
- langchain/schema/document.py +1 -1
- langchain/schema/language_model.py +2 -2
- langchain/schema/messages.py +12 -12
- langchain/schema/output.py +3 -3
- langchain/schema/output_parser.py +3 -3
- langchain/schema/runnable/__init__.py +3 -3
- langchain/schema/runnable/base.py +9 -9
- langchain/schema/runnable/config.py +5 -5
- langchain/schema/runnable/configurable.py +1 -1
- langchain/schema/runnable/history.py +1 -1
- langchain/schema/runnable/passthrough.py +1 -1
- langchain/schema/runnable/utils.py +16 -16
- langchain/schema/vectorstore.py +1 -1
- langchain/smith/__init__.py +2 -1
- langchain/smith/evaluation/__init__.py +2 -2
- langchain/smith/evaluation/config.py +9 -23
- langchain/smith/evaluation/name_generation.py +3 -3
- langchain/smith/evaluation/progress.py +22 -4
- langchain/smith/evaluation/runner_utils.py +416 -247
- langchain/smith/evaluation/string_run_evaluator.py +102 -68
- langchain/storage/__init__.py +2 -2
- langchain/storage/_lc_store.py +4 -2
- langchain/storage/encoder_backed.py +7 -2
- langchain/storage/file_system.py +19 -16
- langchain/storage/in_memory.py +1 -1
- langchain/storage/upstash_redis.py +1 -1
- langchain/text_splitter.py +15 -15
- langchain/tools/__init__.py +28 -26
- langchain/tools/ainetwork/app.py +1 -1
- langchain/tools/ainetwork/base.py +1 -1
- langchain/tools/ainetwork/owner.py +1 -1
- langchain/tools/ainetwork/rule.py +1 -1
- langchain/tools/ainetwork/transfer.py +1 -1
- langchain/tools/ainetwork/value.py +1 -1
- langchain/tools/amadeus/closest_airport.py +1 -1
- langchain/tools/amadeus/flight_search.py +1 -1
- langchain/tools/azure_cognitive_services/__init__.py +1 -1
- langchain/tools/base.py +4 -4
- langchain/tools/bearly/tool.py +1 -1
- langchain/tools/bing_search/__init__.py +1 -1
- langchain/tools/bing_search/tool.py +1 -1
- langchain/tools/dataforseo_api_search/__init__.py +1 -1
- langchain/tools/dataforseo_api_search/tool.py +1 -1
- langchain/tools/ddg_search/tool.py +1 -1
- langchain/tools/e2b_data_analysis/tool.py +2 -2
- langchain/tools/edenai/__init__.py +1 -1
- langchain/tools/file_management/__init__.py +1 -1
- langchain/tools/file_management/copy.py +1 -1
- langchain/tools/file_management/delete.py +1 -1
- langchain/tools/gmail/__init__.py +2 -2
- langchain/tools/gmail/get_message.py +1 -1
- langchain/tools/gmail/search.py +1 -1
- langchain/tools/gmail/send_message.py +1 -1
- langchain/tools/google_finance/__init__.py +1 -1
- langchain/tools/google_finance/tool.py +1 -1
- langchain/tools/google_scholar/__init__.py +1 -1
- langchain/tools/google_scholar/tool.py +1 -1
- langchain/tools/google_search/__init__.py +1 -1
- langchain/tools/google_search/tool.py +1 -1
- langchain/tools/google_serper/__init__.py +1 -1
- langchain/tools/google_serper/tool.py +1 -1
- langchain/tools/google_trends/__init__.py +1 -1
- langchain/tools/google_trends/tool.py +1 -1
- langchain/tools/jira/tool.py +20 -1
- langchain/tools/json/tool.py +25 -3
- langchain/tools/memorize/tool.py +1 -1
- langchain/tools/multion/__init__.py +1 -1
- langchain/tools/multion/update_session.py +1 -1
- langchain/tools/office365/__init__.py +2 -2
- langchain/tools/office365/events_search.py +1 -1
- langchain/tools/office365/messages_search.py +1 -1
- langchain/tools/office365/send_event.py +1 -1
- langchain/tools/office365/send_message.py +1 -1
- langchain/tools/openapi/utils/api_models.py +6 -6
- langchain/tools/playwright/__init__.py +5 -5
- langchain/tools/playwright/click.py +1 -1
- langchain/tools/playwright/extract_hyperlinks.py +1 -1
- langchain/tools/playwright/get_elements.py +1 -1
- langchain/tools/playwright/navigate.py +1 -1
- langchain/tools/plugin.py +2 -2
- langchain/tools/powerbi/tool.py +1 -1
- langchain/tools/python/__init__.py +3 -2
- langchain/tools/reddit_search/tool.py +1 -1
- langchain/tools/render.py +2 -2
- langchain/tools/requests/tool.py +2 -2
- langchain/tools/searchapi/tool.py +1 -1
- langchain/tools/searx_search/tool.py +1 -1
- langchain/tools/slack/get_message.py +1 -1
- langchain/tools/spark_sql/tool.py +1 -1
- langchain/tools/sql_database/tool.py +1 -1
- langchain/tools/tavily_search/__init__.py +1 -1
- langchain/tools/tavily_search/tool.py +1 -1
- langchain/tools/zapier/__init__.py +1 -1
- langchain/tools/zapier/tool.py +24 -2
- langchain/utilities/__init__.py +4 -4
- langchain/utilities/arcee.py +4 -4
- langchain/utilities/clickup.py +4 -4
- langchain/utilities/dalle_image_generator.py +1 -1
- langchain/utilities/dataforseo_api_search.py +1 -1
- langchain/utilities/opaqueprompts.py +1 -1
- langchain/utilities/reddit_search.py +1 -1
- langchain/utilities/sql_database.py +1 -1
- langchain/utilities/tavily_search.py +1 -1
- langchain/utilities/vertexai.py +2 -2
- langchain/utils/__init__.py +1 -1
- langchain/utils/aiter.py +1 -1
- langchain/utils/html.py +3 -3
- langchain/utils/input.py +1 -1
- langchain/utils/iter.py +1 -1
- langchain/utils/json_schema.py +1 -3
- langchain/utils/strings.py +1 -1
- langchain/utils/utils.py +6 -6
- langchain/vectorstores/__init__.py +5 -5
- langchain/vectorstores/alibabacloud_opensearch.py +1 -1
- langchain/vectorstores/azure_cosmos_db.py +1 -1
- langchain/vectorstores/clickhouse.py +1 -1
- langchain/vectorstores/elastic_vector_search.py +1 -1
- langchain/vectorstores/elasticsearch.py +2 -2
- langchain/vectorstores/myscale.py +1 -1
- langchain/vectorstores/neo4j_vector.py +1 -1
- langchain/vectorstores/pgembedding.py +1 -1
- langchain/vectorstores/qdrant.py +1 -1
- langchain/vectorstores/redis/__init__.py +1 -1
- langchain/vectorstores/redis/base.py +1 -1
- langchain/vectorstores/redis/filters.py +4 -4
- langchain/vectorstores/redis/schema.py +6 -6
- langchain/vectorstores/sklearn.py +2 -2
- langchain/vectorstores/starrocks.py +1 -1
- langchain/vectorstores/utils.py +1 -1
- {langchain-0.3.26.dist-info → langchain-0.4.0.dev0.dist-info}/METADATA +4 -14
- {langchain-0.3.26.dist-info → langchain-0.4.0.dev0.dist-info}/RECORD +590 -591
- {langchain-0.3.26.dist-info → langchain-0.4.0.dev0.dist-info}/WHEEL +1 -1
- langchain/smith/evaluation/utils.py +0 -0
- {langchain-0.3.26.dist-info → langchain-0.4.0.dev0.dist-info}/entry_points.txt +0 -0
- {langchain-0.3.26.dist-info → langchain-0.4.0.dev0.dist-info}/licenses/LICENSE +0 -0
|
@@ -98,10 +98,8 @@ class TestResult(dict):
|
|
|
98
98
|
to_drop = [
|
|
99
99
|
col
|
|
100
100
|
for col in df.columns
|
|
101
|
-
if col.startswith("inputs.")
|
|
102
|
-
or col.startswith("outputs.")
|
|
101
|
+
if col.startswith(("inputs.", "outputs.", "reference"))
|
|
103
102
|
or col in {"input", "output"}
|
|
104
|
-
or col.startswith("reference")
|
|
105
103
|
]
|
|
106
104
|
return df.describe(include="all").drop(to_drop, axis=1)
|
|
107
105
|
|
|
@@ -110,10 +108,11 @@ class TestResult(dict):
|
|
|
110
108
|
try:
|
|
111
109
|
import pandas as pd
|
|
112
110
|
except ImportError as e:
|
|
113
|
-
|
|
111
|
+
msg = (
|
|
114
112
|
"Pandas is required to convert the results to a dataframe."
|
|
115
113
|
" to install pandas, run `pip install pandas`."
|
|
116
|
-
)
|
|
114
|
+
)
|
|
115
|
+
raise ImportError(msg) from e
|
|
117
116
|
|
|
118
117
|
indices = []
|
|
119
118
|
records = []
|
|
@@ -134,7 +133,7 @@ class TestResult(dict):
|
|
|
134
133
|
if "reference" in result:
|
|
135
134
|
if isinstance(result["reference"], dict):
|
|
136
135
|
r.update(
|
|
137
|
-
{f"reference.{k}": v for k, v in result["reference"].items()}
|
|
136
|
+
{f"reference.{k}": v for k, v in result["reference"].items()},
|
|
138
137
|
)
|
|
139
138
|
else:
|
|
140
139
|
r["reference"] = result["reference"]
|
|
@@ -144,7 +143,7 @@ class TestResult(dict):
|
|
|
144
143
|
"error": result.get("Error"),
|
|
145
144
|
"execution_time": result["execution_time"],
|
|
146
145
|
"run_id": result.get("run_id"),
|
|
147
|
-
}
|
|
146
|
+
},
|
|
148
147
|
)
|
|
149
148
|
records.append(r)
|
|
150
149
|
indices.append(example_id)
|
|
@@ -156,13 +155,29 @@ class EvalError(dict):
|
|
|
156
155
|
"""Your architecture raised an error."""
|
|
157
156
|
|
|
158
157
|
def __init__(self, Error: BaseException, **kwargs: Any) -> None:
|
|
158
|
+
"""Initialize the EvalError with an error and additional attributes.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
Error: The error that occurred.
|
|
162
|
+
**kwargs: Additional attributes to include in the error.
|
|
163
|
+
"""
|
|
159
164
|
super().__init__(Error=Error, **kwargs)
|
|
160
165
|
|
|
161
166
|
def __getattr__(self, name: str) -> Any:
|
|
167
|
+
"""Get an attribute from the EvalError.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
name: The name of the attribute to get.
|
|
171
|
+
Returns:
|
|
172
|
+
The value of the attribute.
|
|
173
|
+
Raises:
|
|
174
|
+
AttributeError: If the attribute does not exist.
|
|
175
|
+
"""
|
|
162
176
|
try:
|
|
163
177
|
return self[name]
|
|
164
|
-
except KeyError:
|
|
165
|
-
|
|
178
|
+
except KeyError as e:
|
|
179
|
+
msg = f"'EvalError' object has no attribute '{name}'"
|
|
180
|
+
raise AttributeError(msg) from e
|
|
166
181
|
|
|
167
182
|
|
|
168
183
|
def _wrap_in_chain_factory(
|
|
@@ -176,7 +191,7 @@ def _wrap_in_chain_factory(
|
|
|
176
191
|
chain_class = chain.__class__.__name__
|
|
177
192
|
if llm_or_chain_factory.memory is not None:
|
|
178
193
|
memory_class = chain.memory.__class__.__name__
|
|
179
|
-
|
|
194
|
+
msg = (
|
|
180
195
|
"Cannot directly evaluate a chain with stateful memory."
|
|
181
196
|
" To evaluate this chain, pass in a chain constructor"
|
|
182
197
|
" that initializes fresh memory each time it is called."
|
|
@@ -189,40 +204,40 @@ def _wrap_in_chain_factory(
|
|
|
189
204
|
"(memory=new_memory, ...)\n\n"
|
|
190
205
|
f'run_on_dataset("{dataset_name}", chain_constructor, ...)'
|
|
191
206
|
)
|
|
207
|
+
raise ValueError(msg)
|
|
192
208
|
return lambda: chain
|
|
193
|
-
|
|
209
|
+
if isinstance(llm_or_chain_factory, BaseLanguageModel):
|
|
194
210
|
return llm_or_chain_factory
|
|
195
|
-
|
|
211
|
+
if isinstance(llm_or_chain_factory, Runnable):
|
|
196
212
|
# Memory may exist here, but it's not elegant to check all those cases.
|
|
197
213
|
lcf = llm_or_chain_factory
|
|
198
214
|
return lambda: lcf
|
|
199
|
-
|
|
215
|
+
if callable(llm_or_chain_factory):
|
|
200
216
|
if is_traceable_function(llm_or_chain_factory):
|
|
201
|
-
runnable_ = as_runnable(cast(Callable, llm_or_chain_factory))
|
|
217
|
+
runnable_ = as_runnable(cast("Callable", llm_or_chain_factory))
|
|
202
218
|
return lambda: runnable_
|
|
203
219
|
try:
|
|
204
220
|
_model = llm_or_chain_factory() # type: ignore[call-arg]
|
|
205
221
|
except TypeError:
|
|
206
222
|
# It's an arbitrary function, wrap it in a RunnableLambda
|
|
207
|
-
user_func = cast(Callable, llm_or_chain_factory)
|
|
223
|
+
user_func = cast("Callable", llm_or_chain_factory)
|
|
208
224
|
sig = inspect.signature(user_func)
|
|
209
|
-
logger.info(
|
|
225
|
+
logger.info("Wrapping function %s as RunnableLambda.", sig)
|
|
210
226
|
wrapped = RunnableLambda(user_func)
|
|
211
227
|
return lambda: wrapped
|
|
212
|
-
constructor = cast(Callable, llm_or_chain_factory)
|
|
228
|
+
constructor = cast("Callable", llm_or_chain_factory)
|
|
213
229
|
if isinstance(_model, BaseLanguageModel):
|
|
214
230
|
# It's not uncommon to do an LLM constructor instead of raw LLM,
|
|
215
231
|
# so we'll unpack it for the user.
|
|
216
232
|
return _model
|
|
217
|
-
|
|
218
|
-
runnable_ = as_runnable(cast(Callable, _model))
|
|
233
|
+
if is_traceable_function(cast("Callable", _model)):
|
|
234
|
+
runnable_ = as_runnable(cast("Callable", _model))
|
|
219
235
|
return lambda: runnable_
|
|
220
|
-
|
|
236
|
+
if not isinstance(_model, Runnable):
|
|
221
237
|
# This is unlikely to happen - a constructor for a model function
|
|
222
238
|
return lambda: RunnableLambda(constructor)
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
return constructor
|
|
239
|
+
# Typical correct case
|
|
240
|
+
return constructor
|
|
226
241
|
return llm_or_chain_factory
|
|
227
242
|
|
|
228
243
|
|
|
@@ -238,23 +253,24 @@ def _get_prompt(inputs: dict[str, Any]) -> str:
|
|
|
238
253
|
InputFormatError: If the input format is invalid.
|
|
239
254
|
"""
|
|
240
255
|
if not inputs:
|
|
241
|
-
|
|
256
|
+
msg = "Inputs should not be empty."
|
|
257
|
+
raise InputFormatError(msg)
|
|
242
258
|
|
|
243
259
|
prompts = []
|
|
244
260
|
if "prompt" in inputs:
|
|
245
261
|
if not isinstance(inputs["prompt"], str):
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
)
|
|
262
|
+
msg = f"Expected string for 'prompt', got {type(inputs['prompt']).__name__}"
|
|
263
|
+
raise InputFormatError(msg)
|
|
249
264
|
prompts = [inputs["prompt"]]
|
|
250
265
|
elif "prompts" in inputs:
|
|
251
266
|
if not isinstance(inputs["prompts"], list) or not all(
|
|
252
267
|
isinstance(i, str) for i in inputs["prompts"]
|
|
253
268
|
):
|
|
254
|
-
|
|
269
|
+
msg = (
|
|
255
270
|
"Expected list of strings for 'prompts',"
|
|
256
271
|
f" got {type(inputs['prompts']).__name__}"
|
|
257
272
|
)
|
|
273
|
+
raise InputFormatError(msg)
|
|
258
274
|
prompts = inputs["prompts"]
|
|
259
275
|
elif len(inputs) == 1:
|
|
260
276
|
prompt_ = next(iter(inputs.values()))
|
|
@@ -263,17 +279,15 @@ def _get_prompt(inputs: dict[str, Any]) -> str:
|
|
|
263
279
|
elif isinstance(prompt_, list) and all(isinstance(i, str) for i in prompt_):
|
|
264
280
|
prompts = prompt_
|
|
265
281
|
else:
|
|
266
|
-
|
|
282
|
+
msg = f"LLM Run expects string prompt input. Got {inputs}"
|
|
283
|
+
raise InputFormatError(msg)
|
|
267
284
|
else:
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
)
|
|
285
|
+
msg = f"LLM Run expects 'prompt' or 'prompts' in inputs. Got {inputs}"
|
|
286
|
+
raise InputFormatError(msg)
|
|
271
287
|
if len(prompts) == 1:
|
|
272
288
|
return prompts[0]
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
f"LLM Run expects single prompt input. Got {len(prompts)} prompts."
|
|
276
|
-
)
|
|
289
|
+
msg = f"LLM Run expects single prompt input. Got {len(prompts)} prompts."
|
|
290
|
+
raise InputFormatError(msg)
|
|
277
291
|
|
|
278
292
|
|
|
279
293
|
class ChatModelInput(TypedDict):
|
|
@@ -298,7 +312,8 @@ def _get_messages(inputs: dict[str, Any]) -> dict:
|
|
|
298
312
|
InputFormatError: If the input format is invalid.
|
|
299
313
|
"""
|
|
300
314
|
if not inputs:
|
|
301
|
-
|
|
315
|
+
msg = "Inputs should not be empty."
|
|
316
|
+
raise InputFormatError(msg)
|
|
302
317
|
input_copy = inputs.copy()
|
|
303
318
|
if "messages" in inputs:
|
|
304
319
|
input_copy["input"] = input_copy.pop("messages")
|
|
@@ -313,16 +328,17 @@ def _get_messages(inputs: dict[str, Any]) -> dict:
|
|
|
313
328
|
if len(raw_messages) == 1:
|
|
314
329
|
input_copy["input"] = messages_from_dict(raw_messages[0])
|
|
315
330
|
else:
|
|
316
|
-
|
|
331
|
+
msg = (
|
|
317
332
|
"Batch messages not supported. Please provide a"
|
|
318
333
|
" single list of messages."
|
|
319
334
|
)
|
|
335
|
+
raise InputFormatError(msg)
|
|
320
336
|
return input_copy
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
337
|
+
msg = (
|
|
338
|
+
f"Chat Run expects single List[dict] or List[List[dict]] 'messages'"
|
|
339
|
+
f" input. Got {inputs}"
|
|
340
|
+
)
|
|
341
|
+
raise InputFormatError(msg)
|
|
326
342
|
|
|
327
343
|
|
|
328
344
|
## Shared data validation utilities
|
|
@@ -336,20 +352,21 @@ def _validate_example_inputs_for_language_model(
|
|
|
336
352
|
isinstance(prompt_input, list)
|
|
337
353
|
and all(isinstance(msg, BaseMessage) for msg in prompt_input)
|
|
338
354
|
):
|
|
339
|
-
|
|
355
|
+
msg = (
|
|
340
356
|
"When using an input_mapper to prepare dataset example inputs"
|
|
341
357
|
" for an LLM or chat model, the output must a single string or"
|
|
342
358
|
" a list of chat messages."
|
|
343
359
|
f"\nGot: {prompt_input} of type {type(prompt_input)}."
|
|
344
360
|
)
|
|
361
|
+
raise InputFormatError(msg)
|
|
345
362
|
else:
|
|
346
363
|
try:
|
|
347
364
|
_get_prompt(first_example.inputs or {})
|
|
348
365
|
except InputFormatError:
|
|
349
366
|
try:
|
|
350
367
|
_get_messages(first_example.inputs or {})
|
|
351
|
-
except InputFormatError:
|
|
352
|
-
|
|
368
|
+
except InputFormatError as err2:
|
|
369
|
+
msg = (
|
|
353
370
|
"Example inputs do not match language model input format. "
|
|
354
371
|
"Expected a dictionary with messages or a single prompt."
|
|
355
372
|
f" Got: {first_example.inputs}"
|
|
@@ -357,6 +374,7 @@ def _validate_example_inputs_for_language_model(
|
|
|
357
374
|
" to convert the example.inputs to a compatible format"
|
|
358
375
|
" for the llm or chat model you wish to evaluate."
|
|
359
376
|
)
|
|
377
|
+
raise InputFormatError(msg) from err2
|
|
360
378
|
|
|
361
379
|
|
|
362
380
|
def _validate_example_inputs_for_chain(
|
|
@@ -369,16 +387,18 @@ def _validate_example_inputs_for_chain(
|
|
|
369
387
|
first_inputs = input_mapper(first_example.inputs or {})
|
|
370
388
|
missing_keys = set(chain.input_keys).difference(first_inputs)
|
|
371
389
|
if not isinstance(first_inputs, dict):
|
|
372
|
-
|
|
390
|
+
msg = (
|
|
373
391
|
"When using an input_mapper to prepare dataset example"
|
|
374
392
|
" inputs for a chain, the mapped value must be a dictionary."
|
|
375
393
|
f"\nGot: {first_inputs} of type {type(first_inputs)}."
|
|
376
394
|
)
|
|
395
|
+
raise InputFormatError(msg)
|
|
377
396
|
if missing_keys:
|
|
378
|
-
|
|
397
|
+
msg = (
|
|
379
398
|
"Missing keys after loading example using input_mapper."
|
|
380
399
|
f"\nExpected: {chain.input_keys}. Got: {first_inputs.keys()}"
|
|
381
400
|
)
|
|
401
|
+
raise InputFormatError(msg)
|
|
382
402
|
else:
|
|
383
403
|
first_inputs = first_example.inputs
|
|
384
404
|
missing_keys = set(chain.input_keys).difference(first_inputs)
|
|
@@ -387,13 +407,14 @@ def _validate_example_inputs_for_chain(
|
|
|
387
407
|
# Refrain from calling to validate.
|
|
388
408
|
pass
|
|
389
409
|
elif missing_keys:
|
|
390
|
-
|
|
410
|
+
msg = (
|
|
391
411
|
"Example inputs missing expected chain input keys."
|
|
392
412
|
" Please provide an input_mapper to convert the example.inputs"
|
|
393
413
|
" to a compatible format for the chain you wish to evaluate."
|
|
394
414
|
f"Expected: {chain.input_keys}. "
|
|
395
415
|
f"Got: {first_inputs.keys()}"
|
|
396
416
|
)
|
|
417
|
+
raise InputFormatError(msg)
|
|
397
418
|
|
|
398
419
|
|
|
399
420
|
def _validate_example_inputs(
|
|
@@ -410,7 +431,7 @@ def _validate_example_inputs(
|
|
|
410
431
|
# Otherwise it's a runnable
|
|
411
432
|
_validate_example_inputs_for_chain(example, chain, input_mapper)
|
|
412
433
|
elif isinstance(chain, Runnable):
|
|
413
|
-
logger.debug(
|
|
434
|
+
logger.debug("Skipping input validation for %s", chain)
|
|
414
435
|
|
|
415
436
|
|
|
416
437
|
## Shared Evaluator Setup Utilities
|
|
@@ -455,16 +476,19 @@ def _determine_input_key(
|
|
|
455
476
|
input_key = config.input_key
|
|
456
477
|
if run_inputs and input_key not in run_inputs:
|
|
457
478
|
logger.warning(
|
|
458
|
-
|
|
459
|
-
|
|
479
|
+
"Input key %s not in chain's specified input keys %s. "
|
|
480
|
+
"Evaluation behavior may be undefined.",
|
|
481
|
+
input_key,
|
|
482
|
+
run_inputs,
|
|
460
483
|
)
|
|
461
484
|
elif run_inputs and len(run_inputs) == 1:
|
|
462
485
|
input_key = run_inputs[0]
|
|
463
486
|
elif run_inputs is not None and len(run_inputs) > 1:
|
|
464
487
|
logger.warning(
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
" Specify an input_key in the RunEvalConfig to avoid this warning."
|
|
488
|
+
"Chain expects multiple input keys: %s,"
|
|
489
|
+
" Evaluator is likely to fail. Evaluation behavior may be undefined."
|
|
490
|
+
" Specify an input_key in the RunEvalConfig to avoid this warning.",
|
|
491
|
+
run_inputs,
|
|
468
492
|
)
|
|
469
493
|
|
|
470
494
|
return input_key
|
|
@@ -479,16 +503,19 @@ def _determine_prediction_key(
|
|
|
479
503
|
prediction_key = config.prediction_key
|
|
480
504
|
if run_outputs and prediction_key not in run_outputs:
|
|
481
505
|
logger.warning(
|
|
482
|
-
|
|
483
|
-
|
|
506
|
+
"Prediction key %s not in chain's specified output keys %s. "
|
|
507
|
+
"Evaluation behavior may be undefined.",
|
|
508
|
+
prediction_key,
|
|
509
|
+
run_outputs,
|
|
484
510
|
)
|
|
485
511
|
elif run_outputs and len(run_outputs) == 1:
|
|
486
512
|
prediction_key = run_outputs[0]
|
|
487
513
|
elif run_outputs is not None and len(run_outputs) > 1:
|
|
488
514
|
logger.warning(
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
" in the RunEvalConfig to avoid this warning."
|
|
515
|
+
"Chain expects multiple output keys: %s,"
|
|
516
|
+
" Evaluation behavior may be undefined. Specify a prediction_key"
|
|
517
|
+
" in the RunEvalConfig to avoid this warning.",
|
|
518
|
+
run_outputs,
|
|
492
519
|
)
|
|
493
520
|
return prediction_key
|
|
494
521
|
|
|
@@ -500,12 +527,13 @@ def _determine_reference_key(
|
|
|
500
527
|
if config.reference_key:
|
|
501
528
|
reference_key = config.reference_key
|
|
502
529
|
if example_outputs and reference_key not in example_outputs:
|
|
503
|
-
|
|
530
|
+
msg = (
|
|
504
531
|
f"Reference key {reference_key} not in Dataset"
|
|
505
532
|
f" example outputs: {example_outputs}"
|
|
506
533
|
)
|
|
534
|
+
raise ValueError(msg)
|
|
507
535
|
elif example_outputs and len(example_outputs) == 1:
|
|
508
|
-
reference_key =
|
|
536
|
+
reference_key = next(iter(example_outputs))
|
|
509
537
|
else:
|
|
510
538
|
reference_key = None
|
|
511
539
|
return reference_key
|
|
@@ -544,15 +572,17 @@ def _construct_run_evaluator(
|
|
|
544
572
|
# Assume we can decorate
|
|
545
573
|
return run_evaluator_dec(eval_config)
|
|
546
574
|
else:
|
|
547
|
-
|
|
575
|
+
msg = f"Unknown evaluator type: {type(eval_config)}"
|
|
576
|
+
raise ValueError(msg) # noqa: TRY004
|
|
548
577
|
|
|
549
578
|
if isinstance(evaluator_, StringEvaluator):
|
|
550
579
|
if evaluator_.requires_reference and reference_key is None:
|
|
551
|
-
|
|
580
|
+
msg = (
|
|
552
581
|
f"Must specify reference_key in smith_eval.RunEvalConfig to use"
|
|
553
582
|
f" evaluator of type {eval_type_tag} with"
|
|
554
583
|
f" dataset with multiple output keys: {example_outputs}."
|
|
555
584
|
)
|
|
585
|
+
raise ValueError(msg)
|
|
556
586
|
run_evaluator = smith_eval.StringRunEvaluatorChain.from_run_and_data_type(
|
|
557
587
|
evaluator_,
|
|
558
588
|
run_type,
|
|
@@ -563,18 +593,18 @@ def _construct_run_evaluator(
|
|
|
563
593
|
tags=[eval_type_tag],
|
|
564
594
|
)
|
|
565
595
|
elif isinstance(evaluator_, PairwiseStringEvaluator):
|
|
566
|
-
|
|
596
|
+
msg = (
|
|
567
597
|
f"Run evaluator for {eval_type_tag} is not implemented."
|
|
568
598
|
" PairwiseStringEvaluators compare the outputs of two different models"
|
|
569
599
|
" rather than the output of a single model."
|
|
570
600
|
" Did you mean to use a StringEvaluator instead?"
|
|
571
601
|
"\nSee: https://python.langchain.com/docs/guides/evaluation/string/"
|
|
572
602
|
)
|
|
603
|
+
raise NotImplementedError(msg)
|
|
573
604
|
|
|
574
605
|
else:
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
)
|
|
606
|
+
msg = f"Run evaluator for {eval_type_tag} is not implemented"
|
|
607
|
+
raise NotImplementedError(msg)
|
|
578
608
|
return run_evaluator
|
|
579
609
|
|
|
580
610
|
|
|
@@ -611,10 +641,13 @@ def _load_run_evaluators(
|
|
|
611
641
|
input_key, prediction_key, reference_key = None, None, None
|
|
612
642
|
if config.evaluators or (
|
|
613
643
|
config.custom_evaluators
|
|
614
|
-
and any(
|
|
644
|
+
and any(isinstance(e, StringEvaluator) for e in config.custom_evaluators)
|
|
615
645
|
):
|
|
616
646
|
input_key, prediction_key, reference_key = _get_keys(
|
|
617
|
-
config,
|
|
647
|
+
config,
|
|
648
|
+
run_inputs,
|
|
649
|
+
run_outputs,
|
|
650
|
+
example_outputs,
|
|
618
651
|
)
|
|
619
652
|
for eval_config in config.evaluators:
|
|
620
653
|
run_evaluator = _construct_run_evaluator(
|
|
@@ -641,15 +674,16 @@ def _load_run_evaluators(
|
|
|
641
674
|
input_key=input_key,
|
|
642
675
|
prediction_key=prediction_key,
|
|
643
676
|
reference_key=reference_key,
|
|
644
|
-
)
|
|
677
|
+
),
|
|
645
678
|
)
|
|
646
679
|
elif callable(custom_evaluator):
|
|
647
680
|
run_evaluators.append(run_evaluator_dec(custom_evaluator))
|
|
648
681
|
else:
|
|
649
|
-
|
|
682
|
+
msg = (
|
|
650
683
|
f"Unsupported custom evaluator: {custom_evaluator}."
|
|
651
684
|
f" Expected RunEvaluator or StringEvaluator."
|
|
652
685
|
)
|
|
686
|
+
raise ValueError(msg) # noqa: TRY004
|
|
653
687
|
|
|
654
688
|
return run_evaluators
|
|
655
689
|
|
|
@@ -683,41 +717,45 @@ async def _arun_llm(
|
|
|
683
717
|
"""
|
|
684
718
|
if input_mapper is not None:
|
|
685
719
|
prompt_or_messages = input_mapper(inputs)
|
|
686
|
-
if (
|
|
687
|
-
isinstance(prompt_or_messages,
|
|
688
|
-
or isinstance(prompt_or_messages, list)
|
|
720
|
+
if isinstance(prompt_or_messages, str) or (
|
|
721
|
+
isinstance(prompt_or_messages, list)
|
|
689
722
|
and all(isinstance(msg, BaseMessage) for msg in prompt_or_messages)
|
|
690
723
|
):
|
|
691
724
|
return await llm.ainvoke(
|
|
692
725
|
prompt_or_messages,
|
|
693
726
|
config=RunnableConfig(
|
|
694
|
-
callbacks=callbacks,
|
|
727
|
+
callbacks=callbacks,
|
|
728
|
+
tags=tags or [],
|
|
729
|
+
metadata=metadata or {},
|
|
695
730
|
),
|
|
696
731
|
)
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
732
|
+
msg = (
|
|
733
|
+
"Input mapper returned invalid format"
|
|
734
|
+
f" {prompt_or_messages}"
|
|
735
|
+
"\nExpected a single string or list of chat messages."
|
|
736
|
+
)
|
|
737
|
+
raise InputFormatError(msg)
|
|
703
738
|
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
)
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
739
|
+
try:
|
|
740
|
+
prompt = _get_prompt(inputs)
|
|
741
|
+
llm_output: Union[str, BaseMessage] = await llm.ainvoke(
|
|
742
|
+
prompt,
|
|
743
|
+
config=RunnableConfig(
|
|
744
|
+
callbacks=callbacks,
|
|
745
|
+
tags=tags or [],
|
|
746
|
+
metadata=metadata or {},
|
|
747
|
+
),
|
|
748
|
+
)
|
|
749
|
+
except InputFormatError:
|
|
750
|
+
llm_inputs = _get_messages(inputs)
|
|
751
|
+
llm_output = await llm.ainvoke(
|
|
752
|
+
**llm_inputs,
|
|
753
|
+
config=RunnableConfig(
|
|
754
|
+
callbacks=callbacks,
|
|
755
|
+
tags=tags or [],
|
|
756
|
+
metadata=metadata or {},
|
|
757
|
+
),
|
|
758
|
+
)
|
|
721
759
|
return llm_output
|
|
722
760
|
|
|
723
761
|
|
|
@@ -742,12 +780,16 @@ async def _arun_chain(
|
|
|
742
780
|
output = await chain.ainvoke(
|
|
743
781
|
val,
|
|
744
782
|
config=RunnableConfig(
|
|
745
|
-
callbacks=callbacks,
|
|
783
|
+
callbacks=callbacks,
|
|
784
|
+
tags=tags or [],
|
|
785
|
+
metadata=metadata or {},
|
|
746
786
|
),
|
|
747
787
|
)
|
|
748
788
|
else:
|
|
749
789
|
runnable_config = RunnableConfig(
|
|
750
|
-
tags=tags or [],
|
|
790
|
+
tags=tags or [],
|
|
791
|
+
callbacks=callbacks,
|
|
792
|
+
metadata=metadata or {},
|
|
751
793
|
)
|
|
752
794
|
output = await chain.ainvoke(inputs_, config=runnable_config)
|
|
753
795
|
return output
|
|
@@ -799,9 +841,11 @@ async def _arun_llm_or_chain(
|
|
|
799
841
|
result = output
|
|
800
842
|
except Exception as e:
|
|
801
843
|
logger.warning(
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
844
|
+
"%s failed for example %s with inputs %s\n%s",
|
|
845
|
+
chain_or_llm,
|
|
846
|
+
example.id,
|
|
847
|
+
example.inputs,
|
|
848
|
+
e,
|
|
805
849
|
)
|
|
806
850
|
result = EvalError(Error=e)
|
|
807
851
|
return result
|
|
@@ -837,30 +881,34 @@ def _run_llm(
|
|
|
837
881
|
# Most of this is legacy code; we could probably remove a lot of it.
|
|
838
882
|
if input_mapper is not None:
|
|
839
883
|
prompt_or_messages = input_mapper(inputs)
|
|
840
|
-
if (
|
|
841
|
-
isinstance(prompt_or_messages,
|
|
842
|
-
or isinstance(prompt_or_messages, list)
|
|
884
|
+
if isinstance(prompt_or_messages, str) or (
|
|
885
|
+
isinstance(prompt_or_messages, list)
|
|
843
886
|
and all(isinstance(msg, BaseMessage) for msg in prompt_or_messages)
|
|
844
887
|
):
|
|
845
888
|
llm_output: Union[str, BaseMessage] = llm.invoke(
|
|
846
889
|
prompt_or_messages,
|
|
847
890
|
config=RunnableConfig(
|
|
848
|
-
callbacks=callbacks,
|
|
891
|
+
callbacks=callbacks,
|
|
892
|
+
tags=tags or [],
|
|
893
|
+
metadata=metadata or {},
|
|
849
894
|
),
|
|
850
895
|
)
|
|
851
896
|
else:
|
|
852
|
-
|
|
897
|
+
msg = (
|
|
853
898
|
"Input mapper returned invalid format: "
|
|
854
899
|
f" {prompt_or_messages}"
|
|
855
900
|
"\nExpected a single string or list of chat messages."
|
|
856
901
|
)
|
|
902
|
+
raise InputFormatError(msg)
|
|
857
903
|
else:
|
|
858
904
|
try:
|
|
859
905
|
llm_prompts = _get_prompt(inputs)
|
|
860
906
|
llm_output = llm.invoke(
|
|
861
907
|
llm_prompts,
|
|
862
908
|
config=RunnableConfig(
|
|
863
|
-
callbacks=callbacks,
|
|
909
|
+
callbacks=callbacks,
|
|
910
|
+
tags=tags or [],
|
|
911
|
+
metadata=metadata or {},
|
|
864
912
|
),
|
|
865
913
|
)
|
|
866
914
|
except InputFormatError:
|
|
@@ -893,12 +941,16 @@ def _run_chain(
|
|
|
893
941
|
output = chain.invoke(
|
|
894
942
|
val,
|
|
895
943
|
config=RunnableConfig(
|
|
896
|
-
callbacks=callbacks,
|
|
944
|
+
callbacks=callbacks,
|
|
945
|
+
tags=tags or [],
|
|
946
|
+
metadata=metadata or {},
|
|
897
947
|
),
|
|
898
948
|
)
|
|
899
949
|
else:
|
|
900
950
|
runnable_config = RunnableConfig(
|
|
901
|
-
tags=tags or [],
|
|
951
|
+
tags=tags or [],
|
|
952
|
+
callbacks=callbacks,
|
|
953
|
+
metadata=metadata or {},
|
|
902
954
|
)
|
|
903
955
|
output = chain.invoke(inputs_, config=runnable_config)
|
|
904
956
|
return output
|
|
@@ -952,9 +1004,12 @@ def _run_llm_or_chain(
|
|
|
952
1004
|
except Exception as e:
|
|
953
1005
|
error_type = type(e).__name__
|
|
954
1006
|
logger.warning(
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
1007
|
+
"%s failed for example %s with inputs %s\nError Type: %s, Message: %s",
|
|
1008
|
+
chain_or_llm,
|
|
1009
|
+
example.id,
|
|
1010
|
+
example.inputs,
|
|
1011
|
+
error_type,
|
|
1012
|
+
e,
|
|
958
1013
|
)
|
|
959
1014
|
result = EvalError(Error=e)
|
|
960
1015
|
return result
|
|
@@ -974,7 +1029,8 @@ def _prepare_eval_run(
|
|
|
974
1029
|
|
|
975
1030
|
examples = list(client.list_examples(dataset_id=dataset.id, as_of=dataset_version))
|
|
976
1031
|
if not examples:
|
|
977
|
-
|
|
1032
|
+
msg = f"Dataset {dataset_name} has no example rows."
|
|
1033
|
+
raise ValueError(msg)
|
|
978
1034
|
modified_at = [ex.modified_at for ex in examples if ex.modified_at]
|
|
979
1035
|
# Should always be defined in practice when fetched,
|
|
980
1036
|
# but the typing permits None
|
|
@@ -999,7 +1055,7 @@ def _prepare_eval_run(
|
|
|
999
1055
|
)
|
|
1000
1056
|
except (HTTPError, ValueError, LangSmithError) as e:
|
|
1001
1057
|
if "already exists " not in str(e):
|
|
1002
|
-
raise
|
|
1058
|
+
raise
|
|
1003
1059
|
uid = uuid.uuid4()
|
|
1004
1060
|
example_msg = f"""
|
|
1005
1061
|
run_on_dataset(
|
|
@@ -1007,10 +1063,11 @@ run_on_dataset(
|
|
|
1007
1063
|
project_name="{project_name} - {uid}", # Update since {project_name} already exists
|
|
1008
1064
|
)
|
|
1009
1065
|
"""
|
|
1010
|
-
|
|
1066
|
+
msg = (
|
|
1011
1067
|
f"Test project {project_name} already exists. Please use a different name:"
|
|
1012
1068
|
f"\n\n{example_msg}"
|
|
1013
1069
|
)
|
|
1070
|
+
raise ValueError(msg) from e
|
|
1014
1071
|
comparison_url = dataset.url + f"/compare?selectedSessions={project.id}"
|
|
1015
1072
|
print( # noqa: T201
|
|
1016
1073
|
f"View the evaluation results for project '{project_name}'"
|
|
@@ -1047,7 +1104,7 @@ class _DatasetRunContainer:
|
|
|
1047
1104
|
) -> dict:
|
|
1048
1105
|
results: dict = {}
|
|
1049
1106
|
for example, output in zip(self.examples, batch_results):
|
|
1050
|
-
row_result = cast(_RowResult, all_eval_results.get(str(example.id), {}))
|
|
1107
|
+
row_result = cast("_RowResult", all_eval_results.get(str(example.id), {}))
|
|
1051
1108
|
results[str(example.id)] = {
|
|
1052
1109
|
"input": example.inputs,
|
|
1053
1110
|
"feedback": row_result.get("feedback", []),
|
|
@@ -1074,16 +1131,16 @@ class _DatasetRunContainer:
|
|
|
1074
1131
|
result = evaluator(runs_list, self.examples)
|
|
1075
1132
|
if isinstance(result, EvaluationResult):
|
|
1076
1133
|
result = result.dict()
|
|
1077
|
-
aggregate_feedback.append(cast(dict, result))
|
|
1134
|
+
aggregate_feedback.append(cast("dict", result))
|
|
1078
1135
|
executor.submit(
|
|
1079
1136
|
self.client.create_feedback,
|
|
1080
1137
|
**result,
|
|
1081
1138
|
run_id=None,
|
|
1082
1139
|
project_id=self.project.id,
|
|
1083
1140
|
)
|
|
1084
|
-
except Exception
|
|
1085
|
-
logger.
|
|
1086
|
-
|
|
1141
|
+
except Exception:
|
|
1142
|
+
logger.exception(
|
|
1143
|
+
"Error running batch evaluator %s", repr(evaluator)
|
|
1087
1144
|
)
|
|
1088
1145
|
return aggregate_feedback
|
|
1089
1146
|
|
|
@@ -1091,12 +1148,12 @@ class _DatasetRunContainer:
|
|
|
1091
1148
|
all_eval_results: dict = {}
|
|
1092
1149
|
all_runs: dict = {}
|
|
1093
1150
|
for c in self.configs:
|
|
1094
|
-
for callback in cast(list, c["callbacks"]):
|
|
1151
|
+
for callback in cast("list", c["callbacks"]):
|
|
1095
1152
|
if isinstance(callback, EvaluatorCallbackHandler):
|
|
1096
1153
|
eval_results = callback.logged_eval_results
|
|
1097
1154
|
for (_, example_id), v in eval_results.items():
|
|
1098
1155
|
all_eval_results.setdefault(str(example_id), {}).update(
|
|
1099
|
-
{"feedback": v}
|
|
1156
|
+
{"feedback": v},
|
|
1100
1157
|
)
|
|
1101
1158
|
elif isinstance(callback, LangChainTracer):
|
|
1102
1159
|
run = callback.latest_run
|
|
@@ -1111,10 +1168,10 @@ class _DatasetRunContainer:
|
|
|
1111
1168
|
"execution_time": execution_time,
|
|
1112
1169
|
"run_id": run_id,
|
|
1113
1170
|
"run": run,
|
|
1114
|
-
}
|
|
1171
|
+
},
|
|
1115
1172
|
)
|
|
1116
1173
|
all_runs[str(callback.example_id)] = run
|
|
1117
|
-
return cast(dict[str, _RowResult], all_eval_results), all_runs
|
|
1174
|
+
return cast("dict[str, _RowResult]", all_eval_results), all_runs
|
|
1118
1175
|
|
|
1119
1176
|
def _collect_test_results(
|
|
1120
1177
|
self,
|
|
@@ -1134,21 +1191,26 @@ class _DatasetRunContainer:
|
|
|
1134
1191
|
aggregate_metrics=aggregate_feedback,
|
|
1135
1192
|
)
|
|
1136
1193
|
|
|
1137
|
-
def finish(
|
|
1194
|
+
def finish(
|
|
1195
|
+
self,
|
|
1196
|
+
batch_results: list,
|
|
1197
|
+
verbose: bool = False, # noqa: FBT001,FBT002
|
|
1198
|
+
) -> TestResult:
|
|
1138
1199
|
results = self._collect_test_results(batch_results)
|
|
1139
1200
|
if verbose:
|
|
1140
1201
|
try:
|
|
1141
1202
|
agg_feedback = results.get_aggregate_feedback()
|
|
1142
1203
|
_display_aggregate_results(agg_feedback)
|
|
1143
1204
|
except Exception as e:
|
|
1144
|
-
logger.debug(
|
|
1205
|
+
logger.debug("Failed to print aggregate feedback: %s", e, exc_info=True)
|
|
1145
1206
|
try:
|
|
1146
1207
|
# Closing the project permits name changing and metric optimizations
|
|
1147
1208
|
self.client.update_project(
|
|
1148
|
-
self.project.id,
|
|
1209
|
+
self.project.id,
|
|
1210
|
+
end_time=datetime.now(timezone.utc),
|
|
1149
1211
|
)
|
|
1150
1212
|
except Exception as e:
|
|
1151
|
-
logger.debug(
|
|
1213
|
+
logger.debug("Failed to close project: %s", e, exc_info=True)
|
|
1152
1214
|
return results
|
|
1153
1215
|
|
|
1154
1216
|
@classmethod
|
|
@@ -1188,7 +1250,10 @@ class _DatasetRunContainer:
|
|
|
1188
1250
|
run_metadata["revision_id"] = revision_id
|
|
1189
1251
|
wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory)
|
|
1190
1252
|
run_evaluators = _setup_evaluation(
|
|
1191
|
-
wrapped_model,
|
|
1253
|
+
wrapped_model,
|
|
1254
|
+
examples,
|
|
1255
|
+
evaluation,
|
|
1256
|
+
dataset.data_type or DataType.kv,
|
|
1192
1257
|
)
|
|
1193
1258
|
_validate_example_inputs(examples[0], wrapped_model, input_mapper)
|
|
1194
1259
|
progress_bar = progress.ProgressBarCallback(len(examples))
|
|
@@ -1242,7 +1307,8 @@ def _display_aggregate_results(aggregate_results: pd.DataFrame) -> None:
|
|
|
1242
1307
|
display(aggregate_results)
|
|
1243
1308
|
else:
|
|
1244
1309
|
formatted_string = aggregate_results.to_string(
|
|
1245
|
-
float_format=lambda x: f"{x:.2f}",
|
|
1310
|
+
float_format=lambda x: f"{x:.2f}",
|
|
1311
|
+
justify="right",
|
|
1246
1312
|
)
|
|
1247
1313
|
print("\n Experiment Results:") # noqa: T201
|
|
1248
1314
|
print(formatted_string) # noqa: T201
|
|
@@ -1279,6 +1345,114 @@ async def arun_on_dataset(
|
|
|
1279
1345
|
revision_id: Optional[str] = None,
|
|
1280
1346
|
**kwargs: Any,
|
|
1281
1347
|
) -> dict[str, Any]:
|
|
1348
|
+
"""Run on dataset.
|
|
1349
|
+
|
|
1350
|
+
Run the Chain or language model on a dataset and store traces
|
|
1351
|
+
to the specified project name.
|
|
1352
|
+
|
|
1353
|
+
For the (usually faster) async version of this function,
|
|
1354
|
+
see :func:`arun_on_dataset`.
|
|
1355
|
+
|
|
1356
|
+
Args:
|
|
1357
|
+
dataset_name: Name of the dataset to run the chain on.
|
|
1358
|
+
llm_or_chain_factory: Language model or Chain constructor to run
|
|
1359
|
+
over the dataset. The Chain constructor is used to permit
|
|
1360
|
+
independent calls on each example without carrying over state.
|
|
1361
|
+
evaluation: Configuration for evaluators to run on the
|
|
1362
|
+
results of the chain
|
|
1363
|
+
concurrency_level: The number of async tasks to run concurrently.
|
|
1364
|
+
project_name: Name of the project to store the traces in.
|
|
1365
|
+
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
|
1366
|
+
project_metadata: Optional metadata to add to the project.
|
|
1367
|
+
Useful for storing information the test variant.
|
|
1368
|
+
(prompt version, model version, etc.)
|
|
1369
|
+
client: LangSmith client to use to access the dataset and to
|
|
1370
|
+
log feedback and run traces.
|
|
1371
|
+
verbose: Whether to print progress.
|
|
1372
|
+
tags: Tags to add to each run in the project.
|
|
1373
|
+
revision_id: Optional revision identifier to assign this test run to
|
|
1374
|
+
track the performance of different versions of your system.
|
|
1375
|
+
Returns:
|
|
1376
|
+
A dictionary containing the run's project name and the resulting model outputs.
|
|
1377
|
+
|
|
1378
|
+
Examples:
|
|
1379
|
+
|
|
1380
|
+
.. code-block:: python
|
|
1381
|
+
|
|
1382
|
+
from langsmith import Client
|
|
1383
|
+
from langchain_openai import ChatOpenAI
|
|
1384
|
+
from langchain.chains import LLMChain
|
|
1385
|
+
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset
|
|
1386
|
+
|
|
1387
|
+
# Chains may have memory. Passing in a constructor function lets the
|
|
1388
|
+
# evaluation framework avoid cross-contamination between runs.
|
|
1389
|
+
def construct_chain():
|
|
1390
|
+
llm = ChatOpenAI(temperature=0)
|
|
1391
|
+
chain = LLMChain.from_string(
|
|
1392
|
+
llm,
|
|
1393
|
+
"What's the answer to {your_input_key}"
|
|
1394
|
+
)
|
|
1395
|
+
return chain
|
|
1396
|
+
|
|
1397
|
+
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
|
1398
|
+
evaluation_config = smith_eval.RunEvalConfig(
|
|
1399
|
+
evaluators=[
|
|
1400
|
+
"qa", # "Correctness" against a reference answer
|
|
1401
|
+
"embedding_distance",
|
|
1402
|
+
smith_eval.RunEvalConfig.Criteria("helpfulness"),
|
|
1403
|
+
smith_eval.RunEvalConfig.Criteria({
|
|
1404
|
+
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
|
1405
|
+
}),
|
|
1406
|
+
]
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
client = Client()
|
|
1410
|
+
await arun_on_dataset(
|
|
1411
|
+
client,
|
|
1412
|
+
dataset_name="<my_dataset_name>",
|
|
1413
|
+
llm_or_chain_factory=construct_chain,
|
|
1414
|
+
evaluation=evaluation_config,
|
|
1415
|
+
)
|
|
1416
|
+
|
|
1417
|
+
You can also create custom evaluators by subclassing the
|
|
1418
|
+
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
|
1419
|
+
or LangSmith's `RunEvaluator` classes.
|
|
1420
|
+
|
|
1421
|
+
.. code-block:: python
|
|
1422
|
+
|
|
1423
|
+
from typing import Optional
|
|
1424
|
+
from langchain.evaluation import StringEvaluator
|
|
1425
|
+
|
|
1426
|
+
class MyStringEvaluator(StringEvaluator):
|
|
1427
|
+
|
|
1428
|
+
@property
|
|
1429
|
+
def requires_input(self) -> bool:
|
|
1430
|
+
return False
|
|
1431
|
+
|
|
1432
|
+
@property
|
|
1433
|
+
def requires_reference(self) -> bool:
|
|
1434
|
+
return True
|
|
1435
|
+
|
|
1436
|
+
@property
|
|
1437
|
+
def evaluation_name(self) -> str:
|
|
1438
|
+
return "exact_match"
|
|
1439
|
+
|
|
1440
|
+
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
|
1441
|
+
return {"score": prediction == reference}
|
|
1442
|
+
|
|
1443
|
+
|
|
1444
|
+
evaluation_config = smith_eval.RunEvalConfig(
|
|
1445
|
+
custom_evaluators = [MyStringEvaluator()],
|
|
1446
|
+
)
|
|
1447
|
+
|
|
1448
|
+
await arun_on_dataset(
|
|
1449
|
+
client,
|
|
1450
|
+
dataset_name="<my_dataset_name>",
|
|
1451
|
+
llm_or_chain_factory=construct_chain,
|
|
1452
|
+
evaluation=evaluation_config,
|
|
1453
|
+
)
|
|
1454
|
+
|
|
1455
|
+
""" # noqa: E501
|
|
1282
1456
|
input_mapper = kwargs.pop("input_mapper", None)
|
|
1283
1457
|
if input_mapper:
|
|
1284
1458
|
warn_deprecated("0.0.305", message=_INPUT_MAPPER_DEP_WARNING, pending=True)
|
|
@@ -1344,6 +1518,114 @@ def run_on_dataset(
|
|
|
1344
1518
|
revision_id: Optional[str] = None,
|
|
1345
1519
|
**kwargs: Any,
|
|
1346
1520
|
) -> dict[str, Any]:
|
|
1521
|
+
"""Run on dataset.
|
|
1522
|
+
|
|
1523
|
+
Run the Chain or language model on a dataset and store traces
|
|
1524
|
+
to the specified project name.
|
|
1525
|
+
|
|
1526
|
+
For the (usually faster) async version of this function,
|
|
1527
|
+
see :func:`arun_on_dataset`.
|
|
1528
|
+
|
|
1529
|
+
Args:
|
|
1530
|
+
dataset_name: Name of the dataset to run the chain on.
|
|
1531
|
+
llm_or_chain_factory: Language model or Chain constructor to run
|
|
1532
|
+
over the dataset. The Chain constructor is used to permit
|
|
1533
|
+
independent calls on each example without carrying over state.
|
|
1534
|
+
evaluation: Configuration for evaluators to run on the
|
|
1535
|
+
results of the chain
|
|
1536
|
+
concurrency_level: The number of async tasks to run concurrently.
|
|
1537
|
+
project_name: Name of the project to store the traces in.
|
|
1538
|
+
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
|
1539
|
+
project_metadata: Optional metadata to add to the project.
|
|
1540
|
+
Useful for storing information the test variant.
|
|
1541
|
+
(prompt version, model version, etc.)
|
|
1542
|
+
client: LangSmith client to use to access the dataset and to
|
|
1543
|
+
log feedback and run traces.
|
|
1544
|
+
verbose: Whether to print progress.
|
|
1545
|
+
tags: Tags to add to each run in the project.
|
|
1546
|
+
revision_id: Optional revision identifier to assign this test run to
|
|
1547
|
+
track the performance of different versions of your system.
|
|
1548
|
+
Returns:
|
|
1549
|
+
A dictionary containing the run's project name and the resulting model outputs.
|
|
1550
|
+
|
|
1551
|
+
Examples:
|
|
1552
|
+
|
|
1553
|
+
.. code-block:: python
|
|
1554
|
+
|
|
1555
|
+
from langsmith import Client
|
|
1556
|
+
from langchain_openai import ChatOpenAI
|
|
1557
|
+
from langchain.chains import LLMChain
|
|
1558
|
+
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset
|
|
1559
|
+
|
|
1560
|
+
# Chains may have memory. Passing in a constructor function lets the
|
|
1561
|
+
# evaluation framework avoid cross-contamination between runs.
|
|
1562
|
+
def construct_chain():
|
|
1563
|
+
llm = ChatOpenAI(temperature=0)
|
|
1564
|
+
chain = LLMChain.from_string(
|
|
1565
|
+
llm,
|
|
1566
|
+
"What's the answer to {your_input_key}"
|
|
1567
|
+
)
|
|
1568
|
+
return chain
|
|
1569
|
+
|
|
1570
|
+
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
|
1571
|
+
evaluation_config = smith_eval.RunEvalConfig(
|
|
1572
|
+
evaluators=[
|
|
1573
|
+
"qa", # "Correctness" against a reference answer
|
|
1574
|
+
"embedding_distance",
|
|
1575
|
+
smith_eval.RunEvalConfig.Criteria("helpfulness"),
|
|
1576
|
+
smith_eval.RunEvalConfig.Criteria({
|
|
1577
|
+
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
|
1578
|
+
}),
|
|
1579
|
+
]
|
|
1580
|
+
)
|
|
1581
|
+
|
|
1582
|
+
client = Client()
|
|
1583
|
+
run_on_dataset(
|
|
1584
|
+
client,
|
|
1585
|
+
dataset_name="<my_dataset_name>",
|
|
1586
|
+
llm_or_chain_factory=construct_chain,
|
|
1587
|
+
evaluation=evaluation_config,
|
|
1588
|
+
)
|
|
1589
|
+
|
|
1590
|
+
You can also create custom evaluators by subclassing the
|
|
1591
|
+
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
|
1592
|
+
or LangSmith's `RunEvaluator` classes.
|
|
1593
|
+
|
|
1594
|
+
.. code-block:: python
|
|
1595
|
+
|
|
1596
|
+
from typing import Optional
|
|
1597
|
+
from langchain.evaluation import StringEvaluator
|
|
1598
|
+
|
|
1599
|
+
class MyStringEvaluator(StringEvaluator):
|
|
1600
|
+
|
|
1601
|
+
@property
|
|
1602
|
+
def requires_input(self) -> bool:
|
|
1603
|
+
return False
|
|
1604
|
+
|
|
1605
|
+
@property
|
|
1606
|
+
def requires_reference(self) -> bool:
|
|
1607
|
+
return True
|
|
1608
|
+
|
|
1609
|
+
@property
|
|
1610
|
+
def evaluation_name(self) -> str:
|
|
1611
|
+
return "exact_match"
|
|
1612
|
+
|
|
1613
|
+
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
|
1614
|
+
return {"score": prediction == reference}
|
|
1615
|
+
|
|
1616
|
+
|
|
1617
|
+
evaluation_config = smith_eval.RunEvalConfig(
|
|
1618
|
+
custom_evaluators = [MyStringEvaluator()],
|
|
1619
|
+
)
|
|
1620
|
+
|
|
1621
|
+
run_on_dataset(
|
|
1622
|
+
client,
|
|
1623
|
+
dataset_name="<my_dataset_name>",
|
|
1624
|
+
llm_or_chain_factory=construct_chain,
|
|
1625
|
+
evaluation=evaluation_config,
|
|
1626
|
+
)
|
|
1627
|
+
|
|
1628
|
+
""" # noqa: E501
|
|
1347
1629
|
input_mapper = kwargs.pop("input_mapper", None)
|
|
1348
1630
|
if input_mapper:
|
|
1349
1631
|
warn_deprecated("0.0.305", message=_INPUT_MAPPER_DEP_WARNING, pending=True)
|
|
@@ -1401,120 +1683,7 @@ def run_on_dataset(
|
|
|
1401
1683
|
),
|
|
1402
1684
|
container.examples,
|
|
1403
1685
|
container.configs,
|
|
1404
|
-
)
|
|
1686
|
+
),
|
|
1405
1687
|
)
|
|
1406
1688
|
|
|
1407
1689
|
return container.finish(batch_results, verbose=verbose)
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
_RUN_ON_DATASET_DOCSTRING = """
|
|
1411
|
-
Run the Chain or language model on a dataset and store traces
|
|
1412
|
-
to the specified project name.
|
|
1413
|
-
|
|
1414
|
-
Args:
|
|
1415
|
-
dataset_name: Name of the dataset to run the chain on.
|
|
1416
|
-
llm_or_chain_factory: Language model or Chain constructor to run
|
|
1417
|
-
over the dataset. The Chain constructor is used to permit
|
|
1418
|
-
independent calls on each example without carrying over state.
|
|
1419
|
-
evaluation: Configuration for evaluators to run on the
|
|
1420
|
-
results of the chain
|
|
1421
|
-
concurrency_level: The number of async tasks to run concurrently.
|
|
1422
|
-
project_name: Name of the project to store the traces in.
|
|
1423
|
-
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
|
1424
|
-
project_metadata: Optional metadata to add to the project.
|
|
1425
|
-
Useful for storing information the test variant.
|
|
1426
|
-
(prompt version, model version, etc.)
|
|
1427
|
-
client: LangSmith client to use to access the dataset and to
|
|
1428
|
-
log feedback and run traces.
|
|
1429
|
-
verbose: Whether to print progress.
|
|
1430
|
-
tags: Tags to add to each run in the project.
|
|
1431
|
-
revision_id: Optional revision identifier to assign this test run to
|
|
1432
|
-
track the performance of different versions of your system.
|
|
1433
|
-
Returns:
|
|
1434
|
-
A dictionary containing the run's project name and the resulting model outputs.
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
|
|
1438
|
-
|
|
1439
|
-
Examples
|
|
1440
|
-
--------
|
|
1441
|
-
|
|
1442
|
-
.. code-block:: python
|
|
1443
|
-
|
|
1444
|
-
from langsmith import Client
|
|
1445
|
-
from langchain_openai import ChatOpenAI
|
|
1446
|
-
from langchain.chains import LLMChain
|
|
1447
|
-
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset
|
|
1448
|
-
|
|
1449
|
-
# Chains may have memory. Passing in a constructor function lets the
|
|
1450
|
-
# evaluation framework avoid cross-contamination between runs.
|
|
1451
|
-
def construct_chain():
|
|
1452
|
-
llm = ChatOpenAI(temperature=0)
|
|
1453
|
-
chain = LLMChain.from_string(
|
|
1454
|
-
llm,
|
|
1455
|
-
"What's the answer to {your_input_key}"
|
|
1456
|
-
)
|
|
1457
|
-
return chain
|
|
1458
|
-
|
|
1459
|
-
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
|
1460
|
-
evaluation_config = smith_eval.RunEvalConfig(
|
|
1461
|
-
evaluators=[
|
|
1462
|
-
"qa", # "Correctness" against a reference answer
|
|
1463
|
-
"embedding_distance",
|
|
1464
|
-
smith_eval.RunEvalConfig.Criteria("helpfulness"),
|
|
1465
|
-
smith_eval.RunEvalConfig.Criteria({
|
|
1466
|
-
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
|
1467
|
-
}),
|
|
1468
|
-
]
|
|
1469
|
-
)
|
|
1470
|
-
|
|
1471
|
-
client = Client()
|
|
1472
|
-
run_on_dataset(
|
|
1473
|
-
client,
|
|
1474
|
-
dataset_name="<my_dataset_name>",
|
|
1475
|
-
llm_or_chain_factory=construct_chain,
|
|
1476
|
-
evaluation=evaluation_config,
|
|
1477
|
-
)
|
|
1478
|
-
|
|
1479
|
-
You can also create custom evaluators by subclassing the
|
|
1480
|
-
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
|
1481
|
-
or LangSmith's `RunEvaluator` classes.
|
|
1482
|
-
|
|
1483
|
-
.. code-block:: python
|
|
1484
|
-
|
|
1485
|
-
from typing import Optional
|
|
1486
|
-
from langchain.evaluation import StringEvaluator
|
|
1487
|
-
|
|
1488
|
-
class MyStringEvaluator(StringEvaluator):
|
|
1489
|
-
|
|
1490
|
-
@property
|
|
1491
|
-
def requires_input(self) -> bool:
|
|
1492
|
-
return False
|
|
1493
|
-
|
|
1494
|
-
@property
|
|
1495
|
-
def requires_reference(self) -> bool:
|
|
1496
|
-
return True
|
|
1497
|
-
|
|
1498
|
-
@property
|
|
1499
|
-
def evaluation_name(self) -> str:
|
|
1500
|
-
return "exact_match"
|
|
1501
|
-
|
|
1502
|
-
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
|
1503
|
-
return {"score": prediction == reference}
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
evaluation_config = smith_eval.RunEvalConfig(
|
|
1507
|
-
custom_evaluators = [MyStringEvaluator()],
|
|
1508
|
-
)
|
|
1509
|
-
|
|
1510
|
-
run_on_dataset(
|
|
1511
|
-
client,
|
|
1512
|
-
dataset_name="<my_dataset_name>",
|
|
1513
|
-
llm_or_chain_factory=construct_chain,
|
|
1514
|
-
evaluation=evaluation_config,
|
|
1515
|
-
)
|
|
1516
|
-
""" # noqa: E501
|
|
1517
|
-
run_on_dataset.__doc__ = _RUN_ON_DATASET_DOCSTRING
|
|
1518
|
-
arun_on_dataset.__doc__ = _RUN_ON_DATASET_DOCSTRING.replace(
|
|
1519
|
-
"run_on_dataset(", "await arun_on_dataset("
|
|
1520
|
-
)
|