langchain 0.3.27__py3-none-any.whl → 0.4.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain/agents/agent.py +16 -20
- langchain/agents/agent_iterator.py +19 -12
- langchain/agents/agent_toolkits/vectorstore/base.py +2 -0
- langchain/agents/chat/base.py +2 -0
- langchain/agents/conversational/base.py +2 -0
- langchain/agents/conversational_chat/base.py +2 -0
- langchain/agents/initialize.py +1 -1
- langchain/agents/json_chat/base.py +1 -0
- langchain/agents/mrkl/base.py +2 -0
- langchain/agents/openai_assistant/base.py +1 -1
- langchain/agents/openai_functions_agent/agent_token_buffer_memory.py +2 -0
- langchain/agents/openai_functions_agent/base.py +3 -2
- langchain/agents/openai_functions_multi_agent/base.py +1 -1
- langchain/agents/openai_tools/base.py +1 -0
- langchain/agents/output_parsers/json.py +2 -0
- langchain/agents/output_parsers/openai_functions.py +10 -3
- langchain/agents/output_parsers/openai_tools.py +8 -1
- langchain/agents/output_parsers/react_json_single_input.py +3 -0
- langchain/agents/output_parsers/react_single_input.py +3 -0
- langchain/agents/output_parsers/self_ask.py +2 -0
- langchain/agents/output_parsers/tools.py +16 -2
- langchain/agents/output_parsers/xml.py +3 -0
- langchain/agents/react/agent.py +1 -0
- langchain/agents/react/base.py +4 -0
- langchain/agents/react/output_parser.py +2 -0
- langchain/agents/schema.py +2 -0
- langchain/agents/self_ask_with_search/base.py +4 -0
- langchain/agents/structured_chat/base.py +5 -0
- langchain/agents/structured_chat/output_parser.py +13 -0
- langchain/agents/tool_calling_agent/base.py +1 -0
- langchain/agents/tools.py +3 -0
- langchain/agents/xml/base.py +7 -1
- langchain/callbacks/streaming_aiter.py +13 -2
- langchain/callbacks/streaming_aiter_final_only.py +11 -2
- langchain/callbacks/streaming_stdout_final_only.py +5 -0
- langchain/callbacks/tracers/logging.py +11 -0
- langchain/chains/api/base.py +5 -1
- langchain/chains/base.py +8 -2
- langchain/chains/combine_documents/base.py +7 -1
- langchain/chains/combine_documents/map_reduce.py +3 -0
- langchain/chains/combine_documents/map_rerank.py +6 -4
- langchain/chains/combine_documents/reduce.py +1 -0
- langchain/chains/combine_documents/refine.py +1 -0
- langchain/chains/combine_documents/stuff.py +5 -1
- langchain/chains/constitutional_ai/base.py +7 -0
- langchain/chains/conversation/base.py +4 -1
- langchain/chains/conversational_retrieval/base.py +67 -59
- langchain/chains/elasticsearch_database/base.py +2 -1
- langchain/chains/flare/base.py +2 -0
- langchain/chains/flare/prompts.py +2 -0
- langchain/chains/llm.py +7 -2
- langchain/chains/llm_bash/__init__.py +1 -1
- langchain/chains/llm_checker/base.py +12 -1
- langchain/chains/llm_math/base.py +9 -1
- langchain/chains/llm_summarization_checker/base.py +13 -1
- langchain/chains/llm_symbolic_math/__init__.py +1 -1
- langchain/chains/loading.py +4 -2
- langchain/chains/moderation.py +3 -0
- langchain/chains/natbot/base.py +3 -1
- langchain/chains/natbot/crawler.py +29 -0
- langchain/chains/openai_functions/base.py +2 -0
- langchain/chains/openai_functions/citation_fuzzy_match.py +9 -0
- langchain/chains/openai_functions/openapi.py +4 -0
- langchain/chains/openai_functions/qa_with_structure.py +3 -3
- langchain/chains/openai_functions/tagging.py +2 -0
- langchain/chains/qa_generation/base.py +4 -0
- langchain/chains/qa_with_sources/base.py +3 -0
- langchain/chains/qa_with_sources/retrieval.py +1 -1
- langchain/chains/qa_with_sources/vector_db.py +4 -2
- langchain/chains/query_constructor/base.py +4 -2
- langchain/chains/query_constructor/parser.py +64 -2
- langchain/chains/retrieval_qa/base.py +4 -0
- langchain/chains/router/base.py +14 -2
- langchain/chains/router/embedding_router.py +3 -0
- langchain/chains/router/llm_router.py +6 -4
- langchain/chains/router/multi_prompt.py +3 -0
- langchain/chains/router/multi_retrieval_qa.py +18 -0
- langchain/chains/sql_database/query.py +1 -0
- langchain/chains/structured_output/base.py +2 -0
- langchain/chains/transform.py +4 -0
- langchain/chat_models/base.py +55 -18
- langchain/document_loaders/blob_loaders/schema.py +1 -4
- langchain/embeddings/base.py +2 -0
- langchain/embeddings/cache.py +3 -3
- langchain/evaluation/agents/trajectory_eval_chain.py +3 -2
- langchain/evaluation/comparison/eval_chain.py +1 -0
- langchain/evaluation/criteria/eval_chain.py +3 -0
- langchain/evaluation/embedding_distance/base.py +11 -0
- langchain/evaluation/exact_match/base.py +14 -1
- langchain/evaluation/loading.py +1 -0
- langchain/evaluation/parsing/base.py +16 -3
- langchain/evaluation/parsing/json_distance.py +19 -8
- langchain/evaluation/parsing/json_schema.py +1 -4
- langchain/evaluation/qa/eval_chain.py +8 -0
- langchain/evaluation/qa/generate_chain.py +2 -0
- langchain/evaluation/regex_match/base.py +9 -1
- langchain/evaluation/scoring/eval_chain.py +1 -0
- langchain/evaluation/string_distance/base.py +6 -0
- langchain/memory/buffer.py +5 -0
- langchain/memory/buffer_window.py +2 -0
- langchain/memory/combined.py +1 -1
- langchain/memory/entity.py +47 -0
- langchain/memory/simple.py +3 -0
- langchain/memory/summary.py +30 -0
- langchain/memory/summary_buffer.py +3 -0
- langchain/memory/token_buffer.py +2 -0
- langchain/output_parsers/combining.py +4 -2
- langchain/output_parsers/enum.py +5 -1
- langchain/output_parsers/fix.py +8 -1
- langchain/output_parsers/pandas_dataframe.py +16 -1
- langchain/output_parsers/regex.py +2 -0
- langchain/output_parsers/retry.py +21 -1
- langchain/output_parsers/structured.py +10 -0
- langchain/output_parsers/yaml.py +4 -0
- langchain/pydantic_v1/__init__.py +1 -1
- langchain/retrievers/document_compressors/chain_extract.py +4 -2
- langchain/retrievers/document_compressors/cohere_rerank.py +2 -0
- langchain/retrievers/document_compressors/cross_encoder_rerank.py +2 -0
- langchain/retrievers/document_compressors/embeddings_filter.py +3 -0
- langchain/retrievers/document_compressors/listwise_rerank.py +1 -0
- langchain/retrievers/ensemble.py +2 -2
- langchain/retrievers/multi_query.py +3 -1
- langchain/retrievers/multi_vector.py +4 -1
- langchain/retrievers/parent_document_retriever.py +15 -0
- langchain/retrievers/self_query/base.py +19 -0
- langchain/retrievers/time_weighted_retriever.py +3 -0
- langchain/runnables/hub.py +12 -0
- langchain/runnables/openai_functions.py +6 -0
- langchain/smith/__init__.py +1 -0
- langchain/smith/evaluation/config.py +5 -22
- langchain/smith/evaluation/progress.py +12 -3
- langchain/smith/evaluation/runner_utils.py +240 -123
- langchain/smith/evaluation/string_run_evaluator.py +27 -0
- langchain/storage/encoder_backed.py +1 -0
- langchain/tools/python/__init__.py +1 -1
- {langchain-0.3.27.dist-info → langchain-0.4.0.dev0.dist-info}/METADATA +2 -12
- {langchain-0.3.27.dist-info → langchain-0.4.0.dev0.dist-info}/RECORD +140 -141
- langchain/smith/evaluation/utils.py +0 -0
- {langchain-0.3.27.dist-info → langchain-0.4.0.dev0.dist-info}/WHEEL +0 -0
- {langchain-0.3.27.dist-info → langchain-0.4.0.dev0.dist-info}/entry_points.txt +0 -0
- {langchain-0.3.27.dist-info → langchain-0.4.0.dev0.dist-info}/licenses/LICENSE +0 -0
|
@@ -155,9 +155,24 @@ class EvalError(dict):
|
|
|
155
155
|
"""Your architecture raised an error."""
|
|
156
156
|
|
|
157
157
|
def __init__(self, Error: BaseException, **kwargs: Any) -> None:
|
|
158
|
+
"""Initialize the EvalError with an error and additional attributes.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
Error: The error that occurred.
|
|
162
|
+
**kwargs: Additional attributes to include in the error.
|
|
163
|
+
"""
|
|
158
164
|
super().__init__(Error=Error, **kwargs)
|
|
159
165
|
|
|
160
166
|
def __getattr__(self, name: str) -> Any:
|
|
167
|
+
"""Get an attribute from the EvalError.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
name: The name of the attribute to get.
|
|
171
|
+
Returns:
|
|
172
|
+
The value of the attribute.
|
|
173
|
+
Raises:
|
|
174
|
+
AttributeError: If the attribute does not exist.
|
|
175
|
+
"""
|
|
161
176
|
try:
|
|
162
177
|
return self[name]
|
|
163
178
|
except KeyError as e:
|
|
@@ -199,24 +214,24 @@ def _wrap_in_chain_factory(
|
|
|
199
214
|
return lambda: lcf
|
|
200
215
|
if callable(llm_or_chain_factory):
|
|
201
216
|
if is_traceable_function(llm_or_chain_factory):
|
|
202
|
-
runnable_ = as_runnable(cast(Callable, llm_or_chain_factory))
|
|
217
|
+
runnable_ = as_runnable(cast("Callable", llm_or_chain_factory))
|
|
203
218
|
return lambda: runnable_
|
|
204
219
|
try:
|
|
205
220
|
_model = llm_or_chain_factory() # type: ignore[call-arg]
|
|
206
221
|
except TypeError:
|
|
207
222
|
# It's an arbitrary function, wrap it in a RunnableLambda
|
|
208
|
-
user_func = cast(Callable, llm_or_chain_factory)
|
|
223
|
+
user_func = cast("Callable", llm_or_chain_factory)
|
|
209
224
|
sig = inspect.signature(user_func)
|
|
210
225
|
logger.info("Wrapping function %s as RunnableLambda.", sig)
|
|
211
226
|
wrapped = RunnableLambda(user_func)
|
|
212
227
|
return lambda: wrapped
|
|
213
|
-
constructor = cast(Callable, llm_or_chain_factory)
|
|
228
|
+
constructor = cast("Callable", llm_or_chain_factory)
|
|
214
229
|
if isinstance(_model, BaseLanguageModel):
|
|
215
230
|
# It's not uncommon to do an LLM constructor instead of raw LLM,
|
|
216
231
|
# so we'll unpack it for the user.
|
|
217
232
|
return _model
|
|
218
|
-
if is_traceable_function(cast(Callable, _model)):
|
|
219
|
-
runnable_ = as_runnable(cast(Callable, _model))
|
|
233
|
+
if is_traceable_function(cast("Callable", _model)):
|
|
234
|
+
runnable_ = as_runnable(cast("Callable", _model))
|
|
220
235
|
return lambda: runnable_
|
|
221
236
|
if not isinstance(_model, Runnable):
|
|
222
237
|
# This is unlikely to happen - a constructor for a model function
|
|
@@ -1089,7 +1104,7 @@ class _DatasetRunContainer:
|
|
|
1089
1104
|
) -> dict:
|
|
1090
1105
|
results: dict = {}
|
|
1091
1106
|
for example, output in zip(self.examples, batch_results):
|
|
1092
|
-
row_result = cast(_RowResult, all_eval_results.get(str(example.id), {}))
|
|
1107
|
+
row_result = cast("_RowResult", all_eval_results.get(str(example.id), {}))
|
|
1093
1108
|
results[str(example.id)] = {
|
|
1094
1109
|
"input": example.inputs,
|
|
1095
1110
|
"feedback": row_result.get("feedback", []),
|
|
@@ -1116,7 +1131,7 @@ class _DatasetRunContainer:
|
|
|
1116
1131
|
result = evaluator(runs_list, self.examples)
|
|
1117
1132
|
if isinstance(result, EvaluationResult):
|
|
1118
1133
|
result = result.dict()
|
|
1119
|
-
aggregate_feedback.append(cast(dict, result))
|
|
1134
|
+
aggregate_feedback.append(cast("dict", result))
|
|
1120
1135
|
executor.submit(
|
|
1121
1136
|
self.client.create_feedback,
|
|
1122
1137
|
**result,
|
|
@@ -1133,7 +1148,7 @@ class _DatasetRunContainer:
|
|
|
1133
1148
|
all_eval_results: dict = {}
|
|
1134
1149
|
all_runs: dict = {}
|
|
1135
1150
|
for c in self.configs:
|
|
1136
|
-
for callback in cast(list, c["callbacks"]):
|
|
1151
|
+
for callback in cast("list", c["callbacks"]):
|
|
1137
1152
|
if isinstance(callback, EvaluatorCallbackHandler):
|
|
1138
1153
|
eval_results = callback.logged_eval_results
|
|
1139
1154
|
for (_, example_id), v in eval_results.items():
|
|
@@ -1156,7 +1171,7 @@ class _DatasetRunContainer:
|
|
|
1156
1171
|
},
|
|
1157
1172
|
)
|
|
1158
1173
|
all_runs[str(callback.example_id)] = run
|
|
1159
|
-
return cast(dict[str, _RowResult], all_eval_results), all_runs
|
|
1174
|
+
return cast("dict[str, _RowResult]", all_eval_results), all_runs
|
|
1160
1175
|
|
|
1161
1176
|
def _collect_test_results(
|
|
1162
1177
|
self,
|
|
@@ -1330,6 +1345,114 @@ async def arun_on_dataset(
|
|
|
1330
1345
|
revision_id: Optional[str] = None,
|
|
1331
1346
|
**kwargs: Any,
|
|
1332
1347
|
) -> dict[str, Any]:
|
|
1348
|
+
"""Run on dataset.
|
|
1349
|
+
|
|
1350
|
+
Run the Chain or language model on a dataset and store traces
|
|
1351
|
+
to the specified project name.
|
|
1352
|
+
|
|
1353
|
+
For the (usually faster) async version of this function,
|
|
1354
|
+
see :func:`arun_on_dataset`.
|
|
1355
|
+
|
|
1356
|
+
Args:
|
|
1357
|
+
dataset_name: Name of the dataset to run the chain on.
|
|
1358
|
+
llm_or_chain_factory: Language model or Chain constructor to run
|
|
1359
|
+
over the dataset. The Chain constructor is used to permit
|
|
1360
|
+
independent calls on each example without carrying over state.
|
|
1361
|
+
evaluation: Configuration for evaluators to run on the
|
|
1362
|
+
results of the chain
|
|
1363
|
+
concurrency_level: The number of async tasks to run concurrently.
|
|
1364
|
+
project_name: Name of the project to store the traces in.
|
|
1365
|
+
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
|
1366
|
+
project_metadata: Optional metadata to add to the project.
|
|
1367
|
+
Useful for storing information the test variant.
|
|
1368
|
+
(prompt version, model version, etc.)
|
|
1369
|
+
client: LangSmith client to use to access the dataset and to
|
|
1370
|
+
log feedback and run traces.
|
|
1371
|
+
verbose: Whether to print progress.
|
|
1372
|
+
tags: Tags to add to each run in the project.
|
|
1373
|
+
revision_id: Optional revision identifier to assign this test run to
|
|
1374
|
+
track the performance of different versions of your system.
|
|
1375
|
+
Returns:
|
|
1376
|
+
A dictionary containing the run's project name and the resulting model outputs.
|
|
1377
|
+
|
|
1378
|
+
Examples:
|
|
1379
|
+
|
|
1380
|
+
.. code-block:: python
|
|
1381
|
+
|
|
1382
|
+
from langsmith import Client
|
|
1383
|
+
from langchain_openai import ChatOpenAI
|
|
1384
|
+
from langchain.chains import LLMChain
|
|
1385
|
+
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset
|
|
1386
|
+
|
|
1387
|
+
# Chains may have memory. Passing in a constructor function lets the
|
|
1388
|
+
# evaluation framework avoid cross-contamination between runs.
|
|
1389
|
+
def construct_chain():
|
|
1390
|
+
llm = ChatOpenAI(temperature=0)
|
|
1391
|
+
chain = LLMChain.from_string(
|
|
1392
|
+
llm,
|
|
1393
|
+
"What's the answer to {your_input_key}"
|
|
1394
|
+
)
|
|
1395
|
+
return chain
|
|
1396
|
+
|
|
1397
|
+
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
|
1398
|
+
evaluation_config = smith_eval.RunEvalConfig(
|
|
1399
|
+
evaluators=[
|
|
1400
|
+
"qa", # "Correctness" against a reference answer
|
|
1401
|
+
"embedding_distance",
|
|
1402
|
+
smith_eval.RunEvalConfig.Criteria("helpfulness"),
|
|
1403
|
+
smith_eval.RunEvalConfig.Criteria({
|
|
1404
|
+
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
|
1405
|
+
}),
|
|
1406
|
+
]
|
|
1407
|
+
)
|
|
1408
|
+
|
|
1409
|
+
client = Client()
|
|
1410
|
+
await arun_on_dataset(
|
|
1411
|
+
client,
|
|
1412
|
+
dataset_name="<my_dataset_name>",
|
|
1413
|
+
llm_or_chain_factory=construct_chain,
|
|
1414
|
+
evaluation=evaluation_config,
|
|
1415
|
+
)
|
|
1416
|
+
|
|
1417
|
+
You can also create custom evaluators by subclassing the
|
|
1418
|
+
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
|
1419
|
+
or LangSmith's `RunEvaluator` classes.
|
|
1420
|
+
|
|
1421
|
+
.. code-block:: python
|
|
1422
|
+
|
|
1423
|
+
from typing import Optional
|
|
1424
|
+
from langchain.evaluation import StringEvaluator
|
|
1425
|
+
|
|
1426
|
+
class MyStringEvaluator(StringEvaluator):
|
|
1427
|
+
|
|
1428
|
+
@property
|
|
1429
|
+
def requires_input(self) -> bool:
|
|
1430
|
+
return False
|
|
1431
|
+
|
|
1432
|
+
@property
|
|
1433
|
+
def requires_reference(self) -> bool:
|
|
1434
|
+
return True
|
|
1435
|
+
|
|
1436
|
+
@property
|
|
1437
|
+
def evaluation_name(self) -> str:
|
|
1438
|
+
return "exact_match"
|
|
1439
|
+
|
|
1440
|
+
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
|
1441
|
+
return {"score": prediction == reference}
|
|
1442
|
+
|
|
1443
|
+
|
|
1444
|
+
evaluation_config = smith_eval.RunEvalConfig(
|
|
1445
|
+
custom_evaluators = [MyStringEvaluator()],
|
|
1446
|
+
)
|
|
1447
|
+
|
|
1448
|
+
await arun_on_dataset(
|
|
1449
|
+
client,
|
|
1450
|
+
dataset_name="<my_dataset_name>",
|
|
1451
|
+
llm_or_chain_factory=construct_chain,
|
|
1452
|
+
evaluation=evaluation_config,
|
|
1453
|
+
)
|
|
1454
|
+
|
|
1455
|
+
""" # noqa: E501
|
|
1333
1456
|
input_mapper = kwargs.pop("input_mapper", None)
|
|
1334
1457
|
if input_mapper:
|
|
1335
1458
|
warn_deprecated("0.0.305", message=_INPUT_MAPPER_DEP_WARNING, pending=True)
|
|
@@ -1395,6 +1518,114 @@ def run_on_dataset(
|
|
|
1395
1518
|
revision_id: Optional[str] = None,
|
|
1396
1519
|
**kwargs: Any,
|
|
1397
1520
|
) -> dict[str, Any]:
|
|
1521
|
+
"""Run on dataset.
|
|
1522
|
+
|
|
1523
|
+
Run the Chain or language model on a dataset and store traces
|
|
1524
|
+
to the specified project name.
|
|
1525
|
+
|
|
1526
|
+
For the (usually faster) async version of this function,
|
|
1527
|
+
see :func:`arun_on_dataset`.
|
|
1528
|
+
|
|
1529
|
+
Args:
|
|
1530
|
+
dataset_name: Name of the dataset to run the chain on.
|
|
1531
|
+
llm_or_chain_factory: Language model or Chain constructor to run
|
|
1532
|
+
over the dataset. The Chain constructor is used to permit
|
|
1533
|
+
independent calls on each example without carrying over state.
|
|
1534
|
+
evaluation: Configuration for evaluators to run on the
|
|
1535
|
+
results of the chain
|
|
1536
|
+
concurrency_level: The number of async tasks to run concurrently.
|
|
1537
|
+
project_name: Name of the project to store the traces in.
|
|
1538
|
+
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
|
1539
|
+
project_metadata: Optional metadata to add to the project.
|
|
1540
|
+
Useful for storing information the test variant.
|
|
1541
|
+
(prompt version, model version, etc.)
|
|
1542
|
+
client: LangSmith client to use to access the dataset and to
|
|
1543
|
+
log feedback and run traces.
|
|
1544
|
+
verbose: Whether to print progress.
|
|
1545
|
+
tags: Tags to add to each run in the project.
|
|
1546
|
+
revision_id: Optional revision identifier to assign this test run to
|
|
1547
|
+
track the performance of different versions of your system.
|
|
1548
|
+
Returns:
|
|
1549
|
+
A dictionary containing the run's project name and the resulting model outputs.
|
|
1550
|
+
|
|
1551
|
+
Examples:
|
|
1552
|
+
|
|
1553
|
+
.. code-block:: python
|
|
1554
|
+
|
|
1555
|
+
from langsmith import Client
|
|
1556
|
+
from langchain_openai import ChatOpenAI
|
|
1557
|
+
from langchain.chains import LLMChain
|
|
1558
|
+
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset
|
|
1559
|
+
|
|
1560
|
+
# Chains may have memory. Passing in a constructor function lets the
|
|
1561
|
+
# evaluation framework avoid cross-contamination between runs.
|
|
1562
|
+
def construct_chain():
|
|
1563
|
+
llm = ChatOpenAI(temperature=0)
|
|
1564
|
+
chain = LLMChain.from_string(
|
|
1565
|
+
llm,
|
|
1566
|
+
"What's the answer to {your_input_key}"
|
|
1567
|
+
)
|
|
1568
|
+
return chain
|
|
1569
|
+
|
|
1570
|
+
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
|
1571
|
+
evaluation_config = smith_eval.RunEvalConfig(
|
|
1572
|
+
evaluators=[
|
|
1573
|
+
"qa", # "Correctness" against a reference answer
|
|
1574
|
+
"embedding_distance",
|
|
1575
|
+
smith_eval.RunEvalConfig.Criteria("helpfulness"),
|
|
1576
|
+
smith_eval.RunEvalConfig.Criteria({
|
|
1577
|
+
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
|
1578
|
+
}),
|
|
1579
|
+
]
|
|
1580
|
+
)
|
|
1581
|
+
|
|
1582
|
+
client = Client()
|
|
1583
|
+
run_on_dataset(
|
|
1584
|
+
client,
|
|
1585
|
+
dataset_name="<my_dataset_name>",
|
|
1586
|
+
llm_or_chain_factory=construct_chain,
|
|
1587
|
+
evaluation=evaluation_config,
|
|
1588
|
+
)
|
|
1589
|
+
|
|
1590
|
+
You can also create custom evaluators by subclassing the
|
|
1591
|
+
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
|
1592
|
+
or LangSmith's `RunEvaluator` classes.
|
|
1593
|
+
|
|
1594
|
+
.. code-block:: python
|
|
1595
|
+
|
|
1596
|
+
from typing import Optional
|
|
1597
|
+
from langchain.evaluation import StringEvaluator
|
|
1598
|
+
|
|
1599
|
+
class MyStringEvaluator(StringEvaluator):
|
|
1600
|
+
|
|
1601
|
+
@property
|
|
1602
|
+
def requires_input(self) -> bool:
|
|
1603
|
+
return False
|
|
1604
|
+
|
|
1605
|
+
@property
|
|
1606
|
+
def requires_reference(self) -> bool:
|
|
1607
|
+
return True
|
|
1608
|
+
|
|
1609
|
+
@property
|
|
1610
|
+
def evaluation_name(self) -> str:
|
|
1611
|
+
return "exact_match"
|
|
1612
|
+
|
|
1613
|
+
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
|
1614
|
+
return {"score": prediction == reference}
|
|
1615
|
+
|
|
1616
|
+
|
|
1617
|
+
evaluation_config = smith_eval.RunEvalConfig(
|
|
1618
|
+
custom_evaluators = [MyStringEvaluator()],
|
|
1619
|
+
)
|
|
1620
|
+
|
|
1621
|
+
run_on_dataset(
|
|
1622
|
+
client,
|
|
1623
|
+
dataset_name="<my_dataset_name>",
|
|
1624
|
+
llm_or_chain_factory=construct_chain,
|
|
1625
|
+
evaluation=evaluation_config,
|
|
1626
|
+
)
|
|
1627
|
+
|
|
1628
|
+
""" # noqa: E501
|
|
1398
1629
|
input_mapper = kwargs.pop("input_mapper", None)
|
|
1399
1630
|
if input_mapper:
|
|
1400
1631
|
warn_deprecated("0.0.305", message=_INPUT_MAPPER_DEP_WARNING, pending=True)
|
|
@@ -1456,117 +1687,3 @@ def run_on_dataset(
|
|
|
1456
1687
|
)
|
|
1457
1688
|
|
|
1458
1689
|
return container.finish(batch_results, verbose=verbose)
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
_RUN_ON_DATASET_DOCSTRING = """
|
|
1462
|
-
Run the Chain or language model on a dataset and store traces
|
|
1463
|
-
to the specified project name.
|
|
1464
|
-
|
|
1465
|
-
Args:
|
|
1466
|
-
dataset_name: Name of the dataset to run the chain on.
|
|
1467
|
-
llm_or_chain_factory: Language model or Chain constructor to run
|
|
1468
|
-
over the dataset. The Chain constructor is used to permit
|
|
1469
|
-
independent calls on each example without carrying over state.
|
|
1470
|
-
evaluation: Configuration for evaluators to run on the
|
|
1471
|
-
results of the chain
|
|
1472
|
-
concurrency_level: The number of async tasks to run concurrently.
|
|
1473
|
-
project_name: Name of the project to store the traces in.
|
|
1474
|
-
Defaults to {dataset_name}-{chain class name}-{datetime}.
|
|
1475
|
-
project_metadata: Optional metadata to add to the project.
|
|
1476
|
-
Useful for storing information the test variant.
|
|
1477
|
-
(prompt version, model version, etc.)
|
|
1478
|
-
client: LangSmith client to use to access the dataset and to
|
|
1479
|
-
log feedback and run traces.
|
|
1480
|
-
verbose: Whether to print progress.
|
|
1481
|
-
tags: Tags to add to each run in the project.
|
|
1482
|
-
revision_id: Optional revision identifier to assign this test run to
|
|
1483
|
-
track the performance of different versions of your system.
|
|
1484
|
-
Returns:
|
|
1485
|
-
A dictionary containing the run's project name and the resulting model outputs.
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
For the (usually faster) async version of this function, see :func:`arun_on_dataset`.
|
|
1489
|
-
|
|
1490
|
-
Examples
|
|
1491
|
-
--------
|
|
1492
|
-
|
|
1493
|
-
.. code-block:: python
|
|
1494
|
-
|
|
1495
|
-
from langsmith import Client
|
|
1496
|
-
from langchain_openai import ChatOpenAI
|
|
1497
|
-
from langchain.chains import LLMChain
|
|
1498
|
-
from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset
|
|
1499
|
-
|
|
1500
|
-
# Chains may have memory. Passing in a constructor function lets the
|
|
1501
|
-
# evaluation framework avoid cross-contamination between runs.
|
|
1502
|
-
def construct_chain():
|
|
1503
|
-
llm = ChatOpenAI(temperature=0)
|
|
1504
|
-
chain = LLMChain.from_string(
|
|
1505
|
-
llm,
|
|
1506
|
-
"What's the answer to {your_input_key}"
|
|
1507
|
-
)
|
|
1508
|
-
return chain
|
|
1509
|
-
|
|
1510
|
-
# Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
|
|
1511
|
-
evaluation_config = smith_eval.RunEvalConfig(
|
|
1512
|
-
evaluators=[
|
|
1513
|
-
"qa", # "Correctness" against a reference answer
|
|
1514
|
-
"embedding_distance",
|
|
1515
|
-
smith_eval.RunEvalConfig.Criteria("helpfulness"),
|
|
1516
|
-
smith_eval.RunEvalConfig.Criteria({
|
|
1517
|
-
"fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
|
|
1518
|
-
}),
|
|
1519
|
-
]
|
|
1520
|
-
)
|
|
1521
|
-
|
|
1522
|
-
client = Client()
|
|
1523
|
-
run_on_dataset(
|
|
1524
|
-
client,
|
|
1525
|
-
dataset_name="<my_dataset_name>",
|
|
1526
|
-
llm_or_chain_factory=construct_chain,
|
|
1527
|
-
evaluation=evaluation_config,
|
|
1528
|
-
)
|
|
1529
|
-
|
|
1530
|
-
You can also create custom evaluators by subclassing the
|
|
1531
|
-
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
|
|
1532
|
-
or LangSmith's `RunEvaluator` classes.
|
|
1533
|
-
|
|
1534
|
-
.. code-block:: python
|
|
1535
|
-
|
|
1536
|
-
from typing import Optional
|
|
1537
|
-
from langchain.evaluation import StringEvaluator
|
|
1538
|
-
|
|
1539
|
-
class MyStringEvaluator(StringEvaluator):
|
|
1540
|
-
|
|
1541
|
-
@property
|
|
1542
|
-
def requires_input(self) -> bool:
|
|
1543
|
-
return False
|
|
1544
|
-
|
|
1545
|
-
@property
|
|
1546
|
-
def requires_reference(self) -> bool:
|
|
1547
|
-
return True
|
|
1548
|
-
|
|
1549
|
-
@property
|
|
1550
|
-
def evaluation_name(self) -> str:
|
|
1551
|
-
return "exact_match"
|
|
1552
|
-
|
|
1553
|
-
def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
|
|
1554
|
-
return {"score": prediction == reference}
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
evaluation_config = smith_eval.RunEvalConfig(
|
|
1558
|
-
custom_evaluators = [MyStringEvaluator()],
|
|
1559
|
-
)
|
|
1560
|
-
|
|
1561
|
-
run_on_dataset(
|
|
1562
|
-
client,
|
|
1563
|
-
dataset_name="<my_dataset_name>",
|
|
1564
|
-
llm_or_chain_factory=construct_chain,
|
|
1565
|
-
evaluation=evaluation_config,
|
|
1566
|
-
)
|
|
1567
|
-
""" # noqa: E501
|
|
1568
|
-
run_on_dataset.__doc__ = _RUN_ON_DATASET_DOCSTRING
|
|
1569
|
-
arun_on_dataset.__doc__ = _RUN_ON_DATASET_DOCSTRING.replace(
|
|
1570
|
-
"run_on_dataset(",
|
|
1571
|
-
"await arun_on_dataset(",
|
|
1572
|
-
)
|
|
@@ -16,6 +16,7 @@ from langchain_core.load.serializable import Serializable
|
|
|
16
16
|
from langchain_core.messages import BaseMessage, get_buffer_string, messages_from_dict
|
|
17
17
|
from langsmith import EvaluationResult, RunEvaluator
|
|
18
18
|
from langsmith.schemas import DataType, Example, Run
|
|
19
|
+
from typing_extensions import override
|
|
19
20
|
|
|
20
21
|
from langchain.chains.base import Chain
|
|
21
22
|
from langchain.evaluation.schema import StringEvaluator
|
|
@@ -70,6 +71,15 @@ class LLMStringRunMapper(StringRunMapper):
|
|
|
70
71
|
raise ValueError(msg)
|
|
71
72
|
|
|
72
73
|
def serialize_inputs(self, inputs: dict) -> str:
|
|
74
|
+
"""Serialize inputs.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
inputs: The inputs from the run, expected to contain prompts or messages.
|
|
78
|
+
Returns:
|
|
79
|
+
The serialized input text from the prompts or messages.
|
|
80
|
+
Raises:
|
|
81
|
+
ValueError: If neither prompts nor messages are found in the inputs.
|
|
82
|
+
"""
|
|
73
83
|
if "prompts" in inputs: # Should we even accept this?
|
|
74
84
|
input_ = "\n\n".join(inputs["prompts"])
|
|
75
85
|
elif "prompt" in inputs:
|
|
@@ -82,6 +92,18 @@ class LLMStringRunMapper(StringRunMapper):
|
|
|
82
92
|
return input_
|
|
83
93
|
|
|
84
94
|
def serialize_outputs(self, outputs: dict) -> str:
|
|
95
|
+
"""Serialize outputs.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
outputs: The outputs from the run, expected to contain generations.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
The serialized output text from the first generation.
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If no generations are found in the outputs,
|
|
105
|
+
or if the generations are empty.
|
|
106
|
+
"""
|
|
85
107
|
if not outputs.get("generations"):
|
|
86
108
|
msg = "Cannot evaluate LLM Run without generations."
|
|
87
109
|
raise ValueError(msg)
|
|
@@ -185,6 +207,7 @@ class ChainStringRunMapper(StringRunMapper):
|
|
|
185
207
|
class ToolStringRunMapper(StringRunMapper):
|
|
186
208
|
"""Map an input to the tool."""
|
|
187
209
|
|
|
210
|
+
@override
|
|
188
211
|
def map(self, run: Run) -> dict[str, str]:
|
|
189
212
|
if not run.outputs:
|
|
190
213
|
msg = f"Run {run.id} has no outputs to evaluate."
|
|
@@ -256,10 +279,12 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
|
|
|
256
279
|
"""The evaluation chain."""
|
|
257
280
|
|
|
258
281
|
@property
|
|
282
|
+
@override
|
|
259
283
|
def input_keys(self) -> list[str]:
|
|
260
284
|
return ["run", "example"]
|
|
261
285
|
|
|
262
286
|
@property
|
|
287
|
+
@override
|
|
263
288
|
def output_keys(self) -> list[str]:
|
|
264
289
|
return ["feedback"]
|
|
265
290
|
|
|
@@ -330,6 +355,7 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
|
|
|
330
355
|
feedback.evaluator_info[RUN_KEY] = output[RUN_KEY]
|
|
331
356
|
return feedback
|
|
332
357
|
|
|
358
|
+
@override
|
|
333
359
|
def evaluate_run(
|
|
334
360
|
self,
|
|
335
361
|
run: Run,
|
|
@@ -347,6 +373,7 @@ class StringRunEvaluatorChain(Chain, RunEvaluator):
|
|
|
347
373
|
# TODO: Add run ID once we can declare it via callbacks
|
|
348
374
|
)
|
|
349
375
|
|
|
376
|
+
@override
|
|
350
377
|
async def aevaluate_run(
|
|
351
378
|
self,
|
|
352
379
|
run: Run,
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: langchain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0.dev0
|
|
4
4
|
Summary: Building applications with LLMs through composability
|
|
5
5
|
License: MIT
|
|
6
6
|
Project-URL: Source Code, https://github.com/langchain-ai/langchain/tree/master/libs/langchain
|
|
7
7
|
Project-URL: Release Notes, https://github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain%3D%3D0%22&expanded=true
|
|
8
8
|
Project-URL: repository, https://github.com/langchain-ai/langchain
|
|
9
9
|
Requires-Python: <4.0,>=3.9
|
|
10
|
-
Requires-Dist: langchain-core<1.0.0,>=0.
|
|
10
|
+
Requires-Dist: langchain-core<1.0.0,>=0.4.0.dev0
|
|
11
11
|
Requires-Dist: langchain-text-splitters<1.0.0,>=0.3.9
|
|
12
12
|
Requires-Dist: langsmith>=0.1.17
|
|
13
13
|
Requires-Dist: pydantic<3.0.0,>=2.7.4
|
|
@@ -21,12 +21,6 @@ Provides-Extra: anthropic
|
|
|
21
21
|
Requires-Dist: langchain-anthropic; extra == "anthropic"
|
|
22
22
|
Provides-Extra: openai
|
|
23
23
|
Requires-Dist: langchain-openai; extra == "openai"
|
|
24
|
-
Provides-Extra: azure-ai
|
|
25
|
-
Requires-Dist: langchain-azure-ai; extra == "azure-ai"
|
|
26
|
-
Provides-Extra: cohere
|
|
27
|
-
Requires-Dist: langchain-cohere; extra == "cohere"
|
|
28
|
-
Provides-Extra: google-vertexai
|
|
29
|
-
Requires-Dist: langchain-google-vertexai; extra == "google-vertexai"
|
|
30
24
|
Provides-Extra: google-genai
|
|
31
25
|
Requires-Dist: langchain-google-genai; extra == "google-genai"
|
|
32
26
|
Provides-Extra: fireworks
|
|
@@ -41,12 +35,8 @@ Provides-Extra: huggingface
|
|
|
41
35
|
Requires-Dist: langchain-huggingface; extra == "huggingface"
|
|
42
36
|
Provides-Extra: groq
|
|
43
37
|
Requires-Dist: langchain-groq; extra == "groq"
|
|
44
|
-
Provides-Extra: aws
|
|
45
|
-
Requires-Dist: langchain-aws; extra == "aws"
|
|
46
38
|
Provides-Extra: deepseek
|
|
47
39
|
Requires-Dist: langchain-deepseek; extra == "deepseek"
|
|
48
|
-
Provides-Extra: xai
|
|
49
|
-
Requires-Dist: langchain-xai; extra == "xai"
|
|
50
40
|
Provides-Extra: perplexity
|
|
51
41
|
Requires-Dist: langchain-perplexity; extra == "perplexity"
|
|
52
42
|
Description-Content-Type: text/markdown
|