langchain-core 0.3.75__py3-none-any.whl → 0.3.77__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langchain-core might be problematic. Click here for more details.
- langchain_core/_api/beta_decorator.py +22 -44
- langchain_core/_api/deprecation.py +30 -17
- langchain_core/_api/path.py +19 -2
- langchain_core/_import_utils.py +7 -0
- langchain_core/agents.py +10 -6
- langchain_core/beta/runnables/context.py +1 -2
- langchain_core/callbacks/base.py +28 -15
- langchain_core/callbacks/manager.py +83 -71
- langchain_core/callbacks/usage.py +6 -4
- langchain_core/chat_history.py +29 -21
- langchain_core/document_loaders/base.py +34 -9
- langchain_core/document_loaders/langsmith.py +4 -1
- langchain_core/documents/base.py +35 -10
- langchain_core/documents/transformers.py +4 -2
- langchain_core/embeddings/fake.py +8 -5
- langchain_core/env.py +2 -3
- langchain_core/example_selectors/base.py +12 -0
- langchain_core/exceptions.py +7 -0
- langchain_core/globals.py +17 -28
- langchain_core/indexing/api.py +88 -76
- langchain_core/indexing/base.py +5 -8
- langchain_core/indexing/in_memory.py +23 -3
- langchain_core/language_models/__init__.py +3 -2
- langchain_core/language_models/base.py +31 -20
- langchain_core/language_models/chat_models.py +98 -27
- langchain_core/language_models/fake_chat_models.py +10 -9
- langchain_core/language_models/llms.py +52 -18
- langchain_core/load/dump.py +2 -3
- langchain_core/load/load.py +15 -1
- langchain_core/load/serializable.py +39 -44
- langchain_core/memory.py +7 -3
- langchain_core/messages/ai.py +53 -24
- langchain_core/messages/base.py +43 -22
- langchain_core/messages/chat.py +4 -1
- langchain_core/messages/content_blocks.py +23 -2
- langchain_core/messages/function.py +9 -5
- langchain_core/messages/human.py +13 -10
- langchain_core/messages/modifier.py +1 -0
- langchain_core/messages/system.py +11 -8
- langchain_core/messages/tool.py +60 -29
- langchain_core/messages/utils.py +250 -131
- langchain_core/output_parsers/base.py +5 -2
- langchain_core/output_parsers/json.py +4 -4
- langchain_core/output_parsers/list.py +7 -22
- langchain_core/output_parsers/openai_functions.py +3 -0
- langchain_core/output_parsers/openai_tools.py +6 -1
- langchain_core/output_parsers/pydantic.py +4 -0
- langchain_core/output_parsers/string.py +5 -1
- langchain_core/output_parsers/xml.py +19 -19
- langchain_core/outputs/chat_generation.py +25 -10
- langchain_core/outputs/generation.py +14 -3
- langchain_core/outputs/llm_result.py +8 -1
- langchain_core/prompt_values.py +16 -6
- langchain_core/prompts/base.py +4 -9
- langchain_core/prompts/chat.py +89 -57
- langchain_core/prompts/dict.py +16 -8
- langchain_core/prompts/few_shot.py +12 -11
- langchain_core/prompts/few_shot_with_templates.py +5 -1
- langchain_core/prompts/image.py +12 -5
- langchain_core/prompts/message.py +5 -6
- langchain_core/prompts/pipeline.py +13 -8
- langchain_core/prompts/prompt.py +22 -8
- langchain_core/prompts/string.py +18 -10
- langchain_core/prompts/structured.py +7 -2
- langchain_core/rate_limiters.py +2 -2
- langchain_core/retrievers.py +7 -6
- langchain_core/runnables/base.py +406 -186
- langchain_core/runnables/branch.py +14 -19
- langchain_core/runnables/config.py +9 -15
- langchain_core/runnables/configurable.py +34 -19
- langchain_core/runnables/fallbacks.py +20 -13
- langchain_core/runnables/graph.py +48 -38
- langchain_core/runnables/graph_ascii.py +41 -18
- langchain_core/runnables/graph_mermaid.py +54 -25
- langchain_core/runnables/graph_png.py +27 -31
- langchain_core/runnables/history.py +55 -58
- langchain_core/runnables/passthrough.py +44 -21
- langchain_core/runnables/retry.py +44 -23
- langchain_core/runnables/router.py +9 -8
- langchain_core/runnables/schema.py +2 -0
- langchain_core/runnables/utils.py +51 -89
- langchain_core/stores.py +19 -31
- langchain_core/sys_info.py +9 -8
- langchain_core/tools/base.py +37 -28
- langchain_core/tools/convert.py +26 -15
- langchain_core/tools/simple.py +36 -8
- langchain_core/tools/structured.py +25 -12
- langchain_core/tracers/base.py +2 -2
- langchain_core/tracers/context.py +5 -1
- langchain_core/tracers/core.py +109 -39
- langchain_core/tracers/evaluation.py +22 -26
- langchain_core/tracers/event_stream.py +45 -34
- langchain_core/tracers/langchain.py +12 -3
- langchain_core/tracers/langchain_v1.py +10 -2
- langchain_core/tracers/log_stream.py +56 -17
- langchain_core/tracers/root_listeners.py +4 -20
- langchain_core/tracers/run_collector.py +6 -16
- langchain_core/tracers/schemas.py +5 -1
- langchain_core/utils/aiter.py +15 -7
- langchain_core/utils/env.py +3 -0
- langchain_core/utils/function_calling.py +50 -28
- langchain_core/utils/interactive_env.py +6 -2
- langchain_core/utils/iter.py +12 -4
- langchain_core/utils/json.py +12 -3
- langchain_core/utils/json_schema.py +156 -40
- langchain_core/utils/loading.py +5 -1
- langchain_core/utils/mustache.py +24 -15
- langchain_core/utils/pydantic.py +38 -9
- langchain_core/utils/utils.py +25 -9
- langchain_core/vectorstores/base.py +7 -20
- langchain_core/vectorstores/in_memory.py +23 -17
- langchain_core/vectorstores/utils.py +18 -12
- langchain_core/version.py +1 -1
- langchain_core-0.3.77.dist-info/METADATA +67 -0
- langchain_core-0.3.77.dist-info/RECORD +174 -0
- langchain_core-0.3.75.dist-info/METADATA +0 -106
- langchain_core-0.3.75.dist-info/RECORD +0 -174
- {langchain_core-0.3.75.dist-info → langchain_core-0.3.77.dist-info}/WHEEL +0 -0
- {langchain_core-0.3.75.dist-info → langchain_core-0.3.77.dist-info}/entry_points.txt +0 -0
langchain_core/documents/base.py
CHANGED
|
@@ -82,7 +82,7 @@ class Blob(BaseMedia):
|
|
|
82
82
|
blob = Blob.from_data(
|
|
83
83
|
data="Hello, world!",
|
|
84
84
|
mime_type="text/plain",
|
|
85
|
-
metadata={"source": "https://example.com"}
|
|
85
|
+
metadata={"source": "https://example.com"},
|
|
86
86
|
)
|
|
87
87
|
|
|
88
88
|
Example: Load the blob from a file
|
|
@@ -145,7 +145,14 @@ class Blob(BaseMedia):
|
|
|
145
145
|
return values
|
|
146
146
|
|
|
147
147
|
def as_string(self) -> str:
|
|
148
|
-
"""Read data as a string.
|
|
148
|
+
"""Read data as a string.
|
|
149
|
+
|
|
150
|
+
Raises:
|
|
151
|
+
ValueError: If the blob cannot be represented as a string.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
The data as a string.
|
|
155
|
+
"""
|
|
149
156
|
if self.data is None and self.path:
|
|
150
157
|
return Path(self.path).read_text(encoding=self.encoding)
|
|
151
158
|
if isinstance(self.data, bytes):
|
|
@@ -156,7 +163,14 @@ class Blob(BaseMedia):
|
|
|
156
163
|
raise ValueError(msg)
|
|
157
164
|
|
|
158
165
|
def as_bytes(self) -> bytes:
|
|
159
|
-
"""Read data as bytes.
|
|
166
|
+
"""Read data as bytes.
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
ValueError: If the blob cannot be represented as bytes.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
The data as bytes.
|
|
173
|
+
"""
|
|
160
174
|
if isinstance(self.data, bytes):
|
|
161
175
|
return self.data
|
|
162
176
|
if isinstance(self.data, str):
|
|
@@ -168,7 +182,14 @@ class Blob(BaseMedia):
|
|
|
168
182
|
|
|
169
183
|
@contextlib.contextmanager
|
|
170
184
|
def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
|
|
171
|
-
"""Read data as a byte stream.
|
|
185
|
+
"""Read data as a byte stream.
|
|
186
|
+
|
|
187
|
+
Raises:
|
|
188
|
+
NotImplementedError: If the blob cannot be represented as a byte stream.
|
|
189
|
+
|
|
190
|
+
Yields:
|
|
191
|
+
The data as a byte stream.
|
|
192
|
+
"""
|
|
172
193
|
if isinstance(self.data, bytes):
|
|
173
194
|
yield BytesIO(self.data)
|
|
174
195
|
elif self.data is None and self.path:
|
|
@@ -246,7 +267,7 @@ class Blob(BaseMedia):
|
|
|
246
267
|
)
|
|
247
268
|
|
|
248
269
|
def __repr__(self) -> str:
|
|
249
|
-
"""
|
|
270
|
+
"""Return the blob representation."""
|
|
250
271
|
str_repr = f"Blob {id(self)}"
|
|
251
272
|
if self.source:
|
|
252
273
|
str_repr += f" {self.source}"
|
|
@@ -263,8 +284,7 @@ class Document(BaseMedia):
|
|
|
263
284
|
from langchain_core.documents import Document
|
|
264
285
|
|
|
265
286
|
document = Document(
|
|
266
|
-
page_content="Hello, world!",
|
|
267
|
-
metadata={"source": "https://example.com"}
|
|
287
|
+
page_content="Hello, world!", metadata={"source": "https://example.com"}
|
|
268
288
|
)
|
|
269
289
|
|
|
270
290
|
"""
|
|
@@ -281,19 +301,24 @@ class Document(BaseMedia):
|
|
|
281
301
|
|
|
282
302
|
@classmethod
|
|
283
303
|
def is_lc_serializable(cls) -> bool:
|
|
284
|
-
"""Return
|
|
304
|
+
"""Return True as this class is serializable."""
|
|
285
305
|
return True
|
|
286
306
|
|
|
287
307
|
@classmethod
|
|
288
308
|
def get_lc_namespace(cls) -> list[str]:
|
|
289
309
|
"""Get the namespace of the langchain object.
|
|
290
310
|
|
|
291
|
-
|
|
311
|
+
Returns:
|
|
312
|
+
["langchain", "schema", "document"]
|
|
292
313
|
"""
|
|
293
314
|
return ["langchain", "schema", "document"]
|
|
294
315
|
|
|
295
316
|
def __str__(self) -> str:
|
|
296
|
-
"""Override __str__ to restrict it to page_content and metadata.
|
|
317
|
+
"""Override __str__ to restrict it to page_content and metadata.
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
A string representation of the Document.
|
|
321
|
+
"""
|
|
297
322
|
# The format matches pydantic format for __str__.
|
|
298
323
|
#
|
|
299
324
|
# The purpose of this change is to make sure that user code that
|
|
@@ -38,7 +38,9 @@ class BaseDocumentTransformer(ABC):
|
|
|
38
38
|
self.embeddings, stateful_documents
|
|
39
39
|
)
|
|
40
40
|
included_idxs = _filter_similar_embeddings(
|
|
41
|
-
embedded_documents,
|
|
41
|
+
embedded_documents,
|
|
42
|
+
self.similarity_fn,
|
|
43
|
+
self.similarity_threshold,
|
|
42
44
|
)
|
|
43
45
|
return [stateful_documents[i] for i in sorted(included_idxs)]
|
|
44
46
|
|
|
@@ -47,7 +49,7 @@ class BaseDocumentTransformer(ABC):
|
|
|
47
49
|
) -> Sequence[Document]:
|
|
48
50
|
raise NotImplementedError
|
|
49
51
|
|
|
50
|
-
"""
|
|
52
|
+
"""
|
|
51
53
|
|
|
52
54
|
@abstractmethod
|
|
53
55
|
def transform_documents(
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Module contains a few fake embedding models for testing purposes."""
|
|
2
2
|
|
|
3
3
|
# Please do not add additional fake embedding model implementations here.
|
|
4
|
+
import contextlib
|
|
4
5
|
import hashlib
|
|
5
6
|
|
|
6
7
|
from pydantic import BaseModel
|
|
@@ -8,6 +9,9 @@ from typing_extensions import override
|
|
|
8
9
|
|
|
9
10
|
from langchain_core.embeddings import Embeddings
|
|
10
11
|
|
|
12
|
+
with contextlib.suppress(ImportError):
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
11
15
|
|
|
12
16
|
class FakeEmbeddings(Embeddings, BaseModel):
|
|
13
17
|
"""Fake embedding model for unit testing purposes.
|
|
@@ -20,6 +24,7 @@ class FakeEmbeddings(Embeddings, BaseModel):
|
|
|
20
24
|
.. code-block:: python
|
|
21
25
|
|
|
22
26
|
from langchain_core.embeddings import FakeEmbeddings
|
|
27
|
+
|
|
23
28
|
embed = FakeEmbeddings(size=100)
|
|
24
29
|
|
|
25
30
|
Embed single text:
|
|
@@ -53,8 +58,6 @@ class FakeEmbeddings(Embeddings, BaseModel):
|
|
|
53
58
|
"""The size of the embedding vector."""
|
|
54
59
|
|
|
55
60
|
def _get_embedding(self) -> list[float]:
|
|
56
|
-
import numpy as np
|
|
57
|
-
|
|
58
61
|
return list(np.random.default_rng().normal(size=self.size))
|
|
59
62
|
|
|
60
63
|
@override
|
|
@@ -78,6 +81,7 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
|
|
|
78
81
|
.. code-block:: python
|
|
79
82
|
|
|
80
83
|
from langchain_core.embeddings import DeterministicFakeEmbedding
|
|
84
|
+
|
|
81
85
|
embed = DeterministicFakeEmbedding(size=100)
|
|
82
86
|
|
|
83
87
|
Embed single text:
|
|
@@ -111,13 +115,12 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
|
|
|
111
115
|
"""The size of the embedding vector."""
|
|
112
116
|
|
|
113
117
|
def _get_embedding(self, seed: int) -> list[float]:
|
|
114
|
-
import numpy as np
|
|
115
|
-
|
|
116
118
|
# set the seed for the random generator
|
|
117
119
|
rng = np.random.default_rng(seed)
|
|
118
120
|
return list(rng.normal(size=self.size))
|
|
119
121
|
|
|
120
|
-
|
|
122
|
+
@staticmethod
|
|
123
|
+
def _get_seed(text: str) -> int:
|
|
121
124
|
"""Get a seed for the random generator, using the hash of the text."""
|
|
122
125
|
return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8
|
|
123
126
|
|
langchain_core/env.py
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
import platform
|
|
4
4
|
from functools import lru_cache
|
|
5
5
|
|
|
6
|
+
from langchain_core import __version__
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
@lru_cache(maxsize=1)
|
|
8
10
|
def get_runtime_environment() -> dict:
|
|
@@ -11,9 +13,6 @@ def get_runtime_environment() -> dict:
|
|
|
11
13
|
Returns:
|
|
12
14
|
A dictionary with information about the runtime environment.
|
|
13
15
|
"""
|
|
14
|
-
# Lazy import to avoid circular imports
|
|
15
|
-
from langchain_core import __version__
|
|
16
|
-
|
|
17
16
|
return {
|
|
18
17
|
"library_version": __version__,
|
|
19
18
|
"library": "langchain-core",
|
|
@@ -16,6 +16,9 @@ class BaseExampleSelector(ABC):
|
|
|
16
16
|
Args:
|
|
17
17
|
example: A dictionary with keys as input variables
|
|
18
18
|
and values as their values.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Any return value.
|
|
19
22
|
"""
|
|
20
23
|
|
|
21
24
|
async def aadd_example(self, example: dict[str, str]) -> Any:
|
|
@@ -24,6 +27,9 @@ class BaseExampleSelector(ABC):
|
|
|
24
27
|
Args:
|
|
25
28
|
example: A dictionary with keys as input variables
|
|
26
29
|
and values as their values.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Any return value.
|
|
27
33
|
"""
|
|
28
34
|
return await run_in_executor(None, self.add_example, example)
|
|
29
35
|
|
|
@@ -34,6 +40,9 @@ class BaseExampleSelector(ABC):
|
|
|
34
40
|
Args:
|
|
35
41
|
input_variables: A dictionary with keys as input variables
|
|
36
42
|
and values as their values.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
A list of examples.
|
|
37
46
|
"""
|
|
38
47
|
|
|
39
48
|
async def aselect_examples(self, input_variables: dict[str, str]) -> list[dict]:
|
|
@@ -42,5 +51,8 @@ class BaseExampleSelector(ABC):
|
|
|
42
51
|
Args:
|
|
43
52
|
input_variables: A dictionary with keys as input variables
|
|
44
53
|
and values as their values.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
A list of examples.
|
|
45
57
|
"""
|
|
46
58
|
return await run_in_executor(None, self.select_examples, input_variables)
|
langchain_core/exceptions.py
CHANGED
|
@@ -42,6 +42,10 @@ class OutputParserException(ValueError, LangChainException): # noqa: N818
|
|
|
42
42
|
previous output was improperly structured, in the hopes that it will
|
|
43
43
|
update the output to the correct format.
|
|
44
44
|
Defaults to False.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If ``send_to_llm`` is True but either observation or
|
|
48
|
+
``llm_output`` are not provided.
|
|
45
49
|
"""
|
|
46
50
|
if isinstance(error, str):
|
|
47
51
|
error = create_message(
|
|
@@ -77,6 +81,9 @@ def create_message(*, message: str, error_code: ErrorCode) -> str:
|
|
|
77
81
|
Args:
|
|
78
82
|
message: The message to display.
|
|
79
83
|
error_code: The error code to display.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
The full message with the troubleshooting link.
|
|
80
87
|
"""
|
|
81
88
|
return (
|
|
82
89
|
f"{message}\n"
|
langchain_core/globals.py
CHANGED
|
@@ -6,6 +6,13 @@ from typing import TYPE_CHECKING, Optional
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from langchain_core.caches import BaseCache
|
|
8
8
|
|
|
9
|
+
try:
|
|
10
|
+
import langchain # type: ignore[import-not-found]
|
|
11
|
+
|
|
12
|
+
_HAS_LANGCHAIN = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
_HAS_LANGCHAIN = False
|
|
15
|
+
|
|
9
16
|
|
|
10
17
|
# DO NOT USE THESE VALUES DIRECTLY!
|
|
11
18
|
# Use them only via `get_<X>()` and `set_<X>()` below,
|
|
@@ -22,9 +29,7 @@ def set_verbose(value: bool) -> None: # noqa: FBT001
|
|
|
22
29
|
Args:
|
|
23
30
|
value: The new value for the `verbose` global setting.
|
|
24
31
|
"""
|
|
25
|
-
|
|
26
|
-
import langchain # type: ignore[import-not-found]
|
|
27
|
-
|
|
32
|
+
if _HAS_LANGCHAIN:
|
|
28
33
|
# We're about to run some deprecated code, don't report warnings from it.
|
|
29
34
|
# The user called the correct (non-deprecated) code path and shouldn't get
|
|
30
35
|
# warnings.
|
|
@@ -43,8 +48,6 @@ def set_verbose(value: bool) -> None: # noqa: FBT001
|
|
|
43
48
|
# Remove it once `langchain.verbose` is no longer supported, and once all
|
|
44
49
|
# users have migrated to using `set_verbose()` here.
|
|
45
50
|
langchain.verbose = value
|
|
46
|
-
except ImportError:
|
|
47
|
-
pass
|
|
48
51
|
|
|
49
52
|
global _verbose # noqa: PLW0603
|
|
50
53
|
_verbose = value
|
|
@@ -56,9 +59,7 @@ def get_verbose() -> bool:
|
|
|
56
59
|
Returns:
|
|
57
60
|
The value of the `verbose` global setting.
|
|
58
61
|
"""
|
|
59
|
-
|
|
60
|
-
import langchain
|
|
61
|
-
|
|
62
|
+
if _HAS_LANGCHAIN:
|
|
62
63
|
# We're about to run some deprecated code, don't report warnings from it.
|
|
63
64
|
# The user called the correct (non-deprecated) code path and shouldn't get
|
|
64
65
|
# warnings.
|
|
@@ -83,7 +84,7 @@ def get_verbose() -> bool:
|
|
|
83
84
|
# deprecation warnings directing them to use `set_verbose()` when they
|
|
84
85
|
# import `langchain.verbose`.
|
|
85
86
|
old_verbose = langchain.verbose
|
|
86
|
-
|
|
87
|
+
else:
|
|
87
88
|
old_verbose = False
|
|
88
89
|
|
|
89
90
|
return _verbose or old_verbose
|
|
@@ -95,9 +96,7 @@ def set_debug(value: bool) -> None: # noqa: FBT001
|
|
|
95
96
|
Args:
|
|
96
97
|
value: The new value for the `debug` global setting.
|
|
97
98
|
"""
|
|
98
|
-
|
|
99
|
-
import langchain
|
|
100
|
-
|
|
99
|
+
if _HAS_LANGCHAIN:
|
|
101
100
|
# We're about to run some deprecated code, don't report warnings from it.
|
|
102
101
|
# The user called the correct (non-deprecated) code path and shouldn't get
|
|
103
102
|
# warnings.
|
|
@@ -114,8 +113,6 @@ def set_debug(value: bool) -> None: # noqa: FBT001
|
|
|
114
113
|
# Remove it once `langchain.debug` is no longer supported, and once all
|
|
115
114
|
# users have migrated to using `set_debug()` here.
|
|
116
115
|
langchain.debug = value
|
|
117
|
-
except ImportError:
|
|
118
|
-
pass
|
|
119
116
|
|
|
120
117
|
global _debug # noqa: PLW0603
|
|
121
118
|
_debug = value
|
|
@@ -127,9 +124,7 @@ def get_debug() -> bool:
|
|
|
127
124
|
Returns:
|
|
128
125
|
The value of the `debug` global setting.
|
|
129
126
|
"""
|
|
130
|
-
|
|
131
|
-
import langchain
|
|
132
|
-
|
|
127
|
+
if _HAS_LANGCHAIN:
|
|
133
128
|
# We're about to run some deprecated code, don't report warnings from it.
|
|
134
129
|
# The user called the correct (non-deprecated) code path and shouldn't get
|
|
135
130
|
# warnings.
|
|
@@ -151,7 +146,7 @@ def get_debug() -> bool:
|
|
|
151
146
|
# to using `set_debug()` yet. Those users are getting deprecation warnings
|
|
152
147
|
# directing them to use `set_debug()` when they import `langchain.debug`.
|
|
153
148
|
old_debug = langchain.debug
|
|
154
|
-
|
|
149
|
+
else:
|
|
155
150
|
old_debug = False
|
|
156
151
|
|
|
157
152
|
return _debug or old_debug
|
|
@@ -163,9 +158,7 @@ def set_llm_cache(value: Optional["BaseCache"]) -> None:
|
|
|
163
158
|
Args:
|
|
164
159
|
value: The new LLM cache to use. If `None`, the LLM cache is disabled.
|
|
165
160
|
"""
|
|
166
|
-
|
|
167
|
-
import langchain
|
|
168
|
-
|
|
161
|
+
if _HAS_LANGCHAIN:
|
|
169
162
|
# We're about to run some deprecated code, don't report warnings from it.
|
|
170
163
|
# The user called the correct (non-deprecated) code path and shouldn't get
|
|
171
164
|
# warnings.
|
|
@@ -184,22 +177,18 @@ def set_llm_cache(value: Optional["BaseCache"]) -> None:
|
|
|
184
177
|
# Remove it once `langchain.llm_cache` is no longer supported, and
|
|
185
178
|
# once all users have migrated to using `set_llm_cache()` here.
|
|
186
179
|
langchain.llm_cache = value
|
|
187
|
-
except ImportError:
|
|
188
|
-
pass
|
|
189
180
|
|
|
190
181
|
global _llm_cache # noqa: PLW0603
|
|
191
182
|
_llm_cache = value
|
|
192
183
|
|
|
193
184
|
|
|
194
|
-
def get_llm_cache() -> "BaseCache":
|
|
185
|
+
def get_llm_cache() -> Optional["BaseCache"]:
|
|
195
186
|
"""Get the value of the `llm_cache` global setting.
|
|
196
187
|
|
|
197
188
|
Returns:
|
|
198
189
|
The value of the `llm_cache` global setting.
|
|
199
190
|
"""
|
|
200
|
-
|
|
201
|
-
import langchain
|
|
202
|
-
|
|
191
|
+
if _HAS_LANGCHAIN:
|
|
203
192
|
# We're about to run some deprecated code, don't report warnings from it.
|
|
204
193
|
# The user called the correct (non-deprecated) code path and shouldn't get
|
|
205
194
|
# warnings.
|
|
@@ -225,7 +214,7 @@ def get_llm_cache() -> "BaseCache":
|
|
|
225
214
|
# Those users are getting deprecation warnings directing them
|
|
226
215
|
# to use `set_llm_cache()` when they import `langchain.llm_cache`.
|
|
227
216
|
old_llm_cache = langchain.llm_cache
|
|
228
|
-
|
|
217
|
+
else:
|
|
229
218
|
old_llm_cache = None
|
|
230
219
|
|
|
231
220
|
return _llm_cache or old_llm_cache
|
langchain_core/indexing/api.py
CHANGED
|
@@ -56,7 +56,7 @@ def _warn_about_sha1() -> None:
|
|
|
56
56
|
"that map to the same fingerprint. If this matters in your "
|
|
57
57
|
"threat model, switch to a stronger algorithm such "
|
|
58
58
|
"as 'blake2b', 'sha256', or 'sha512' by specifying "
|
|
59
|
-
" `key_encoder` parameter in the
|
|
59
|
+
" `key_encoder` parameter in the `index` or `aindex` function. ",
|
|
60
60
|
category=UserWarning,
|
|
61
61
|
stacklevel=2,
|
|
62
62
|
)
|
|
@@ -185,6 +185,9 @@ def _get_document_with_hash(
|
|
|
185
185
|
When changing the key encoder, you must change the
|
|
186
186
|
index as well to avoid duplicated documents in the cache.
|
|
187
187
|
|
|
188
|
+
Raises:
|
|
189
|
+
ValueError: If the metadata cannot be serialized using json.
|
|
190
|
+
|
|
188
191
|
Returns:
|
|
189
192
|
Document with a unique identifier based on the hash of the content and metadata.
|
|
190
193
|
"""
|
|
@@ -291,22 +294,26 @@ def index(
|
|
|
291
294
|
documents were deleted, which documents should be skipped.
|
|
292
295
|
|
|
293
296
|
For the time being, documents are indexed using their hashes, and users
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
297
|
+
are not able to specify the uid of the document.
|
|
298
|
+
|
|
299
|
+
.. versionchanged:: 0.3.25
|
|
300
|
+
Added ``scoped_full`` cleanup mode.
|
|
301
|
+
|
|
302
|
+
.. important::
|
|
303
|
+
|
|
304
|
+
* In full mode, the loader should be returning
|
|
305
|
+
the entire dataset, and not just a subset of the dataset.
|
|
306
|
+
Otherwise, the auto_cleanup will remove documents that it is not
|
|
307
|
+
supposed to.
|
|
308
|
+
* In incremental mode, if documents associated with a particular
|
|
309
|
+
source id appear across different batches, the indexing API
|
|
310
|
+
will do some redundant work. This will still result in the
|
|
311
|
+
correct end state of the index, but will unfortunately not be
|
|
312
|
+
100% efficient. For example, if a given document is split into 15
|
|
313
|
+
chunks, and we index them using a batch size of 5, we'll have 3 batches
|
|
314
|
+
all with the same source id. In general, to avoid doing too much
|
|
315
|
+
redundant work select as big a batch size as possible.
|
|
316
|
+
* The ``scoped_full`` mode is suitable if determining an appropriate batch size
|
|
310
317
|
is challenging or if your data loader cannot return the entire dataset at
|
|
311
318
|
once. This mode keeps track of source IDs in memory, which should be fine
|
|
312
319
|
for most use cases. If your dataset is large (10M+ docs), you will likely
|
|
@@ -315,23 +322,22 @@ def index(
|
|
|
315
322
|
Args:
|
|
316
323
|
docs_source: Data loader or iterable of documents to index.
|
|
317
324
|
record_manager: Timestamped set to keep track of which documents were
|
|
318
|
-
|
|
325
|
+
updated.
|
|
319
326
|
vector_store: VectorStore or DocumentIndex to index the documents into.
|
|
320
327
|
batch_size: Batch size to use when indexing. Default is 100.
|
|
321
328
|
cleanup: How to handle clean up of documents. Default is None.
|
|
329
|
+
|
|
322
330
|
- incremental: Cleans up all documents that haven't been updated AND
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
to minimize the probability of users seeing duplicated
|
|
327
|
-
content.
|
|
331
|
+
that are associated with source ids that were seen during indexing.
|
|
332
|
+
Clean up is done continuously during indexing helping to minimize the
|
|
333
|
+
probability of users seeing duplicated content.
|
|
328
334
|
- full: Delete all documents that have not been returned by the loader
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
335
|
+
during this run of indexing.
|
|
336
|
+
Clean up runs after all documents have been indexed.
|
|
337
|
+
This means that users may see duplicated content during indexing.
|
|
332
338
|
- scoped_full: Similar to Full, but only deletes all documents
|
|
333
|
-
|
|
334
|
-
|
|
339
|
+
that haven't been updated AND that are associated with
|
|
340
|
+
source ids that were seen during indexing.
|
|
335
341
|
- None: Do not delete any documents.
|
|
336
342
|
source_id_key: Optional key that helps identify the original source
|
|
337
343
|
of the document. Default is None.
|
|
@@ -358,10 +364,9 @@ def index(
|
|
|
358
364
|
When changing the key encoder, you must change the
|
|
359
365
|
index as well to avoid duplicated documents in the cache.
|
|
360
366
|
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
upsert_kwargs={"vector_field": "embedding"}
|
|
367
|
+
method of the VectorStore or the upsert method of the DocumentIndex.
|
|
368
|
+
For example, you can use this to specify a custom vector_field:
|
|
369
|
+
upsert_kwargs={"vector_field": "embedding"}
|
|
365
370
|
.. versionadded:: 0.3.10
|
|
366
371
|
|
|
367
372
|
Returns:
|
|
@@ -374,10 +379,9 @@ def index(
|
|
|
374
379
|
ValueError: If vectorstore does not have
|
|
375
380
|
"delete" and "add_documents" required methods.
|
|
376
381
|
ValueError: If source_id_key is not None, but is not a string or callable.
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
* Added `scoped_full` cleanup mode.
|
|
382
|
+
TypeError: If ``vectorstore`` is not a VectorStore or a DocumentIndex.
|
|
383
|
+
AssertionError: If ``source_id`` is None when cleanup mode is incremental.
|
|
384
|
+
(should be unreachable code).
|
|
381
385
|
"""
|
|
382
386
|
# Behavior is deprecated, but we keep it for backwards compatibility.
|
|
383
387
|
# # Warn only once per process.
|
|
@@ -632,46 +636,50 @@ async def aindex(
|
|
|
632
636
|
documents were deleted, which documents should be skipped.
|
|
633
637
|
|
|
634
638
|
For the time being, documents are indexed using their hashes, and users
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
639
|
+
are not able to specify the uid of the document.
|
|
640
|
+
|
|
641
|
+
.. versionchanged:: 0.3.25
|
|
642
|
+
Added ``scoped_full`` cleanup mode.
|
|
643
|
+
|
|
644
|
+
.. important::
|
|
645
|
+
|
|
646
|
+
* In full mode, the loader should be returning
|
|
647
|
+
the entire dataset, and not just a subset of the dataset.
|
|
648
|
+
Otherwise, the auto_cleanup will remove documents that it is not
|
|
649
|
+
supposed to.
|
|
650
|
+
* In incremental mode, if documents associated with a particular
|
|
651
|
+
source id appear across different batches, the indexing API
|
|
652
|
+
will do some redundant work. This will still result in the
|
|
653
|
+
correct end state of the index, but will unfortunately not be
|
|
654
|
+
100% efficient. For example, if a given document is split into 15
|
|
655
|
+
chunks, and we index them using a batch size of 5, we'll have 3 batches
|
|
656
|
+
all with the same source id. In general, to avoid doing too much
|
|
657
|
+
redundant work select as big a batch size as possible.
|
|
658
|
+
* The ``scoped_full`` mode is suitable if determining an appropriate batch size
|
|
659
|
+
is challenging or if your data loader cannot return the entire dataset at
|
|
660
|
+
once. This mode keeps track of source IDs in memory, which should be fine
|
|
661
|
+
for most use cases. If your dataset is large (10M+ docs), you will likely
|
|
662
|
+
need to parallelize the indexing process regardless.
|
|
655
663
|
|
|
656
664
|
Args:
|
|
657
665
|
docs_source: Data loader or iterable of documents to index.
|
|
658
666
|
record_manager: Timestamped set to keep track of which documents were
|
|
659
|
-
|
|
667
|
+
updated.
|
|
660
668
|
vector_store: VectorStore or DocumentIndex to index the documents into.
|
|
661
669
|
batch_size: Batch size to use when indexing. Default is 100.
|
|
662
670
|
cleanup: How to handle clean up of documents. Default is None.
|
|
671
|
+
|
|
663
672
|
- incremental: Cleans up all documents that haven't been updated AND
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
This means that users may see duplicated content during indexing.
|
|
673
|
+
that are associated with source ids that were seen during indexing.
|
|
674
|
+
Clean up is done continuously during indexing helping to minimize the
|
|
675
|
+
probability of users seeing duplicated content.
|
|
676
|
+
- full: Delete all documents that have not been returned by the loader
|
|
677
|
+
during this run of indexing.
|
|
678
|
+
Clean up runs after all documents have been indexed.
|
|
679
|
+
This means that users may see duplicated content during indexing.
|
|
672
680
|
- scoped_full: Similar to Full, but only deletes all documents
|
|
673
|
-
|
|
674
|
-
|
|
681
|
+
that haven't been updated AND that are associated with
|
|
682
|
+
source ids that were seen during indexing.
|
|
675
683
|
- None: Do not delete any documents.
|
|
676
684
|
source_id_key: Optional key that helps identify the original source
|
|
677
685
|
of the document. Default is None.
|
|
@@ -680,6 +688,12 @@ async def aindex(
|
|
|
680
688
|
force_update: Force update documents even if they are present in the
|
|
681
689
|
record manager. Useful if you are re-indexing with updated embeddings.
|
|
682
690
|
Default is False.
|
|
691
|
+
key_encoder: Hashing algorithm to use for hashing the document content and
|
|
692
|
+
metadata. Default is "sha1".
|
|
693
|
+
Other options include "blake2b", "sha256", and "sha512".
|
|
694
|
+
|
|
695
|
+
.. versionadded:: 0.3.66
|
|
696
|
+
|
|
683
697
|
key_encoder: Hashing algorithm to use for hashing the document.
|
|
684
698
|
If not provided, a default encoder using SHA-1 will be used.
|
|
685
699
|
SHA-1 is not collision-resistant, and a motivated attacker
|
|
@@ -691,11 +705,10 @@ async def aindex(
|
|
|
691
705
|
|
|
692
706
|
When changing the key encoder, you must change the
|
|
693
707
|
index as well to avoid duplicated documents in the cache.
|
|
694
|
-
upsert_kwargs: Additional keyword arguments to pass to the
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
upsert_kwargs={"vector_field": "embedding"}
|
|
708
|
+
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
|
709
|
+
method of the VectorStore or the upsert method of the DocumentIndex.
|
|
710
|
+
For example, you can use this to specify a custom vector_field:
|
|
711
|
+
upsert_kwargs={"vector_field": "embedding"}
|
|
699
712
|
.. versionadded:: 0.3.10
|
|
700
713
|
|
|
701
714
|
Returns:
|
|
@@ -708,10 +721,9 @@ async def aindex(
|
|
|
708
721
|
ValueError: If vectorstore does not have
|
|
709
722
|
"adelete" and "aadd_documents" required methods.
|
|
710
723
|
ValueError: If source_id_key is not None, but is not a string or callable.
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
* Added `scoped_full` cleanup mode.
|
|
724
|
+
TypeError: If ``vector_store`` is not a VectorStore or DocumentIndex.
|
|
725
|
+
AssertionError: If ``source_id_key`` is None when cleanup mode is
|
|
726
|
+
incremental or ``scoped_full`` (should be unreachable).
|
|
715
727
|
"""
|
|
716
728
|
# Behavior is deprecated, but we keep it for backwards compatibility.
|
|
717
729
|
# # Warn only once per process.
|