langchain-core 1.0.0a2__py3-none-any.whl → 1.0.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langchain-core might be problematic. Click here for more details.

Files changed (130) hide show
  1. langchain_core/_api/beta_decorator.py +17 -40
  2. langchain_core/_api/deprecation.py +20 -7
  3. langchain_core/_api/path.py +19 -2
  4. langchain_core/_import_utils.py +7 -0
  5. langchain_core/agents.py +10 -6
  6. langchain_core/callbacks/base.py +28 -15
  7. langchain_core/callbacks/manager.py +81 -69
  8. langchain_core/callbacks/usage.py +4 -2
  9. langchain_core/chat_history.py +29 -21
  10. langchain_core/document_loaders/base.py +34 -9
  11. langchain_core/document_loaders/langsmith.py +3 -0
  12. langchain_core/documents/base.py +35 -10
  13. langchain_core/documents/transformers.py +4 -2
  14. langchain_core/embeddings/fake.py +8 -5
  15. langchain_core/env.py +2 -3
  16. langchain_core/example_selectors/base.py +12 -0
  17. langchain_core/exceptions.py +7 -0
  18. langchain_core/globals.py +17 -28
  19. langchain_core/indexing/api.py +57 -45
  20. langchain_core/indexing/base.py +5 -8
  21. langchain_core/indexing/in_memory.py +23 -3
  22. langchain_core/language_models/__init__.py +6 -2
  23. langchain_core/language_models/_utils.py +27 -5
  24. langchain_core/language_models/base.py +33 -21
  25. langchain_core/language_models/chat_models.py +99 -27
  26. langchain_core/language_models/fake_chat_models.py +5 -7
  27. langchain_core/language_models/llms.py +54 -20
  28. langchain_core/load/dump.py +2 -3
  29. langchain_core/load/load.py +15 -1
  30. langchain_core/load/serializable.py +38 -43
  31. langchain_core/memory.py +7 -3
  32. langchain_core/messages/__init__.py +1 -1
  33. langchain_core/messages/ai.py +41 -34
  34. langchain_core/messages/base.py +16 -7
  35. langchain_core/messages/block_translators/__init__.py +10 -8
  36. langchain_core/messages/block_translators/anthropic.py +3 -1
  37. langchain_core/messages/block_translators/bedrock.py +3 -1
  38. langchain_core/messages/block_translators/bedrock_converse.py +3 -1
  39. langchain_core/messages/block_translators/google_genai.py +3 -1
  40. langchain_core/messages/block_translators/google_vertexai.py +3 -1
  41. langchain_core/messages/block_translators/groq.py +3 -1
  42. langchain_core/messages/block_translators/ollama.py +3 -1
  43. langchain_core/messages/block_translators/openai.py +50 -20
  44. langchain_core/messages/content.py +23 -13
  45. langchain_core/messages/human.py +2 -13
  46. langchain_core/messages/system.py +2 -6
  47. langchain_core/messages/tool.py +34 -14
  48. langchain_core/messages/utils.py +186 -73
  49. langchain_core/output_parsers/base.py +5 -2
  50. langchain_core/output_parsers/json.py +4 -4
  51. langchain_core/output_parsers/list.py +7 -22
  52. langchain_core/output_parsers/openai_functions.py +3 -0
  53. langchain_core/output_parsers/openai_tools.py +6 -1
  54. langchain_core/output_parsers/pydantic.py +4 -0
  55. langchain_core/output_parsers/string.py +5 -1
  56. langchain_core/output_parsers/xml.py +19 -19
  57. langchain_core/outputs/chat_generation.py +18 -7
  58. langchain_core/outputs/generation.py +14 -3
  59. langchain_core/outputs/llm_result.py +8 -1
  60. langchain_core/prompt_values.py +10 -4
  61. langchain_core/prompts/base.py +6 -11
  62. langchain_core/prompts/chat.py +88 -60
  63. langchain_core/prompts/dict.py +16 -8
  64. langchain_core/prompts/few_shot.py +9 -11
  65. langchain_core/prompts/few_shot_with_templates.py +5 -1
  66. langchain_core/prompts/image.py +12 -5
  67. langchain_core/prompts/loading.py +2 -2
  68. langchain_core/prompts/message.py +5 -6
  69. langchain_core/prompts/pipeline.py +13 -8
  70. langchain_core/prompts/prompt.py +22 -8
  71. langchain_core/prompts/string.py +18 -10
  72. langchain_core/prompts/structured.py +7 -2
  73. langchain_core/rate_limiters.py +2 -2
  74. langchain_core/retrievers.py +7 -6
  75. langchain_core/runnables/base.py +387 -246
  76. langchain_core/runnables/branch.py +11 -28
  77. langchain_core/runnables/config.py +20 -17
  78. langchain_core/runnables/configurable.py +34 -19
  79. langchain_core/runnables/fallbacks.py +20 -13
  80. langchain_core/runnables/graph.py +48 -38
  81. langchain_core/runnables/graph_ascii.py +40 -17
  82. langchain_core/runnables/graph_mermaid.py +54 -25
  83. langchain_core/runnables/graph_png.py +27 -31
  84. langchain_core/runnables/history.py +55 -58
  85. langchain_core/runnables/passthrough.py +44 -21
  86. langchain_core/runnables/retry.py +44 -23
  87. langchain_core/runnables/router.py +9 -8
  88. langchain_core/runnables/schema.py +9 -0
  89. langchain_core/runnables/utils.py +53 -90
  90. langchain_core/stores.py +19 -31
  91. langchain_core/sys_info.py +9 -8
  92. langchain_core/tools/base.py +36 -27
  93. langchain_core/tools/convert.py +25 -14
  94. langchain_core/tools/simple.py +36 -8
  95. langchain_core/tools/structured.py +25 -12
  96. langchain_core/tracers/base.py +2 -2
  97. langchain_core/tracers/context.py +5 -1
  98. langchain_core/tracers/core.py +110 -46
  99. langchain_core/tracers/evaluation.py +22 -26
  100. langchain_core/tracers/event_stream.py +97 -42
  101. langchain_core/tracers/langchain.py +12 -3
  102. langchain_core/tracers/langchain_v1.py +10 -2
  103. langchain_core/tracers/log_stream.py +56 -17
  104. langchain_core/tracers/root_listeners.py +4 -20
  105. langchain_core/tracers/run_collector.py +6 -16
  106. langchain_core/tracers/schemas.py +5 -1
  107. langchain_core/utils/aiter.py +14 -6
  108. langchain_core/utils/env.py +3 -0
  109. langchain_core/utils/function_calling.py +46 -20
  110. langchain_core/utils/interactive_env.py +6 -2
  111. langchain_core/utils/iter.py +12 -5
  112. langchain_core/utils/json.py +12 -3
  113. langchain_core/utils/json_schema.py +156 -40
  114. langchain_core/utils/loading.py +5 -1
  115. langchain_core/utils/mustache.py +25 -16
  116. langchain_core/utils/pydantic.py +38 -9
  117. langchain_core/utils/utils.py +25 -9
  118. langchain_core/vectorstores/base.py +7 -20
  119. langchain_core/vectorstores/in_memory.py +20 -14
  120. langchain_core/vectorstores/utils.py +18 -12
  121. langchain_core/version.py +1 -1
  122. langchain_core-1.0.0a3.dist-info/METADATA +77 -0
  123. langchain_core-1.0.0a3.dist-info/RECORD +181 -0
  124. langchain_core/beta/__init__.py +0 -1
  125. langchain_core/beta/runnables/__init__.py +0 -1
  126. langchain_core/beta/runnables/context.py +0 -448
  127. langchain_core-1.0.0a2.dist-info/METADATA +0 -106
  128. langchain_core-1.0.0a2.dist-info/RECORD +0 -184
  129. {langchain_core-1.0.0a2.dist-info → langchain_core-1.0.0a3.dist-info}/WHEEL +0 -0
  130. {langchain_core-1.0.0a2.dist-info → langchain_core-1.0.0a3.dist-info}/entry_points.txt +0 -0
@@ -82,7 +82,7 @@ class Blob(BaseMedia):
82
82
  blob = Blob.from_data(
83
83
  data="Hello, world!",
84
84
  mime_type="text/plain",
85
- metadata={"source": "https://example.com"}
85
+ metadata={"source": "https://example.com"},
86
86
  )
87
87
 
88
88
  Example: Load the blob from a file
@@ -145,7 +145,14 @@ class Blob(BaseMedia):
145
145
  return values
146
146
 
147
147
  def as_string(self) -> str:
148
- """Read data as a string."""
148
+ """Read data as a string.
149
+
150
+ Raises:
151
+ ValueError: If the blob cannot be represented as a string.
152
+
153
+ Returns:
154
+ The data as a string.
155
+ """
149
156
  if self.data is None and self.path:
150
157
  return Path(self.path).read_text(encoding=self.encoding)
151
158
  if isinstance(self.data, bytes):
@@ -156,7 +163,14 @@ class Blob(BaseMedia):
156
163
  raise ValueError(msg)
157
164
 
158
165
  def as_bytes(self) -> bytes:
159
- """Read data as bytes."""
166
+ """Read data as bytes.
167
+
168
+ Raises:
169
+ ValueError: If the blob cannot be represented as bytes.
170
+
171
+ Returns:
172
+ The data as bytes.
173
+ """
160
174
  if isinstance(self.data, bytes):
161
175
  return self.data
162
176
  if isinstance(self.data, str):
@@ -168,7 +182,14 @@ class Blob(BaseMedia):
168
182
 
169
183
  @contextlib.contextmanager
170
184
  def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
171
- """Read data as a byte stream."""
185
+ """Read data as a byte stream.
186
+
187
+ Raises:
188
+ NotImplementedError: If the blob cannot be represented as a byte stream.
189
+
190
+ Yields:
191
+ The data as a byte stream.
192
+ """
172
193
  if isinstance(self.data, bytes):
173
194
  yield BytesIO(self.data)
174
195
  elif self.data is None and self.path:
@@ -246,7 +267,7 @@ class Blob(BaseMedia):
246
267
  )
247
268
 
248
269
  def __repr__(self) -> str:
249
- """Define the blob representation."""
270
+ """Return the blob representation."""
250
271
  str_repr = f"Blob {id(self)}"
251
272
  if self.source:
252
273
  str_repr += f" {self.source}"
@@ -263,8 +284,7 @@ class Document(BaseMedia):
263
284
  from langchain_core.documents import Document
264
285
 
265
286
  document = Document(
266
- page_content="Hello, world!",
267
- metadata={"source": "https://example.com"}
287
+ page_content="Hello, world!", metadata={"source": "https://example.com"}
268
288
  )
269
289
 
270
290
  """
@@ -281,19 +301,24 @@ class Document(BaseMedia):
281
301
 
282
302
  @classmethod
283
303
  def is_lc_serializable(cls) -> bool:
284
- """Return whether this class is serializable."""
304
+ """Return True as this class is serializable."""
285
305
  return True
286
306
 
287
307
  @classmethod
288
308
  def get_lc_namespace(cls) -> list[str]:
289
309
  """Get the namespace of the langchain object.
290
310
 
291
- Default namespace is ["langchain", "schema", "document"].
311
+ Returns:
312
+ ["langchain", "schema", "document"]
292
313
  """
293
314
  return ["langchain", "schema", "document"]
294
315
 
295
316
  def __str__(self) -> str:
296
- """Override __str__ to restrict it to page_content and metadata."""
317
+ """Override __str__ to restrict it to page_content and metadata.
318
+
319
+ Returns:
320
+ A string representation of the Document.
321
+ """
297
322
  # The format matches pydantic format for __str__.
298
323
  #
299
324
  # The purpose of this change is to make sure that user code that
@@ -38,7 +38,9 @@ class BaseDocumentTransformer(ABC):
38
38
  self.embeddings, stateful_documents
39
39
  )
40
40
  included_idxs = _filter_similar_embeddings(
41
- embedded_documents, self.similarity_fn, self.similarity_threshold
41
+ embedded_documents,
42
+ self.similarity_fn,
43
+ self.similarity_threshold,
42
44
  )
43
45
  return [stateful_documents[i] for i in sorted(included_idxs)]
44
46
 
@@ -47,7 +49,7 @@ class BaseDocumentTransformer(ABC):
47
49
  ) -> Sequence[Document]:
48
50
  raise NotImplementedError
49
51
 
50
- """ # noqa: E501
52
+ """
51
53
 
52
54
  @abstractmethod
53
55
  def transform_documents(
@@ -1,6 +1,7 @@
1
1
  """Module contains a few fake embedding models for testing purposes."""
2
2
 
3
3
  # Please do not add additional fake embedding model implementations here.
4
+ import contextlib
4
5
  import hashlib
5
6
 
6
7
  from pydantic import BaseModel
@@ -8,6 +9,9 @@ from typing_extensions import override
8
9
 
9
10
  from langchain_core.embeddings import Embeddings
10
11
 
12
+ with contextlib.suppress(ImportError):
13
+ import numpy as np
14
+
11
15
 
12
16
  class FakeEmbeddings(Embeddings, BaseModel):
13
17
  """Fake embedding model for unit testing purposes.
@@ -20,6 +24,7 @@ class FakeEmbeddings(Embeddings, BaseModel):
20
24
  .. code-block:: python
21
25
 
22
26
  from langchain_core.embeddings import FakeEmbeddings
27
+
23
28
  embed = FakeEmbeddings(size=100)
24
29
 
25
30
  Embed single text:
@@ -53,8 +58,6 @@ class FakeEmbeddings(Embeddings, BaseModel):
53
58
  """The size of the embedding vector."""
54
59
 
55
60
  def _get_embedding(self) -> list[float]:
56
- import numpy as np
57
-
58
61
  return list(np.random.default_rng().normal(size=self.size))
59
62
 
60
63
  @override
@@ -78,6 +81,7 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
78
81
  .. code-block:: python
79
82
 
80
83
  from langchain_core.embeddings import DeterministicFakeEmbedding
84
+
81
85
  embed = DeterministicFakeEmbedding(size=100)
82
86
 
83
87
  Embed single text:
@@ -111,13 +115,12 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
111
115
  """The size of the embedding vector."""
112
116
 
113
117
  def _get_embedding(self, seed: int) -> list[float]:
114
- import numpy as np
115
-
116
118
  # set the seed for the random generator
117
119
  rng = np.random.default_rng(seed)
118
120
  return list(rng.normal(size=self.size))
119
121
 
120
- def _get_seed(self, text: str) -> int:
122
+ @staticmethod
123
+ def _get_seed(text: str) -> int:
121
124
  """Get a seed for the random generator, using the hash of the text."""
122
125
  return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8
123
126
 
langchain_core/env.py CHANGED
@@ -3,6 +3,8 @@
3
3
  import platform
4
4
  from functools import lru_cache
5
5
 
6
+ from langchain_core import __version__
7
+
6
8
 
7
9
  @lru_cache(maxsize=1)
8
10
  def get_runtime_environment() -> dict:
@@ -11,9 +13,6 @@ def get_runtime_environment() -> dict:
11
13
  Returns:
12
14
  A dictionary with information about the runtime environment.
13
15
  """
14
- # Lazy import to avoid circular imports
15
- from langchain_core import __version__
16
-
17
16
  return {
18
17
  "library_version": __version__,
19
18
  "library": "langchain-core",
@@ -16,6 +16,9 @@ class BaseExampleSelector(ABC):
16
16
  Args:
17
17
  example: A dictionary with keys as input variables
18
18
  and values as their values.
19
+
20
+ Returns:
21
+ Any return value.
19
22
  """
20
23
 
21
24
  async def aadd_example(self, example: dict[str, str]) -> Any:
@@ -24,6 +27,9 @@ class BaseExampleSelector(ABC):
24
27
  Args:
25
28
  example: A dictionary with keys as input variables
26
29
  and values as their values.
30
+
31
+ Returns:
32
+ Any return value.
27
33
  """
28
34
  return await run_in_executor(None, self.add_example, example)
29
35
 
@@ -34,6 +40,9 @@ class BaseExampleSelector(ABC):
34
40
  Args:
35
41
  input_variables: A dictionary with keys as input variables
36
42
  and values as their values.
43
+
44
+ Returns:
45
+ A list of examples.
37
46
  """
38
47
 
39
48
  async def aselect_examples(self, input_variables: dict[str, str]) -> list[dict]:
@@ -42,5 +51,8 @@ class BaseExampleSelector(ABC):
42
51
  Args:
43
52
  input_variables: A dictionary with keys as input variables
44
53
  and values as their values.
54
+
55
+ Returns:
56
+ A list of examples.
45
57
  """
46
58
  return await run_in_executor(None, self.select_examples, input_variables)
@@ -42,6 +42,10 @@ class OutputParserException(ValueError, LangChainException): # noqa: N818
42
42
  previous output was improperly structured, in the hopes that it will
43
43
  update the output to the correct format.
44
44
  Defaults to False.
45
+
46
+ Raises:
47
+ ValueError: If ``send_to_llm`` is True but either observation or
48
+ ``llm_output`` are not provided.
45
49
  """
46
50
  if isinstance(error, str):
47
51
  error = create_message(
@@ -77,6 +81,9 @@ def create_message(*, message: str, error_code: ErrorCode) -> str:
77
81
  Args:
78
82
  message: The message to display.
79
83
  error_code: The error code to display.
84
+
85
+ Returns:
86
+ The full message with the troubleshooting link.
80
87
  """
81
88
  return (
82
89
  f"{message}\n"
langchain_core/globals.py CHANGED
@@ -6,6 +6,13 @@ from typing import TYPE_CHECKING, Optional
6
6
  if TYPE_CHECKING:
7
7
  from langchain_core.caches import BaseCache
8
8
 
9
+ try:
10
+ import langchain # type: ignore[import-not-found]
11
+
12
+ _HAS_LANGCHAIN = True
13
+ except ImportError:
14
+ _HAS_LANGCHAIN = False
15
+
9
16
 
10
17
  # DO NOT USE THESE VALUES DIRECTLY!
11
18
  # Use them only via `get_<X>()` and `set_<X>()` below,
@@ -22,9 +29,7 @@ def set_verbose(value: bool) -> None: # noqa: FBT001
22
29
  Args:
23
30
  value: The new value for the `verbose` global setting.
24
31
  """
25
- try:
26
- import langchain # type: ignore[import-not-found]
27
-
32
+ if _HAS_LANGCHAIN:
28
33
  # We're about to run some deprecated code, don't report warnings from it.
29
34
  # The user called the correct (non-deprecated) code path and shouldn't get
30
35
  # warnings.
@@ -43,8 +48,6 @@ def set_verbose(value: bool) -> None: # noqa: FBT001
43
48
  # Remove it once `langchain.verbose` is no longer supported, and once all
44
49
  # users have migrated to using `set_verbose()` here.
45
50
  langchain.verbose = value
46
- except ImportError:
47
- pass
48
51
 
49
52
  global _verbose # noqa: PLW0603
50
53
  _verbose = value
@@ -56,9 +59,7 @@ def get_verbose() -> bool:
56
59
  Returns:
57
60
  The value of the `verbose` global setting.
58
61
  """
59
- try:
60
- import langchain
61
-
62
+ if _HAS_LANGCHAIN:
62
63
  # We're about to run some deprecated code, don't report warnings from it.
63
64
  # The user called the correct (non-deprecated) code path and shouldn't get
64
65
  # warnings.
@@ -83,7 +84,7 @@ def get_verbose() -> bool:
83
84
  # deprecation warnings directing them to use `set_verbose()` when they
84
85
  # import `langchain.verbose`.
85
86
  old_verbose = langchain.verbose
86
- except ImportError:
87
+ else:
87
88
  old_verbose = False
88
89
 
89
90
  return _verbose or old_verbose
@@ -95,9 +96,7 @@ def set_debug(value: bool) -> None: # noqa: FBT001
95
96
  Args:
96
97
  value: The new value for the `debug` global setting.
97
98
  """
98
- try:
99
- import langchain
100
-
99
+ if _HAS_LANGCHAIN:
101
100
  # We're about to run some deprecated code, don't report warnings from it.
102
101
  # The user called the correct (non-deprecated) code path and shouldn't get
103
102
  # warnings.
@@ -114,8 +113,6 @@ def set_debug(value: bool) -> None: # noqa: FBT001
114
113
  # Remove it once `langchain.debug` is no longer supported, and once all
115
114
  # users have migrated to using `set_debug()` here.
116
115
  langchain.debug = value
117
- except ImportError:
118
- pass
119
116
 
120
117
  global _debug # noqa: PLW0603
121
118
  _debug = value
@@ -127,9 +124,7 @@ def get_debug() -> bool:
127
124
  Returns:
128
125
  The value of the `debug` global setting.
129
126
  """
130
- try:
131
- import langchain
132
-
127
+ if _HAS_LANGCHAIN:
133
128
  # We're about to run some deprecated code, don't report warnings from it.
134
129
  # The user called the correct (non-deprecated) code path and shouldn't get
135
130
  # warnings.
@@ -151,7 +146,7 @@ def get_debug() -> bool:
151
146
  # to using `set_debug()` yet. Those users are getting deprecation warnings
152
147
  # directing them to use `set_debug()` when they import `langchain.debug`.
153
148
  old_debug = langchain.debug
154
- except ImportError:
149
+ else:
155
150
  old_debug = False
156
151
 
157
152
  return _debug or old_debug
@@ -163,9 +158,7 @@ def set_llm_cache(value: Optional["BaseCache"]) -> None:
163
158
  Args:
164
159
  value: The new LLM cache to use. If `None`, the LLM cache is disabled.
165
160
  """
166
- try:
167
- import langchain
168
-
161
+ if _HAS_LANGCHAIN:
169
162
  # We're about to run some deprecated code, don't report warnings from it.
170
163
  # The user called the correct (non-deprecated) code path and shouldn't get
171
164
  # warnings.
@@ -184,22 +177,18 @@ def set_llm_cache(value: Optional["BaseCache"]) -> None:
184
177
  # Remove it once `langchain.llm_cache` is no longer supported, and
185
178
  # once all users have migrated to using `set_llm_cache()` here.
186
179
  langchain.llm_cache = value
187
- except ImportError:
188
- pass
189
180
 
190
181
  global _llm_cache # noqa: PLW0603
191
182
  _llm_cache = value
192
183
 
193
184
 
194
- def get_llm_cache() -> "BaseCache":
185
+ def get_llm_cache() -> Optional["BaseCache"]:
195
186
  """Get the value of the `llm_cache` global setting.
196
187
 
197
188
  Returns:
198
189
  The value of the `llm_cache` global setting.
199
190
  """
200
- try:
201
- import langchain
202
-
191
+ if _HAS_LANGCHAIN:
203
192
  # We're about to run some deprecated code, don't report warnings from it.
204
193
  # The user called the correct (non-deprecated) code path and shouldn't get
205
194
  # warnings.
@@ -225,7 +214,7 @@ def get_llm_cache() -> "BaseCache":
225
214
  # Those users are getting deprecation warnings directing them
226
215
  # to use `set_llm_cache()` when they import `langchain.llm_cache`.
227
216
  old_llm_cache = langchain.llm_cache
228
- except ImportError:
217
+ else:
229
218
  old_llm_cache = None
230
219
 
231
220
  return _llm_cache or old_llm_cache
@@ -56,7 +56,7 @@ def _warn_about_sha1() -> None:
56
56
  "that map to the same fingerprint. If this matters in your "
57
57
  "threat model, switch to a stronger algorithm such "
58
58
  "as 'blake2b', 'sha256', or 'sha512' by specifying "
59
- " `key_encoder` parameter in the the `index` or `aindex` function. ",
59
+ " `key_encoder` parameter in the `index` or `aindex` function. ",
60
60
  category=UserWarning,
61
61
  stacklevel=2,
62
62
  )
@@ -185,6 +185,9 @@ def _get_document_with_hash(
185
185
  When changing the key encoder, you must change the
186
186
  index as well to avoid duplicated documents in the cache.
187
187
 
188
+ Raises:
189
+ ValueError: If the metadata cannot be serialized using json.
190
+
188
191
  Returns:
189
192
  Document with a unique identifier based on the hash of the content and metadata.
190
193
  """
@@ -291,21 +294,21 @@ def index(
291
294
  documents were deleted, which documents should be skipped.
292
295
 
293
296
  For the time being, documents are indexed using their hashes, and users
294
- are not able to specify the uid of the document.
297
+ are not able to specify the uid of the document.
295
298
 
296
299
  Important:
297
- * In full mode, the loader should be returning
298
- the entire dataset, and not just a subset of the dataset.
299
- Otherwise, the auto_cleanup will remove documents that it is not
300
- supposed to.
301
- * In incremental mode, if documents associated with a particular
302
- source id appear across different batches, the indexing API
303
- will do some redundant work. This will still result in the
304
- correct end state of the index, but will unfortunately not be
305
- 100% efficient. For example, if a given document is split into 15
306
- chunks, and we index them using a batch size of 5, we'll have 3 batches
307
- all with the same source id. In general, to avoid doing too much
308
- redundant work select as big a batch size as possible.
300
+ * In full mode, the loader should be returning
301
+ the entire dataset, and not just a subset of the dataset.
302
+ Otherwise, the auto_cleanup will remove documents that it is not
303
+ supposed to.
304
+ * In incremental mode, if documents associated with a particular
305
+ source id appear across different batches, the indexing API
306
+ will do some redundant work. This will still result in the
307
+ correct end state of the index, but will unfortunately not be
308
+ 100% efficient. For example, if a given document is split into 15
309
+ chunks, and we index them using a batch size of 5, we'll have 3 batches
310
+ all with the same source id. In general, to avoid doing too much
311
+ redundant work select as big a batch size as possible.
309
312
  * The `scoped_full` mode is suitable if determining an appropriate batch size
310
313
  is challenging or if your data loader cannot return the entire dataset at
311
314
  once. This mode keeps track of source IDs in memory, which should be fine
@@ -315,23 +318,22 @@ def index(
315
318
  Args:
316
319
  docs_source: Data loader or iterable of documents to index.
317
320
  record_manager: Timestamped set to keep track of which documents were
318
- updated.
321
+ updated.
319
322
  vector_store: VectorStore or DocumentIndex to index the documents into.
320
323
  batch_size: Batch size to use when indexing. Default is 100.
321
324
  cleanup: How to handle clean up of documents. Default is None.
325
+
322
326
  - incremental: Cleans up all documents that haven't been updated AND
323
- that are associated with source ids that were seen
324
- during indexing.
325
- Clean up is done continuously during indexing helping
326
- to minimize the probability of users seeing duplicated
327
- content.
327
+ that are associated with source ids that were seen during indexing.
328
+ Clean up is done continuously during indexing helping to minimize the
329
+ probability of users seeing duplicated content.
328
330
  - full: Delete all documents that have not been returned by the loader
329
- during this run of indexing.
330
- Clean up runs after all documents have been indexed.
331
- This means that users may see duplicated content during indexing.
331
+ during this run of indexing.
332
+ Clean up runs after all documents have been indexed.
333
+ This means that users may see duplicated content during indexing.
332
334
  - scoped_full: Similar to Full, but only deletes all documents
333
- that haven't been updated AND that are associated with
334
- source ids that were seen during indexing.
335
+ that haven't been updated AND that are associated with
336
+ source ids that were seen during indexing.
335
337
  - None: Do not delete any documents.
336
338
  source_id_key: Optional key that helps identify the original source
337
339
  of the document. Default is None.
@@ -358,10 +360,9 @@ def index(
358
360
  When changing the key encoder, you must change the
359
361
  index as well to avoid duplicated documents in the cache.
360
362
  upsert_kwargs: Additional keyword arguments to pass to the add_documents
361
- method of the VectorStore or the upsert method of the
362
- DocumentIndex. For example, you can use this to
363
- specify a custom vector_field:
364
- upsert_kwargs={"vector_field": "embedding"}
363
+ method of the VectorStore or the upsert method of the DocumentIndex.
364
+ For example, you can use this to specify a custom vector_field:
365
+ upsert_kwargs={"vector_field": "embedding"}
365
366
  .. versionadded:: 0.3.10
366
367
 
367
368
  Returns:
@@ -374,6 +375,9 @@ def index(
374
375
  ValueError: If vectorstore does not have
375
376
  "delete" and "add_documents" required methods.
376
377
  ValueError: If source_id_key is not None, but is not a string or callable.
378
+ TypeError: If ``vectorstore`` is not a VectorStore or a DocumentIndex.
379
+ AssertionError: If ``source_id`` is None when cleanup mode is incremental.
380
+ (should be unreachable code).
377
381
 
378
382
  .. version_modified:: 0.3.25
379
383
 
@@ -656,22 +660,22 @@ async def aindex(
656
660
  Args:
657
661
  docs_source: Data loader or iterable of documents to index.
658
662
  record_manager: Timestamped set to keep track of which documents were
659
- updated.
663
+ updated.
660
664
  vector_store: VectorStore or DocumentIndex to index the documents into.
661
665
  batch_size: Batch size to use when indexing. Default is 100.
662
666
  cleanup: How to handle clean up of documents. Default is None.
667
+
663
668
  - incremental: Cleans up all documents that haven't been updated AND
664
- that are associated with source ids that were seen
665
- during indexing.
666
- Clean up is done continuously during indexing helping
667
- to minimize the probability of users seeing duplicated
668
- content.
669
- - full: Delete all documents that haven to been returned by the loader.
670
- Clean up runs after all documents have been indexed.
671
- This means that users may see duplicated content during indexing.
669
+ that are associated with source ids that were seen during indexing.
670
+ Clean up is done continuously during indexing helping to minimize the
671
+ probability of users seeing duplicated content.
672
+ - full: Delete all documents that have not been returned by the loader
673
+ during this run of indexing.
674
+ Clean up runs after all documents have been indexed.
675
+ This means that users may see duplicated content during indexing.
672
676
  - scoped_full: Similar to Full, but only deletes all documents
673
- that haven't been updated AND that are associated with
674
- source ids that were seen during indexing.
677
+ that haven't been updated AND that are associated with
678
+ source ids that were seen during indexing.
675
679
  - None: Do not delete any documents.
676
680
  source_id_key: Optional key that helps identify the original source
677
681
  of the document. Default is None.
@@ -680,6 +684,12 @@ async def aindex(
680
684
  force_update: Force update documents even if they are present in the
681
685
  record manager. Useful if you are re-indexing with updated embeddings.
682
686
  Default is False.
687
+ key_encoder: Hashing algorithm to use for hashing the document content and
688
+ metadata. Default is "sha1".
689
+ Other options include "blake2b", "sha256", and "sha512".
690
+
691
+ .. versionadded:: 0.3.66
692
+
683
693
  key_encoder: Hashing algorithm to use for hashing the document.
684
694
  If not provided, a default encoder using SHA-1 will be used.
685
695
  SHA-1 is not collision-resistant, and a motivated attacker
@@ -691,11 +701,10 @@ async def aindex(
691
701
 
692
702
  When changing the key encoder, you must change the
693
703
  index as well to avoid duplicated documents in the cache.
694
- upsert_kwargs: Additional keyword arguments to pass to the aadd_documents
695
- method of the VectorStore or the aupsert method of the
696
- DocumentIndex. For example, you can use this to
697
- specify a custom vector_field:
698
- upsert_kwargs={"vector_field": "embedding"}
704
+ upsert_kwargs: Additional keyword arguments to pass to the add_documents
705
+ method of the VectorStore or the upsert method of the DocumentIndex.
706
+ For example, you can use this to specify a custom vector_field:
707
+ upsert_kwargs={"vector_field": "embedding"}
699
708
  .. versionadded:: 0.3.10
700
709
 
701
710
  Returns:
@@ -708,6 +717,9 @@ async def aindex(
708
717
  ValueError: If vectorstore does not have
709
718
  "adelete" and "aadd_documents" required methods.
710
719
  ValueError: If source_id_key is not None, but is not a string or callable.
720
+ TypeError: If ``vector_store`` is not a VectorStore or DocumentIndex.
721
+ AssertionError: If ``source_id_key`` is None when cleanup mode is
722
+ incremental or ``scoped_full`` (should be unreachable).
711
723
 
712
724
  .. version_modified:: 0.3.25
713
725
 
@@ -7,6 +7,8 @@ import time
7
7
  from abc import ABC, abstractmethod
8
8
  from typing import TYPE_CHECKING, Any, Optional, TypedDict
9
9
 
10
+ from typing_extensions import override
11
+
10
12
  from langchain_core._api import beta
11
13
  from langchain_core.retrievers import BaseRetriever
12
14
  from langchain_core.runnables import run_in_executor
@@ -254,14 +256,14 @@ class InMemoryRecordManager(RecordManager):
254
256
  """In-memory schema creation is simply ensuring the structure is initialized."""
255
257
 
256
258
  async def acreate_schema(self) -> None:
257
- """Async in-memory schema creation is simply ensuring the structure is initialized.""" # noqa: E501
259
+ """In-memory schema creation is simply ensuring the structure is initialized."""
258
260
 
261
+ @override
259
262
  def get_time(self) -> float:
260
- """Get the current server time as a high resolution timestamp!"""
261
263
  return time.time()
262
264
 
265
+ @override
263
266
  async def aget_time(self) -> float:
264
- """Async get the current server time as a high resolution timestamp!"""
265
267
  return self.get_time()
266
268
 
267
269
  def update(
@@ -322,11 +324,6 @@ class InMemoryRecordManager(RecordManager):
322
324
  raise an error.
323
325
  This is meant to help prevent time-drift issues since
324
326
  time may not be monotonically increasing!
325
-
326
- Raises:
327
- ValueError: If the length of keys doesn't match the length of group
328
- ids.
329
- ValueError: If time_at_least is in the future.
330
327
  """
331
328
  self.update(keys, group_ids=group_ids, time_at_least=time_at_least)
332
329
 
@@ -32,7 +32,17 @@ class InMemoryDocumentIndex(DocumentIndex):
32
32
 
33
33
  @override
34
34
  def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
35
- """Upsert items into the index."""
35
+ """Upsert documents into the index.
36
+
37
+ Args:
38
+ items: Sequence of documents to add to the index.
39
+ **kwargs: Additional keyword arguments.
40
+
41
+ Returns:
42
+ A response object that contains the list of IDs that were
43
+ successfully added or updated in the index and the list of IDs that
44
+ failed to be added or updated.
45
+ """
36
46
  ok_ids = []
37
47
 
38
48
  for item in items:
@@ -51,7 +61,18 @@ class InMemoryDocumentIndex(DocumentIndex):
51
61
 
52
62
  @override
53
63
  def delete(self, ids: Optional[list[str]] = None, **kwargs: Any) -> DeleteResponse:
54
- """Delete by ID."""
64
+ """Delete by IDs.
65
+
66
+ Args:
67
+ ids: List of ids to delete.
68
+
69
+ Raises:
70
+ ValueError: If ids is None.
71
+
72
+ Returns:
73
+ A response object that contains the list of IDs that were successfully
74
+ deleted and the list of IDs that failed to be deleted.
75
+ """
55
76
  if ids is None:
56
77
  msg = "IDs must be provided for deletion"
57
78
  raise ValueError(msg)
@@ -69,7 +90,6 @@ class InMemoryDocumentIndex(DocumentIndex):
69
90
 
70
91
  @override
71
92
  def get(self, ids: Sequence[str], /, **kwargs: Any) -> list[Document]:
72
- """Get by ids."""
73
93
  return [self.store[id_] for id_ in ids if id_ in self.store]
74
94
 
75
95
  @override