langchain-core 0.3.75__py3-none-any.whl → 0.3.77__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langchain-core might be problematic. Click here for more details.

Files changed (119) hide show
  1. langchain_core/_api/beta_decorator.py +22 -44
  2. langchain_core/_api/deprecation.py +30 -17
  3. langchain_core/_api/path.py +19 -2
  4. langchain_core/_import_utils.py +7 -0
  5. langchain_core/agents.py +10 -6
  6. langchain_core/beta/runnables/context.py +1 -2
  7. langchain_core/callbacks/base.py +28 -15
  8. langchain_core/callbacks/manager.py +83 -71
  9. langchain_core/callbacks/usage.py +6 -4
  10. langchain_core/chat_history.py +29 -21
  11. langchain_core/document_loaders/base.py +34 -9
  12. langchain_core/document_loaders/langsmith.py +4 -1
  13. langchain_core/documents/base.py +35 -10
  14. langchain_core/documents/transformers.py +4 -2
  15. langchain_core/embeddings/fake.py +8 -5
  16. langchain_core/env.py +2 -3
  17. langchain_core/example_selectors/base.py +12 -0
  18. langchain_core/exceptions.py +7 -0
  19. langchain_core/globals.py +17 -28
  20. langchain_core/indexing/api.py +88 -76
  21. langchain_core/indexing/base.py +5 -8
  22. langchain_core/indexing/in_memory.py +23 -3
  23. langchain_core/language_models/__init__.py +3 -2
  24. langchain_core/language_models/base.py +31 -20
  25. langchain_core/language_models/chat_models.py +98 -27
  26. langchain_core/language_models/fake_chat_models.py +10 -9
  27. langchain_core/language_models/llms.py +52 -18
  28. langchain_core/load/dump.py +2 -3
  29. langchain_core/load/load.py +15 -1
  30. langchain_core/load/serializable.py +39 -44
  31. langchain_core/memory.py +7 -3
  32. langchain_core/messages/ai.py +53 -24
  33. langchain_core/messages/base.py +43 -22
  34. langchain_core/messages/chat.py +4 -1
  35. langchain_core/messages/content_blocks.py +23 -2
  36. langchain_core/messages/function.py +9 -5
  37. langchain_core/messages/human.py +13 -10
  38. langchain_core/messages/modifier.py +1 -0
  39. langchain_core/messages/system.py +11 -8
  40. langchain_core/messages/tool.py +60 -29
  41. langchain_core/messages/utils.py +250 -131
  42. langchain_core/output_parsers/base.py +5 -2
  43. langchain_core/output_parsers/json.py +4 -4
  44. langchain_core/output_parsers/list.py +7 -22
  45. langchain_core/output_parsers/openai_functions.py +3 -0
  46. langchain_core/output_parsers/openai_tools.py +6 -1
  47. langchain_core/output_parsers/pydantic.py +4 -0
  48. langchain_core/output_parsers/string.py +5 -1
  49. langchain_core/output_parsers/xml.py +19 -19
  50. langchain_core/outputs/chat_generation.py +25 -10
  51. langchain_core/outputs/generation.py +14 -3
  52. langchain_core/outputs/llm_result.py +8 -1
  53. langchain_core/prompt_values.py +16 -6
  54. langchain_core/prompts/base.py +4 -9
  55. langchain_core/prompts/chat.py +89 -57
  56. langchain_core/prompts/dict.py +16 -8
  57. langchain_core/prompts/few_shot.py +12 -11
  58. langchain_core/prompts/few_shot_with_templates.py +5 -1
  59. langchain_core/prompts/image.py +12 -5
  60. langchain_core/prompts/message.py +5 -6
  61. langchain_core/prompts/pipeline.py +13 -8
  62. langchain_core/prompts/prompt.py +22 -8
  63. langchain_core/prompts/string.py +18 -10
  64. langchain_core/prompts/structured.py +7 -2
  65. langchain_core/rate_limiters.py +2 -2
  66. langchain_core/retrievers.py +7 -6
  67. langchain_core/runnables/base.py +406 -186
  68. langchain_core/runnables/branch.py +14 -19
  69. langchain_core/runnables/config.py +9 -15
  70. langchain_core/runnables/configurable.py +34 -19
  71. langchain_core/runnables/fallbacks.py +20 -13
  72. langchain_core/runnables/graph.py +48 -38
  73. langchain_core/runnables/graph_ascii.py +41 -18
  74. langchain_core/runnables/graph_mermaid.py +54 -25
  75. langchain_core/runnables/graph_png.py +27 -31
  76. langchain_core/runnables/history.py +55 -58
  77. langchain_core/runnables/passthrough.py +44 -21
  78. langchain_core/runnables/retry.py +44 -23
  79. langchain_core/runnables/router.py +9 -8
  80. langchain_core/runnables/schema.py +2 -0
  81. langchain_core/runnables/utils.py +51 -89
  82. langchain_core/stores.py +19 -31
  83. langchain_core/sys_info.py +9 -8
  84. langchain_core/tools/base.py +37 -28
  85. langchain_core/tools/convert.py +26 -15
  86. langchain_core/tools/simple.py +36 -8
  87. langchain_core/tools/structured.py +25 -12
  88. langchain_core/tracers/base.py +2 -2
  89. langchain_core/tracers/context.py +5 -1
  90. langchain_core/tracers/core.py +109 -39
  91. langchain_core/tracers/evaluation.py +22 -26
  92. langchain_core/tracers/event_stream.py +45 -34
  93. langchain_core/tracers/langchain.py +12 -3
  94. langchain_core/tracers/langchain_v1.py +10 -2
  95. langchain_core/tracers/log_stream.py +56 -17
  96. langchain_core/tracers/root_listeners.py +4 -20
  97. langchain_core/tracers/run_collector.py +6 -16
  98. langchain_core/tracers/schemas.py +5 -1
  99. langchain_core/utils/aiter.py +15 -7
  100. langchain_core/utils/env.py +3 -0
  101. langchain_core/utils/function_calling.py +50 -28
  102. langchain_core/utils/interactive_env.py +6 -2
  103. langchain_core/utils/iter.py +12 -4
  104. langchain_core/utils/json.py +12 -3
  105. langchain_core/utils/json_schema.py +156 -40
  106. langchain_core/utils/loading.py +5 -1
  107. langchain_core/utils/mustache.py +24 -15
  108. langchain_core/utils/pydantic.py +38 -9
  109. langchain_core/utils/utils.py +25 -9
  110. langchain_core/vectorstores/base.py +7 -20
  111. langchain_core/vectorstores/in_memory.py +23 -17
  112. langchain_core/vectorstores/utils.py +18 -12
  113. langchain_core/version.py +1 -1
  114. langchain_core-0.3.77.dist-info/METADATA +67 -0
  115. langchain_core-0.3.77.dist-info/RECORD +174 -0
  116. langchain_core-0.3.75.dist-info/METADATA +0 -106
  117. langchain_core-0.3.75.dist-info/RECORD +0 -174
  118. {langchain_core-0.3.75.dist-info → langchain_core-0.3.77.dist-info}/WHEEL +0 -0
  119. {langchain_core-0.3.75.dist-info → langchain_core-0.3.77.dist-info}/entry_points.txt +0 -0
@@ -82,7 +82,7 @@ class Blob(BaseMedia):
82
82
  blob = Blob.from_data(
83
83
  data="Hello, world!",
84
84
  mime_type="text/plain",
85
- metadata={"source": "https://example.com"}
85
+ metadata={"source": "https://example.com"},
86
86
  )
87
87
 
88
88
  Example: Load the blob from a file
@@ -145,7 +145,14 @@ class Blob(BaseMedia):
145
145
  return values
146
146
 
147
147
  def as_string(self) -> str:
148
- """Read data as a string."""
148
+ """Read data as a string.
149
+
150
+ Raises:
151
+ ValueError: If the blob cannot be represented as a string.
152
+
153
+ Returns:
154
+ The data as a string.
155
+ """
149
156
  if self.data is None and self.path:
150
157
  return Path(self.path).read_text(encoding=self.encoding)
151
158
  if isinstance(self.data, bytes):
@@ -156,7 +163,14 @@ class Blob(BaseMedia):
156
163
  raise ValueError(msg)
157
164
 
158
165
  def as_bytes(self) -> bytes:
159
- """Read data as bytes."""
166
+ """Read data as bytes.
167
+
168
+ Raises:
169
+ ValueError: If the blob cannot be represented as bytes.
170
+
171
+ Returns:
172
+ The data as bytes.
173
+ """
160
174
  if isinstance(self.data, bytes):
161
175
  return self.data
162
176
  if isinstance(self.data, str):
@@ -168,7 +182,14 @@ class Blob(BaseMedia):
168
182
 
169
183
  @contextlib.contextmanager
170
184
  def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
171
- """Read data as a byte stream."""
185
+ """Read data as a byte stream.
186
+
187
+ Raises:
188
+ NotImplementedError: If the blob cannot be represented as a byte stream.
189
+
190
+ Yields:
191
+ The data as a byte stream.
192
+ """
172
193
  if isinstance(self.data, bytes):
173
194
  yield BytesIO(self.data)
174
195
  elif self.data is None and self.path:
@@ -246,7 +267,7 @@ class Blob(BaseMedia):
246
267
  )
247
268
 
248
269
  def __repr__(self) -> str:
249
- """Define the blob representation."""
270
+ """Return the blob representation."""
250
271
  str_repr = f"Blob {id(self)}"
251
272
  if self.source:
252
273
  str_repr += f" {self.source}"
@@ -263,8 +284,7 @@ class Document(BaseMedia):
263
284
  from langchain_core.documents import Document
264
285
 
265
286
  document = Document(
266
- page_content="Hello, world!",
267
- metadata={"source": "https://example.com"}
287
+ page_content="Hello, world!", metadata={"source": "https://example.com"}
268
288
  )
269
289
 
270
290
  """
@@ -281,19 +301,24 @@ class Document(BaseMedia):
281
301
 
282
302
  @classmethod
283
303
  def is_lc_serializable(cls) -> bool:
284
- """Return whether this class is serializable."""
304
+ """Return True as this class is serializable."""
285
305
  return True
286
306
 
287
307
  @classmethod
288
308
  def get_lc_namespace(cls) -> list[str]:
289
309
  """Get the namespace of the langchain object.
290
310
 
291
- Default namespace is ["langchain", "schema", "document"].
311
+ Returns:
312
+ ["langchain", "schema", "document"]
292
313
  """
293
314
  return ["langchain", "schema", "document"]
294
315
 
295
316
  def __str__(self) -> str:
296
- """Override __str__ to restrict it to page_content and metadata."""
317
+ """Override __str__ to restrict it to page_content and metadata.
318
+
319
+ Returns:
320
+ A string representation of the Document.
321
+ """
297
322
  # The format matches pydantic format for __str__.
298
323
  #
299
324
  # The purpose of this change is to make sure that user code that
@@ -38,7 +38,9 @@ class BaseDocumentTransformer(ABC):
38
38
  self.embeddings, stateful_documents
39
39
  )
40
40
  included_idxs = _filter_similar_embeddings(
41
- embedded_documents, self.similarity_fn, self.similarity_threshold
41
+ embedded_documents,
42
+ self.similarity_fn,
43
+ self.similarity_threshold,
42
44
  )
43
45
  return [stateful_documents[i] for i in sorted(included_idxs)]
44
46
 
@@ -47,7 +49,7 @@ class BaseDocumentTransformer(ABC):
47
49
  ) -> Sequence[Document]:
48
50
  raise NotImplementedError
49
51
 
50
- """ # noqa: E501
52
+ """
51
53
 
52
54
  @abstractmethod
53
55
  def transform_documents(
@@ -1,6 +1,7 @@
1
1
  """Module contains a few fake embedding models for testing purposes."""
2
2
 
3
3
  # Please do not add additional fake embedding model implementations here.
4
+ import contextlib
4
5
  import hashlib
5
6
 
6
7
  from pydantic import BaseModel
@@ -8,6 +9,9 @@ from typing_extensions import override
8
9
 
9
10
  from langchain_core.embeddings import Embeddings
10
11
 
12
+ with contextlib.suppress(ImportError):
13
+ import numpy as np
14
+
11
15
 
12
16
  class FakeEmbeddings(Embeddings, BaseModel):
13
17
  """Fake embedding model for unit testing purposes.
@@ -20,6 +24,7 @@ class FakeEmbeddings(Embeddings, BaseModel):
20
24
  .. code-block:: python
21
25
 
22
26
  from langchain_core.embeddings import FakeEmbeddings
27
+
23
28
  embed = FakeEmbeddings(size=100)
24
29
 
25
30
  Embed single text:
@@ -53,8 +58,6 @@ class FakeEmbeddings(Embeddings, BaseModel):
53
58
  """The size of the embedding vector."""
54
59
 
55
60
  def _get_embedding(self) -> list[float]:
56
- import numpy as np
57
-
58
61
  return list(np.random.default_rng().normal(size=self.size))
59
62
 
60
63
  @override
@@ -78,6 +81,7 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
78
81
  .. code-block:: python
79
82
 
80
83
  from langchain_core.embeddings import DeterministicFakeEmbedding
84
+
81
85
  embed = DeterministicFakeEmbedding(size=100)
82
86
 
83
87
  Embed single text:
@@ -111,13 +115,12 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
111
115
  """The size of the embedding vector."""
112
116
 
113
117
  def _get_embedding(self, seed: int) -> list[float]:
114
- import numpy as np
115
-
116
118
  # set the seed for the random generator
117
119
  rng = np.random.default_rng(seed)
118
120
  return list(rng.normal(size=self.size))
119
121
 
120
- def _get_seed(self, text: str) -> int:
122
+ @staticmethod
123
+ def _get_seed(text: str) -> int:
121
124
  """Get a seed for the random generator, using the hash of the text."""
122
125
  return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8
123
126
 
langchain_core/env.py CHANGED
@@ -3,6 +3,8 @@
3
3
  import platform
4
4
  from functools import lru_cache
5
5
 
6
+ from langchain_core import __version__
7
+
6
8
 
7
9
  @lru_cache(maxsize=1)
8
10
  def get_runtime_environment() -> dict:
@@ -11,9 +13,6 @@ def get_runtime_environment() -> dict:
11
13
  Returns:
12
14
  A dictionary with information about the runtime environment.
13
15
  """
14
- # Lazy import to avoid circular imports
15
- from langchain_core import __version__
16
-
17
16
  return {
18
17
  "library_version": __version__,
19
18
  "library": "langchain-core",
@@ -16,6 +16,9 @@ class BaseExampleSelector(ABC):
16
16
  Args:
17
17
  example: A dictionary with keys as input variables
18
18
  and values as their values.
19
+
20
+ Returns:
21
+ Any return value.
19
22
  """
20
23
 
21
24
  async def aadd_example(self, example: dict[str, str]) -> Any:
@@ -24,6 +27,9 @@ class BaseExampleSelector(ABC):
24
27
  Args:
25
28
  example: A dictionary with keys as input variables
26
29
  and values as their values.
30
+
31
+ Returns:
32
+ Any return value.
27
33
  """
28
34
  return await run_in_executor(None, self.add_example, example)
29
35
 
@@ -34,6 +40,9 @@ class BaseExampleSelector(ABC):
34
40
  Args:
35
41
  input_variables: A dictionary with keys as input variables
36
42
  and values as their values.
43
+
44
+ Returns:
45
+ A list of examples.
37
46
  """
38
47
 
39
48
  async def aselect_examples(self, input_variables: dict[str, str]) -> list[dict]:
@@ -42,5 +51,8 @@ class BaseExampleSelector(ABC):
42
51
  Args:
43
52
  input_variables: A dictionary with keys as input variables
44
53
  and values as their values.
54
+
55
+ Returns:
56
+ A list of examples.
45
57
  """
46
58
  return await run_in_executor(None, self.select_examples, input_variables)
@@ -42,6 +42,10 @@ class OutputParserException(ValueError, LangChainException): # noqa: N818
42
42
  previous output was improperly structured, in the hopes that it will
43
43
  update the output to the correct format.
44
44
  Defaults to False.
45
+
46
+ Raises:
47
+ ValueError: If ``send_to_llm`` is True but either observation or
48
+ ``llm_output`` are not provided.
45
49
  """
46
50
  if isinstance(error, str):
47
51
  error = create_message(
@@ -77,6 +81,9 @@ def create_message(*, message: str, error_code: ErrorCode) -> str:
77
81
  Args:
78
82
  message: The message to display.
79
83
  error_code: The error code to display.
84
+
85
+ Returns:
86
+ The full message with the troubleshooting link.
80
87
  """
81
88
  return (
82
89
  f"{message}\n"
langchain_core/globals.py CHANGED
@@ -6,6 +6,13 @@ from typing import TYPE_CHECKING, Optional
6
6
  if TYPE_CHECKING:
7
7
  from langchain_core.caches import BaseCache
8
8
 
9
+ try:
10
+ import langchain # type: ignore[import-not-found]
11
+
12
+ _HAS_LANGCHAIN = True
13
+ except ImportError:
14
+ _HAS_LANGCHAIN = False
15
+
9
16
 
10
17
  # DO NOT USE THESE VALUES DIRECTLY!
11
18
  # Use them only via `get_<X>()` and `set_<X>()` below,
@@ -22,9 +29,7 @@ def set_verbose(value: bool) -> None: # noqa: FBT001
22
29
  Args:
23
30
  value: The new value for the `verbose` global setting.
24
31
  """
25
- try:
26
- import langchain # type: ignore[import-not-found]
27
-
32
+ if _HAS_LANGCHAIN:
28
33
  # We're about to run some deprecated code, don't report warnings from it.
29
34
  # The user called the correct (non-deprecated) code path and shouldn't get
30
35
  # warnings.
@@ -43,8 +48,6 @@ def set_verbose(value: bool) -> None: # noqa: FBT001
43
48
  # Remove it once `langchain.verbose` is no longer supported, and once all
44
49
  # users have migrated to using `set_verbose()` here.
45
50
  langchain.verbose = value
46
- except ImportError:
47
- pass
48
51
 
49
52
  global _verbose # noqa: PLW0603
50
53
  _verbose = value
@@ -56,9 +59,7 @@ def get_verbose() -> bool:
56
59
  Returns:
57
60
  The value of the `verbose` global setting.
58
61
  """
59
- try:
60
- import langchain
61
-
62
+ if _HAS_LANGCHAIN:
62
63
  # We're about to run some deprecated code, don't report warnings from it.
63
64
  # The user called the correct (non-deprecated) code path and shouldn't get
64
65
  # warnings.
@@ -83,7 +84,7 @@ def get_verbose() -> bool:
83
84
  # deprecation warnings directing them to use `set_verbose()` when they
84
85
  # import `langchain.verbose`.
85
86
  old_verbose = langchain.verbose
86
- except ImportError:
87
+ else:
87
88
  old_verbose = False
88
89
 
89
90
  return _verbose or old_verbose
@@ -95,9 +96,7 @@ def set_debug(value: bool) -> None: # noqa: FBT001
95
96
  Args:
96
97
  value: The new value for the `debug` global setting.
97
98
  """
98
- try:
99
- import langchain
100
-
99
+ if _HAS_LANGCHAIN:
101
100
  # We're about to run some deprecated code, don't report warnings from it.
102
101
  # The user called the correct (non-deprecated) code path and shouldn't get
103
102
  # warnings.
@@ -114,8 +113,6 @@ def set_debug(value: bool) -> None: # noqa: FBT001
114
113
  # Remove it once `langchain.debug` is no longer supported, and once all
115
114
  # users have migrated to using `set_debug()` here.
116
115
  langchain.debug = value
117
- except ImportError:
118
- pass
119
116
 
120
117
  global _debug # noqa: PLW0603
121
118
  _debug = value
@@ -127,9 +124,7 @@ def get_debug() -> bool:
127
124
  Returns:
128
125
  The value of the `debug` global setting.
129
126
  """
130
- try:
131
- import langchain
132
-
127
+ if _HAS_LANGCHAIN:
133
128
  # We're about to run some deprecated code, don't report warnings from it.
134
129
  # The user called the correct (non-deprecated) code path and shouldn't get
135
130
  # warnings.
@@ -151,7 +146,7 @@ def get_debug() -> bool:
151
146
  # to using `set_debug()` yet. Those users are getting deprecation warnings
152
147
  # directing them to use `set_debug()` when they import `langchain.debug`.
153
148
  old_debug = langchain.debug
154
- except ImportError:
149
+ else:
155
150
  old_debug = False
156
151
 
157
152
  return _debug or old_debug
@@ -163,9 +158,7 @@ def set_llm_cache(value: Optional["BaseCache"]) -> None:
163
158
  Args:
164
159
  value: The new LLM cache to use. If `None`, the LLM cache is disabled.
165
160
  """
166
- try:
167
- import langchain
168
-
161
+ if _HAS_LANGCHAIN:
169
162
  # We're about to run some deprecated code, don't report warnings from it.
170
163
  # The user called the correct (non-deprecated) code path and shouldn't get
171
164
  # warnings.
@@ -184,22 +177,18 @@ def set_llm_cache(value: Optional["BaseCache"]) -> None:
184
177
  # Remove it once `langchain.llm_cache` is no longer supported, and
185
178
  # once all users have migrated to using `set_llm_cache()` here.
186
179
  langchain.llm_cache = value
187
- except ImportError:
188
- pass
189
180
 
190
181
  global _llm_cache # noqa: PLW0603
191
182
  _llm_cache = value
192
183
 
193
184
 
194
- def get_llm_cache() -> "BaseCache":
185
+ def get_llm_cache() -> Optional["BaseCache"]:
195
186
  """Get the value of the `llm_cache` global setting.
196
187
 
197
188
  Returns:
198
189
  The value of the `llm_cache` global setting.
199
190
  """
200
- try:
201
- import langchain
202
-
191
+ if _HAS_LANGCHAIN:
203
192
  # We're about to run some deprecated code, don't report warnings from it.
204
193
  # The user called the correct (non-deprecated) code path and shouldn't get
205
194
  # warnings.
@@ -225,7 +214,7 @@ def get_llm_cache() -> "BaseCache":
225
214
  # Those users are getting deprecation warnings directing them
226
215
  # to use `set_llm_cache()` when they import `langchain.llm_cache`.
227
216
  old_llm_cache = langchain.llm_cache
228
- except ImportError:
217
+ else:
229
218
  old_llm_cache = None
230
219
 
231
220
  return _llm_cache or old_llm_cache
@@ -56,7 +56,7 @@ def _warn_about_sha1() -> None:
56
56
  "that map to the same fingerprint. If this matters in your "
57
57
  "threat model, switch to a stronger algorithm such "
58
58
  "as 'blake2b', 'sha256', or 'sha512' by specifying "
59
- " `key_encoder` parameter in the the `index` or `aindex` function. ",
59
+ " `key_encoder` parameter in the `index` or `aindex` function. ",
60
60
  category=UserWarning,
61
61
  stacklevel=2,
62
62
  )
@@ -185,6 +185,9 @@ def _get_document_with_hash(
185
185
  When changing the key encoder, you must change the
186
186
  index as well to avoid duplicated documents in the cache.
187
187
 
188
+ Raises:
189
+ ValueError: If the metadata cannot be serialized using json.
190
+
188
191
  Returns:
189
192
  Document with a unique identifier based on the hash of the content and metadata.
190
193
  """
@@ -291,22 +294,26 @@ def index(
291
294
  documents were deleted, which documents should be skipped.
292
295
 
293
296
  For the time being, documents are indexed using their hashes, and users
294
- are not able to specify the uid of the document.
295
-
296
- Important:
297
- * In full mode, the loader should be returning
298
- the entire dataset, and not just a subset of the dataset.
299
- Otherwise, the auto_cleanup will remove documents that it is not
300
- supposed to.
301
- * In incremental mode, if documents associated with a particular
302
- source id appear across different batches, the indexing API
303
- will do some redundant work. This will still result in the
304
- correct end state of the index, but will unfortunately not be
305
- 100% efficient. For example, if a given document is split into 15
306
- chunks, and we index them using a batch size of 5, we'll have 3 batches
307
- all with the same source id. In general, to avoid doing too much
308
- redundant work select as big a batch size as possible.
309
- * The `scoped_full` mode is suitable if determining an appropriate batch size
297
+ are not able to specify the uid of the document.
298
+
299
+ .. versionchanged:: 0.3.25
300
+ Added ``scoped_full`` cleanup mode.
301
+
302
+ .. important::
303
+
304
+ * In full mode, the loader should be returning
305
+ the entire dataset, and not just a subset of the dataset.
306
+ Otherwise, the auto_cleanup will remove documents that it is not
307
+ supposed to.
308
+ * In incremental mode, if documents associated with a particular
309
+ source id appear across different batches, the indexing API
310
+ will do some redundant work. This will still result in the
311
+ correct end state of the index, but will unfortunately not be
312
+ 100% efficient. For example, if a given document is split into 15
313
+ chunks, and we index them using a batch size of 5, we'll have 3 batches
314
+ all with the same source id. In general, to avoid doing too much
315
+ redundant work select as big a batch size as possible.
316
+ * The ``scoped_full`` mode is suitable if determining an appropriate batch size
310
317
  is challenging or if your data loader cannot return the entire dataset at
311
318
  once. This mode keeps track of source IDs in memory, which should be fine
312
319
  for most use cases. If your dataset is large (10M+ docs), you will likely
@@ -315,23 +322,22 @@ def index(
315
322
  Args:
316
323
  docs_source: Data loader or iterable of documents to index.
317
324
  record_manager: Timestamped set to keep track of which documents were
318
- updated.
325
+ updated.
319
326
  vector_store: VectorStore or DocumentIndex to index the documents into.
320
327
  batch_size: Batch size to use when indexing. Default is 100.
321
328
  cleanup: How to handle clean up of documents. Default is None.
329
+
322
330
  - incremental: Cleans up all documents that haven't been updated AND
323
- that are associated with source ids that were seen
324
- during indexing.
325
- Clean up is done continuously during indexing helping
326
- to minimize the probability of users seeing duplicated
327
- content.
331
+ that are associated with source ids that were seen during indexing.
332
+ Clean up is done continuously during indexing helping to minimize the
333
+ probability of users seeing duplicated content.
328
334
  - full: Delete all documents that have not been returned by the loader
329
- during this run of indexing.
330
- Clean up runs after all documents have been indexed.
331
- This means that users may see duplicated content during indexing.
335
+ during this run of indexing.
336
+ Clean up runs after all documents have been indexed.
337
+ This means that users may see duplicated content during indexing.
332
338
  - scoped_full: Similar to Full, but only deletes all documents
333
- that haven't been updated AND that are associated with
334
- source ids that were seen during indexing.
339
+ that haven't been updated AND that are associated with
340
+ source ids that were seen during indexing.
335
341
  - None: Do not delete any documents.
336
342
  source_id_key: Optional key that helps identify the original source
337
343
  of the document. Default is None.
@@ -358,10 +364,9 @@ def index(
358
364
  When changing the key encoder, you must change the
359
365
  index as well to avoid duplicated documents in the cache.
360
366
  upsert_kwargs: Additional keyword arguments to pass to the add_documents
361
- method of the VectorStore or the upsert method of the
362
- DocumentIndex. For example, you can use this to
363
- specify a custom vector_field:
364
- upsert_kwargs={"vector_field": "embedding"}
367
+ method of the VectorStore or the upsert method of the DocumentIndex.
368
+ For example, you can use this to specify a custom vector_field:
369
+ upsert_kwargs={"vector_field": "embedding"}
365
370
  .. versionadded:: 0.3.10
366
371
 
367
372
  Returns:
@@ -374,10 +379,9 @@ def index(
374
379
  ValueError: If vectorstore does not have
375
380
  "delete" and "add_documents" required methods.
376
381
  ValueError: If source_id_key is not None, but is not a string or callable.
377
-
378
- .. version_modified:: 0.3.25
379
-
380
- * Added `scoped_full` cleanup mode.
382
+ TypeError: If ``vectorstore`` is not a VectorStore or a DocumentIndex.
383
+ AssertionError: If ``source_id`` is None when cleanup mode is incremental.
384
+ (should be unreachable code).
381
385
  """
382
386
  # Behavior is deprecated, but we keep it for backwards compatibility.
383
387
  # # Warn only once per process.
@@ -632,46 +636,50 @@ async def aindex(
632
636
  documents were deleted, which documents should be skipped.
633
637
 
634
638
  For the time being, documents are indexed using their hashes, and users
635
- are not able to specify the uid of the document.
636
-
637
- Important:
638
- * In full mode, the loader should be returning
639
- the entire dataset, and not just a subset of the dataset.
640
- Otherwise, the auto_cleanup will remove documents that it is not
641
- supposed to.
642
- * In incremental mode, if documents associated with a particular
643
- source id appear across different batches, the indexing API
644
- will do some redundant work. This will still result in the
645
- correct end state of the index, but will unfortunately not be
646
- 100% efficient. For example, if a given document is split into 15
647
- chunks, and we index them using a batch size of 5, we'll have 3 batches
648
- all with the same source id. In general, to avoid doing too much
649
- redundant work select as big a batch size as possible.
650
- * The `scoped_full` mode is suitable if determining an appropriate batch size
651
- is challenging or if your data loader cannot return the entire dataset at
652
- once. This mode keeps track of source IDs in memory, which should be fine
653
- for most use cases. If your dataset is large (10M+ docs), you will likely
654
- need to parallelize the indexing process regardless.
639
+ are not able to specify the uid of the document.
640
+
641
+ .. versionchanged:: 0.3.25
642
+ Added ``scoped_full`` cleanup mode.
643
+
644
+ .. important::
645
+
646
+ * In full mode, the loader should be returning
647
+ the entire dataset, and not just a subset of the dataset.
648
+ Otherwise, the auto_cleanup will remove documents that it is not
649
+ supposed to.
650
+ * In incremental mode, if documents associated with a particular
651
+ source id appear across different batches, the indexing API
652
+ will do some redundant work. This will still result in the
653
+ correct end state of the index, but will unfortunately not be
654
+ 100% efficient. For example, if a given document is split into 15
655
+ chunks, and we index them using a batch size of 5, we'll have 3 batches
656
+ all with the same source id. In general, to avoid doing too much
657
+ redundant work select as big a batch size as possible.
658
+ * The ``scoped_full`` mode is suitable if determining an appropriate batch size
659
+ is challenging or if your data loader cannot return the entire dataset at
660
+ once. This mode keeps track of source IDs in memory, which should be fine
661
+ for most use cases. If your dataset is large (10M+ docs), you will likely
662
+ need to parallelize the indexing process regardless.
655
663
 
656
664
  Args:
657
665
  docs_source: Data loader or iterable of documents to index.
658
666
  record_manager: Timestamped set to keep track of which documents were
659
- updated.
667
+ updated.
660
668
  vector_store: VectorStore or DocumentIndex to index the documents into.
661
669
  batch_size: Batch size to use when indexing. Default is 100.
662
670
  cleanup: How to handle clean up of documents. Default is None.
671
+
663
672
  - incremental: Cleans up all documents that haven't been updated AND
664
- that are associated with source ids that were seen
665
- during indexing.
666
- Clean up is done continuously during indexing helping
667
- to minimize the probability of users seeing duplicated
668
- content.
669
- - full: Delete all documents that haven to been returned by the loader.
670
- Clean up runs after all documents have been indexed.
671
- This means that users may see duplicated content during indexing.
673
+ that are associated with source ids that were seen during indexing.
674
+ Clean up is done continuously during indexing helping to minimize the
675
+ probability of users seeing duplicated content.
676
+ - full: Delete all documents that have not been returned by the loader
677
+ during this run of indexing.
678
+ Clean up runs after all documents have been indexed.
679
+ This means that users may see duplicated content during indexing.
672
680
  - scoped_full: Similar to Full, but only deletes all documents
673
- that haven't been updated AND that are associated with
674
- source ids that were seen during indexing.
681
+ that haven't been updated AND that are associated with
682
+ source ids that were seen during indexing.
675
683
  - None: Do not delete any documents.
676
684
  source_id_key: Optional key that helps identify the original source
677
685
  of the document. Default is None.
@@ -680,6 +688,12 @@ async def aindex(
680
688
  force_update: Force update documents even if they are present in the
681
689
  record manager. Useful if you are re-indexing with updated embeddings.
682
690
  Default is False.
691
+ key_encoder: Hashing algorithm to use for hashing the document content and
692
+ metadata. Default is "sha1".
693
+ Other options include "blake2b", "sha256", and "sha512".
694
+
695
+ .. versionadded:: 0.3.66
696
+
683
697
  key_encoder: Hashing algorithm to use for hashing the document.
684
698
  If not provided, a default encoder using SHA-1 will be used.
685
699
  SHA-1 is not collision-resistant, and a motivated attacker
@@ -691,11 +705,10 @@ async def aindex(
691
705
 
692
706
  When changing the key encoder, you must change the
693
707
  index as well to avoid duplicated documents in the cache.
694
- upsert_kwargs: Additional keyword arguments to pass to the aadd_documents
695
- method of the VectorStore or the aupsert method of the
696
- DocumentIndex. For example, you can use this to
697
- specify a custom vector_field:
698
- upsert_kwargs={"vector_field": "embedding"}
708
+ upsert_kwargs: Additional keyword arguments to pass to the add_documents
709
+ method of the VectorStore or the upsert method of the DocumentIndex.
710
+ For example, you can use this to specify a custom vector_field:
711
+ upsert_kwargs={"vector_field": "embedding"}
699
712
  .. versionadded:: 0.3.10
700
713
 
701
714
  Returns:
@@ -708,10 +721,9 @@ async def aindex(
708
721
  ValueError: If vectorstore does not have
709
722
  "adelete" and "aadd_documents" required methods.
710
723
  ValueError: If source_id_key is not None, but is not a string or callable.
711
-
712
- .. version_modified:: 0.3.25
713
-
714
- * Added `scoped_full` cleanup mode.
724
+ TypeError: If ``vector_store`` is not a VectorStore or DocumentIndex.
725
+ AssertionError: If ``source_id_key`` is None when cleanup mode is
726
+ incremental or ``scoped_full`` (should be unreachable).
715
727
  """
716
728
  # Behavior is deprecated, but we keep it for backwards compatibility.
717
729
  # # Warn only once per process.