langchain-core 0.4.0.dev0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langchain-core might be problematic. Click here for more details.

Files changed (172) hide show
  1. langchain_core/__init__.py +1 -1
  2. langchain_core/_api/__init__.py +3 -4
  3. langchain_core/_api/beta_decorator.py +45 -70
  4. langchain_core/_api/deprecation.py +80 -80
  5. langchain_core/_api/path.py +22 -8
  6. langchain_core/_import_utils.py +10 -4
  7. langchain_core/agents.py +25 -21
  8. langchain_core/caches.py +53 -63
  9. langchain_core/callbacks/__init__.py +1 -8
  10. langchain_core/callbacks/base.py +341 -348
  11. langchain_core/callbacks/file.py +55 -44
  12. langchain_core/callbacks/manager.py +546 -683
  13. langchain_core/callbacks/stdout.py +29 -30
  14. langchain_core/callbacks/streaming_stdout.py +35 -36
  15. langchain_core/callbacks/usage.py +65 -70
  16. langchain_core/chat_history.py +48 -55
  17. langchain_core/document_loaders/base.py +46 -21
  18. langchain_core/document_loaders/langsmith.py +39 -36
  19. langchain_core/documents/__init__.py +0 -1
  20. langchain_core/documents/base.py +96 -74
  21. langchain_core/documents/compressor.py +12 -9
  22. langchain_core/documents/transformers.py +29 -28
  23. langchain_core/embeddings/fake.py +56 -57
  24. langchain_core/env.py +2 -3
  25. langchain_core/example_selectors/base.py +12 -0
  26. langchain_core/example_selectors/length_based.py +1 -1
  27. langchain_core/example_selectors/semantic_similarity.py +21 -25
  28. langchain_core/exceptions.py +15 -9
  29. langchain_core/globals.py +4 -163
  30. langchain_core/indexing/api.py +132 -125
  31. langchain_core/indexing/base.py +64 -67
  32. langchain_core/indexing/in_memory.py +26 -6
  33. langchain_core/language_models/__init__.py +15 -27
  34. langchain_core/language_models/_utils.py +267 -117
  35. langchain_core/language_models/base.py +92 -177
  36. langchain_core/language_models/chat_models.py +547 -407
  37. langchain_core/language_models/fake.py +11 -11
  38. langchain_core/language_models/fake_chat_models.py +72 -118
  39. langchain_core/language_models/llms.py +168 -242
  40. langchain_core/load/dump.py +8 -11
  41. langchain_core/load/load.py +32 -28
  42. langchain_core/load/mapping.py +2 -4
  43. langchain_core/load/serializable.py +50 -56
  44. langchain_core/messages/__init__.py +36 -51
  45. langchain_core/messages/ai.py +377 -150
  46. langchain_core/messages/base.py +239 -47
  47. langchain_core/messages/block_translators/__init__.py +111 -0
  48. langchain_core/messages/block_translators/anthropic.py +470 -0
  49. langchain_core/messages/block_translators/bedrock.py +94 -0
  50. langchain_core/messages/block_translators/bedrock_converse.py +297 -0
  51. langchain_core/messages/block_translators/google_genai.py +530 -0
  52. langchain_core/messages/block_translators/google_vertexai.py +21 -0
  53. langchain_core/messages/block_translators/groq.py +143 -0
  54. langchain_core/messages/block_translators/langchain_v0.py +301 -0
  55. langchain_core/messages/block_translators/openai.py +1010 -0
  56. langchain_core/messages/chat.py +2 -3
  57. langchain_core/messages/content.py +1423 -0
  58. langchain_core/messages/function.py +7 -7
  59. langchain_core/messages/human.py +44 -38
  60. langchain_core/messages/modifier.py +3 -2
  61. langchain_core/messages/system.py +40 -27
  62. langchain_core/messages/tool.py +160 -58
  63. langchain_core/messages/utils.py +527 -638
  64. langchain_core/output_parsers/__init__.py +1 -14
  65. langchain_core/output_parsers/base.py +68 -104
  66. langchain_core/output_parsers/json.py +13 -17
  67. langchain_core/output_parsers/list.py +11 -33
  68. langchain_core/output_parsers/openai_functions.py +56 -74
  69. langchain_core/output_parsers/openai_tools.py +68 -109
  70. langchain_core/output_parsers/pydantic.py +15 -13
  71. langchain_core/output_parsers/string.py +6 -2
  72. langchain_core/output_parsers/transform.py +17 -60
  73. langchain_core/output_parsers/xml.py +34 -44
  74. langchain_core/outputs/__init__.py +1 -1
  75. langchain_core/outputs/chat_generation.py +26 -11
  76. langchain_core/outputs/chat_result.py +1 -3
  77. langchain_core/outputs/generation.py +17 -6
  78. langchain_core/outputs/llm_result.py +15 -8
  79. langchain_core/prompt_values.py +29 -123
  80. langchain_core/prompts/__init__.py +3 -27
  81. langchain_core/prompts/base.py +48 -63
  82. langchain_core/prompts/chat.py +259 -288
  83. langchain_core/prompts/dict.py +19 -11
  84. langchain_core/prompts/few_shot.py +84 -90
  85. langchain_core/prompts/few_shot_with_templates.py +14 -12
  86. langchain_core/prompts/image.py +19 -14
  87. langchain_core/prompts/loading.py +6 -8
  88. langchain_core/prompts/message.py +7 -8
  89. langchain_core/prompts/prompt.py +42 -43
  90. langchain_core/prompts/string.py +37 -16
  91. langchain_core/prompts/structured.py +43 -46
  92. langchain_core/rate_limiters.py +51 -60
  93. langchain_core/retrievers.py +52 -192
  94. langchain_core/runnables/base.py +1727 -1683
  95. langchain_core/runnables/branch.py +52 -73
  96. langchain_core/runnables/config.py +89 -103
  97. langchain_core/runnables/configurable.py +128 -130
  98. langchain_core/runnables/fallbacks.py +93 -82
  99. langchain_core/runnables/graph.py +127 -127
  100. langchain_core/runnables/graph_ascii.py +63 -41
  101. langchain_core/runnables/graph_mermaid.py +87 -70
  102. langchain_core/runnables/graph_png.py +31 -36
  103. langchain_core/runnables/history.py +145 -161
  104. langchain_core/runnables/passthrough.py +141 -144
  105. langchain_core/runnables/retry.py +84 -68
  106. langchain_core/runnables/router.py +33 -37
  107. langchain_core/runnables/schema.py +79 -72
  108. langchain_core/runnables/utils.py +95 -139
  109. langchain_core/stores.py +85 -131
  110. langchain_core/structured_query.py +11 -15
  111. langchain_core/sys_info.py +31 -32
  112. langchain_core/tools/__init__.py +1 -14
  113. langchain_core/tools/base.py +221 -247
  114. langchain_core/tools/convert.py +144 -161
  115. langchain_core/tools/render.py +10 -10
  116. langchain_core/tools/retriever.py +12 -19
  117. langchain_core/tools/simple.py +52 -29
  118. langchain_core/tools/structured.py +56 -60
  119. langchain_core/tracers/__init__.py +1 -9
  120. langchain_core/tracers/_streaming.py +6 -7
  121. langchain_core/tracers/base.py +103 -112
  122. langchain_core/tracers/context.py +29 -48
  123. langchain_core/tracers/core.py +142 -105
  124. langchain_core/tracers/evaluation.py +30 -34
  125. langchain_core/tracers/event_stream.py +162 -117
  126. langchain_core/tracers/langchain.py +34 -36
  127. langchain_core/tracers/log_stream.py +87 -49
  128. langchain_core/tracers/memory_stream.py +3 -3
  129. langchain_core/tracers/root_listeners.py +18 -34
  130. langchain_core/tracers/run_collector.py +8 -20
  131. langchain_core/tracers/schemas.py +0 -125
  132. langchain_core/tracers/stdout.py +3 -3
  133. langchain_core/utils/__init__.py +1 -4
  134. langchain_core/utils/_merge.py +47 -9
  135. langchain_core/utils/aiter.py +70 -66
  136. langchain_core/utils/env.py +12 -9
  137. langchain_core/utils/function_calling.py +139 -206
  138. langchain_core/utils/html.py +7 -8
  139. langchain_core/utils/input.py +6 -6
  140. langchain_core/utils/interactive_env.py +6 -2
  141. langchain_core/utils/iter.py +48 -45
  142. langchain_core/utils/json.py +14 -4
  143. langchain_core/utils/json_schema.py +159 -43
  144. langchain_core/utils/mustache.py +32 -25
  145. langchain_core/utils/pydantic.py +67 -40
  146. langchain_core/utils/strings.py +5 -5
  147. langchain_core/utils/usage.py +1 -1
  148. langchain_core/utils/utils.py +104 -62
  149. langchain_core/vectorstores/base.py +131 -179
  150. langchain_core/vectorstores/in_memory.py +113 -182
  151. langchain_core/vectorstores/utils.py +23 -17
  152. langchain_core/version.py +1 -1
  153. langchain_core-1.0.0.dist-info/METADATA +68 -0
  154. langchain_core-1.0.0.dist-info/RECORD +172 -0
  155. {langchain_core-0.4.0.dev0.dist-info → langchain_core-1.0.0.dist-info}/WHEEL +1 -1
  156. langchain_core/beta/__init__.py +0 -1
  157. langchain_core/beta/runnables/__init__.py +0 -1
  158. langchain_core/beta/runnables/context.py +0 -448
  159. langchain_core/memory.py +0 -116
  160. langchain_core/messages/content_blocks.py +0 -1435
  161. langchain_core/prompts/pipeline.py +0 -133
  162. langchain_core/pydantic_v1/__init__.py +0 -30
  163. langchain_core/pydantic_v1/dataclasses.py +0 -23
  164. langchain_core/pydantic_v1/main.py +0 -23
  165. langchain_core/tracers/langchain_v1.py +0 -23
  166. langchain_core/utils/loading.py +0 -31
  167. langchain_core/v1/__init__.py +0 -1
  168. langchain_core/v1/chat_models.py +0 -1047
  169. langchain_core/v1/messages.py +0 -755
  170. langchain_core-0.4.0.dev0.dist-info/METADATA +0 -108
  171. langchain_core-0.4.0.dev0.dist-info/RECORD +0 -177
  172. langchain_core-0.4.0.dev0.dist-info/entry_points.txt +0 -4
@@ -6,16 +6,20 @@ import hashlib
6
6
  import json
7
7
  import uuid
8
8
  import warnings
9
- from collections.abc import AsyncIterable, AsyncIterator, Iterable, Iterator, Sequence
9
+ from collections.abc import (
10
+ AsyncIterable,
11
+ AsyncIterator,
12
+ Callable,
13
+ Iterable,
14
+ Iterator,
15
+ Sequence,
16
+ )
10
17
  from itertools import islice
11
18
  from typing import (
12
19
  Any,
13
- Callable,
14
20
  Literal,
15
- Optional,
16
21
  TypedDict,
17
22
  TypeVar,
18
- Union,
19
23
  cast,
20
24
  )
21
25
 
@@ -56,7 +60,7 @@ def _warn_about_sha1() -> None:
56
60
  "that map to the same fingerprint. If this matters in your "
57
61
  "threat model, switch to a stronger algorithm such "
58
62
  "as 'blake2b', 'sha256', or 'sha512' by specifying "
59
- " `key_encoder` parameter in the the `index` or `aindex` function. ",
63
+ " `key_encoder` parameter in the `index` or `aindex` function. ",
60
64
  category=UserWarning,
61
65
  stacklevel=2,
62
66
  )
@@ -107,8 +111,8 @@ async def _abatch(size: int, iterable: AsyncIterable[T]) -> AsyncIterator[list[T
107
111
 
108
112
 
109
113
  def _get_source_id_assigner(
110
- source_id_key: Union[str, Callable[[Document], str], None],
111
- ) -> Callable[[Document], Union[str, None]]:
114
+ source_id_key: str | Callable[[Document], str] | None,
115
+ ) -> Callable[[Document], str | None]:
112
116
  """Get the source id from the document."""
113
117
  if source_id_key is None:
114
118
  return lambda _doc: None
@@ -162,9 +166,8 @@ def _calculate_hash(
162
166
  def _get_document_with_hash(
163
167
  document: Document,
164
168
  *,
165
- key_encoder: Union[
166
- Callable[[Document], str], Literal["sha1", "sha256", "sha512", "blake2b"]
167
- ],
169
+ key_encoder: Callable[[Document], str]
170
+ | Literal["sha1", "sha256", "sha512", "blake2b"],
168
171
  ) -> Document:
169
172
  """Calculate a hash of the document, and assign it to the uid.
170
173
 
@@ -185,6 +188,9 @@ def _get_document_with_hash(
185
188
  When changing the key encoder, you must change the
186
189
  index as well to avoid duplicated documents in the cache.
187
190
 
191
+ Raises:
192
+ ValueError: If the metadata cannot be serialized using json.
193
+
188
194
  Returns:
189
195
  Document with a unique identifier based on the hash of the content and metadata.
190
196
  """
@@ -230,7 +236,7 @@ class _HashedDocument:
230
236
 
231
237
 
232
238
  def _delete(
233
- vector_store: Union[VectorStore, DocumentIndex],
239
+ vector_store: VectorStore | DocumentIndex,
234
240
  ids: list[str],
235
241
  ) -> None:
236
242
  if isinstance(vector_store, VectorStore):
@@ -268,19 +274,18 @@ class IndexingResult(TypedDict):
268
274
 
269
275
 
270
276
  def index(
271
- docs_source: Union[BaseLoader, Iterable[Document]],
277
+ docs_source: BaseLoader | Iterable[Document],
272
278
  record_manager: RecordManager,
273
- vector_store: Union[VectorStore, DocumentIndex],
279
+ vector_store: VectorStore | DocumentIndex,
274
280
  *,
275
281
  batch_size: int = 100,
276
- cleanup: Optional[Literal["incremental", "full", "scoped_full"]] = None,
277
- source_id_key: Union[str, Callable[[Document], str], None] = None,
282
+ cleanup: Literal["incremental", "full", "scoped_full"] | None = None,
283
+ source_id_key: str | Callable[[Document], str] | None = None,
278
284
  cleanup_batch_size: int = 1_000,
279
285
  force_update: bool = False,
280
- key_encoder: Union[
281
- Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
282
- ] = "sha1",
283
- upsert_kwargs: Optional[dict[str, Any]] = None,
286
+ key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]
287
+ | Callable[[Document], str] = "sha1",
288
+ upsert_kwargs: dict[str, Any] | None = None,
284
289
  ) -> IndexingResult:
285
290
  """Index data from the loader into the vector store.
286
291
 
@@ -291,21 +296,25 @@ def index(
291
296
  documents were deleted, which documents should be skipped.
292
297
 
293
298
  For the time being, documents are indexed using their hashes, and users
294
- are not able to specify the uid of the document.
295
-
296
- Important:
297
- * In full mode, the loader should be returning
298
- the entire dataset, and not just a subset of the dataset.
299
- Otherwise, the auto_cleanup will remove documents that it is not
300
- supposed to.
301
- * In incremental mode, if documents associated with a particular
302
- source id appear across different batches, the indexing API
303
- will do some redundant work. This will still result in the
304
- correct end state of the index, but will unfortunately not be
305
- 100% efficient. For example, if a given document is split into 15
306
- chunks, and we index them using a batch size of 5, we'll have 3 batches
307
- all with the same source id. In general, to avoid doing too much
308
- redundant work select as big a batch size as possible.
299
+ are not able to specify the uid of the document.
300
+
301
+ !!! warning "Behavior changed in 0.3.25"
302
+ Added `scoped_full` cleanup mode.
303
+
304
+ !!! warning
305
+
306
+ * In full mode, the loader should be returning
307
+ the entire dataset, and not just a subset of the dataset.
308
+ Otherwise, the auto_cleanup will remove documents that it is not
309
+ supposed to.
310
+ * In incremental mode, if documents associated with a particular
311
+ source id appear across different batches, the indexing API
312
+ will do some redundant work. This will still result in the
313
+ correct end state of the index, but will unfortunately not be
314
+ 100% efficient. For example, if a given document is split into 15
315
+ chunks, and we index them using a batch size of 5, we'll have 3 batches
316
+ all with the same source id. In general, to avoid doing too much
317
+ redundant work select as big a batch size as possible.
309
318
  * The `scoped_full` mode is suitable if determining an appropriate batch size
310
319
  is challenging or if your data loader cannot return the entire dataset at
311
320
  once. This mode keeps track of source IDs in memory, which should be fine
@@ -315,36 +324,32 @@ def index(
315
324
  Args:
316
325
  docs_source: Data loader or iterable of documents to index.
317
326
  record_manager: Timestamped set to keep track of which documents were
318
- updated.
327
+ updated.
319
328
  vector_store: VectorStore or DocumentIndex to index the documents into.
320
- batch_size: Batch size to use when indexing. Default is 100.
321
- cleanup: How to handle clean up of documents. Default is None.
329
+ batch_size: Batch size to use when indexing.
330
+ cleanup: How to handle clean up of documents.
331
+
322
332
  - incremental: Cleans up all documents that haven't been updated AND
323
- that are associated with source ids that were seen
324
- during indexing.
325
- Clean up is done continuously during indexing helping
326
- to minimize the probability of users seeing duplicated
327
- content.
333
+ that are associated with source ids that were seen during indexing.
334
+ Clean up is done continuously during indexing helping to minimize the
335
+ probability of users seeing duplicated content.
328
336
  - full: Delete all documents that have not been returned by the loader
329
- during this run of indexing.
330
- Clean up runs after all documents have been indexed.
331
- This means that users may see duplicated content during indexing.
337
+ during this run of indexing.
338
+ Clean up runs after all documents have been indexed.
339
+ This means that users may see duplicated content during indexing.
332
340
  - scoped_full: Similar to Full, but only deletes all documents
333
- that haven't been updated AND that are associated with
334
- source ids that were seen during indexing.
341
+ that haven't been updated AND that are associated with
342
+ source ids that were seen during indexing.
335
343
  - None: Do not delete any documents.
336
344
  source_id_key: Optional key that helps identify the original source
337
- of the document. Default is None.
345
+ of the document.
338
346
  cleanup_batch_size: Batch size to use when cleaning up documents.
339
- Default is 1_000.
340
347
  force_update: Force update documents even if they are present in the
341
348
  record manager. Useful if you are re-indexing with updated embeddings.
342
- Default is False.
343
349
  key_encoder: Hashing algorithm to use for hashing the document content and
344
- metadata. Default is "sha1".
345
- Other options include "blake2b", "sha256", and "sha512".
350
+ metadata. Options include "blake2b", "sha256", and "sha512".
346
351
 
347
- .. versionadded:: 0.3.66
352
+ !!! version-added "Added in version 0.3.66"
348
353
 
349
354
  key_encoder: Hashing algorithm to use for hashing the document.
350
355
  If not provided, a default encoder using SHA-1 will be used.
@@ -358,11 +363,10 @@ def index(
358
363
  When changing the key encoder, you must change the
359
364
  index as well to avoid duplicated documents in the cache.
360
365
  upsert_kwargs: Additional keyword arguments to pass to the add_documents
361
- method of the VectorStore or the upsert method of the
362
- DocumentIndex. For example, you can use this to
363
- specify a custom vector_field:
364
- upsert_kwargs={"vector_field": "embedding"}
365
- .. versionadded:: 0.3.10
366
+ method of the VectorStore or the upsert method of the DocumentIndex.
367
+ For example, you can use this to specify a custom vector_field:
368
+ upsert_kwargs={"vector_field": "embedding"}
369
+ !!! version-added "Added in version 0.3.10"
366
370
 
367
371
  Returns:
368
372
  Indexing result which contains information about how many documents
@@ -374,10 +378,9 @@ def index(
374
378
  ValueError: If vectorstore does not have
375
379
  "delete" and "add_documents" required methods.
376
380
  ValueError: If source_id_key is not None, but is not a string or callable.
377
-
378
- .. version_modified:: 0.3.25
379
-
380
- * Added `scoped_full` cleanup mode.
381
+ TypeError: If `vectorstore` is not a VectorStore or a DocumentIndex.
382
+ AssertionError: If `source_id` is None when cleanup mode is incremental.
383
+ (should be unreachable code).
381
384
  """
382
385
  # Behavior is deprecated, but we keep it for backwards compatibility.
383
386
  # # Warn only once per process.
@@ -458,13 +461,13 @@ def index(
458
461
  # Count documents removed by within-batch deduplication
459
462
  num_skipped += original_batch_size - len(hashed_docs)
460
463
 
461
- source_ids: Sequence[Optional[str]] = [
464
+ source_ids: Sequence[str | None] = [
462
465
  source_id_assigner(hashed_doc) for hashed_doc in hashed_docs
463
466
  ]
464
467
 
465
468
  if cleanup in {"incremental", "scoped_full"}:
466
469
  # source ids are required.
467
- for source_id, hashed_doc in zip(source_ids, hashed_docs):
470
+ for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
468
471
  if source_id is None:
469
472
  msg = (
470
473
  f"Source ids are required when cleanup mode is "
@@ -488,7 +491,7 @@ def index(
488
491
  docs_to_index = []
489
492
  uids_to_refresh = []
490
493
  seen_docs: set[str] = set()
491
- for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
494
+ for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):
492
495
  hashed_id = cast("str", hashed_doc.id)
493
496
  if doc_exists:
494
497
  if force_update:
@@ -559,7 +562,7 @@ def index(
559
562
  if cleanup == "full" or (
560
563
  cleanup == "scoped_full" and scoped_full_cleanup_source_ids
561
564
  ):
562
- delete_group_ids: Optional[Sequence[str]] = None
565
+ delete_group_ids: Sequence[str] | None = None
563
566
  if cleanup == "scoped_full":
564
567
  delete_group_ids = list(scoped_full_cleanup_source_ids)
565
568
  while uids_to_delete := record_manager.list_keys(
@@ -587,7 +590,7 @@ async def _to_async_iterator(iterator: Iterable[T]) -> AsyncIterator[T]:
587
590
 
588
591
 
589
592
  async def _adelete(
590
- vector_store: Union[VectorStore, DocumentIndex],
593
+ vector_store: VectorStore | DocumentIndex,
591
594
  ids: list[str],
592
595
  ) -> None:
593
596
  if isinstance(vector_store, VectorStore):
@@ -609,19 +612,18 @@ async def _adelete(
609
612
 
610
613
 
611
614
  async def aindex(
612
- docs_source: Union[BaseLoader, Iterable[Document], AsyncIterator[Document]],
615
+ docs_source: BaseLoader | Iterable[Document] | AsyncIterator[Document],
613
616
  record_manager: RecordManager,
614
- vector_store: Union[VectorStore, DocumentIndex],
617
+ vector_store: VectorStore | DocumentIndex,
615
618
  *,
616
619
  batch_size: int = 100,
617
- cleanup: Optional[Literal["incremental", "full", "scoped_full"]] = None,
618
- source_id_key: Union[str, Callable[[Document], str], None] = None,
620
+ cleanup: Literal["incremental", "full", "scoped_full"] | None = None,
621
+ source_id_key: str | Callable[[Document], str] | None = None,
619
622
  cleanup_batch_size: int = 1_000,
620
623
  force_update: bool = False,
621
- key_encoder: Union[
622
- Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
623
- ] = "sha1",
624
- upsert_kwargs: Optional[dict[str, Any]] = None,
624
+ key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]
625
+ | Callable[[Document], str] = "sha1",
626
+ upsert_kwargs: dict[str, Any] | None = None,
625
627
  ) -> IndexingResult:
626
628
  """Async index data from the loader into the vector store.
627
629
 
@@ -632,54 +634,61 @@ async def aindex(
632
634
  documents were deleted, which documents should be skipped.
633
635
 
634
636
  For the time being, documents are indexed using their hashes, and users
635
- are not able to specify the uid of the document.
636
-
637
- Important:
638
- * In full mode, the loader should be returning
639
- the entire dataset, and not just a subset of the dataset.
640
- Otherwise, the auto_cleanup will remove documents that it is not
641
- supposed to.
642
- * In incremental mode, if documents associated with a particular
643
- source id appear across different batches, the indexing API
644
- will do some redundant work. This will still result in the
645
- correct end state of the index, but will unfortunately not be
646
- 100% efficient. For example, if a given document is split into 15
647
- chunks, and we index them using a batch size of 5, we'll have 3 batches
648
- all with the same source id. In general, to avoid doing too much
649
- redundant work select as big a batch size as possible.
650
- * The `scoped_full` mode is suitable if determining an appropriate batch size
651
- is challenging or if your data loader cannot return the entire dataset at
652
- once. This mode keeps track of source IDs in memory, which should be fine
653
- for most use cases. If your dataset is large (10M+ docs), you will likely
654
- need to parallelize the indexing process regardless.
637
+ are not able to specify the uid of the document.
638
+
639
+ !!! warning "Behavior changed in 0.3.25"
640
+ Added `scoped_full` cleanup mode.
641
+
642
+ !!! warning
643
+
644
+ * In full mode, the loader should be returning
645
+ the entire dataset, and not just a subset of the dataset.
646
+ Otherwise, the auto_cleanup will remove documents that it is not
647
+ supposed to.
648
+ * In incremental mode, if documents associated with a particular
649
+ source id appear across different batches, the indexing API
650
+ will do some redundant work. This will still result in the
651
+ correct end state of the index, but will unfortunately not be
652
+ 100% efficient. For example, if a given document is split into 15
653
+ chunks, and we index them using a batch size of 5, we'll have 3 batches
654
+ all with the same source id. In general, to avoid doing too much
655
+ redundant work select as big a batch size as possible.
656
+ * The `scoped_full` mode is suitable if determining an appropriate batch size
657
+ is challenging or if your data loader cannot return the entire dataset at
658
+ once. This mode keeps track of source IDs in memory, which should be fine
659
+ for most use cases. If your dataset is large (10M+ docs), you will likely
660
+ need to parallelize the indexing process regardless.
655
661
 
656
662
  Args:
657
663
  docs_source: Data loader or iterable of documents to index.
658
664
  record_manager: Timestamped set to keep track of which documents were
659
- updated.
665
+ updated.
660
666
  vector_store: VectorStore or DocumentIndex to index the documents into.
661
- batch_size: Batch size to use when indexing. Default is 100.
662
- cleanup: How to handle clean up of documents. Default is None.
667
+ batch_size: Batch size to use when indexing.
668
+ cleanup: How to handle clean up of documents.
669
+
663
670
  - incremental: Cleans up all documents that haven't been updated AND
664
- that are associated with source ids that were seen
665
- during indexing.
666
- Clean up is done continuously during indexing helping
667
- to minimize the probability of users seeing duplicated
668
- content.
669
- - full: Delete all documents that haven to been returned by the loader.
670
- Clean up runs after all documents have been indexed.
671
- This means that users may see duplicated content during indexing.
671
+ that are associated with source ids that were seen during indexing.
672
+ Clean up is done continuously during indexing helping to minimize the
673
+ probability of users seeing duplicated content.
674
+ - full: Delete all documents that have not been returned by the loader
675
+ during this run of indexing.
676
+ Clean up runs after all documents have been indexed.
677
+ This means that users may see duplicated content during indexing.
672
678
  - scoped_full: Similar to Full, but only deletes all documents
673
- that haven't been updated AND that are associated with
674
- source ids that were seen during indexing.
679
+ that haven't been updated AND that are associated with
680
+ source ids that were seen during indexing.
675
681
  - None: Do not delete any documents.
676
682
  source_id_key: Optional key that helps identify the original source
677
- of the document. Default is None.
683
+ of the document.
678
684
  cleanup_batch_size: Batch size to use when cleaning up documents.
679
- Default is 1_000.
680
685
  force_update: Force update documents even if they are present in the
681
686
  record manager. Useful if you are re-indexing with updated embeddings.
682
- Default is False.
687
+ key_encoder: Hashing algorithm to use for hashing the document content and
688
+ metadata. Options include "blake2b", "sha256", and "sha512".
689
+
690
+ !!! version-added "Added in version 0.3.66"
691
+
683
692
  key_encoder: Hashing algorithm to use for hashing the document.
684
693
  If not provided, a default encoder using SHA-1 will be used.
685
694
  SHA-1 is not collision-resistant, and a motivated attacker
@@ -691,12 +700,11 @@ async def aindex(
691
700
 
692
701
  When changing the key encoder, you must change the
693
702
  index as well to avoid duplicated documents in the cache.
694
- upsert_kwargs: Additional keyword arguments to pass to the aadd_documents
695
- method of the VectorStore or the aupsert method of the
696
- DocumentIndex. For example, you can use this to
697
- specify a custom vector_field:
698
- upsert_kwargs={"vector_field": "embedding"}
699
- .. versionadded:: 0.3.10
703
+ upsert_kwargs: Additional keyword arguments to pass to the add_documents
704
+ method of the VectorStore or the upsert method of the DocumentIndex.
705
+ For example, you can use this to specify a custom vector_field:
706
+ upsert_kwargs={"vector_field": "embedding"}
707
+ !!! version-added "Added in version 0.3.10"
700
708
 
701
709
  Returns:
702
710
  Indexing result which contains information about how many documents
@@ -708,10 +716,9 @@ async def aindex(
708
716
  ValueError: If vectorstore does not have
709
717
  "adelete" and "aadd_documents" required methods.
710
718
  ValueError: If source_id_key is not None, but is not a string or callable.
711
-
712
- .. version_modified:: 0.3.25
713
-
714
- * Added `scoped_full` cleanup mode.
719
+ TypeError: If `vector_store` is not a VectorStore or DocumentIndex.
720
+ AssertionError: If `source_id_key` is None when cleanup mode is
721
+ incremental or `scoped_full` (should be unreachable).
715
722
  """
716
723
  # Behavior is deprecated, but we keep it for backwards compatibility.
717
724
  # # Warn only once per process.
@@ -803,13 +810,13 @@ async def aindex(
803
810
  # Count documents removed by within-batch deduplication
804
811
  num_skipped += original_batch_size - len(hashed_docs)
805
812
 
806
- source_ids: Sequence[Optional[str]] = [
813
+ source_ids: Sequence[str | None] = [
807
814
  source_id_assigner(doc) for doc in hashed_docs
808
815
  ]
809
816
 
810
817
  if cleanup in {"incremental", "scoped_full"}:
811
818
  # If the cleanup mode is incremental, source ids are required.
812
- for source_id, hashed_doc in zip(source_ids, hashed_docs):
819
+ for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
813
820
  if source_id is None:
814
821
  msg = (
815
822
  f"Source ids are required when cleanup mode is "
@@ -833,7 +840,7 @@ async def aindex(
833
840
  docs_to_index: list[Document] = []
834
841
  uids_to_refresh = []
835
842
  seen_docs: set[str] = set()
836
- for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
843
+ for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):
837
844
  hashed_id = cast("str", hashed_doc.id)
838
845
  if doc_exists:
839
846
  if force_update:
@@ -905,7 +912,7 @@ async def aindex(
905
912
  if cleanup == "full" or (
906
913
  cleanup == "scoped_full" and scoped_full_cleanup_source_ids
907
914
  ):
908
- delete_group_ids: Optional[Sequence[str]] = None
915
+ delete_group_ids: Sequence[str] | None = None
909
916
  if cleanup == "scoped_full":
910
917
  delete_group_ids = list(scoped_full_cleanup_source_ids)
911
918
  while uids_to_delete := await record_manager.alist_keys(