langchain-core 1.0.0a6__py3-none-any.whl → 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. langchain_core/__init__.py +1 -1
  2. langchain_core/_api/__init__.py +3 -4
  3. langchain_core/_api/beta_decorator.py +23 -26
  4. langchain_core/_api/deprecation.py +51 -64
  5. langchain_core/_api/path.py +3 -6
  6. langchain_core/_import_utils.py +3 -4
  7. langchain_core/agents.py +55 -48
  8. langchain_core/caches.py +65 -66
  9. langchain_core/callbacks/__init__.py +1 -8
  10. langchain_core/callbacks/base.py +321 -336
  11. langchain_core/callbacks/file.py +44 -44
  12. langchain_core/callbacks/manager.py +454 -514
  13. langchain_core/callbacks/stdout.py +29 -30
  14. langchain_core/callbacks/streaming_stdout.py +32 -32
  15. langchain_core/callbacks/usage.py +60 -57
  16. langchain_core/chat_history.py +53 -68
  17. langchain_core/document_loaders/base.py +27 -25
  18. langchain_core/document_loaders/blob_loaders.py +1 -1
  19. langchain_core/document_loaders/langsmith.py +44 -48
  20. langchain_core/documents/__init__.py +23 -3
  21. langchain_core/documents/base.py +102 -94
  22. langchain_core/documents/compressor.py +10 -10
  23. langchain_core/documents/transformers.py +34 -35
  24. langchain_core/embeddings/fake.py +50 -54
  25. langchain_core/example_selectors/length_based.py +2 -2
  26. langchain_core/example_selectors/semantic_similarity.py +28 -32
  27. langchain_core/exceptions.py +21 -20
  28. langchain_core/globals.py +3 -151
  29. langchain_core/indexing/__init__.py +1 -1
  30. langchain_core/indexing/api.py +121 -126
  31. langchain_core/indexing/base.py +73 -75
  32. langchain_core/indexing/in_memory.py +4 -6
  33. langchain_core/language_models/__init__.py +14 -29
  34. langchain_core/language_models/_utils.py +58 -61
  35. langchain_core/language_models/base.py +82 -172
  36. langchain_core/language_models/chat_models.py +329 -402
  37. langchain_core/language_models/fake.py +11 -11
  38. langchain_core/language_models/fake_chat_models.py +42 -36
  39. langchain_core/language_models/llms.py +189 -269
  40. langchain_core/load/dump.py +9 -12
  41. langchain_core/load/load.py +18 -28
  42. langchain_core/load/mapping.py +2 -4
  43. langchain_core/load/serializable.py +42 -40
  44. langchain_core/messages/__init__.py +10 -16
  45. langchain_core/messages/ai.py +148 -148
  46. langchain_core/messages/base.py +53 -51
  47. langchain_core/messages/block_translators/__init__.py +19 -22
  48. langchain_core/messages/block_translators/anthropic.py +6 -6
  49. langchain_core/messages/block_translators/bedrock_converse.py +5 -5
  50. langchain_core/messages/block_translators/google_genai.py +10 -7
  51. langchain_core/messages/block_translators/google_vertexai.py +4 -32
  52. langchain_core/messages/block_translators/groq.py +117 -21
  53. langchain_core/messages/block_translators/langchain_v0.py +5 -5
  54. langchain_core/messages/block_translators/openai.py +11 -11
  55. langchain_core/messages/chat.py +2 -6
  56. langchain_core/messages/content.py +339 -330
  57. langchain_core/messages/function.py +6 -10
  58. langchain_core/messages/human.py +24 -31
  59. langchain_core/messages/modifier.py +2 -2
  60. langchain_core/messages/system.py +19 -29
  61. langchain_core/messages/tool.py +74 -90
  62. langchain_core/messages/utils.py +484 -510
  63. langchain_core/output_parsers/__init__.py +13 -10
  64. langchain_core/output_parsers/base.py +61 -61
  65. langchain_core/output_parsers/format_instructions.py +9 -4
  66. langchain_core/output_parsers/json.py +12 -10
  67. langchain_core/output_parsers/list.py +21 -23
  68. langchain_core/output_parsers/openai_functions.py +49 -47
  69. langchain_core/output_parsers/openai_tools.py +30 -23
  70. langchain_core/output_parsers/pydantic.py +13 -14
  71. langchain_core/output_parsers/string.py +5 -5
  72. langchain_core/output_parsers/transform.py +15 -17
  73. langchain_core/output_parsers/xml.py +35 -34
  74. langchain_core/outputs/__init__.py +1 -1
  75. langchain_core/outputs/chat_generation.py +18 -18
  76. langchain_core/outputs/chat_result.py +1 -3
  77. langchain_core/outputs/generation.py +16 -16
  78. langchain_core/outputs/llm_result.py +10 -10
  79. langchain_core/prompt_values.py +13 -19
  80. langchain_core/prompts/__init__.py +3 -27
  81. langchain_core/prompts/base.py +81 -86
  82. langchain_core/prompts/chat.py +308 -351
  83. langchain_core/prompts/dict.py +6 -6
  84. langchain_core/prompts/few_shot.py +81 -88
  85. langchain_core/prompts/few_shot_with_templates.py +11 -13
  86. langchain_core/prompts/image.py +12 -14
  87. langchain_core/prompts/loading.py +4 -6
  88. langchain_core/prompts/message.py +7 -7
  89. langchain_core/prompts/prompt.py +24 -39
  90. langchain_core/prompts/string.py +26 -10
  91. langchain_core/prompts/structured.py +49 -53
  92. langchain_core/rate_limiters.py +51 -60
  93. langchain_core/retrievers.py +61 -198
  94. langchain_core/runnables/base.py +1551 -1656
  95. langchain_core/runnables/branch.py +68 -70
  96. langchain_core/runnables/config.py +72 -89
  97. langchain_core/runnables/configurable.py +145 -161
  98. langchain_core/runnables/fallbacks.py +102 -96
  99. langchain_core/runnables/graph.py +91 -97
  100. langchain_core/runnables/graph_ascii.py +27 -28
  101. langchain_core/runnables/graph_mermaid.py +42 -51
  102. langchain_core/runnables/graph_png.py +43 -16
  103. langchain_core/runnables/history.py +175 -177
  104. langchain_core/runnables/passthrough.py +151 -167
  105. langchain_core/runnables/retry.py +46 -51
  106. langchain_core/runnables/router.py +30 -35
  107. langchain_core/runnables/schema.py +75 -80
  108. langchain_core/runnables/utils.py +60 -67
  109. langchain_core/stores.py +85 -121
  110. langchain_core/structured_query.py +8 -8
  111. langchain_core/sys_info.py +29 -29
  112. langchain_core/tools/__init__.py +1 -14
  113. langchain_core/tools/base.py +306 -245
  114. langchain_core/tools/convert.py +160 -155
  115. langchain_core/tools/render.py +10 -10
  116. langchain_core/tools/retriever.py +12 -11
  117. langchain_core/tools/simple.py +19 -24
  118. langchain_core/tools/structured.py +32 -39
  119. langchain_core/tracers/__init__.py +1 -9
  120. langchain_core/tracers/base.py +97 -99
  121. langchain_core/tracers/context.py +29 -52
  122. langchain_core/tracers/core.py +49 -53
  123. langchain_core/tracers/evaluation.py +11 -11
  124. langchain_core/tracers/event_stream.py +65 -64
  125. langchain_core/tracers/langchain.py +21 -21
  126. langchain_core/tracers/log_stream.py +45 -45
  127. langchain_core/tracers/memory_stream.py +3 -3
  128. langchain_core/tracers/root_listeners.py +16 -16
  129. langchain_core/tracers/run_collector.py +2 -4
  130. langchain_core/tracers/schemas.py +0 -129
  131. langchain_core/tracers/stdout.py +3 -3
  132. langchain_core/utils/__init__.py +1 -4
  133. langchain_core/utils/_merge.py +2 -2
  134. langchain_core/utils/aiter.py +57 -61
  135. langchain_core/utils/env.py +9 -9
  136. langchain_core/utils/function_calling.py +94 -188
  137. langchain_core/utils/html.py +7 -8
  138. langchain_core/utils/input.py +9 -6
  139. langchain_core/utils/interactive_env.py +1 -1
  140. langchain_core/utils/iter.py +36 -40
  141. langchain_core/utils/json.py +4 -3
  142. langchain_core/utils/json_schema.py +9 -9
  143. langchain_core/utils/mustache.py +8 -10
  144. langchain_core/utils/pydantic.py +35 -37
  145. langchain_core/utils/strings.py +6 -9
  146. langchain_core/utils/usage.py +1 -1
  147. langchain_core/utils/utils.py +66 -62
  148. langchain_core/vectorstores/base.py +182 -216
  149. langchain_core/vectorstores/in_memory.py +101 -176
  150. langchain_core/vectorstores/utils.py +5 -5
  151. langchain_core/version.py +1 -1
  152. langchain_core-1.0.4.dist-info/METADATA +69 -0
  153. langchain_core-1.0.4.dist-info/RECORD +172 -0
  154. {langchain_core-1.0.0a6.dist-info → langchain_core-1.0.4.dist-info}/WHEEL +1 -1
  155. langchain_core/memory.py +0 -120
  156. langchain_core/messages/block_translators/ollama.py +0 -47
  157. langchain_core/prompts/pipeline.py +0 -138
  158. langchain_core/pydantic_v1/__init__.py +0 -30
  159. langchain_core/pydantic_v1/dataclasses.py +0 -23
  160. langchain_core/pydantic_v1/main.py +0 -23
  161. langchain_core/tracers/langchain_v1.py +0 -31
  162. langchain_core/utils/loading.py +0 -35
  163. langchain_core-1.0.0a6.dist-info/METADATA +0 -67
  164. langchain_core-1.0.0a6.dist-info/RECORD +0 -181
  165. langchain_core-1.0.0a6.dist-info/entry_points.txt +0 -4
@@ -6,16 +6,20 @@ import hashlib
6
6
  import json
7
7
  import uuid
8
8
  import warnings
9
- from collections.abc import AsyncIterable, AsyncIterator, Iterable, Iterator, Sequence
9
+ from collections.abc import (
10
+ AsyncIterable,
11
+ AsyncIterator,
12
+ Callable,
13
+ Iterable,
14
+ Iterator,
15
+ Sequence,
16
+ )
10
17
  from itertools import islice
11
18
  from typing import (
12
19
  Any,
13
- Callable,
14
20
  Literal,
15
- Optional,
16
21
  TypedDict,
17
22
  TypeVar,
18
- Union,
19
23
  cast,
20
24
  )
21
25
 
@@ -107,8 +111,8 @@ async def _abatch(size: int, iterable: AsyncIterable[T]) -> AsyncIterator[list[T
107
111
 
108
112
 
109
113
  def _get_source_id_assigner(
110
- source_id_key: Union[str, Callable[[Document], str], None],
111
- ) -> Callable[[Document], Union[str, None]]:
114
+ source_id_key: str | Callable[[Document], str] | None,
115
+ ) -> Callable[[Document], str | None]:
112
116
  """Get the source id from the document."""
113
117
  if source_id_key is None:
114
118
  return lambda _doc: None
@@ -162,9 +166,8 @@ def _calculate_hash(
162
166
  def _get_document_with_hash(
163
167
  document: Document,
164
168
  *,
165
- key_encoder: Union[
166
- Callable[[Document], str], Literal["sha1", "sha256", "sha512", "blake2b"]
167
- ],
169
+ key_encoder: Callable[[Document], str]
170
+ | Literal["sha1", "sha256", "sha512", "blake2b"],
168
171
  ) -> Document:
169
172
  """Calculate a hash of the document, and assign it to the uid.
170
173
 
@@ -233,7 +236,7 @@ class _HashedDocument:
233
236
 
234
237
 
235
238
  def _delete(
236
- vector_store: Union[VectorStore, DocumentIndex],
239
+ vector_store: VectorStore | DocumentIndex,
237
240
  ids: list[str],
238
241
  ) -> None:
239
242
  if isinstance(vector_store, VectorStore):
@@ -271,19 +274,18 @@ class IndexingResult(TypedDict):
271
274
 
272
275
 
273
276
  def index(
274
- docs_source: Union[BaseLoader, Iterable[Document]],
277
+ docs_source: BaseLoader | Iterable[Document],
275
278
  record_manager: RecordManager,
276
- vector_store: Union[VectorStore, DocumentIndex],
279
+ vector_store: VectorStore | DocumentIndex,
277
280
  *,
278
281
  batch_size: int = 100,
279
- cleanup: Optional[Literal["incremental", "full", "scoped_full"]] = None,
280
- source_id_key: Union[str, Callable[[Document], str], None] = None,
282
+ cleanup: Literal["incremental", "full", "scoped_full"] | None = None,
283
+ source_id_key: str | Callable[[Document], str] | None = None,
281
284
  cleanup_batch_size: int = 1_000,
282
285
  force_update: bool = False,
283
- key_encoder: Union[
284
- Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
285
- ] = "sha1",
286
- upsert_kwargs: Optional[dict[str, Any]] = None,
286
+ key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]
287
+ | Callable[[Document], str] = "sha1",
288
+ upsert_kwargs: dict[str, Any] | None = None,
287
289
  ) -> IndexingResult:
288
290
  """Index data from the loader into the vector store.
289
291
 
@@ -296,61 +298,58 @@ def index(
296
298
  For the time being, documents are indexed using their hashes, and users
297
299
  are not able to specify the uid of the document.
298
300
 
299
- .. versionchanged:: 0.3.25
300
- Added ``scoped_full`` cleanup mode.
301
+ !!! warning "Behavior changed in `langchain-core` 0.3.25"
302
+ Added `scoped_full` cleanup mode.
301
303
 
302
- .. important::
304
+ !!! warning
303
305
 
304
306
  * In full mode, the loader should be returning
305
- the entire dataset, and not just a subset of the dataset.
306
- Otherwise, the auto_cleanup will remove documents that it is not
307
- supposed to.
307
+ the entire dataset, and not just a subset of the dataset.
308
+ Otherwise, the auto_cleanup will remove documents that it is not
309
+ supposed to.
308
310
  * In incremental mode, if documents associated with a particular
309
- source id appear across different batches, the indexing API
310
- will do some redundant work. This will still result in the
311
- correct end state of the index, but will unfortunately not be
312
- 100% efficient. For example, if a given document is split into 15
313
- chunks, and we index them using a batch size of 5, we'll have 3 batches
314
- all with the same source id. In general, to avoid doing too much
315
- redundant work select as big a batch size as possible.
316
- * The ``scoped_full`` mode is suitable if determining an appropriate batch size
317
- is challenging or if your data loader cannot return the entire dataset at
318
- once. This mode keeps track of source IDs in memory, which should be fine
319
- for most use cases. If your dataset is large (10M+ docs), you will likely
320
- need to parallelize the indexing process regardless.
311
+ source id appear across different batches, the indexing API
312
+ will do some redundant work. This will still result in the
313
+ correct end state of the index, but will unfortunately not be
314
+ 100% efficient. For example, if a given document is split into 15
315
+ chunks, and we index them using a batch size of 5, we'll have 3 batches
316
+ all with the same source id. In general, to avoid doing too much
317
+ redundant work select as big a batch size as possible.
318
+ * The `scoped_full` mode is suitable if determining an appropriate batch size
319
+ is challenging or if your data loader cannot return the entire dataset at
320
+ once. This mode keeps track of source IDs in memory, which should be fine
321
+ for most use cases. If your dataset is large (10M+ docs), you will likely
322
+ need to parallelize the indexing process regardless.
321
323
 
322
324
  Args:
323
325
  docs_source: Data loader or iterable of documents to index.
324
326
  record_manager: Timestamped set to keep track of which documents were
325
327
  updated.
326
- vector_store: VectorStore or DocumentIndex to index the documents into.
327
- batch_size: Batch size to use when indexing. Default is 100.
328
- cleanup: How to handle clean up of documents. Default is None.
328
+ vector_store: `VectorStore` or DocumentIndex to index the documents into.
329
+ batch_size: Batch size to use when indexing.
330
+ cleanup: How to handle clean up of documents.
329
331
 
330
332
  - incremental: Cleans up all documents that haven't been updated AND
331
- that are associated with source ids that were seen during indexing.
332
- Clean up is done continuously during indexing helping to minimize the
333
- probability of users seeing duplicated content.
333
+ that are associated with source IDs that were seen during indexing.
334
+ Clean up is done continuously during indexing helping to minimize the
335
+ probability of users seeing duplicated content.
334
336
  - full: Delete all documents that have not been returned by the loader
335
- during this run of indexing.
336
- Clean up runs after all documents have been indexed.
337
- This means that users may see duplicated content during indexing.
337
+ during this run of indexing.
338
+ Clean up runs after all documents have been indexed.
339
+ This means that users may see duplicated content during indexing.
338
340
  - scoped_full: Similar to Full, but only deletes all documents
339
- that haven't been updated AND that are associated with
340
- source ids that were seen during indexing.
341
+ that haven't been updated AND that are associated with
342
+ source IDs that were seen during indexing.
341
343
  - None: Do not delete any documents.
342
344
  source_id_key: Optional key that helps identify the original source
343
- of the document. Default is None.
345
+ of the document.
344
346
  cleanup_batch_size: Batch size to use when cleaning up documents.
345
- Default is 1_000.
346
347
  force_update: Force update documents even if they are present in the
347
348
  record manager. Useful if you are re-indexing with updated embeddings.
348
- Default is False.
349
349
  key_encoder: Hashing algorithm to use for hashing the document content and
350
- metadata. Default is "sha1".
351
- Other options include "blake2b", "sha256", and "sha512".
350
+ metadata. Options include "blake2b", "sha256", and "sha512".
352
351
 
353
- .. versionadded:: 0.3.66
352
+ !!! version-added "Added in `langchain-core` 0.3.66"
354
353
 
355
354
  key_encoder: Hashing algorithm to use for hashing the document.
356
355
  If not provided, a default encoder using SHA-1 will be used.
@@ -364,10 +363,10 @@ def index(
364
363
  When changing the key encoder, you must change the
365
364
  index as well to avoid duplicated documents in the cache.
366
365
  upsert_kwargs: Additional keyword arguments to pass to the add_documents
367
- method of the VectorStore or the upsert method of the DocumentIndex.
366
+ method of the `VectorStore` or the upsert method of the DocumentIndex.
368
367
  For example, you can use this to specify a custom vector_field:
369
368
  upsert_kwargs={"vector_field": "embedding"}
370
- .. versionadded:: 0.3.10
369
+ !!! version-added "Added in `langchain-core` 0.3.10"
371
370
 
372
371
  Returns:
373
372
  Indexing result which contains information about how many documents
@@ -376,11 +375,11 @@ def index(
376
375
  Raises:
377
376
  ValueError: If cleanup mode is not one of 'incremental', 'full' or None
378
377
  ValueError: If cleanup mode is incremental and source_id_key is None.
379
- ValueError: If vectorstore does not have
378
+ ValueError: If `VectorStore` does not have
380
379
  "delete" and "add_documents" required methods.
381
380
  ValueError: If source_id_key is not None, but is not a string or callable.
382
- TypeError: If ``vectorstore`` is not a VectorStore or a DocumentIndex.
383
- AssertionError: If ``source_id`` is None when cleanup mode is incremental.
381
+ TypeError: If `vectorstore` is not a `VectorStore` or a DocumentIndex.
382
+ AssertionError: If `source_id` is None when cleanup mode is incremental.
384
383
  (should be unreachable code).
385
384
  """
386
385
  # Behavior is deprecated, but we keep it for backwards compatibility.
@@ -416,7 +415,7 @@ def index(
416
415
  raise ValueError(msg)
417
416
 
418
417
  if type(destination).delete == VectorStore.delete:
419
- # Checking if the vectorstore has overridden the default delete method
418
+ # Checking if the VectorStore has overridden the default delete method
420
419
  # implementation which just raises a NotImplementedError
421
420
  msg = "Vectorstore has not implemented the delete method"
422
421
  raise ValueError(msg)
@@ -462,16 +461,16 @@ def index(
462
461
  # Count documents removed by within-batch deduplication
463
462
  num_skipped += original_batch_size - len(hashed_docs)
464
463
 
465
- source_ids: Sequence[Optional[str]] = [
464
+ source_ids: Sequence[str | None] = [
466
465
  source_id_assigner(hashed_doc) for hashed_doc in hashed_docs
467
466
  ]
468
467
 
469
468
  if cleanup in {"incremental", "scoped_full"}:
470
- # source ids are required.
471
- for source_id, hashed_doc in zip(source_ids, hashed_docs):
469
+ # Source IDs are required.
470
+ for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
472
471
  if source_id is None:
473
472
  msg = (
474
- f"Source ids are required when cleanup mode is "
473
+ f"Source IDs are required when cleanup mode is "
475
474
  f"incremental or scoped_full. "
476
475
  f"Document that starts with "
477
476
  f"content: {hashed_doc.page_content[:100]} "
@@ -480,7 +479,7 @@ def index(
480
479
  raise ValueError(msg)
481
480
  if cleanup == "scoped_full":
482
481
  scoped_full_cleanup_source_ids.add(source_id)
483
- # source ids cannot be None after for loop above.
482
+ # Source IDs cannot be None after for loop above.
484
483
  source_ids = cast("Sequence[str]", source_ids)
485
484
 
486
485
  exists_batch = record_manager.exists(
@@ -492,7 +491,7 @@ def index(
492
491
  docs_to_index = []
493
492
  uids_to_refresh = []
494
493
  seen_docs: set[str] = set()
495
- for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
494
+ for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):
496
495
  hashed_id = cast("str", hashed_doc.id)
497
496
  if doc_exists:
498
497
  if force_update:
@@ -539,7 +538,7 @@ def index(
539
538
  # If source IDs are provided, we can do the deletion incrementally!
540
539
  if cleanup == "incremental":
541
540
  # Get the uids of the documents that were not returned by the loader.
542
- # mypy isn't good enough to determine that source ids cannot be None
541
+ # mypy isn't good enough to determine that source IDs cannot be None
543
542
  # here due to a check that's happening above, so we check again.
544
543
  for source_id in source_ids:
545
544
  if source_id is None:
@@ -563,7 +562,7 @@ def index(
563
562
  if cleanup == "full" or (
564
563
  cleanup == "scoped_full" and scoped_full_cleanup_source_ids
565
564
  ):
566
- delete_group_ids: Optional[Sequence[str]] = None
565
+ delete_group_ids: Sequence[str] | None = None
567
566
  if cleanup == "scoped_full":
568
567
  delete_group_ids = list(scoped_full_cleanup_source_ids)
569
568
  while uids_to_delete := record_manager.list_keys(
@@ -591,7 +590,7 @@ async def _to_async_iterator(iterator: Iterable[T]) -> AsyncIterator[T]:
591
590
 
592
591
 
593
592
  async def _adelete(
594
- vector_store: Union[VectorStore, DocumentIndex],
593
+ vector_store: VectorStore | DocumentIndex,
595
594
  ids: list[str],
596
595
  ) -> None:
597
596
  if isinstance(vector_store, VectorStore):
@@ -613,19 +612,18 @@ async def _adelete(
613
612
 
614
613
 
615
614
  async def aindex(
616
- docs_source: Union[BaseLoader, Iterable[Document], AsyncIterator[Document]],
615
+ docs_source: BaseLoader | Iterable[Document] | AsyncIterator[Document],
617
616
  record_manager: RecordManager,
618
- vector_store: Union[VectorStore, DocumentIndex],
617
+ vector_store: VectorStore | DocumentIndex,
619
618
  *,
620
619
  batch_size: int = 100,
621
- cleanup: Optional[Literal["incremental", "full", "scoped_full"]] = None,
622
- source_id_key: Union[str, Callable[[Document], str], None] = None,
620
+ cleanup: Literal["incremental", "full", "scoped_full"] | None = None,
621
+ source_id_key: str | Callable[[Document], str] | None = None,
623
622
  cleanup_batch_size: int = 1_000,
624
623
  force_update: bool = False,
625
- key_encoder: Union[
626
- Literal["sha1", "sha256", "sha512", "blake2b"], Callable[[Document], str]
627
- ] = "sha1",
628
- upsert_kwargs: Optional[dict[str, Any]] = None,
624
+ key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]
625
+ | Callable[[Document], str] = "sha1",
626
+ upsert_kwargs: dict[str, Any] | None = None,
629
627
  ) -> IndexingResult:
630
628
  """Async index data from the loader into the vector store.
631
629
 
@@ -638,61 +636,58 @@ async def aindex(
638
636
  For the time being, documents are indexed using their hashes, and users
639
637
  are not able to specify the uid of the document.
640
638
 
641
- .. versionchanged:: 0.3.25
642
- Added ``scoped_full`` cleanup mode.
639
+ !!! warning "Behavior changed in `langchain-core` 0.3.25"
640
+ Added `scoped_full` cleanup mode.
643
641
 
644
- .. important::
642
+ !!! warning
645
643
 
646
644
  * In full mode, the loader should be returning
647
- the entire dataset, and not just a subset of the dataset.
648
- Otherwise, the auto_cleanup will remove documents that it is not
649
- supposed to.
645
+ the entire dataset, and not just a subset of the dataset.
646
+ Otherwise, the auto_cleanup will remove documents that it is not
647
+ supposed to.
650
648
  * In incremental mode, if documents associated with a particular
651
- source id appear across different batches, the indexing API
652
- will do some redundant work. This will still result in the
653
- correct end state of the index, but will unfortunately not be
654
- 100% efficient. For example, if a given document is split into 15
655
- chunks, and we index them using a batch size of 5, we'll have 3 batches
656
- all with the same source id. In general, to avoid doing too much
657
- redundant work select as big a batch size as possible.
658
- * The ``scoped_full`` mode is suitable if determining an appropriate batch size
659
- is challenging or if your data loader cannot return the entire dataset at
660
- once. This mode keeps track of source IDs in memory, which should be fine
661
- for most use cases. If your dataset is large (10M+ docs), you will likely
662
- need to parallelize the indexing process regardless.
649
+ source id appear across different batches, the indexing API
650
+ will do some redundant work. This will still result in the
651
+ correct end state of the index, but will unfortunately not be
652
+ 100% efficient. For example, if a given document is split into 15
653
+ chunks, and we index them using a batch size of 5, we'll have 3 batches
654
+ all with the same source id. In general, to avoid doing too much
655
+ redundant work select as big a batch size as possible.
656
+ * The `scoped_full` mode is suitable if determining an appropriate batch size
657
+ is challenging or if your data loader cannot return the entire dataset at
658
+ once. This mode keeps track of source IDs in memory, which should be fine
659
+ for most use cases. If your dataset is large (10M+ docs), you will likely
660
+ need to parallelize the indexing process regardless.
663
661
 
664
662
  Args:
665
663
  docs_source: Data loader or iterable of documents to index.
666
664
  record_manager: Timestamped set to keep track of which documents were
667
665
  updated.
668
- vector_store: VectorStore or DocumentIndex to index the documents into.
669
- batch_size: Batch size to use when indexing. Default is 100.
670
- cleanup: How to handle clean up of documents. Default is None.
666
+ vector_store: `VectorStore` or DocumentIndex to index the documents into.
667
+ batch_size: Batch size to use when indexing.
668
+ cleanup: How to handle clean up of documents.
671
669
 
672
670
  - incremental: Cleans up all documents that haven't been updated AND
673
- that are associated with source ids that were seen during indexing.
674
- Clean up is done continuously during indexing helping to minimize the
675
- probability of users seeing duplicated content.
671
+ that are associated with source IDs that were seen during indexing.
672
+ Clean up is done continuously during indexing helping to minimize the
673
+ probability of users seeing duplicated content.
676
674
  - full: Delete all documents that have not been returned by the loader
677
- during this run of indexing.
678
- Clean up runs after all documents have been indexed.
679
- This means that users may see duplicated content during indexing.
675
+ during this run of indexing.
676
+ Clean up runs after all documents have been indexed.
677
+ This means that users may see duplicated content during indexing.
680
678
  - scoped_full: Similar to Full, but only deletes all documents
681
- that haven't been updated AND that are associated with
682
- source ids that were seen during indexing.
679
+ that haven't been updated AND that are associated with
680
+ source IDs that were seen during indexing.
683
681
  - None: Do not delete any documents.
684
682
  source_id_key: Optional key that helps identify the original source
685
- of the document. Default is None.
683
+ of the document.
686
684
  cleanup_batch_size: Batch size to use when cleaning up documents.
687
- Default is 1_000.
688
685
  force_update: Force update documents even if they are present in the
689
686
  record manager. Useful if you are re-indexing with updated embeddings.
690
- Default is False.
691
687
  key_encoder: Hashing algorithm to use for hashing the document content and
692
- metadata. Default is "sha1".
693
- Other options include "blake2b", "sha256", and "sha512".
688
+ metadata. Options include "blake2b", "sha256", and "sha512".
694
689
 
695
- .. versionadded:: 0.3.66
690
+ !!! version-added "Added in `langchain-core` 0.3.66"
696
691
 
697
692
  key_encoder: Hashing algorithm to use for hashing the document.
698
693
  If not provided, a default encoder using SHA-1 will be used.
@@ -706,10 +701,10 @@ async def aindex(
706
701
  When changing the key encoder, you must change the
707
702
  index as well to avoid duplicated documents in the cache.
708
703
  upsert_kwargs: Additional keyword arguments to pass to the add_documents
709
- method of the VectorStore or the upsert method of the DocumentIndex.
704
+ method of the `VectorStore` or the upsert method of the DocumentIndex.
710
705
  For example, you can use this to specify a custom vector_field:
711
706
  upsert_kwargs={"vector_field": "embedding"}
712
- .. versionadded:: 0.3.10
707
+ !!! version-added "Added in `langchain-core` 0.3.10"
713
708
 
714
709
  Returns:
715
710
  Indexing result which contains information about how many documents
@@ -718,12 +713,12 @@ async def aindex(
718
713
  Raises:
719
714
  ValueError: If cleanup mode is not one of 'incremental', 'full' or None
720
715
  ValueError: If cleanup mode is incremental and source_id_key is None.
721
- ValueError: If vectorstore does not have
716
+ ValueError: If `VectorStore` does not have
722
717
  "adelete" and "aadd_documents" required methods.
723
718
  ValueError: If source_id_key is not None, but is not a string or callable.
724
- TypeError: If ``vector_store`` is not a VectorStore or DocumentIndex.
725
- AssertionError: If ``source_id_key`` is None when cleanup mode is
726
- incremental or ``scoped_full`` (should be unreachable).
719
+ TypeError: If `vector_store` is not a `VectorStore` or DocumentIndex.
720
+ AssertionError: If `source_id_key` is None when cleanup mode is
721
+ incremental or `scoped_full` (should be unreachable).
727
722
  """
728
723
  # Behavior is deprecated, but we keep it for backwards compatibility.
729
724
  # # Warn only once per process.
@@ -762,7 +757,7 @@ async def aindex(
762
757
  type(destination).adelete == VectorStore.adelete
763
758
  and type(destination).delete == VectorStore.delete
764
759
  ):
765
- # Checking if the vectorstore has overridden the default adelete or delete
760
+ # Checking if the VectorStore has overridden the default adelete or delete
766
761
  # methods implementation which just raises a NotImplementedError
767
762
  msg = "Vectorstore has not implemented the adelete or delete method"
768
763
  raise ValueError(msg)
@@ -815,16 +810,16 @@ async def aindex(
815
810
  # Count documents removed by within-batch deduplication
816
811
  num_skipped += original_batch_size - len(hashed_docs)
817
812
 
818
- source_ids: Sequence[Optional[str]] = [
813
+ source_ids: Sequence[str | None] = [
819
814
  source_id_assigner(doc) for doc in hashed_docs
820
815
  ]
821
816
 
822
817
  if cleanup in {"incremental", "scoped_full"}:
823
- # If the cleanup mode is incremental, source ids are required.
824
- for source_id, hashed_doc in zip(source_ids, hashed_docs):
818
+ # If the cleanup mode is incremental, source IDs are required.
819
+ for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
825
820
  if source_id is None:
826
821
  msg = (
827
- f"Source ids are required when cleanup mode is "
822
+ f"Source IDs are required when cleanup mode is "
828
823
  f"incremental or scoped_full. "
829
824
  f"Document that starts with "
830
825
  f"content: {hashed_doc.page_content[:100]} "
@@ -833,7 +828,7 @@ async def aindex(
833
828
  raise ValueError(msg)
834
829
  if cleanup == "scoped_full":
835
830
  scoped_full_cleanup_source_ids.add(source_id)
836
- # source ids cannot be None after for loop above.
831
+ # Source IDs cannot be None after for loop above.
837
832
  source_ids = cast("Sequence[str]", source_ids)
838
833
 
839
834
  exists_batch = await record_manager.aexists(
@@ -845,7 +840,7 @@ async def aindex(
845
840
  docs_to_index: list[Document] = []
846
841
  uids_to_refresh = []
847
842
  seen_docs: set[str] = set()
848
- for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
843
+ for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):
849
844
  hashed_id = cast("str", hashed_doc.id)
850
845
  if doc_exists:
851
846
  if force_update:
@@ -893,7 +888,7 @@ async def aindex(
893
888
  if cleanup == "incremental":
894
889
  # Get the uids of the documents that were not returned by the loader.
895
890
 
896
- # mypy isn't good enough to determine that source ids cannot be None
891
+ # mypy isn't good enough to determine that source IDs cannot be None
897
892
  # here due to a check that's happening above, so we check again.
898
893
  for source_id in source_ids:
899
894
  if source_id is None:
@@ -917,7 +912,7 @@ async def aindex(
917
912
  if cleanup == "full" or (
918
913
  cleanup == "scoped_full" and scoped_full_cleanup_source_ids
919
914
  ):
920
- delete_group_ids: Optional[Sequence[str]] = None
915
+ delete_group_ids: Sequence[str] | None = None
921
916
  if cleanup == "scoped_full":
922
917
  delete_group_ids = list(scoped_full_cleanup_source_ids)
923
918
  while uids_to_delete := await record_manager.alist_keys(