PyPI - langchain-core - Versions diffs - 0.3.75__py3-none-any.whl → 0.3.77__py3-none-any.whl - Mend

langchain-core 0.3.75py3-none-any.whl → 0.3.77py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of langchain-core might be problematic. Click here for more details.

Files changed (119) hide show

langchain_core/_api/beta_decorator.py +22 -44
langchain_core/_api/deprecation.py +30 -17
langchain_core/_api/path.py +19 -2
langchain_core/_import_utils.py +7 -0
langchain_core/agents.py +10 -6
langchain_core/beta/runnables/context.py +1 -2
langchain_core/callbacks/base.py +28 -15
langchain_core/callbacks/manager.py +83 -71
langchain_core/callbacks/usage.py +6 -4
langchain_core/chat_history.py +29 -21
langchain_core/document_loaders/base.py +34 -9
langchain_core/document_loaders/langsmith.py +4 -1
langchain_core/documents/base.py +35 -10
langchain_core/documents/transformers.py +4 -2
langchain_core/embeddings/fake.py +8 -5
langchain_core/env.py +2 -3
langchain_core/example_selectors/base.py +12 -0
langchain_core/exceptions.py +7 -0
langchain_core/globals.py +17 -28
langchain_core/indexing/api.py +88 -76
langchain_core/indexing/base.py +5 -8
langchain_core/indexing/in_memory.py +23 -3
langchain_core/language_models/__init__.py +3 -2
langchain_core/language_models/base.py +31 -20
langchain_core/language_models/chat_models.py +98 -27
langchain_core/language_models/fake_chat_models.py +10 -9
langchain_core/language_models/llms.py +52 -18
langchain_core/load/dump.py +2 -3
langchain_core/load/load.py +15 -1
langchain_core/load/serializable.py +39 -44
langchain_core/memory.py +7 -3
langchain_core/messages/ai.py +53 -24
langchain_core/messages/base.py +43 -22
langchain_core/messages/chat.py +4 -1
langchain_core/messages/content_blocks.py +23 -2
langchain_core/messages/function.py +9 -5
langchain_core/messages/human.py +13 -10
langchain_core/messages/modifier.py +1 -0
langchain_core/messages/system.py +11 -8
langchain_core/messages/tool.py +60 -29
langchain_core/messages/utils.py +250 -131
langchain_core/output_parsers/base.py +5 -2
langchain_core/output_parsers/json.py +4 -4
langchain_core/output_parsers/list.py +7 -22
langchain_core/output_parsers/openai_functions.py +3 -0
langchain_core/output_parsers/openai_tools.py +6 -1
langchain_core/output_parsers/pydantic.py +4 -0
langchain_core/output_parsers/string.py +5 -1
langchain_core/output_parsers/xml.py +19 -19
langchain_core/outputs/chat_generation.py +25 -10
langchain_core/outputs/generation.py +14 -3
langchain_core/outputs/llm_result.py +8 -1
langchain_core/prompt_values.py +16 -6
langchain_core/prompts/base.py +4 -9
langchain_core/prompts/chat.py +89 -57
langchain_core/prompts/dict.py +16 -8
langchain_core/prompts/few_shot.py +12 -11
langchain_core/prompts/few_shot_with_templates.py +5 -1
langchain_core/prompts/image.py +12 -5
langchain_core/prompts/message.py +5 -6
langchain_core/prompts/pipeline.py +13 -8
langchain_core/prompts/prompt.py +22 -8
langchain_core/prompts/string.py +18 -10
langchain_core/prompts/structured.py +7 -2
langchain_core/rate_limiters.py +2 -2
langchain_core/retrievers.py +7 -6
langchain_core/runnables/base.py +406 -186
langchain_core/runnables/branch.py +14 -19
langchain_core/runnables/config.py +9 -15
langchain_core/runnables/configurable.py +34 -19
langchain_core/runnables/fallbacks.py +20 -13
langchain_core/runnables/graph.py +48 -38
langchain_core/runnables/graph_ascii.py +41 -18
langchain_core/runnables/graph_mermaid.py +54 -25
langchain_core/runnables/graph_png.py +27 -31
langchain_core/runnables/history.py +55 -58
langchain_core/runnables/passthrough.py +44 -21
langchain_core/runnables/retry.py +44 -23
langchain_core/runnables/router.py +9 -8
langchain_core/runnables/schema.py +2 -0
langchain_core/runnables/utils.py +51 -89
langchain_core/stores.py +19 -31
langchain_core/sys_info.py +9 -8
langchain_core/tools/base.py +37 -28
langchain_core/tools/convert.py +26 -15
langchain_core/tools/simple.py +36 -8
langchain_core/tools/structured.py +25 -12
langchain_core/tracers/base.py +2 -2
langchain_core/tracers/context.py +5 -1
langchain_core/tracers/core.py +109 -39
langchain_core/tracers/evaluation.py +22 -26
langchain_core/tracers/event_stream.py +45 -34
langchain_core/tracers/langchain.py +12 -3
langchain_core/tracers/langchain_v1.py +10 -2
langchain_core/tracers/log_stream.py +56 -17
langchain_core/tracers/root_listeners.py +4 -20
langchain_core/tracers/run_collector.py +6 -16
langchain_core/tracers/schemas.py +5 -1
langchain_core/utils/aiter.py +15 -7
langchain_core/utils/env.py +3 -0
langchain_core/utils/function_calling.py +50 -28
langchain_core/utils/interactive_env.py +6 -2
langchain_core/utils/iter.py +12 -4
langchain_core/utils/json.py +12 -3
langchain_core/utils/json_schema.py +156 -40
langchain_core/utils/loading.py +5 -1
langchain_core/utils/mustache.py +24 -15
langchain_core/utils/pydantic.py +38 -9
langchain_core/utils/utils.py +25 -9
langchain_core/vectorstores/base.py +7 -20
langchain_core/vectorstores/in_memory.py +23 -17
langchain_core/vectorstores/utils.py +18 -12
langchain_core/version.py +1 -1
langchain_core-0.3.77.dist-info/METADATA +67 -0
langchain_core-0.3.77.dist-info/RECORD +174 -0
langchain_core-0.3.75.dist-info/METADATA +0 -106
langchain_core-0.3.75.dist-info/RECORD +0 -174
{langchain_core-0.3.75.dist-info → langchain_core-0.3.77.dist-info}/WHEEL +0 -0
{langchain_core-0.3.75.dist-info → langchain_core-0.3.77.dist-info}/entry_points.txt +0 -0

langchain_core/documents/base.py CHANGED Viewed

@@ -82,7 +82,7 @@ class Blob(BaseMedia):
             blob = Blob.from_data(
                 data="Hello, world!",
                 mime_type="text/plain",
-                metadata={"source": "https://example.com"}
+                metadata={"source": "https://example.com"},
             )
     Example: Load the blob from a file
@@ -145,7 +145,14 @@ class Blob(BaseMedia):
         return values
     def as_string(self) -> str:
-        """Read data as a string."""
+        """Read data as a string.
+        Raises:
+            ValueError: If the blob cannot be represented as a string.
+        Returns:
+            The data as a string.
+        """
         if self.data is None and self.path:
             return Path(self.path).read_text(encoding=self.encoding)
         if isinstance(self.data, bytes):
@@ -156,7 +163,14 @@ class Blob(BaseMedia):
         raise ValueError(msg)
     def as_bytes(self) -> bytes:
-        """Read data as bytes."""
+        """Read data as bytes.
+        Raises:
+            ValueError: If the blob cannot be represented as bytes.
+        Returns:
+            The data as bytes.
+        """
         if isinstance(self.data, bytes):
             return self.data
         if isinstance(self.data, str):
@@ -168,7 +182,14 @@ class Blob(BaseMedia):
     @contextlib.contextmanager
     def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
-        """Read data as a byte stream."""
+        """Read data as a byte stream.
+        Raises:
+            NotImplementedError: If the blob cannot be represented as a byte stream.
+        Yields:
+            The data as a byte stream.
+        """
         if isinstance(self.data, bytes):
             yield BytesIO(self.data)
         elif self.data is None and self.path:
@@ -246,7 +267,7 @@ class Blob(BaseMedia):
         )
     def __repr__(self) -> str:
-        """Define the blob representation."""
+        """Return the blob representation."""
         str_repr = f"Blob {id(self)}"
         if self.source:
             str_repr += f" {self.source}"
@@ -263,8 +284,7 @@ class Document(BaseMedia):
             from langchain_core.documents import Document
             document = Document(
-                page_content="Hello, world!",
-                metadata={"source": "https://example.com"}
+                page_content="Hello, world!", metadata={"source": "https://example.com"}
             )
     """
@@ -281,19 +301,24 @@ class Document(BaseMedia):
     @classmethod
     def is_lc_serializable(cls) -> bool:
-        """Return whether this class is serializable."""
+        """Return True as this class is serializable."""
         return True
     @classmethod
     def get_lc_namespace(cls) -> list[str]:
         """Get the namespace of the langchain object.
-        Default namespace is ["langchain", "schema", "document"].
+        Returns:
+            ["langchain", "schema", "document"]
         """
         return ["langchain", "schema", "document"]
     def __str__(self) -> str:
-        """Override __str__ to restrict it to page_content and metadata."""
+        """Override __str__ to restrict it to page_content and metadata.
+        Returns:
+            A string representation of the Document.
+        """
         # The format matches pydantic format for __str__.
         #
         # The purpose of this change is to make sure that user code that

langchain_core/documents/transformers.py CHANGED Viewed

@@ -38,7 +38,9 @@ class BaseDocumentTransformer(ABC):
                         self.embeddings, stateful_documents
                     )
                     included_idxs = _filter_similar_embeddings(
-                        embedded_documents, self.similarity_fn, self.similarity_threshold
+                        embedded_documents,
+                        self.similarity_fn,
+                        self.similarity_threshold,
                     )
                     return [stateful_documents[i] for i in sorted(included_idxs)]
@@ -47,7 +49,7 @@ class BaseDocumentTransformer(ABC):
                 ) -> Sequence[Document]:
                     raise NotImplementedError
-    """  # noqa: E501
+    """
     @abstractmethod
     def transform_documents(

langchain_core/embeddings/fake.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Module contains a few fake embedding models for testing purposes."""
 # Please do not add additional fake embedding model implementations here.
+import contextlib
 import hashlib
 from pydantic import BaseModel
@@ -8,6 +9,9 @@ from typing_extensions import override
 from langchain_core.embeddings import Embeddings
+with contextlib.suppress(ImportError):
+    import numpy as np
 class FakeEmbeddings(Embeddings, BaseModel):
     """Fake embedding model for unit testing purposes.
@@ -20,6 +24,7 @@ class FakeEmbeddings(Embeddings, BaseModel):
         .. code-block:: python
             from langchain_core.embeddings import FakeEmbeddings
             embed = FakeEmbeddings(size=100)
     Embed single text:
@@ -53,8 +58,6 @@ class FakeEmbeddings(Embeddings, BaseModel):
     """The size of the embedding vector."""
     def _get_embedding(self) -> list[float]:
-        import numpy as np
         return list(np.random.default_rng().normal(size=self.size))
     @override
@@ -78,6 +81,7 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
         .. code-block:: python
             from langchain_core.embeddings import DeterministicFakeEmbedding
             embed = DeterministicFakeEmbedding(size=100)
     Embed single text:
@@ -111,13 +115,12 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
     """The size of the embedding vector."""
     def _get_embedding(self, seed: int) -> list[float]:
-        import numpy as np
         # set the seed for the random generator
         rng = np.random.default_rng(seed)
         return list(rng.normal(size=self.size))
-    def _get_seed(self, text: str) -> int:
+    @staticmethod
+    def _get_seed(text: str) -> int:
         """Get a seed for the random generator, using the hash of the text."""
         return int(hashlib.sha256(text.encode("utf-8")).hexdigest(), 16) % 10**8

langchain_core/env.py CHANGED Viewed

@@ -3,6 +3,8 @@
 import platform
 from functools import lru_cache
+from langchain_core import __version__
 @lru_cache(maxsize=1)
 def get_runtime_environment() -> dict:
@@ -11,9 +13,6 @@ def get_runtime_environment() -> dict:
     Returns:
         A dictionary with information about the runtime environment.
     """
-    # Lazy import to avoid circular imports
-    from langchain_core import __version__
     return {
         "library_version": __version__,
         "library": "langchain-core",

langchain_core/example_selectors/base.py CHANGED Viewed

@@ -16,6 +16,9 @@ class BaseExampleSelector(ABC):
         Args:
             example: A dictionary with keys as input variables
                 and values as their values.
+        Returns:
+            Any return value.
         """
     async def aadd_example(self, example: dict[str, str]) -> Any:
@@ -24,6 +27,9 @@ class BaseExampleSelector(ABC):
         Args:
             example: A dictionary with keys as input variables
                 and values as their values.
+        Returns:
+            Any return value.
         """
         return await run_in_executor(None, self.add_example, example)
@@ -34,6 +40,9 @@ class BaseExampleSelector(ABC):
         Args:
             input_variables: A dictionary with keys as input variables
                 and values as their values.
+        Returns:
+            A list of examples.
         """
     async def aselect_examples(self, input_variables: dict[str, str]) -> list[dict]:
@@ -42,5 +51,8 @@ class BaseExampleSelector(ABC):
         Args:
             input_variables: A dictionary with keys as input variables
                 and values as their values.
+        Returns:
+            A list of examples.
         """
         return await run_in_executor(None, self.select_examples, input_variables)

langchain_core/exceptions.py CHANGED Viewed

@@ -42,6 +42,10 @@ class OutputParserException(ValueError, LangChainException):  # noqa: N818
                 previous output was improperly structured, in the hopes that it will
                 update the output to the correct format.
                 Defaults to False.
+        Raises:
+            ValueError: If ``send_to_llm`` is True but either observation or
+                ``llm_output`` are not provided.
         """
         if isinstance(error, str):
             error = create_message(
@@ -77,6 +81,9 @@ def create_message(*, message: str, error_code: ErrorCode) -> str:
     Args:
         message: The message to display.
         error_code: The error code to display.
+    Returns:
+        The full message with the troubleshooting link.
     """
     return (
         f"{message}\n"

langchain_core/globals.py CHANGED Viewed

@@ -6,6 +6,13 @@ from typing import TYPE_CHECKING, Optional
 if TYPE_CHECKING:
     from langchain_core.caches import BaseCache
+try:
+    import langchain  # type: ignore[import-not-found]
+    _HAS_LANGCHAIN = True
+except ImportError:
+    _HAS_LANGCHAIN = False
 # DO NOT USE THESE VALUES DIRECTLY!
 # Use them only via `get_<X>()` and `set_<X>()` below,
@@ -22,9 +29,7 @@ def set_verbose(value: bool) -> None:  # noqa: FBT001
     Args:
         value: The new value for the `verbose` global setting.
     """
-    try:
-        import langchain  # type: ignore[import-not-found]
+    if _HAS_LANGCHAIN:
         # We're about to run some deprecated code, don't report warnings from it.
         # The user called the correct (non-deprecated) code path and shouldn't get
         # warnings.
@@ -43,8 +48,6 @@ def set_verbose(value: bool) -> None:  # noqa: FBT001
             # Remove it once `langchain.verbose` is no longer supported, and once all
             # users have migrated to using `set_verbose()` here.
             langchain.verbose = value
-    except ImportError:
-        pass
     global _verbose  # noqa: PLW0603
     _verbose = value
@@ -56,9 +59,7 @@ def get_verbose() -> bool:
     Returns:
         The value of the `verbose` global setting.
     """
-    try:
-        import langchain
+    if _HAS_LANGCHAIN:
         # We're about to run some deprecated code, don't report warnings from it.
         # The user called the correct (non-deprecated) code path and shouldn't get
         # warnings.
@@ -83,7 +84,7 @@ def get_verbose() -> bool:
             # deprecation warnings directing them to use `set_verbose()` when they
             # import `langchain.verbose`.
             old_verbose = langchain.verbose
-    except ImportError:
+    else:
         old_verbose = False
     return _verbose or old_verbose
@@ -95,9 +96,7 @@ def set_debug(value: bool) -> None:  # noqa: FBT001
     Args:
         value: The new value for the `debug` global setting.
     """
-    try:
-        import langchain
+    if _HAS_LANGCHAIN:
         # We're about to run some deprecated code, don't report warnings from it.
         # The user called the correct (non-deprecated) code path and shouldn't get
         # warnings.
@@ -114,8 +113,6 @@ def set_debug(value: bool) -> None:  # noqa: FBT001
             # Remove it once `langchain.debug` is no longer supported, and once all
             # users have migrated to using `set_debug()` here.
             langchain.debug = value
-    except ImportError:
-        pass
     global _debug  # noqa: PLW0603
     _debug = value
@@ -127,9 +124,7 @@ def get_debug() -> bool:
     Returns:
         The value of the `debug` global setting.
     """
-    try:
-        import langchain
+    if _HAS_LANGCHAIN:
         # We're about to run some deprecated code, don't report warnings from it.
         # The user called the correct (non-deprecated) code path and shouldn't get
         # warnings.
@@ -151,7 +146,7 @@ def get_debug() -> bool:
             # to using `set_debug()` yet. Those users are getting deprecation warnings
             # directing them to use `set_debug()` when they import `langchain.debug`.
             old_debug = langchain.debug
-    except ImportError:
+    else:
         old_debug = False
     return _debug or old_debug
@@ -163,9 +158,7 @@ def set_llm_cache(value: Optional["BaseCache"]) -> None:
     Args:
         value: The new LLM cache to use. If `None`, the LLM cache is disabled.
     """
-    try:
-        import langchain
+    if _HAS_LANGCHAIN:
         # We're about to run some deprecated code, don't report warnings from it.
         # The user called the correct (non-deprecated) code path and shouldn't get
         # warnings.
@@ -184,22 +177,18 @@ def set_llm_cache(value: Optional["BaseCache"]) -> None:
             # Remove it once `langchain.llm_cache` is no longer supported, and
             # once all users have migrated to using `set_llm_cache()` here.
             langchain.llm_cache = value
-    except ImportError:
-        pass
     global _llm_cache  # noqa: PLW0603
     _llm_cache = value
-def get_llm_cache() -> "BaseCache":
+def get_llm_cache() -> Optional["BaseCache"]:
     """Get the value of the `llm_cache` global setting.
     Returns:
         The value of the `llm_cache` global setting.
     """
-    try:
-        import langchain
+    if _HAS_LANGCHAIN:
         # We're about to run some deprecated code, don't report warnings from it.
         # The user called the correct (non-deprecated) code path and shouldn't get
         # warnings.
@@ -225,7 +214,7 @@ def get_llm_cache() -> "BaseCache":
             # Those users are getting deprecation warnings directing them
             # to use `set_llm_cache()` when they import `langchain.llm_cache`.
             old_llm_cache = langchain.llm_cache
-    except ImportError:
+    else:
         old_llm_cache = None
     return _llm_cache or old_llm_cache

langchain_core/indexing/api.py CHANGED Viewed

@@ -56,7 +56,7 @@ def _warn_about_sha1() -> None:
             "that map to the same fingerprint. If this matters in your "
             "threat model, switch to a stronger algorithm such "
             "as 'blake2b', 'sha256', or 'sha512' by specifying "
-            " `key_encoder` parameter in the the `index` or `aindex` function. ",
+            " `key_encoder` parameter in the `index` or `aindex` function. ",
             category=UserWarning,
             stacklevel=2,
         )
@@ -185,6 +185,9 @@ def _get_document_with_hash(
             When changing the key encoder, you must change the
             index as well to avoid duplicated documents in the cache.
+    Raises:
+        ValueError: If the metadata cannot be serialized using json.
     Returns:
         Document with a unique identifier based on the hash of the content and metadata.
     """
@@ -291,22 +294,26 @@ def index(
     documents were deleted, which documents should be skipped.
     For the time being, documents are indexed using their hashes, and users
-     are not able to specify the uid of the document.
-    Important:
-       * In full mode, the loader should be returning
-         the entire dataset, and not just a subset of the dataset.
-         Otherwise, the auto_cleanup will remove documents that it is not
-         supposed to.
-       * In incremental mode, if documents associated with a particular
-         source id appear across different batches, the indexing API
-         will do some redundant work. This will still result in the
-         correct end state of the index, but will unfortunately not be
-         100% efficient. For example, if a given document is split into 15
-         chunks, and we index them using a batch size of 5, we'll have 3 batches
-         all with the same source id. In general, to avoid doing too much
-         redundant work select as big a batch size as possible.
-        * The `scoped_full` mode is suitable if determining an appropriate batch size
+    are not able to specify the uid of the document.
+    .. versionchanged:: 0.3.25
+        Added ``scoped_full`` cleanup mode.
+    .. important::
+        * In full mode, the loader should be returning
+          the entire dataset, and not just a subset of the dataset.
+          Otherwise, the auto_cleanup will remove documents that it is not
+          supposed to.
+        * In incremental mode, if documents associated with a particular
+          source id appear across different batches, the indexing API
+          will do some redundant work. This will still result in the
+          correct end state of the index, but will unfortunately not be
+          100% efficient. For example, if a given document is split into 15
+          chunks, and we index them using a batch size of 5, we'll have 3 batches
+          all with the same source id. In general, to avoid doing too much
+          redundant work select as big a batch size as possible.
+        * The ``scoped_full`` mode is suitable if determining an appropriate batch size
           is challenging or if your data loader cannot return the entire dataset at
           once. This mode keeps track of source IDs in memory, which should be fine
           for most use cases. If your dataset is large (10M+ docs), you will likely
@@ -315,23 +322,22 @@ def index(
     Args:
         docs_source: Data loader or iterable of documents to index.
         record_manager: Timestamped set to keep track of which documents were
-                         updated.
+            updated.
         vector_store: VectorStore or DocumentIndex to index the documents into.
         batch_size: Batch size to use when indexing. Default is 100.
         cleanup: How to handle clean up of documents. Default is None.
             - incremental: Cleans up all documents that haven't been updated AND
-                           that are associated with source ids that were seen
-                           during indexing.
-                           Clean up is done continuously during indexing helping
-                           to minimize the probability of users seeing duplicated
-                           content.
+              that are associated with source ids that were seen during indexing.
+              Clean up is done continuously during indexing helping to minimize the
+              probability of users seeing duplicated content.
             - full: Delete all documents that have not been returned by the loader
-                    during this run of indexing.
-                    Clean up runs after all documents have been indexed.
-                    This means that users may see duplicated content during indexing.
+              during this run of indexing.
+              Clean up runs after all documents have been indexed.
+              This means that users may see duplicated content during indexing.
             - scoped_full: Similar to Full, but only deletes all documents
-                           that haven't been updated AND that are associated with
-                           source ids that were seen during indexing.
+              that haven't been updated AND that are associated with
+              source ids that were seen during indexing.
             - None: Do not delete any documents.
         source_id_key: Optional key that helps identify the original source
             of the document. Default is None.
@@ -358,10 +364,9 @@ def index(
             When changing the key encoder, you must change the
             index as well to avoid duplicated documents in the cache.
         upsert_kwargs: Additional keyword arguments to pass to the add_documents
-                       method of the VectorStore or the upsert method of the
-                       DocumentIndex. For example, you can use this to
-                       specify a custom vector_field:
-                       upsert_kwargs={"vector_field": "embedding"}
+            method of the VectorStore or the upsert method of the DocumentIndex.
+            For example, you can use this to specify a custom vector_field:
+            upsert_kwargs={"vector_field": "embedding"}
             .. versionadded:: 0.3.10
     Returns:
@@ -374,10 +379,9 @@ def index(
         ValueError: If vectorstore does not have
             "delete" and "add_documents" required methods.
         ValueError: If source_id_key is not None, but is not a string or callable.
-    .. version_modified:: 0.3.25
-        * Added `scoped_full` cleanup mode.
+        TypeError: If ``vectorstore`` is not a VectorStore or a DocumentIndex.
+        AssertionError: If ``source_id`` is None when cleanup mode is incremental.
+            (should be unreachable code).
     """
     # Behavior is deprecated, but we keep it for backwards compatibility.
     # # Warn only once per process.
@@ -632,46 +636,50 @@ async def aindex(
     documents were deleted, which documents should be skipped.
     For the time being, documents are indexed using their hashes, and users
-     are not able to specify the uid of the document.
-    Important:
-       * In full mode, the loader should be returning
-         the entire dataset, and not just a subset of the dataset.
-         Otherwise, the auto_cleanup will remove documents that it is not
-         supposed to.
-       * In incremental mode, if documents associated with a particular
-         source id appear across different batches, the indexing API
-         will do some redundant work. This will still result in the
-         correct end state of the index, but will unfortunately not be
-         100% efficient. For example, if a given document is split into 15
-         chunks, and we index them using a batch size of 5, we'll have 3 batches
-         all with the same source id. In general, to avoid doing too much
-         redundant work select as big a batch size as possible.
-       * The `scoped_full` mode is suitable if determining an appropriate batch size
-         is challenging or if your data loader cannot return the entire dataset at
-         once. This mode keeps track of source IDs in memory, which should be fine
-         for most use cases. If your dataset is large (10M+ docs), you will likely
-         need to parallelize the indexing process regardless.
+    are not able to specify the uid of the document.
+    .. versionchanged:: 0.3.25
+        Added ``scoped_full`` cleanup mode.
+    .. important::
+        * In full mode, the loader should be returning
+          the entire dataset, and not just a subset of the dataset.
+          Otherwise, the auto_cleanup will remove documents that it is not
+          supposed to.
+        * In incremental mode, if documents associated with a particular
+          source id appear across different batches, the indexing API
+          will do some redundant work. This will still result in the
+          correct end state of the index, but will unfortunately not be
+          100% efficient. For example, if a given document is split into 15
+          chunks, and we index them using a batch size of 5, we'll have 3 batches
+          all with the same source id. In general, to avoid doing too much
+          redundant work select as big a batch size as possible.
+        * The ``scoped_full`` mode is suitable if determining an appropriate batch size
+          is challenging or if your data loader cannot return the entire dataset at
+          once. This mode keeps track of source IDs in memory, which should be fine
+          for most use cases. If your dataset is large (10M+ docs), you will likely
+          need to parallelize the indexing process regardless.
     Args:
         docs_source: Data loader or iterable of documents to index.
         record_manager: Timestamped set to keep track of which documents were
-                         updated.
+            updated.
         vector_store: VectorStore or DocumentIndex to index the documents into.
         batch_size: Batch size to use when indexing. Default is 100.
         cleanup: How to handle clean up of documents. Default is None.
             - incremental: Cleans up all documents that haven't been updated AND
-                           that are associated with source ids that were seen
-                           during indexing.
-                           Clean up is done continuously during indexing helping
-                           to minimize the probability of users seeing duplicated
-                           content.
-            - full: Delete all documents that haven to been returned by the loader.
-                    Clean up runs after all documents have been indexed.
-                    This means that users may see duplicated content during indexing.
+              that are associated with source ids that were seen during indexing.
+              Clean up is done continuously during indexing helping to minimize the
+              probability of users seeing duplicated content.
+            - full: Delete all documents that have not been returned by the loader
+              during this run of indexing.
+              Clean up runs after all documents have been indexed.
+              This means that users may see duplicated content during indexing.
             - scoped_full: Similar to Full, but only deletes all documents
-                           that haven't been updated AND that are associated with
-                           source ids that were seen during indexing.
+              that haven't been updated AND that are associated with
+              source ids that were seen during indexing.
             - None: Do not delete any documents.
         source_id_key: Optional key that helps identify the original source
             of the document. Default is None.
@@ -680,6 +688,12 @@ async def aindex(
         force_update: Force update documents even if they are present in the
             record manager. Useful if you are re-indexing with updated embeddings.
             Default is False.
+        key_encoder: Hashing algorithm to use for hashing the document content and
+            metadata. Default is "sha1".
+            Other options include "blake2b", "sha256", and "sha512".
+            .. versionadded:: 0.3.66
         key_encoder: Hashing algorithm to use for hashing the document.
             If not provided, a default encoder using SHA-1 will be used.
             SHA-1 is not collision-resistant, and a motivated attacker
@@ -691,11 +705,10 @@ async def aindex(
             When changing the key encoder, you must change the
             index as well to avoid duplicated documents in the cache.
-        upsert_kwargs: Additional keyword arguments to pass to the aadd_documents
-                       method of the VectorStore or the aupsert method of the
-                       DocumentIndex. For example, you can use this to
-                       specify a custom vector_field:
-                       upsert_kwargs={"vector_field": "embedding"}
+        upsert_kwargs: Additional keyword arguments to pass to the add_documents
+            method of the VectorStore or the upsert method of the DocumentIndex.
+            For example, you can use this to specify a custom vector_field:
+            upsert_kwargs={"vector_field": "embedding"}
             .. versionadded:: 0.3.10
     Returns:
@@ -708,10 +721,9 @@ async def aindex(
         ValueError: If vectorstore does not have
             "adelete" and "aadd_documents" required methods.
         ValueError: If source_id_key is not None, but is not a string or callable.
-    .. version_modified:: 0.3.25
-        * Added `scoped_full` cleanup mode.
+        TypeError: If ``vector_store`` is not a VectorStore or DocumentIndex.
+        AssertionError: If ``source_id_key`` is None when cleanup mode is
+            incremental or ``scoped_full`` (should be unreachable).
     """
     # Behavior is deprecated, but we keep it for backwards compatibility.
     # # Warn only once per process.

langchain-core 0.3.75__py3-none-any.whl → 0.3.77__py3-none-any.whl

Potentially problematic release.

langchain-core 0.3.75py3-none-any.whl → 0.3.77py3-none-any.whl