langchain-core 1.0.0a6__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. langchain_core/__init__.py +1 -1
  2. langchain_core/_api/__init__.py +3 -4
  3. langchain_core/_api/beta_decorator.py +23 -26
  4. langchain_core/_api/deprecation.py +51 -64
  5. langchain_core/_api/path.py +3 -6
  6. langchain_core/_import_utils.py +3 -4
  7. langchain_core/agents.py +20 -22
  8. langchain_core/caches.py +65 -66
  9. langchain_core/callbacks/__init__.py +1 -8
  10. langchain_core/callbacks/base.py +321 -336
  11. langchain_core/callbacks/file.py +44 -44
  12. langchain_core/callbacks/manager.py +436 -513
  13. langchain_core/callbacks/stdout.py +29 -30
  14. langchain_core/callbacks/streaming_stdout.py +32 -32
  15. langchain_core/callbacks/usage.py +60 -57
  16. langchain_core/chat_history.py +53 -68
  17. langchain_core/document_loaders/base.py +27 -25
  18. langchain_core/document_loaders/blob_loaders.py +1 -1
  19. langchain_core/document_loaders/langsmith.py +44 -48
  20. langchain_core/documents/__init__.py +23 -3
  21. langchain_core/documents/base.py +98 -90
  22. langchain_core/documents/compressor.py +10 -10
  23. langchain_core/documents/transformers.py +34 -35
  24. langchain_core/embeddings/fake.py +50 -54
  25. langchain_core/example_selectors/length_based.py +1 -1
  26. langchain_core/example_selectors/semantic_similarity.py +28 -32
  27. langchain_core/exceptions.py +21 -20
  28. langchain_core/globals.py +3 -151
  29. langchain_core/indexing/__init__.py +1 -1
  30. langchain_core/indexing/api.py +121 -126
  31. langchain_core/indexing/base.py +73 -75
  32. langchain_core/indexing/in_memory.py +4 -6
  33. langchain_core/language_models/__init__.py +14 -29
  34. langchain_core/language_models/_utils.py +58 -61
  35. langchain_core/language_models/base.py +53 -162
  36. langchain_core/language_models/chat_models.py +298 -387
  37. langchain_core/language_models/fake.py +11 -11
  38. langchain_core/language_models/fake_chat_models.py +42 -36
  39. langchain_core/language_models/llms.py +125 -235
  40. langchain_core/load/dump.py +9 -12
  41. langchain_core/load/load.py +18 -28
  42. langchain_core/load/mapping.py +2 -4
  43. langchain_core/load/serializable.py +42 -40
  44. langchain_core/messages/__init__.py +10 -16
  45. langchain_core/messages/ai.py +148 -148
  46. langchain_core/messages/base.py +53 -51
  47. langchain_core/messages/block_translators/__init__.py +19 -22
  48. langchain_core/messages/block_translators/anthropic.py +6 -6
  49. langchain_core/messages/block_translators/bedrock_converse.py +5 -5
  50. langchain_core/messages/block_translators/google_genai.py +10 -7
  51. langchain_core/messages/block_translators/google_vertexai.py +4 -32
  52. langchain_core/messages/block_translators/groq.py +117 -21
  53. langchain_core/messages/block_translators/langchain_v0.py +5 -5
  54. langchain_core/messages/block_translators/openai.py +11 -11
  55. langchain_core/messages/chat.py +2 -6
  56. langchain_core/messages/content.py +337 -328
  57. langchain_core/messages/function.py +6 -10
  58. langchain_core/messages/human.py +24 -31
  59. langchain_core/messages/modifier.py +2 -2
  60. langchain_core/messages/system.py +19 -29
  61. langchain_core/messages/tool.py +74 -90
  62. langchain_core/messages/utils.py +474 -504
  63. langchain_core/output_parsers/__init__.py +13 -10
  64. langchain_core/output_parsers/base.py +61 -61
  65. langchain_core/output_parsers/format_instructions.py +9 -4
  66. langchain_core/output_parsers/json.py +12 -10
  67. langchain_core/output_parsers/list.py +21 -23
  68. langchain_core/output_parsers/openai_functions.py +49 -47
  69. langchain_core/output_parsers/openai_tools.py +16 -21
  70. langchain_core/output_parsers/pydantic.py +13 -14
  71. langchain_core/output_parsers/string.py +5 -5
  72. langchain_core/output_parsers/transform.py +15 -17
  73. langchain_core/output_parsers/xml.py +35 -34
  74. langchain_core/outputs/__init__.py +1 -1
  75. langchain_core/outputs/chat_generation.py +18 -18
  76. langchain_core/outputs/chat_result.py +1 -3
  77. langchain_core/outputs/generation.py +10 -11
  78. langchain_core/outputs/llm_result.py +10 -10
  79. langchain_core/prompt_values.py +11 -17
  80. langchain_core/prompts/__init__.py +3 -27
  81. langchain_core/prompts/base.py +48 -56
  82. langchain_core/prompts/chat.py +275 -325
  83. langchain_core/prompts/dict.py +5 -5
  84. langchain_core/prompts/few_shot.py +81 -88
  85. langchain_core/prompts/few_shot_with_templates.py +11 -13
  86. langchain_core/prompts/image.py +12 -14
  87. langchain_core/prompts/loading.py +4 -6
  88. langchain_core/prompts/message.py +3 -3
  89. langchain_core/prompts/prompt.py +24 -39
  90. langchain_core/prompts/string.py +26 -10
  91. langchain_core/prompts/structured.py +49 -53
  92. langchain_core/rate_limiters.py +51 -60
  93. langchain_core/retrievers.py +61 -198
  94. langchain_core/runnables/base.py +1476 -1626
  95. langchain_core/runnables/branch.py +53 -57
  96. langchain_core/runnables/config.py +72 -89
  97. langchain_core/runnables/configurable.py +120 -137
  98. langchain_core/runnables/fallbacks.py +83 -79
  99. langchain_core/runnables/graph.py +91 -97
  100. langchain_core/runnables/graph_ascii.py +27 -28
  101. langchain_core/runnables/graph_mermaid.py +38 -50
  102. langchain_core/runnables/graph_png.py +15 -16
  103. langchain_core/runnables/history.py +135 -148
  104. langchain_core/runnables/passthrough.py +124 -150
  105. langchain_core/runnables/retry.py +46 -51
  106. langchain_core/runnables/router.py +25 -30
  107. langchain_core/runnables/schema.py +75 -80
  108. langchain_core/runnables/utils.py +60 -67
  109. langchain_core/stores.py +85 -121
  110. langchain_core/structured_query.py +8 -8
  111. langchain_core/sys_info.py +27 -29
  112. langchain_core/tools/__init__.py +1 -14
  113. langchain_core/tools/base.py +284 -229
  114. langchain_core/tools/convert.py +160 -155
  115. langchain_core/tools/render.py +10 -10
  116. langchain_core/tools/retriever.py +12 -11
  117. langchain_core/tools/simple.py +19 -24
  118. langchain_core/tools/structured.py +32 -39
  119. langchain_core/tracers/__init__.py +1 -9
  120. langchain_core/tracers/base.py +97 -99
  121. langchain_core/tracers/context.py +29 -52
  122. langchain_core/tracers/core.py +49 -53
  123. langchain_core/tracers/evaluation.py +11 -11
  124. langchain_core/tracers/event_stream.py +65 -64
  125. langchain_core/tracers/langchain.py +21 -21
  126. langchain_core/tracers/log_stream.py +45 -45
  127. langchain_core/tracers/memory_stream.py +3 -3
  128. langchain_core/tracers/root_listeners.py +16 -16
  129. langchain_core/tracers/run_collector.py +2 -4
  130. langchain_core/tracers/schemas.py +0 -129
  131. langchain_core/tracers/stdout.py +3 -3
  132. langchain_core/utils/__init__.py +1 -4
  133. langchain_core/utils/_merge.py +2 -2
  134. langchain_core/utils/aiter.py +57 -61
  135. langchain_core/utils/env.py +9 -9
  136. langchain_core/utils/function_calling.py +89 -186
  137. langchain_core/utils/html.py +7 -8
  138. langchain_core/utils/input.py +6 -6
  139. langchain_core/utils/interactive_env.py +1 -1
  140. langchain_core/utils/iter.py +36 -40
  141. langchain_core/utils/json.py +4 -3
  142. langchain_core/utils/json_schema.py +9 -9
  143. langchain_core/utils/mustache.py +8 -10
  144. langchain_core/utils/pydantic.py +33 -35
  145. langchain_core/utils/strings.py +6 -9
  146. langchain_core/utils/usage.py +1 -1
  147. langchain_core/utils/utils.py +66 -62
  148. langchain_core/vectorstores/base.py +182 -216
  149. langchain_core/vectorstores/in_memory.py +101 -176
  150. langchain_core/vectorstores/utils.py +5 -5
  151. langchain_core/version.py +1 -1
  152. langchain_core-1.0.3.dist-info/METADATA +69 -0
  153. langchain_core-1.0.3.dist-info/RECORD +172 -0
  154. {langchain_core-1.0.0a6.dist-info → langchain_core-1.0.3.dist-info}/WHEEL +1 -1
  155. langchain_core/memory.py +0 -120
  156. langchain_core/messages/block_translators/ollama.py +0 -47
  157. langchain_core/prompts/pipeline.py +0 -138
  158. langchain_core/pydantic_v1/__init__.py +0 -30
  159. langchain_core/pydantic_v1/dataclasses.py +0 -23
  160. langchain_core/pydantic_v1/main.py +0 -23
  161. langchain_core/tracers/langchain_v1.py +0 -31
  162. langchain_core/utils/loading.py +0 -35
  163. langchain_core-1.0.0a6.dist-info/METADATA +0 -67
  164. langchain_core-1.0.0a6.dist-info/RECORD +0 -181
  165. langchain_core-1.0.0a6.dist-info/entry_points.txt +0 -4
@@ -3,7 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from abc import ABC, abstractmethod
6
- from typing import TYPE_CHECKING, Optional
6
+ from typing import TYPE_CHECKING
7
7
 
8
8
  from langchain_core.runnables import run_in_executor
9
9
 
@@ -27,7 +27,7 @@ class BaseLoader(ABC): # noqa: B024
27
27
  """Interface for Document Loader.
28
28
 
29
29
  Implementations should implement the lazy-loading method using generators
30
- to avoid loading all Documents into memory at once.
30
+ to avoid loading all documents into memory at once.
31
31
 
32
32
  `load` is provided just for user convenience and should not be overridden.
33
33
  """
@@ -35,38 +35,40 @@ class BaseLoader(ABC): # noqa: B024
35
35
  # Sub-classes should not implement this method directly. Instead, they
36
36
  # should implement the lazy load method.
37
37
  def load(self) -> list[Document]:
38
- """Load data into Document objects.
38
+ """Load data into `Document` objects.
39
39
 
40
40
  Returns:
41
- the documents.
41
+ The documents.
42
42
  """
43
43
  return list(self.lazy_load())
44
44
 
45
45
  async def aload(self) -> list[Document]:
46
- """Load data into Document objects.
46
+ """Load data into `Document` objects.
47
47
 
48
48
  Returns:
49
- the documents.
49
+ The documents.
50
50
  """
51
51
  return [document async for document in self.alazy_load()]
52
52
 
53
53
  def load_and_split(
54
- self, text_splitter: Optional[TextSplitter] = None
54
+ self, text_splitter: TextSplitter | None = None
55
55
  ) -> list[Document]:
56
- """Load Documents and split into chunks. Chunks are returned as Documents.
56
+ """Load `Document` and split into chunks. Chunks are returned as `Document`.
57
57
 
58
- Do not override this method. It should be considered to be deprecated!
58
+ !!! danger
59
+
60
+ Do not override this method. It should be considered to be deprecated!
59
61
 
60
62
  Args:
61
- text_splitter: TextSplitter instance to use for splitting documents.
62
- Defaults to RecursiveCharacterTextSplitter.
63
+ text_splitter: `TextSplitter` instance to use for splitting documents.
64
+ Defaults to `RecursiveCharacterTextSplitter`.
63
65
 
64
66
  Raises:
65
- ImportError: If langchain-text-splitters is not installed
66
- and no text_splitter is provided.
67
+ ImportError: If `langchain-text-splitters` is not installed
68
+ and no `text_splitter` is provided.
67
69
 
68
70
  Returns:
69
- List of Documents.
71
+ List of `Document`.
70
72
  """
71
73
  if text_splitter is None:
72
74
  if not _HAS_TEXT_SPLITTERS:
@@ -86,10 +88,10 @@ class BaseLoader(ABC): # noqa: B024
86
88
  # Attention: This method will be upgraded into an abstractmethod once it's
87
89
  # implemented in all the existing subclasses.
88
90
  def lazy_load(self) -> Iterator[Document]:
89
- """A lazy loader for Documents.
91
+ """A lazy loader for `Document`.
90
92
 
91
93
  Yields:
92
- the documents.
94
+ The `Document` objects.
93
95
  """
94
96
  if type(self).load != BaseLoader.load:
95
97
  return iter(self.load())
@@ -97,10 +99,10 @@ class BaseLoader(ABC): # noqa: B024
97
99
  raise NotImplementedError(msg)
98
100
 
99
101
  async def alazy_load(self) -> AsyncIterator[Document]:
100
- """A lazy loader for Documents.
102
+ """A lazy loader for `Document`.
101
103
 
102
104
  Yields:
103
- the documents.
105
+ The `Document` objects.
104
106
  """
105
107
  iterator = await run_in_executor(None, self.lazy_load)
106
108
  done = object()
@@ -115,7 +117,7 @@ class BaseBlobParser(ABC):
115
117
  """Abstract interface for blob parsers.
116
118
 
117
119
  A blob parser provides a way to parse raw data stored in a blob into one
118
- or more documents.
120
+ or more `Document` objects.
119
121
 
120
122
  The parser can be composed with blob loaders, making it easy to reuse
121
123
  a parser independent of how the blob was originally loaded.
@@ -128,25 +130,25 @@ class BaseBlobParser(ABC):
128
130
  Subclasses are required to implement this method.
129
131
 
130
132
  Args:
131
- blob: Blob instance
133
+ blob: `Blob` instance
132
134
 
133
135
  Returns:
134
- Generator of documents
136
+ Generator of `Document` objects
135
137
  """
136
138
 
137
139
  def parse(self, blob: Blob) -> list[Document]:
138
- """Eagerly parse the blob into a document or documents.
140
+ """Eagerly parse the blob into a `Document` or list of `Document` objects.
139
141
 
140
142
  This is a convenience method for interactive development environment.
141
143
 
142
- Production applications should favor the lazy_parse method instead.
144
+ Production applications should favor the `lazy_parse` method instead.
143
145
 
144
146
  Subclasses should generally not over-ride this parse method.
145
147
 
146
148
  Args:
147
- blob: Blob instance
149
+ blob: `Blob` instance
148
150
 
149
151
  Returns:
150
- List of documents
152
+ List of `Document` objects
151
153
  """
152
154
  return list(self.lazy_parse(blob))
@@ -28,7 +28,7 @@ class BlobLoader(ABC):
28
28
  def yield_blobs(
29
29
  self,
30
30
  ) -> Iterable[Blob]:
31
- """A lazy loader for raw data represented by LangChain's Blob object.
31
+ """A lazy loader for raw data represented by LangChain's `Blob` object.
32
32
 
33
33
  Returns:
34
34
  A generator over blobs
@@ -3,8 +3,8 @@
3
3
  import datetime
4
4
  import json
5
5
  import uuid
6
- from collections.abc import Iterator, Sequence
7
- from typing import Any, Callable, Optional, Union
6
+ from collections.abc import Callable, Iterator, Sequence
7
+ from typing import Any
8
8
 
9
9
  from langsmith import Client as LangSmithClient
10
10
  from typing_extensions import override
@@ -14,79 +14,75 @@ from langchain_core.documents import Document
14
14
 
15
15
 
16
16
  class LangSmithLoader(BaseLoader):
17
- """Load LangSmith Dataset examples as Documents.
17
+ """Load LangSmith Dataset examples as `Document` objects.
18
18
 
19
- Loads the example inputs as the Document page content and places the entire example
20
- into the Document metadata. This allows you to easily create few-shot example
21
- retrievers from the loaded documents.
19
+ Loads the example inputs as the `Document` page content and places the entire
20
+ example into the `Document` metadata. This allows you to easily create few-shot
21
+ example retrievers from the loaded documents.
22
22
 
23
- .. dropdown:: Lazy load
23
+ ??? note "Lazy loading example"
24
24
 
25
- .. code-block:: python
25
+ ```python
26
+ from langchain_core.document_loaders import LangSmithLoader
26
27
 
27
- from langchain_core.document_loaders import LangSmithLoader
28
+ loader = LangSmithLoader(dataset_id="...", limit=100)
29
+ docs = []
30
+ for doc in loader.lazy_load():
31
+ docs.append(doc)
32
+ ```
28
33
 
29
- loader = LangSmithLoader(dataset_id="...", limit=100)
30
- docs = []
31
- for doc in loader.lazy_load():
32
- docs.append(doc)
33
-
34
- .. code-block:: python
35
-
36
- # -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]
37
-
38
- .. versionadded:: 0.2.34
39
-
40
- """ # noqa: E501
34
+ ```python
35
+ # -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]
36
+ ```
37
+ """
41
38
 
42
39
  def __init__(
43
40
  self,
44
41
  *,
45
- dataset_id: Optional[Union[uuid.UUID, str]] = None,
46
- dataset_name: Optional[str] = None,
47
- example_ids: Optional[Sequence[Union[uuid.UUID, str]]] = None,
48
- as_of: Optional[Union[datetime.datetime, str]] = None,
49
- splits: Optional[Sequence[str]] = None,
42
+ dataset_id: uuid.UUID | str | None = None,
43
+ dataset_name: str | None = None,
44
+ example_ids: Sequence[uuid.UUID | str] | None = None,
45
+ as_of: datetime.datetime | str | None = None,
46
+ splits: Sequence[str] | None = None,
50
47
  inline_s3_urls: bool = True,
51
48
  offset: int = 0,
52
- limit: Optional[int] = None,
53
- metadata: Optional[dict] = None,
54
- filter: Optional[str] = None, # noqa: A002
49
+ limit: int | None = None,
50
+ metadata: dict | None = None,
51
+ filter: str | None = None, # noqa: A002
55
52
  content_key: str = "",
56
- format_content: Optional[Callable[..., str]] = None,
57
- client: Optional[LangSmithClient] = None,
53
+ format_content: Callable[..., str] | None = None,
54
+ client: LangSmithClient | None = None,
58
55
  **client_kwargs: Any,
59
56
  ) -> None:
60
57
  """Create a LangSmith loader.
61
58
 
62
59
  Args:
63
- dataset_id: The ID of the dataset to filter by. Defaults to None.
64
- dataset_name: The name of the dataset to filter by. Defaults to None.
65
- content_key: The inputs key to set as Document page content. ``'.'`` characters
66
- are interpreted as nested keys. E.g. ``content_key="first.second"`` will
60
+ dataset_id: The ID of the dataset to filter by.
61
+ dataset_name: The name of the dataset to filter by.
62
+ content_key: The inputs key to set as Document page content. `'.'` characters
63
+ are interpreted as nested keys. E.g. `content_key="first.second"` will
67
64
  result in
68
- ``Document(page_content=format_content(example.inputs["first"]["second"]))``
65
+ `Document(page_content=format_content(example.inputs["first"]["second"]))`
69
66
  format_content: Function for converting the content extracted from the example
70
67
  inputs into a string. Defaults to JSON-encoding the contents.
71
- example_ids: The IDs of the examples to filter by. Defaults to None.
72
- as_of: The dataset version tag OR
73
- timestamp to retrieve the examples as of.
74
- Response examples will only be those that were present at the time
75
- of the tagged (or timestamped) version.
68
+ example_ids: The IDs of the examples to filter by.
69
+ as_of: The dataset version tag or timestamp to retrieve the examples as of.
70
+ Response examples will only be those that were present at the time of
71
+ the tagged (or timestamped) version.
76
72
  splits: A list of dataset splits, which are
77
- divisions of your dataset such as 'train', 'test', or 'validation'.
73
+ divisions of your dataset such as `train`, `test`, or `validation`.
78
74
  Returns examples only from the specified splits.
79
- inline_s3_urls: Whether to inline S3 URLs. Defaults to True.
80
- offset: The offset to start from. Defaults to 0.
75
+ inline_s3_urls: Whether to inline S3 URLs.
76
+ offset: The offset to start from.
81
77
  limit: The maximum number of examples to return.
82
- metadata: Metadata to filter by. Defaults to None.
78
+ metadata: Metadata to filter by.
83
79
  filter: A structured filter string to apply to the examples.
84
80
  client: LangSmith Client. If not provided will be initialized from below args.
85
81
  client_kwargs: Keyword args to pass to LangSmith client init. Should only be
86
- specified if ``client`` isn't.
82
+ specified if `client` isn't.
87
83
 
88
84
  Raises:
89
- ValueError: If both ``client`` and ``client_kwargs`` are provided.
85
+ ValueError: If both `client` and `client_kwargs` are provided.
90
86
  """ # noqa: E501
91
87
  if client and client_kwargs:
92
88
  raise ValueError
@@ -129,7 +125,7 @@ class LangSmithLoader(BaseLoader):
129
125
  yield Document(content_str, metadata=metadata)
130
126
 
131
127
 
132
- def _stringify(x: Union[str, dict]) -> str:
128
+ def _stringify(x: str | dict) -> str:
133
129
  if isinstance(x, str):
134
130
  return x
135
131
  try:
@@ -1,8 +1,28 @@
1
- """Documents module.
1
+ """Documents module for data retrieval and processing workflows.
2
2
 
3
- **Document** module is a collection of classes that handle documents
4
- and their transformations.
3
+ This module provides core abstractions for handling data in retrieval-augmented
4
+ generation (RAG) pipelines, vector stores, and document processing workflows.
5
5
 
6
+ !!! warning "Documents vs. message content"
7
+ This module is distinct from `langchain_core.messages.content`, which provides
8
+ multimodal content blocks for **LLM chat I/O** (text, images, audio, etc. within
9
+ messages).
10
+
11
+ **Key distinction:**
12
+
13
+ - **Documents** (this module): For **data retrieval and processing workflows**
14
+ - Vector stores, retrievers, RAG pipelines
15
+ - Text chunking, embedding, and semantic search
16
+ - Example: Chunks of a PDF stored in a vector database
17
+
18
+ - **Content Blocks** (`messages.content`): For **LLM conversational I/O**
19
+ - Multimodal message content sent to/from models
20
+ - Tool calls, reasoning, citations within chat
21
+ - Example: An image sent to a vision model in a chat message (via
22
+ [`ImageContentBlock`][langchain.messages.ImageContentBlock])
23
+
24
+ While both can represent similar data types (text, files), they serve different
25
+ architectural purposes in LangChain applications.
6
26
  """
7
27
 
8
28
  from typing import TYPE_CHECKING
@@ -1,4 +1,16 @@
1
- """Base classes for media and documents."""
1
+ """Base classes for media and documents.
2
+
3
+ This module contains core abstractions for **data retrieval and processing workflows**:
4
+
5
+ - `BaseMedia`: Base class providing `id` and `metadata` fields
6
+ - `Blob`: Raw data loading (files, binary data) - used by document loaders
7
+ - `Document`: Text content for retrieval (RAG, vector stores, semantic search)
8
+
9
+ !!! note "Not for LLM chat messages"
10
+ These classes are for data processing pipelines, not LLM I/O. For multimodal
11
+ content in chat messages (images, audio in conversations), see
12
+ `langchain.messages` content blocks instead.
13
+ """
2
14
 
3
15
  from __future__ import annotations
4
16
 
@@ -6,7 +18,7 @@ import contextlib
6
18
  import mimetypes
7
19
  from io import BufferedReader, BytesIO
8
20
  from pathlib import Path, PurePath
9
- from typing import TYPE_CHECKING, Any, Literal, Optional, Union, cast
21
+ from typing import TYPE_CHECKING, Any, Literal, cast
10
22
 
11
23
  from pydantic import ConfigDict, Field, model_validator
12
24
 
@@ -15,31 +27,27 @@ from langchain_core.load.serializable import Serializable
15
27
  if TYPE_CHECKING:
16
28
  from collections.abc import Generator
17
29
 
18
- PathLike = Union[str, PurePath]
30
+ PathLike = str | PurePath
19
31
 
20
32
 
21
33
  class BaseMedia(Serializable):
22
- """Use to represent media content.
23
-
24
- Media objects can be used to represent raw data, such as text or binary data.
34
+ """Base class for content used in retrieval and data processing workflows.
25
35
 
26
- LangChain Media objects allow associating metadata and an optional identifier
27
- with the content.
36
+ Provides common fields for content that needs to be stored, indexed, or searched.
28
37
 
29
- The presence of an ID and metadata make it easier to store, index, and search
30
- over the content in a structured way.
38
+ !!! note
39
+ For multimodal content in **chat messages** (images, audio sent to/from LLMs),
40
+ use `langchain.messages` content blocks instead.
31
41
  """
32
42
 
33
43
  # The ID field is optional at the moment.
34
44
  # It will likely become required in a future major release after
35
- # it has been adopted by enough vectorstore implementations.
36
- id: Optional[str] = Field(default=None, coerce_numbers_to_str=True)
45
+ # it has been adopted by enough VectorStore implementations.
46
+ id: str | None = Field(default=None, coerce_numbers_to_str=True)
37
47
  """An optional identifier for the document.
38
48
 
39
49
  Ideally this should be unique across the document collection and formatted
40
50
  as a UUID, but this will not be enforced.
41
-
42
- .. versionadded:: 0.2.11
43
51
  """
44
52
 
45
53
  metadata: dict = Field(default_factory=dict)
@@ -47,74 +55,72 @@ class BaseMedia(Serializable):
47
55
 
48
56
 
49
57
  class Blob(BaseMedia):
50
- """Blob represents raw data by either reference or value.
51
-
52
- Provides an interface to materialize the blob in different representations, and
53
- help to decouple the development of data loaders from the downstream parsing of
54
- the raw data.
55
-
56
- Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
57
-
58
- Example: Initialize a blob from in-memory data
58
+ """Raw data abstraction for document loading and file processing.
59
59
 
60
- .. code-block:: python
60
+ Represents raw bytes or text, either in-memory or by file reference. Used
61
+ primarily by document loaders to decouple data loading from parsing.
61
62
 
62
- from langchain_core.documents import Blob
63
+ Inspired by [Mozilla's `Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob)
63
64
 
64
- blob = Blob.from_data("Hello, world!")
65
+ ???+ example "Initialize a blob from in-memory data"
65
66
 
66
- # Read the blob as a string
67
- print(blob.as_string())
67
+ ```python
68
+ from langchain_core.documents import Blob
68
69
 
69
- # Read the blob as bytes
70
- print(blob.as_bytes())
70
+ blob = Blob.from_data("Hello, world!")
71
71
 
72
- # Read the blob as a byte stream
73
- with blob.as_bytes_io() as f:
74
- print(f.read())
72
+ # Read the blob as a string
73
+ print(blob.as_string())
75
74
 
76
- Example: Load from memory and specify mime-type and metadata
75
+ # Read the blob as bytes
76
+ print(blob.as_bytes())
77
77
 
78
- .. code-block:: python
78
+ # Read the blob as a byte stream
79
+ with blob.as_bytes_io() as f:
80
+ print(f.read())
81
+ ```
79
82
 
80
- from langchain_core.documents import Blob
83
+ ??? example "Load from memory and specify MIME type and metadata"
81
84
 
82
- blob = Blob.from_data(
83
- data="Hello, world!",
84
- mime_type="text/plain",
85
- metadata={"source": "https://example.com"},
86
- )
85
+ ```python
86
+ from langchain_core.documents import Blob
87
87
 
88
- Example: Load the blob from a file
89
-
90
- .. code-block:: python
88
+ blob = Blob.from_data(
89
+ data="Hello, world!",
90
+ mime_type="text/plain",
91
+ metadata={"source": "https://example.com"},
92
+ )
93
+ ```
91
94
 
92
- from langchain_core.documents import Blob
95
+ ??? example "Load the blob from a file"
93
96
 
94
- blob = Blob.from_path("path/to/file.txt")
97
+ ```python
98
+ from langchain_core.documents import Blob
95
99
 
96
- # Read the blob as a string
97
- print(blob.as_string())
100
+ blob = Blob.from_path("path/to/file.txt")
98
101
 
99
- # Read the blob as bytes
100
- print(blob.as_bytes())
102
+ # Read the blob as a string
103
+ print(blob.as_string())
101
104
 
102
- # Read the blob as a byte stream
103
- with blob.as_bytes_io() as f:
104
- print(f.read())
105
+ # Read the blob as bytes
106
+ print(blob.as_bytes())
105
107
 
108
+ # Read the blob as a byte stream
109
+ with blob.as_bytes_io() as f:
110
+ print(f.read())
111
+ ```
106
112
  """
107
113
 
108
- data: Union[bytes, str, None] = None
109
- """Raw data associated with the blob."""
110
- mimetype: Optional[str] = None
114
+ data: bytes | str | None = None
115
+ """Raw data associated with the `Blob`."""
116
+ mimetype: str | None = None
111
117
  """MimeType not to be confused with a file extension."""
112
118
  encoding: str = "utf-8"
113
119
  """Encoding to use if decoding the bytes into a string.
114
120
 
115
- Use utf-8 as default encoding, if decoding to string.
121
+ Use `utf-8` as default encoding, if decoding to string.
116
122
  """
117
- path: Optional[PathLike] = None
123
+ path: PathLike | None = None
118
124
  """Location where the original content was found."""
119
125
 
120
126
  model_config = ConfigDict(
@@ -123,16 +129,16 @@ class Blob(BaseMedia):
123
129
  )
124
130
 
125
131
  @property
126
- def source(self) -> Optional[str]:
132
+ def source(self) -> str | None:
127
133
  """The source location of the blob as string if known otherwise none.
128
134
 
129
- If a path is associated with the blob, it will default to the path location.
135
+ If a path is associated with the `Blob`, it will default to the path location.
130
136
 
131
- Unless explicitly set via a metadata field called "source", in which
137
+ Unless explicitly set via a metadata field called `"source"`, in which
132
138
  case that value will be used instead.
133
139
  """
134
140
  if self.metadata and "source" in self.metadata:
135
- return cast("Optional[str]", self.metadata["source"])
141
+ return cast("str | None", self.metadata["source"])
136
142
  return str(self.path) if self.path else None
137
143
 
138
144
  @model_validator(mode="before")
@@ -181,7 +187,7 @@ class Blob(BaseMedia):
181
187
  raise ValueError(msg)
182
188
 
183
189
  @contextlib.contextmanager
184
- def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]:
190
+ def as_bytes_io(self) -> Generator[BytesIO | BufferedReader, None, None]:
185
191
  """Read data as a byte stream.
186
192
 
187
193
  Raises:
@@ -205,22 +211,22 @@ class Blob(BaseMedia):
205
211
  path: PathLike,
206
212
  *,
207
213
  encoding: str = "utf-8",
208
- mime_type: Optional[str] = None,
214
+ mime_type: str | None = None,
209
215
  guess_type: bool = True,
210
- metadata: Optional[dict] = None,
216
+ metadata: dict | None = None,
211
217
  ) -> Blob:
212
218
  """Load the blob from a path like object.
213
219
 
214
220
  Args:
215
- path: path like object to file to be read
221
+ path: Path-like object to file to be read
216
222
  encoding: Encoding to use if decoding the bytes into a string
217
- mime_type: if provided, will be set as the mime-type of the data
218
- guess_type: If True, the mimetype will be guessed from the file extension,
219
- if a mime-type was not provided
220
- metadata: Metadata to associate with the blob
223
+ mime_type: If provided, will be set as the MIME type of the data
224
+ guess_type: If `True`, the MIME type will be guessed from the file
225
+ extension, if a MIME type was not provided
226
+ metadata: Metadata to associate with the `Blob`
221
227
 
222
228
  Returns:
223
- Blob instance
229
+ `Blob` instance
224
230
  """
225
231
  if mime_type is None and guess_type:
226
232
  mimetype = mimetypes.guess_type(path)[0] if guess_type else None
@@ -239,24 +245,24 @@ class Blob(BaseMedia):
239
245
  @classmethod
240
246
  def from_data(
241
247
  cls,
242
- data: Union[str, bytes],
248
+ data: str | bytes,
243
249
  *,
244
250
  encoding: str = "utf-8",
245
- mime_type: Optional[str] = None,
246
- path: Optional[str] = None,
247
- metadata: Optional[dict] = None,
251
+ mime_type: str | None = None,
252
+ path: str | None = None,
253
+ metadata: dict | None = None,
248
254
  ) -> Blob:
249
- """Initialize the blob from in-memory data.
255
+ """Initialize the `Blob` from in-memory data.
250
256
 
251
257
  Args:
252
- data: the in-memory data associated with the blob
258
+ data: The in-memory data associated with the `Blob`
253
259
  encoding: Encoding to use if decoding the bytes into a string
254
- mime_type: if provided, will be set as the mime-type of the data
255
- path: if provided, will be set as the source from which the data came
256
- metadata: Metadata to associate with the blob
260
+ mime_type: If provided, will be set as the MIME type of the data
261
+ path: If provided, will be set as the source from which the data came
262
+ metadata: Metadata to associate with the `Blob`
257
263
 
258
264
  Returns:
259
- Blob instance
265
+ `Blob` instance
260
266
  """
261
267
  return cls(
262
268
  data=data,
@@ -277,16 +283,18 @@ class Blob(BaseMedia):
277
283
  class Document(BaseMedia):
278
284
  """Class for storing a piece of text and associated metadata.
279
285
 
280
- Example:
281
-
282
- .. code-block:: python
286
+ !!! note
287
+ `Document` is for **retrieval workflows**, not chat I/O. For sending text
288
+ to an LLM in a conversation, use message types from `langchain.messages`.
283
289
 
284
- from langchain_core.documents import Document
285
-
286
- document = Document(
287
- page_content="Hello, world!", metadata={"source": "https://example.com"}
288
- )
290
+ Example:
291
+ ```python
292
+ from langchain_core.documents import Document
289
293
 
294
+ document = Document(
295
+ page_content="Hello, world!", metadata={"source": "https://example.com"}
296
+ )
297
+ ```
290
298
  """
291
299
 
292
300
  page_content: str
@@ -306,7 +314,7 @@ class Document(BaseMedia):
306
314
 
307
315
  @classmethod
308
316
  def get_lc_namespace(cls) -> list[str]:
309
- """Get the namespace of the langchain object.
317
+ """Get the namespace of the LangChain object.
310
318
 
311
319
  Returns:
312
320
  ["langchain", "schema", "document"]