langchain-core 1.0.0a6__py3-none-any.whl → 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_core/__init__.py +1 -1
- langchain_core/_api/__init__.py +3 -4
- langchain_core/_api/beta_decorator.py +23 -26
- langchain_core/_api/deprecation.py +51 -64
- langchain_core/_api/path.py +3 -6
- langchain_core/_import_utils.py +3 -4
- langchain_core/agents.py +55 -48
- langchain_core/caches.py +65 -66
- langchain_core/callbacks/__init__.py +1 -8
- langchain_core/callbacks/base.py +321 -336
- langchain_core/callbacks/file.py +44 -44
- langchain_core/callbacks/manager.py +454 -514
- langchain_core/callbacks/stdout.py +29 -30
- langchain_core/callbacks/streaming_stdout.py +32 -32
- langchain_core/callbacks/usage.py +60 -57
- langchain_core/chat_history.py +53 -68
- langchain_core/document_loaders/base.py +27 -25
- langchain_core/document_loaders/blob_loaders.py +1 -1
- langchain_core/document_loaders/langsmith.py +44 -48
- langchain_core/documents/__init__.py +23 -3
- langchain_core/documents/base.py +102 -94
- langchain_core/documents/compressor.py +10 -10
- langchain_core/documents/transformers.py +34 -35
- langchain_core/embeddings/fake.py +50 -54
- langchain_core/example_selectors/length_based.py +2 -2
- langchain_core/example_selectors/semantic_similarity.py +28 -32
- langchain_core/exceptions.py +21 -20
- langchain_core/globals.py +3 -151
- langchain_core/indexing/__init__.py +1 -1
- langchain_core/indexing/api.py +121 -126
- langchain_core/indexing/base.py +73 -75
- langchain_core/indexing/in_memory.py +4 -6
- langchain_core/language_models/__init__.py +14 -29
- langchain_core/language_models/_utils.py +58 -61
- langchain_core/language_models/base.py +82 -172
- langchain_core/language_models/chat_models.py +329 -402
- langchain_core/language_models/fake.py +11 -11
- langchain_core/language_models/fake_chat_models.py +42 -36
- langchain_core/language_models/llms.py +189 -269
- langchain_core/load/dump.py +9 -12
- langchain_core/load/load.py +18 -28
- langchain_core/load/mapping.py +2 -4
- langchain_core/load/serializable.py +42 -40
- langchain_core/messages/__init__.py +10 -16
- langchain_core/messages/ai.py +148 -148
- langchain_core/messages/base.py +53 -51
- langchain_core/messages/block_translators/__init__.py +19 -22
- langchain_core/messages/block_translators/anthropic.py +6 -6
- langchain_core/messages/block_translators/bedrock_converse.py +5 -5
- langchain_core/messages/block_translators/google_genai.py +10 -7
- langchain_core/messages/block_translators/google_vertexai.py +4 -32
- langchain_core/messages/block_translators/groq.py +117 -21
- langchain_core/messages/block_translators/langchain_v0.py +5 -5
- langchain_core/messages/block_translators/openai.py +11 -11
- langchain_core/messages/chat.py +2 -6
- langchain_core/messages/content.py +339 -330
- langchain_core/messages/function.py +6 -10
- langchain_core/messages/human.py +24 -31
- langchain_core/messages/modifier.py +2 -2
- langchain_core/messages/system.py +19 -29
- langchain_core/messages/tool.py +74 -90
- langchain_core/messages/utils.py +484 -510
- langchain_core/output_parsers/__init__.py +13 -10
- langchain_core/output_parsers/base.py +61 -61
- langchain_core/output_parsers/format_instructions.py +9 -4
- langchain_core/output_parsers/json.py +12 -10
- langchain_core/output_parsers/list.py +21 -23
- langchain_core/output_parsers/openai_functions.py +49 -47
- langchain_core/output_parsers/openai_tools.py +30 -23
- langchain_core/output_parsers/pydantic.py +13 -14
- langchain_core/output_parsers/string.py +5 -5
- langchain_core/output_parsers/transform.py +15 -17
- langchain_core/output_parsers/xml.py +35 -34
- langchain_core/outputs/__init__.py +1 -1
- langchain_core/outputs/chat_generation.py +18 -18
- langchain_core/outputs/chat_result.py +1 -3
- langchain_core/outputs/generation.py +16 -16
- langchain_core/outputs/llm_result.py +10 -10
- langchain_core/prompt_values.py +13 -19
- langchain_core/prompts/__init__.py +3 -27
- langchain_core/prompts/base.py +81 -86
- langchain_core/prompts/chat.py +308 -351
- langchain_core/prompts/dict.py +6 -6
- langchain_core/prompts/few_shot.py +81 -88
- langchain_core/prompts/few_shot_with_templates.py +11 -13
- langchain_core/prompts/image.py +12 -14
- langchain_core/prompts/loading.py +4 -6
- langchain_core/prompts/message.py +7 -7
- langchain_core/prompts/prompt.py +24 -39
- langchain_core/prompts/string.py +26 -10
- langchain_core/prompts/structured.py +49 -53
- langchain_core/rate_limiters.py +51 -60
- langchain_core/retrievers.py +61 -198
- langchain_core/runnables/base.py +1551 -1656
- langchain_core/runnables/branch.py +68 -70
- langchain_core/runnables/config.py +72 -89
- langchain_core/runnables/configurable.py +145 -161
- langchain_core/runnables/fallbacks.py +102 -96
- langchain_core/runnables/graph.py +91 -97
- langchain_core/runnables/graph_ascii.py +27 -28
- langchain_core/runnables/graph_mermaid.py +42 -51
- langchain_core/runnables/graph_png.py +43 -16
- langchain_core/runnables/history.py +175 -177
- langchain_core/runnables/passthrough.py +151 -167
- langchain_core/runnables/retry.py +46 -51
- langchain_core/runnables/router.py +30 -35
- langchain_core/runnables/schema.py +75 -80
- langchain_core/runnables/utils.py +60 -67
- langchain_core/stores.py +85 -121
- langchain_core/structured_query.py +8 -8
- langchain_core/sys_info.py +29 -29
- langchain_core/tools/__init__.py +1 -14
- langchain_core/tools/base.py +306 -245
- langchain_core/tools/convert.py +160 -155
- langchain_core/tools/render.py +10 -10
- langchain_core/tools/retriever.py +12 -11
- langchain_core/tools/simple.py +19 -24
- langchain_core/tools/structured.py +32 -39
- langchain_core/tracers/__init__.py +1 -9
- langchain_core/tracers/base.py +97 -99
- langchain_core/tracers/context.py +29 -52
- langchain_core/tracers/core.py +49 -53
- langchain_core/tracers/evaluation.py +11 -11
- langchain_core/tracers/event_stream.py +65 -64
- langchain_core/tracers/langchain.py +21 -21
- langchain_core/tracers/log_stream.py +45 -45
- langchain_core/tracers/memory_stream.py +3 -3
- langchain_core/tracers/root_listeners.py +16 -16
- langchain_core/tracers/run_collector.py +2 -4
- langchain_core/tracers/schemas.py +0 -129
- langchain_core/tracers/stdout.py +3 -3
- langchain_core/utils/__init__.py +1 -4
- langchain_core/utils/_merge.py +2 -2
- langchain_core/utils/aiter.py +57 -61
- langchain_core/utils/env.py +9 -9
- langchain_core/utils/function_calling.py +94 -188
- langchain_core/utils/html.py +7 -8
- langchain_core/utils/input.py +9 -6
- langchain_core/utils/interactive_env.py +1 -1
- langchain_core/utils/iter.py +36 -40
- langchain_core/utils/json.py +4 -3
- langchain_core/utils/json_schema.py +9 -9
- langchain_core/utils/mustache.py +8 -10
- langchain_core/utils/pydantic.py +35 -37
- langchain_core/utils/strings.py +6 -9
- langchain_core/utils/usage.py +1 -1
- langchain_core/utils/utils.py +66 -62
- langchain_core/vectorstores/base.py +182 -216
- langchain_core/vectorstores/in_memory.py +101 -176
- langchain_core/vectorstores/utils.py +5 -5
- langchain_core/version.py +1 -1
- langchain_core-1.0.4.dist-info/METADATA +69 -0
- langchain_core-1.0.4.dist-info/RECORD +172 -0
- {langchain_core-1.0.0a6.dist-info → langchain_core-1.0.4.dist-info}/WHEEL +1 -1
- langchain_core/memory.py +0 -120
- langchain_core/messages/block_translators/ollama.py +0 -47
- langchain_core/prompts/pipeline.py +0 -138
- langchain_core/pydantic_v1/__init__.py +0 -30
- langchain_core/pydantic_v1/dataclasses.py +0 -23
- langchain_core/pydantic_v1/main.py +0 -23
- langchain_core/tracers/langchain_v1.py +0 -31
- langchain_core/utils/loading.py +0 -35
- langchain_core-1.0.0a6.dist-info/METADATA +0 -67
- langchain_core-1.0.0a6.dist-info/RECORD +0 -181
- langchain_core-1.0.0a6.dist-info/entry_points.txt +0 -4
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
7
|
|
|
8
8
|
from langchain_core.runnables import run_in_executor
|
|
9
9
|
|
|
@@ -27,7 +27,7 @@ class BaseLoader(ABC): # noqa: B024
|
|
|
27
27
|
"""Interface for Document Loader.
|
|
28
28
|
|
|
29
29
|
Implementations should implement the lazy-loading method using generators
|
|
30
|
-
to avoid loading all
|
|
30
|
+
to avoid loading all documents into memory at once.
|
|
31
31
|
|
|
32
32
|
`load` is provided just for user convenience and should not be overridden.
|
|
33
33
|
"""
|
|
@@ -35,38 +35,40 @@ class BaseLoader(ABC): # noqa: B024
|
|
|
35
35
|
# Sub-classes should not implement this method directly. Instead, they
|
|
36
36
|
# should implement the lazy load method.
|
|
37
37
|
def load(self) -> list[Document]:
|
|
38
|
-
"""Load data into Document objects.
|
|
38
|
+
"""Load data into `Document` objects.
|
|
39
39
|
|
|
40
40
|
Returns:
|
|
41
|
-
|
|
41
|
+
The documents.
|
|
42
42
|
"""
|
|
43
43
|
return list(self.lazy_load())
|
|
44
44
|
|
|
45
45
|
async def aload(self) -> list[Document]:
|
|
46
|
-
"""Load data into Document objects.
|
|
46
|
+
"""Load data into `Document` objects.
|
|
47
47
|
|
|
48
48
|
Returns:
|
|
49
|
-
|
|
49
|
+
The documents.
|
|
50
50
|
"""
|
|
51
51
|
return [document async for document in self.alazy_load()]
|
|
52
52
|
|
|
53
53
|
def load_and_split(
|
|
54
|
-
self, text_splitter:
|
|
54
|
+
self, text_splitter: TextSplitter | None = None
|
|
55
55
|
) -> list[Document]:
|
|
56
|
-
"""Load
|
|
56
|
+
"""Load `Document` and split into chunks. Chunks are returned as `Document`.
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
!!! danger
|
|
59
|
+
|
|
60
|
+
Do not override this method. It should be considered to be deprecated!
|
|
59
61
|
|
|
60
62
|
Args:
|
|
61
|
-
text_splitter: TextSplitter instance to use for splitting documents.
|
|
62
|
-
Defaults to RecursiveCharacterTextSplitter
|
|
63
|
+
text_splitter: `TextSplitter` instance to use for splitting documents.
|
|
64
|
+
Defaults to `RecursiveCharacterTextSplitter`.
|
|
63
65
|
|
|
64
66
|
Raises:
|
|
65
|
-
ImportError: If langchain-text-splitters is not installed
|
|
66
|
-
and no text_splitter is provided.
|
|
67
|
+
ImportError: If `langchain-text-splitters` is not installed
|
|
68
|
+
and no `text_splitter` is provided.
|
|
67
69
|
|
|
68
70
|
Returns:
|
|
69
|
-
List of
|
|
71
|
+
List of `Document`.
|
|
70
72
|
"""
|
|
71
73
|
if text_splitter is None:
|
|
72
74
|
if not _HAS_TEXT_SPLITTERS:
|
|
@@ -86,10 +88,10 @@ class BaseLoader(ABC): # noqa: B024
|
|
|
86
88
|
# Attention: This method will be upgraded into an abstractmethod once it's
|
|
87
89
|
# implemented in all the existing subclasses.
|
|
88
90
|
def lazy_load(self) -> Iterator[Document]:
|
|
89
|
-
"""A lazy loader for
|
|
91
|
+
"""A lazy loader for `Document`.
|
|
90
92
|
|
|
91
93
|
Yields:
|
|
92
|
-
|
|
94
|
+
The `Document` objects.
|
|
93
95
|
"""
|
|
94
96
|
if type(self).load != BaseLoader.load:
|
|
95
97
|
return iter(self.load())
|
|
@@ -97,10 +99,10 @@ class BaseLoader(ABC): # noqa: B024
|
|
|
97
99
|
raise NotImplementedError(msg)
|
|
98
100
|
|
|
99
101
|
async def alazy_load(self) -> AsyncIterator[Document]:
|
|
100
|
-
"""A lazy loader for
|
|
102
|
+
"""A lazy loader for `Document`.
|
|
101
103
|
|
|
102
104
|
Yields:
|
|
103
|
-
|
|
105
|
+
The `Document` objects.
|
|
104
106
|
"""
|
|
105
107
|
iterator = await run_in_executor(None, self.lazy_load)
|
|
106
108
|
done = object()
|
|
@@ -115,7 +117,7 @@ class BaseBlobParser(ABC):
|
|
|
115
117
|
"""Abstract interface for blob parsers.
|
|
116
118
|
|
|
117
119
|
A blob parser provides a way to parse raw data stored in a blob into one
|
|
118
|
-
or more
|
|
120
|
+
or more `Document` objects.
|
|
119
121
|
|
|
120
122
|
The parser can be composed with blob loaders, making it easy to reuse
|
|
121
123
|
a parser independent of how the blob was originally loaded.
|
|
@@ -128,25 +130,25 @@ class BaseBlobParser(ABC):
|
|
|
128
130
|
Subclasses are required to implement this method.
|
|
129
131
|
|
|
130
132
|
Args:
|
|
131
|
-
blob: Blob instance
|
|
133
|
+
blob: `Blob` instance
|
|
132
134
|
|
|
133
135
|
Returns:
|
|
134
|
-
Generator of
|
|
136
|
+
Generator of `Document` objects
|
|
135
137
|
"""
|
|
136
138
|
|
|
137
139
|
def parse(self, blob: Blob) -> list[Document]:
|
|
138
|
-
"""Eagerly parse the blob into a
|
|
140
|
+
"""Eagerly parse the blob into a `Document` or list of `Document` objects.
|
|
139
141
|
|
|
140
142
|
This is a convenience method for interactive development environment.
|
|
141
143
|
|
|
142
|
-
Production applications should favor the lazy_parse method instead.
|
|
144
|
+
Production applications should favor the `lazy_parse` method instead.
|
|
143
145
|
|
|
144
146
|
Subclasses should generally not over-ride this parse method.
|
|
145
147
|
|
|
146
148
|
Args:
|
|
147
|
-
blob: Blob instance
|
|
149
|
+
blob: `Blob` instance
|
|
148
150
|
|
|
149
151
|
Returns:
|
|
150
|
-
List of
|
|
152
|
+
List of `Document` objects
|
|
151
153
|
"""
|
|
152
154
|
return list(self.lazy_parse(blob))
|
|
@@ -28,7 +28,7 @@ class BlobLoader(ABC):
|
|
|
28
28
|
def yield_blobs(
|
|
29
29
|
self,
|
|
30
30
|
) -> Iterable[Blob]:
|
|
31
|
-
"""A lazy loader for raw data represented by LangChain's Blob object.
|
|
31
|
+
"""A lazy loader for raw data represented by LangChain's `Blob` object.
|
|
32
32
|
|
|
33
33
|
Returns:
|
|
34
34
|
A generator over blobs
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
import datetime
|
|
4
4
|
import json
|
|
5
5
|
import uuid
|
|
6
|
-
from collections.abc import Iterator, Sequence
|
|
7
|
-
from typing import Any
|
|
6
|
+
from collections.abc import Callable, Iterator, Sequence
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
from langsmith import Client as LangSmithClient
|
|
10
10
|
from typing_extensions import override
|
|
@@ -14,79 +14,75 @@ from langchain_core.documents import Document
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class LangSmithLoader(BaseLoader):
|
|
17
|
-
"""Load LangSmith Dataset examples as
|
|
17
|
+
"""Load LangSmith Dataset examples as `Document` objects.
|
|
18
18
|
|
|
19
|
-
Loads the example inputs as the Document page content and places the entire
|
|
20
|
-
into the Document metadata. This allows you to easily create few-shot
|
|
21
|
-
retrievers from the loaded documents.
|
|
19
|
+
Loads the example inputs as the `Document` page content and places the entire
|
|
20
|
+
example into the `Document` metadata. This allows you to easily create few-shot
|
|
21
|
+
example retrievers from the loaded documents.
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
??? note "Lazy loading example"
|
|
24
24
|
|
|
25
|
-
|
|
25
|
+
```python
|
|
26
|
+
from langchain_core.document_loaders import LangSmithLoader
|
|
26
27
|
|
|
27
|
-
|
|
28
|
+
loader = LangSmithLoader(dataset_id="...", limit=100)
|
|
29
|
+
docs = []
|
|
30
|
+
for doc in loader.lazy_load():
|
|
31
|
+
docs.append(doc)
|
|
32
|
+
```
|
|
28
33
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
.. code-block:: python
|
|
35
|
-
|
|
36
|
-
# -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]
|
|
37
|
-
|
|
38
|
-
.. versionadded:: 0.2.34
|
|
39
|
-
|
|
40
|
-
""" # noqa: E501
|
|
34
|
+
```python
|
|
35
|
+
# -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]
|
|
36
|
+
```
|
|
37
|
+
"""
|
|
41
38
|
|
|
42
39
|
def __init__(
|
|
43
40
|
self,
|
|
44
41
|
*,
|
|
45
|
-
dataset_id:
|
|
46
|
-
dataset_name:
|
|
47
|
-
example_ids:
|
|
48
|
-
as_of:
|
|
49
|
-
splits:
|
|
42
|
+
dataset_id: uuid.UUID | str | None = None,
|
|
43
|
+
dataset_name: str | None = None,
|
|
44
|
+
example_ids: Sequence[uuid.UUID | str] | None = None,
|
|
45
|
+
as_of: datetime.datetime | str | None = None,
|
|
46
|
+
splits: Sequence[str] | None = None,
|
|
50
47
|
inline_s3_urls: bool = True,
|
|
51
48
|
offset: int = 0,
|
|
52
|
-
limit:
|
|
53
|
-
metadata:
|
|
54
|
-
filter:
|
|
49
|
+
limit: int | None = None,
|
|
50
|
+
metadata: dict | None = None,
|
|
51
|
+
filter: str | None = None, # noqa: A002
|
|
55
52
|
content_key: str = "",
|
|
56
|
-
format_content:
|
|
57
|
-
client:
|
|
53
|
+
format_content: Callable[..., str] | None = None,
|
|
54
|
+
client: LangSmithClient | None = None,
|
|
58
55
|
**client_kwargs: Any,
|
|
59
56
|
) -> None:
|
|
60
57
|
"""Create a LangSmith loader.
|
|
61
58
|
|
|
62
59
|
Args:
|
|
63
|
-
dataset_id: The ID of the dataset to filter by.
|
|
64
|
-
dataset_name: The name of the dataset to filter by.
|
|
65
|
-
content_key: The inputs key to set as Document page content.
|
|
66
|
-
are interpreted as nested keys. E.g.
|
|
60
|
+
dataset_id: The ID of the dataset to filter by.
|
|
61
|
+
dataset_name: The name of the dataset to filter by.
|
|
62
|
+
content_key: The inputs key to set as Document page content. `'.'` characters
|
|
63
|
+
are interpreted as nested keys. E.g. `content_key="first.second"` will
|
|
67
64
|
result in
|
|
68
|
-
|
|
65
|
+
`Document(page_content=format_content(example.inputs["first"]["second"]))`
|
|
69
66
|
format_content: Function for converting the content extracted from the example
|
|
70
67
|
inputs into a string. Defaults to JSON-encoding the contents.
|
|
71
|
-
example_ids: The IDs of the examples to filter by.
|
|
72
|
-
as_of: The dataset version tag
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
of the tagged (or timestamped) version.
|
|
68
|
+
example_ids: The IDs of the examples to filter by.
|
|
69
|
+
as_of: The dataset version tag or timestamp to retrieve the examples as of.
|
|
70
|
+
Response examples will only be those that were present at the time of
|
|
71
|
+
the tagged (or timestamped) version.
|
|
76
72
|
splits: A list of dataset splits, which are
|
|
77
|
-
divisions of your dataset such as
|
|
73
|
+
divisions of your dataset such as `train`, `test`, or `validation`.
|
|
78
74
|
Returns examples only from the specified splits.
|
|
79
|
-
inline_s3_urls: Whether to inline S3 URLs.
|
|
80
|
-
offset: The offset to start from.
|
|
75
|
+
inline_s3_urls: Whether to inline S3 URLs.
|
|
76
|
+
offset: The offset to start from.
|
|
81
77
|
limit: The maximum number of examples to return.
|
|
82
|
-
metadata: Metadata to filter by.
|
|
78
|
+
metadata: Metadata to filter by.
|
|
83
79
|
filter: A structured filter string to apply to the examples.
|
|
84
80
|
client: LangSmith Client. If not provided will be initialized from below args.
|
|
85
81
|
client_kwargs: Keyword args to pass to LangSmith client init. Should only be
|
|
86
|
-
specified if
|
|
82
|
+
specified if `client` isn't.
|
|
87
83
|
|
|
88
84
|
Raises:
|
|
89
|
-
ValueError: If both
|
|
85
|
+
ValueError: If both `client` and `client_kwargs` are provided.
|
|
90
86
|
""" # noqa: E501
|
|
91
87
|
if client and client_kwargs:
|
|
92
88
|
raise ValueError
|
|
@@ -129,7 +125,7 @@ class LangSmithLoader(BaseLoader):
|
|
|
129
125
|
yield Document(content_str, metadata=metadata)
|
|
130
126
|
|
|
131
127
|
|
|
132
|
-
def _stringify(x:
|
|
128
|
+
def _stringify(x: str | dict) -> str:
|
|
133
129
|
if isinstance(x, str):
|
|
134
130
|
return x
|
|
135
131
|
try:
|
|
@@ -1,8 +1,28 @@
|
|
|
1
|
-
"""Documents module.
|
|
1
|
+
"""Documents module for data retrieval and processing workflows.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
and
|
|
3
|
+
This module provides core abstractions for handling data in retrieval-augmented
|
|
4
|
+
generation (RAG) pipelines, vector stores, and document processing workflows.
|
|
5
5
|
|
|
6
|
+
!!! warning "Documents vs. message content"
|
|
7
|
+
This module is distinct from `langchain_core.messages.content`, which provides
|
|
8
|
+
multimodal content blocks for **LLM chat I/O** (text, images, audio, etc. within
|
|
9
|
+
messages).
|
|
10
|
+
|
|
11
|
+
**Key distinction:**
|
|
12
|
+
|
|
13
|
+
- **Documents** (this module): For **data retrieval and processing workflows**
|
|
14
|
+
- Vector stores, retrievers, RAG pipelines
|
|
15
|
+
- Text chunking, embedding, and semantic search
|
|
16
|
+
- Example: Chunks of a PDF stored in a vector database
|
|
17
|
+
|
|
18
|
+
- **Content Blocks** (`messages.content`): For **LLM conversational I/O**
|
|
19
|
+
- Multimodal message content sent to/from models
|
|
20
|
+
- Tool calls, reasoning, citations within chat
|
|
21
|
+
- Example: An image sent to a vision model in a chat message (via
|
|
22
|
+
[`ImageContentBlock`][langchain.messages.ImageContentBlock])
|
|
23
|
+
|
|
24
|
+
While both can represent similar data types (text, files), they serve different
|
|
25
|
+
architectural purposes in LangChain applications.
|
|
6
26
|
"""
|
|
7
27
|
|
|
8
28
|
from typing import TYPE_CHECKING
|
langchain_core/documents/base.py
CHANGED
|
@@ -1,4 +1,16 @@
|
|
|
1
|
-
"""Base classes for media and documents.
|
|
1
|
+
"""Base classes for media and documents.
|
|
2
|
+
|
|
3
|
+
This module contains core abstractions for **data retrieval and processing workflows**:
|
|
4
|
+
|
|
5
|
+
- `BaseMedia`: Base class providing `id` and `metadata` fields
|
|
6
|
+
- `Blob`: Raw data loading (files, binary data) - used by document loaders
|
|
7
|
+
- `Document`: Text content for retrieval (RAG, vector stores, semantic search)
|
|
8
|
+
|
|
9
|
+
!!! note "Not for LLM chat messages"
|
|
10
|
+
These classes are for data processing pipelines, not LLM I/O. For multimodal
|
|
11
|
+
content in chat messages (images, audio in conversations), see
|
|
12
|
+
`langchain.messages` content blocks instead.
|
|
13
|
+
"""
|
|
2
14
|
|
|
3
15
|
from __future__ import annotations
|
|
4
16
|
|
|
@@ -6,7 +18,7 @@ import contextlib
|
|
|
6
18
|
import mimetypes
|
|
7
19
|
from io import BufferedReader, BytesIO
|
|
8
20
|
from pathlib import Path, PurePath
|
|
9
|
-
from typing import TYPE_CHECKING, Any, Literal,
|
|
21
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
10
22
|
|
|
11
23
|
from pydantic import ConfigDict, Field, model_validator
|
|
12
24
|
|
|
@@ -15,31 +27,27 @@ from langchain_core.load.serializable import Serializable
|
|
|
15
27
|
if TYPE_CHECKING:
|
|
16
28
|
from collections.abc import Generator
|
|
17
29
|
|
|
18
|
-
PathLike =
|
|
30
|
+
PathLike = str | PurePath
|
|
19
31
|
|
|
20
32
|
|
|
21
33
|
class BaseMedia(Serializable):
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
Media objects can be used to represent raw data, such as text or binary data.
|
|
34
|
+
"""Base class for content used in retrieval and data processing workflows.
|
|
25
35
|
|
|
26
|
-
|
|
27
|
-
with the content.
|
|
36
|
+
Provides common fields for content that needs to be stored, indexed, or searched.
|
|
28
37
|
|
|
29
|
-
|
|
30
|
-
|
|
38
|
+
!!! note
|
|
39
|
+
For multimodal content in **chat messages** (images, audio sent to/from LLMs),
|
|
40
|
+
use `langchain.messages` content blocks instead.
|
|
31
41
|
"""
|
|
32
42
|
|
|
33
43
|
# The ID field is optional at the moment.
|
|
34
44
|
# It will likely become required in a future major release after
|
|
35
|
-
# it has been adopted by enough
|
|
36
|
-
id:
|
|
45
|
+
# it has been adopted by enough VectorStore implementations.
|
|
46
|
+
id: str | None = Field(default=None, coerce_numbers_to_str=True)
|
|
37
47
|
"""An optional identifier for the document.
|
|
38
48
|
|
|
39
49
|
Ideally this should be unique across the document collection and formatted
|
|
40
50
|
as a UUID, but this will not be enforced.
|
|
41
|
-
|
|
42
|
-
.. versionadded:: 0.2.11
|
|
43
51
|
"""
|
|
44
52
|
|
|
45
53
|
metadata: dict = Field(default_factory=dict)
|
|
@@ -47,74 +55,72 @@ class BaseMedia(Serializable):
|
|
|
47
55
|
|
|
48
56
|
|
|
49
57
|
class Blob(BaseMedia):
|
|
50
|
-
"""
|
|
51
|
-
|
|
52
|
-
Provides an interface to materialize the blob in different representations, and
|
|
53
|
-
help to decouple the development of data loaders from the downstream parsing of
|
|
54
|
-
the raw data.
|
|
55
|
-
|
|
56
|
-
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob
|
|
57
|
-
|
|
58
|
-
Example: Initialize a blob from in-memory data
|
|
58
|
+
"""Raw data abstraction for document loading and file processing.
|
|
59
59
|
|
|
60
|
-
|
|
60
|
+
Represents raw bytes or text, either in-memory or by file reference. Used
|
|
61
|
+
primarily by document loaders to decouple data loading from parsing.
|
|
61
62
|
|
|
62
|
-
|
|
63
|
+
Inspired by [Mozilla's `Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob)
|
|
63
64
|
|
|
64
|
-
|
|
65
|
+
???+ example "Initialize a blob from in-memory data"
|
|
65
66
|
|
|
66
|
-
|
|
67
|
-
|
|
67
|
+
```python
|
|
68
|
+
from langchain_core.documents import Blob
|
|
68
69
|
|
|
69
|
-
|
|
70
|
-
print(blob.as_bytes())
|
|
70
|
+
blob = Blob.from_data("Hello, world!")
|
|
71
71
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
print(f.read())
|
|
72
|
+
# Read the blob as a string
|
|
73
|
+
print(blob.as_string())
|
|
75
74
|
|
|
76
|
-
|
|
75
|
+
# Read the blob as bytes
|
|
76
|
+
print(blob.as_bytes())
|
|
77
77
|
|
|
78
|
-
|
|
78
|
+
# Read the blob as a byte stream
|
|
79
|
+
with blob.as_bytes_io() as f:
|
|
80
|
+
print(f.read())
|
|
81
|
+
```
|
|
79
82
|
|
|
80
|
-
|
|
83
|
+
??? example "Load from memory and specify MIME type and metadata"
|
|
81
84
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
mime_type="text/plain",
|
|
85
|
-
metadata={"source": "https://example.com"},
|
|
86
|
-
)
|
|
85
|
+
```python
|
|
86
|
+
from langchain_core.documents import Blob
|
|
87
87
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
88
|
+
blob = Blob.from_data(
|
|
89
|
+
data="Hello, world!",
|
|
90
|
+
mime_type="text/plain",
|
|
91
|
+
metadata={"source": "https://example.com"},
|
|
92
|
+
)
|
|
93
|
+
```
|
|
91
94
|
|
|
92
|
-
|
|
95
|
+
??? example "Load the blob from a file"
|
|
93
96
|
|
|
94
|
-
|
|
97
|
+
```python
|
|
98
|
+
from langchain_core.documents import Blob
|
|
95
99
|
|
|
96
|
-
|
|
97
|
-
print(blob.as_string())
|
|
100
|
+
blob = Blob.from_path("path/to/file.txt")
|
|
98
101
|
|
|
99
|
-
|
|
100
|
-
|
|
102
|
+
# Read the blob as a string
|
|
103
|
+
print(blob.as_string())
|
|
101
104
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
print(f.read())
|
|
105
|
+
# Read the blob as bytes
|
|
106
|
+
print(blob.as_bytes())
|
|
105
107
|
|
|
108
|
+
# Read the blob as a byte stream
|
|
109
|
+
with blob.as_bytes_io() as f:
|
|
110
|
+
print(f.read())
|
|
111
|
+
```
|
|
106
112
|
"""
|
|
107
113
|
|
|
108
|
-
data:
|
|
109
|
-
"""Raw data associated with the
|
|
110
|
-
mimetype:
|
|
111
|
-
"""
|
|
114
|
+
data: bytes | str | None = None
|
|
115
|
+
"""Raw data associated with the `Blob`."""
|
|
116
|
+
mimetype: str | None = None
|
|
117
|
+
"""MIME type, not to be confused with a file extension."""
|
|
112
118
|
encoding: str = "utf-8"
|
|
113
119
|
"""Encoding to use if decoding the bytes into a string.
|
|
114
120
|
|
|
115
|
-
|
|
121
|
+
Uses `utf-8` as default encoding if decoding to string.
|
|
116
122
|
"""
|
|
117
|
-
path:
|
|
123
|
+
path: PathLike | None = None
|
|
118
124
|
"""Location where the original content was found."""
|
|
119
125
|
|
|
120
126
|
model_config = ConfigDict(
|
|
@@ -123,16 +129,16 @@ class Blob(BaseMedia):
|
|
|
123
129
|
)
|
|
124
130
|
|
|
125
131
|
@property
|
|
126
|
-
def source(self) ->
|
|
132
|
+
def source(self) -> str | None:
|
|
127
133
|
"""The source location of the blob as string if known otherwise none.
|
|
128
134
|
|
|
129
|
-
If a path is associated with the
|
|
135
|
+
If a path is associated with the `Blob`, it will default to the path location.
|
|
130
136
|
|
|
131
|
-
Unless explicitly set via a metadata field called
|
|
137
|
+
Unless explicitly set via a metadata field called `'source'`, in which
|
|
132
138
|
case that value will be used instead.
|
|
133
139
|
"""
|
|
134
140
|
if self.metadata and "source" in self.metadata:
|
|
135
|
-
return cast("
|
|
141
|
+
return cast("str | None", self.metadata["source"])
|
|
136
142
|
return str(self.path) if self.path else None
|
|
137
143
|
|
|
138
144
|
@model_validator(mode="before")
|
|
@@ -181,7 +187,7 @@ class Blob(BaseMedia):
|
|
|
181
187
|
raise ValueError(msg)
|
|
182
188
|
|
|
183
189
|
@contextlib.contextmanager
|
|
184
|
-
def as_bytes_io(self) -> Generator[
|
|
190
|
+
def as_bytes_io(self) -> Generator[BytesIO | BufferedReader, None, None]:
|
|
185
191
|
"""Read data as a byte stream.
|
|
186
192
|
|
|
187
193
|
Raises:
|
|
@@ -205,22 +211,22 @@ class Blob(BaseMedia):
|
|
|
205
211
|
path: PathLike,
|
|
206
212
|
*,
|
|
207
213
|
encoding: str = "utf-8",
|
|
208
|
-
mime_type:
|
|
214
|
+
mime_type: str | None = None,
|
|
209
215
|
guess_type: bool = True,
|
|
210
|
-
metadata:
|
|
216
|
+
metadata: dict | None = None,
|
|
211
217
|
) -> Blob:
|
|
212
218
|
"""Load the blob from a path like object.
|
|
213
219
|
|
|
214
220
|
Args:
|
|
215
|
-
path:
|
|
221
|
+
path: Path-like object to file to be read
|
|
216
222
|
encoding: Encoding to use if decoding the bytes into a string
|
|
217
|
-
mime_type:
|
|
218
|
-
guess_type: If True
|
|
219
|
-
|
|
220
|
-
metadata: Metadata to associate with the
|
|
223
|
+
mime_type: If provided, will be set as the MIME type of the data
|
|
224
|
+
guess_type: If `True`, the MIME type will be guessed from the file
|
|
225
|
+
extension, if a MIME type was not provided
|
|
226
|
+
metadata: Metadata to associate with the `Blob`
|
|
221
227
|
|
|
222
228
|
Returns:
|
|
223
|
-
Blob instance
|
|
229
|
+
`Blob` instance
|
|
224
230
|
"""
|
|
225
231
|
if mime_type is None and guess_type:
|
|
226
232
|
mimetype = mimetypes.guess_type(path)[0] if guess_type else None
|
|
@@ -239,24 +245,24 @@ class Blob(BaseMedia):
|
|
|
239
245
|
@classmethod
|
|
240
246
|
def from_data(
|
|
241
247
|
cls,
|
|
242
|
-
data:
|
|
248
|
+
data: str | bytes,
|
|
243
249
|
*,
|
|
244
250
|
encoding: str = "utf-8",
|
|
245
|
-
mime_type:
|
|
246
|
-
path:
|
|
247
|
-
metadata:
|
|
251
|
+
mime_type: str | None = None,
|
|
252
|
+
path: str | None = None,
|
|
253
|
+
metadata: dict | None = None,
|
|
248
254
|
) -> Blob:
|
|
249
|
-
"""Initialize the
|
|
255
|
+
"""Initialize the `Blob` from in-memory data.
|
|
250
256
|
|
|
251
257
|
Args:
|
|
252
|
-
data:
|
|
258
|
+
data: The in-memory data associated with the `Blob`
|
|
253
259
|
encoding: Encoding to use if decoding the bytes into a string
|
|
254
|
-
mime_type:
|
|
255
|
-
path:
|
|
256
|
-
metadata: Metadata to associate with the
|
|
260
|
+
mime_type: If provided, will be set as the MIME type of the data
|
|
261
|
+
path: If provided, will be set as the source from which the data came
|
|
262
|
+
metadata: Metadata to associate with the `Blob`
|
|
257
263
|
|
|
258
264
|
Returns:
|
|
259
|
-
Blob instance
|
|
265
|
+
`Blob` instance
|
|
260
266
|
"""
|
|
261
267
|
return cls(
|
|
262
268
|
data=data,
|
|
@@ -277,16 +283,18 @@ class Blob(BaseMedia):
|
|
|
277
283
|
class Document(BaseMedia):
|
|
278
284
|
"""Class for storing a piece of text and associated metadata.
|
|
279
285
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
286
|
+
!!! note
|
|
287
|
+
`Document` is for **retrieval workflows**, not chat I/O. For sending text
|
|
288
|
+
to an LLM in a conversation, use message types from `langchain.messages`.
|
|
283
289
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
page_content="Hello, world!", metadata={"source": "https://example.com"}
|
|
288
|
-
)
|
|
290
|
+
Example:
|
|
291
|
+
```python
|
|
292
|
+
from langchain_core.documents import Document
|
|
289
293
|
|
|
294
|
+
document = Document(
|
|
295
|
+
page_content="Hello, world!", metadata={"source": "https://example.com"}
|
|
296
|
+
)
|
|
297
|
+
```
|
|
290
298
|
"""
|
|
291
299
|
|
|
292
300
|
page_content: str
|
|
@@ -301,12 +309,12 @@ class Document(BaseMedia):
|
|
|
301
309
|
|
|
302
310
|
@classmethod
|
|
303
311
|
def is_lc_serializable(cls) -> bool:
|
|
304
|
-
"""Return True as this class is serializable."""
|
|
312
|
+
"""Return `True` as this class is serializable."""
|
|
305
313
|
return True
|
|
306
314
|
|
|
307
315
|
@classmethod
|
|
308
316
|
def get_lc_namespace(cls) -> list[str]:
|
|
309
|
-
"""Get the namespace of the
|
|
317
|
+
"""Get the namespace of the LangChain object.
|
|
310
318
|
|
|
311
319
|
Returns:
|
|
312
320
|
["langchain", "schema", "document"]
|
|
@@ -314,10 +322,10 @@ class Document(BaseMedia):
|
|
|
314
322
|
return ["langchain", "schema", "document"]
|
|
315
323
|
|
|
316
324
|
def __str__(self) -> str:
|
|
317
|
-
"""Override __str__ to restrict it to page_content and metadata.
|
|
325
|
+
"""Override `__str__` to restrict it to page_content and metadata.
|
|
318
326
|
|
|
319
327
|
Returns:
|
|
320
|
-
A string representation of the Document
|
|
328
|
+
A string representation of the `Document`.
|
|
321
329
|
"""
|
|
322
330
|
# The format matches pydantic format for __str__.
|
|
323
331
|
#
|