langchain-core 1.0.0rc3__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langchain-core might be problematic. Click here for more details.
- langchain_core/agents.py +2 -4
- langchain_core/caches.py +16 -7
- langchain_core/callbacks/base.py +0 -4
- langchain_core/callbacks/manager.py +0 -11
- langchain_core/chat_history.py +5 -5
- langchain_core/document_loaders/base.py +6 -4
- langchain_core/document_loaders/blob_loaders.py +1 -1
- langchain_core/document_loaders/langsmith.py +9 -13
- langchain_core/documents/__init__.py +24 -3
- langchain_core/documents/base.py +72 -61
- langchain_core/documents/compressor.py +6 -6
- langchain_core/documents/transformers.py +6 -6
- langchain_core/embeddings/fake.py +2 -2
- langchain_core/example_selectors/semantic_similarity.py +7 -7
- langchain_core/exceptions.py +2 -2
- langchain_core/indexing/__init__.py +1 -1
- langchain_core/indexing/api.py +62 -62
- langchain_core/indexing/base.py +20 -22
- langchain_core/indexing/in_memory.py +2 -4
- langchain_core/language_models/__init__.py +6 -5
- langchain_core/language_models/base.py +7 -8
- langchain_core/language_models/chat_models.py +84 -78
- langchain_core/language_models/fake_chat_models.py +1 -1
- langchain_core/language_models/llms.py +20 -18
- langchain_core/load/dump.py +6 -8
- langchain_core/load/serializable.py +4 -1
- langchain_core/messages/__init__.py +9 -0
- langchain_core/messages/ai.py +11 -7
- langchain_core/messages/base.py +4 -0
- langchain_core/messages/block_translators/google_genai.py +5 -3
- langchain_core/messages/content.py +4 -4
- langchain_core/messages/utils.py +17 -17
- langchain_core/output_parsers/__init__.py +17 -1
- langchain_core/output_parsers/base.py +3 -0
- langchain_core/output_parsers/format_instructions.py +9 -4
- langchain_core/output_parsers/json.py +5 -2
- langchain_core/output_parsers/list.py +16 -16
- langchain_core/output_parsers/openai_tools.py +2 -2
- langchain_core/output_parsers/pydantic.py +1 -1
- langchain_core/output_parsers/string.py +3 -3
- langchain_core/output_parsers/xml.py +28 -25
- langchain_core/outputs/generation.py +2 -3
- langchain_core/prompt_values.py +0 -6
- langchain_core/prompts/base.py +5 -3
- langchain_core/prompts/chat.py +60 -52
- langchain_core/prompts/string.py +5 -2
- langchain_core/prompts/structured.py +12 -8
- langchain_core/rate_limiters.py +1 -3
- langchain_core/retrievers.py +41 -37
- langchain_core/runnables/base.py +25 -29
- langchain_core/runnables/branch.py +9 -9
- langchain_core/runnables/config.py +2 -4
- langchain_core/runnables/configurable.py +3 -3
- langchain_core/runnables/fallbacks.py +1 -1
- langchain_core/runnables/graph.py +7 -3
- langchain_core/runnables/retry.py +1 -1
- langchain_core/runnables/schema.py +2 -5
- langchain_core/runnables/utils.py +3 -3
- langchain_core/stores.py +4 -6
- langchain_core/tools/base.py +68 -14
- langchain_core/tools/convert.py +8 -7
- langchain_core/tools/retriever.py +6 -5
- langchain_core/tools/structured.py +7 -5
- langchain_core/tracers/event_stream.py +4 -1
- langchain_core/tracers/log_stream.py +6 -3
- langchain_core/utils/function_calling.py +8 -0
- langchain_core/utils/json_schema.py +1 -1
- langchain_core/utils/strings.py +1 -4
- langchain_core/utils/utils.py +12 -5
- langchain_core/vectorstores/base.py +130 -130
- langchain_core/vectorstores/in_memory.py +4 -4
- langchain_core/vectorstores/utils.py +1 -1
- langchain_core/version.py +1 -1
- {langchain_core-1.0.0rc3.dist-info → langchain_core-1.0.2.dist-info}/METADATA +8 -7
- {langchain_core-1.0.0rc3.dist-info → langchain_core-1.0.2.dist-info}/RECORD +76 -76
- {langchain_core-1.0.0rc3.dist-info → langchain_core-1.0.2.dist-info}/WHEEL +0 -0
langchain_core/agents.py
CHANGED
|
@@ -5,12 +5,10 @@
|
|
|
5
5
|
|
|
6
6
|
!!! warning
|
|
7
7
|
New agents should be built using the
|
|
8
|
-
[
|
|
8
|
+
[`langchain` library](https://pypi.org/project/langchain/), which provides a
|
|
9
9
|
simpler and more flexible way to define agents.
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
[migration guide](https://python.langchain.com/docs/how_to/migrate_agent/) for
|
|
13
|
-
information on how to migrate existing agents to modern langgraph agents.
|
|
11
|
+
See docs on [building agents](https://docs.langchain.com/oss/python/langchain/agents).
|
|
14
12
|
|
|
15
13
|
Agents use language models to choose a sequence of actions to take.
|
|
16
14
|
|
langchain_core/caches.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Optional caching layer for language models.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Distinct from provider-based [prompt caching](https://docs.langchain.com/oss/python/langchain/models#prompt-caching).
|
|
4
|
+
|
|
5
|
+
!!! warning "Beta feature"
|
|
6
|
+
This is a beta feature. Please be wary of deploying experimental code to production
|
|
5
7
|
unless you've taken appropriate precautions.
|
|
6
8
|
|
|
7
9
|
A cache is useful for two reasons:
|
|
@@ -47,17 +49,18 @@ class BaseCache(ABC):
|
|
|
47
49
|
"""Look up based on `prompt` and `llm_string`.
|
|
48
50
|
|
|
49
51
|
A cache implementation is expected to generate a key from the 2-tuple
|
|
50
|
-
of prompt and llm_string (e.g., by concatenating them with a delimiter).
|
|
52
|
+
of `prompt` and `llm_string` (e.g., by concatenating them with a delimiter).
|
|
51
53
|
|
|
52
54
|
Args:
|
|
53
55
|
prompt: A string representation of the prompt.
|
|
54
56
|
In the case of a chat model, the prompt is a non-trivial
|
|
55
57
|
serialization of the prompt into the language model.
|
|
56
58
|
llm_string: A string representation of the LLM configuration.
|
|
59
|
+
|
|
57
60
|
This is used to capture the invocation parameters of the LLM
|
|
58
61
|
(e.g., model name, temperature, stop tokens, max tokens, etc.).
|
|
59
|
-
|
|
60
|
-
representation.
|
|
62
|
+
|
|
63
|
+
These invocation parameters are serialized into a string representation.
|
|
61
64
|
|
|
62
65
|
Returns:
|
|
63
66
|
On a cache miss, return `None`. On a cache hit, return the cached value.
|
|
@@ -76,8 +79,10 @@ class BaseCache(ABC):
|
|
|
76
79
|
In the case of a chat model, the prompt is a non-trivial
|
|
77
80
|
serialization of the prompt into the language model.
|
|
78
81
|
llm_string: A string representation of the LLM configuration.
|
|
82
|
+
|
|
79
83
|
This is used to capture the invocation parameters of the LLM
|
|
80
84
|
(e.g., model name, temperature, stop tokens, max tokens, etc.).
|
|
85
|
+
|
|
81
86
|
These invocation parameters are serialized into a string
|
|
82
87
|
representation.
|
|
83
88
|
return_val: The value to be cached. The value is a list of `Generation`
|
|
@@ -92,15 +97,17 @@ class BaseCache(ABC):
|
|
|
92
97
|
"""Async look up based on `prompt` and `llm_string`.
|
|
93
98
|
|
|
94
99
|
A cache implementation is expected to generate a key from the 2-tuple
|
|
95
|
-
of prompt and llm_string (e.g., by concatenating them with a delimiter).
|
|
100
|
+
of `prompt` and `llm_string` (e.g., by concatenating them with a delimiter).
|
|
96
101
|
|
|
97
102
|
Args:
|
|
98
103
|
prompt: A string representation of the prompt.
|
|
99
104
|
In the case of a chat model, the prompt is a non-trivial
|
|
100
105
|
serialization of the prompt into the language model.
|
|
101
106
|
llm_string: A string representation of the LLM configuration.
|
|
107
|
+
|
|
102
108
|
This is used to capture the invocation parameters of the LLM
|
|
103
109
|
(e.g., model name, temperature, stop tokens, max tokens, etc.).
|
|
110
|
+
|
|
104
111
|
These invocation parameters are serialized into a string
|
|
105
112
|
representation.
|
|
106
113
|
|
|
@@ -123,8 +130,10 @@ class BaseCache(ABC):
|
|
|
123
130
|
In the case of a chat model, the prompt is a non-trivial
|
|
124
131
|
serialization of the prompt into the language model.
|
|
125
132
|
llm_string: A string representation of the LLM configuration.
|
|
133
|
+
|
|
126
134
|
This is used to capture the invocation parameters of the LLM
|
|
127
135
|
(e.g., model name, temperature, stop tokens, max tokens, etc.).
|
|
136
|
+
|
|
128
137
|
These invocation parameters are serialized into a string
|
|
129
138
|
representation.
|
|
130
139
|
return_val: The value to be cached. The value is a list of `Generation`
|
langchain_core/callbacks/base.py
CHANGED
|
@@ -420,8 +420,6 @@ class RunManagerMixin:
|
|
|
420
420
|
(includes inherited tags).
|
|
421
421
|
metadata: The metadata associated with the custom event
|
|
422
422
|
(includes inherited metadata).
|
|
423
|
-
|
|
424
|
-
!!! version-added "Added in version 0.2.15"
|
|
425
423
|
"""
|
|
426
424
|
|
|
427
425
|
|
|
@@ -882,8 +880,6 @@ class AsyncCallbackHandler(BaseCallbackHandler):
|
|
|
882
880
|
(includes inherited tags).
|
|
883
881
|
metadata: The metadata associated with the custom event
|
|
884
882
|
(includes inherited metadata).
|
|
885
|
-
|
|
886
|
-
!!! version-added "Added in version 0.2.15"
|
|
887
883
|
"""
|
|
888
884
|
|
|
889
885
|
|
|
@@ -1566,9 +1566,6 @@ class CallbackManager(BaseCallbackManager):
|
|
|
1566
1566
|
|
|
1567
1567
|
Raises:
|
|
1568
1568
|
ValueError: If additional keyword arguments are passed.
|
|
1569
|
-
|
|
1570
|
-
!!! version-added "Added in version 0.2.14"
|
|
1571
|
-
|
|
1572
1569
|
"""
|
|
1573
1570
|
if not self.handlers:
|
|
1574
1571
|
return
|
|
@@ -2042,8 +2039,6 @@ class AsyncCallbackManager(BaseCallbackManager):
|
|
|
2042
2039
|
|
|
2043
2040
|
Raises:
|
|
2044
2041
|
ValueError: If additional keyword arguments are passed.
|
|
2045
|
-
|
|
2046
|
-
!!! version-added "Added in version 0.2.14"
|
|
2047
2042
|
"""
|
|
2048
2043
|
if not self.handlers:
|
|
2049
2044
|
return
|
|
@@ -2555,9 +2550,6 @@ async def adispatch_custom_event(
|
|
|
2555
2550
|
This is due to a limitation in asyncio for python <= 3.10 that prevents
|
|
2556
2551
|
LangChain from automatically propagating the config object on the user's
|
|
2557
2552
|
behalf.
|
|
2558
|
-
|
|
2559
|
-
!!! version-added "Added in version 0.2.15"
|
|
2560
|
-
|
|
2561
2553
|
"""
|
|
2562
2554
|
# Import locally to prevent circular imports.
|
|
2563
2555
|
from langchain_core.runnables.config import ( # noqa: PLC0415
|
|
@@ -2630,9 +2622,6 @@ def dispatch_custom_event(
|
|
|
2630
2622
|
foo_ = RunnableLambda(foo)
|
|
2631
2623
|
foo_.invoke({"a": "1"}, {"callbacks": [CustomCallbackManager()]})
|
|
2632
2624
|
```
|
|
2633
|
-
|
|
2634
|
-
!!! version-added "Added in version 0.2.15"
|
|
2635
|
-
|
|
2636
2625
|
"""
|
|
2637
2626
|
# Import locally to prevent circular imports.
|
|
2638
2627
|
from langchain_core.runnables.config import ( # noqa: PLC0415
|
langchain_core/chat_history.py
CHANGED
|
@@ -121,7 +121,7 @@ class BaseChatMessageHistory(ABC):
|
|
|
121
121
|
This method may be deprecated in a future release.
|
|
122
122
|
|
|
123
123
|
Args:
|
|
124
|
-
message: The
|
|
124
|
+
message: The `HumanMessage` to add to the store.
|
|
125
125
|
"""
|
|
126
126
|
if isinstance(message, HumanMessage):
|
|
127
127
|
self.add_message(message)
|
|
@@ -129,7 +129,7 @@ class BaseChatMessageHistory(ABC):
|
|
|
129
129
|
self.add_message(HumanMessage(content=message))
|
|
130
130
|
|
|
131
131
|
def add_ai_message(self, message: AIMessage | str) -> None:
|
|
132
|
-
"""Convenience method for adding an
|
|
132
|
+
"""Convenience method for adding an `AIMessage` string to the store.
|
|
133
133
|
|
|
134
134
|
!!! note
|
|
135
135
|
This is a convenience method. Code should favor the bulk `add_messages`
|
|
@@ -138,7 +138,7 @@ class BaseChatMessageHistory(ABC):
|
|
|
138
138
|
This method may be deprecated in a future release.
|
|
139
139
|
|
|
140
140
|
Args:
|
|
141
|
-
message: The
|
|
141
|
+
message: The `AIMessage` to add.
|
|
142
142
|
"""
|
|
143
143
|
if isinstance(message, AIMessage):
|
|
144
144
|
self.add_message(message)
|
|
@@ -173,7 +173,7 @@ class BaseChatMessageHistory(ABC):
|
|
|
173
173
|
in an efficient manner to avoid unnecessary round-trips to the underlying store.
|
|
174
174
|
|
|
175
175
|
Args:
|
|
176
|
-
messages: A sequence of BaseMessage objects to store.
|
|
176
|
+
messages: A sequence of `BaseMessage` objects to store.
|
|
177
177
|
"""
|
|
178
178
|
for message in messages:
|
|
179
179
|
self.add_message(message)
|
|
@@ -182,7 +182,7 @@ class BaseChatMessageHistory(ABC):
|
|
|
182
182
|
"""Async add a list of messages.
|
|
183
183
|
|
|
184
184
|
Args:
|
|
185
|
-
messages: A sequence of BaseMessage objects to store.
|
|
185
|
+
messages: A sequence of `BaseMessage` objects to store.
|
|
186
186
|
"""
|
|
187
187
|
await run_in_executor(None, self.add_messages, messages)
|
|
188
188
|
|
|
@@ -27,7 +27,7 @@ class BaseLoader(ABC): # noqa: B024
|
|
|
27
27
|
"""Interface for Document Loader.
|
|
28
28
|
|
|
29
29
|
Implementations should implement the lazy-loading method using generators
|
|
30
|
-
to avoid loading all
|
|
30
|
+
to avoid loading all documents into memory at once.
|
|
31
31
|
|
|
32
32
|
`load` is provided just for user convenience and should not be overridden.
|
|
33
33
|
"""
|
|
@@ -53,9 +53,11 @@ class BaseLoader(ABC): # noqa: B024
|
|
|
53
53
|
def load_and_split(
|
|
54
54
|
self, text_splitter: TextSplitter | None = None
|
|
55
55
|
) -> list[Document]:
|
|
56
|
-
"""Load
|
|
56
|
+
"""Load `Document` and split into chunks. Chunks are returned as `Document`.
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
!!! danger
|
|
59
|
+
|
|
60
|
+
Do not override this method. It should be considered to be deprecated!
|
|
59
61
|
|
|
60
62
|
Args:
|
|
61
63
|
text_splitter: `TextSplitter` instance to use for splitting documents.
|
|
@@ -135,7 +137,7 @@ class BaseBlobParser(ABC):
|
|
|
135
137
|
"""
|
|
136
138
|
|
|
137
139
|
def parse(self, blob: Blob) -> list[Document]:
|
|
138
|
-
"""Eagerly parse the blob into a `Document` or `Document` objects.
|
|
140
|
+
"""Eagerly parse the blob into a `Document` or list of `Document` objects.
|
|
139
141
|
|
|
140
142
|
This is a convenience method for interactive development environment.
|
|
141
143
|
|
|
@@ -28,7 +28,7 @@ class BlobLoader(ABC):
|
|
|
28
28
|
def yield_blobs(
|
|
29
29
|
self,
|
|
30
30
|
) -> Iterable[Blob]:
|
|
31
|
-
"""A lazy loader for raw data represented by LangChain's Blob object.
|
|
31
|
+
"""A lazy loader for raw data represented by LangChain's `Blob` object.
|
|
32
32
|
|
|
33
33
|
Returns:
|
|
34
34
|
A generator over blobs
|
|
@@ -14,13 +14,13 @@ from langchain_core.documents import Document
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class LangSmithLoader(BaseLoader):
|
|
17
|
-
"""Load LangSmith Dataset examples as
|
|
17
|
+
"""Load LangSmith Dataset examples as `Document` objects.
|
|
18
18
|
|
|
19
|
-
Loads the example inputs as the Document page content and places the entire
|
|
20
|
-
into the Document metadata. This allows you to easily create few-shot
|
|
21
|
-
retrievers from the loaded documents.
|
|
19
|
+
Loads the example inputs as the `Document` page content and places the entire
|
|
20
|
+
example into the `Document` metadata. This allows you to easily create few-shot
|
|
21
|
+
example retrievers from the loaded documents.
|
|
22
22
|
|
|
23
|
-
??? note "Lazy
|
|
23
|
+
??? note "Lazy loading example"
|
|
24
24
|
|
|
25
25
|
```python
|
|
26
26
|
from langchain_core.document_loaders import LangSmithLoader
|
|
@@ -34,9 +34,6 @@ class LangSmithLoader(BaseLoader):
|
|
|
34
34
|
```python
|
|
35
35
|
# -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]
|
|
36
36
|
```
|
|
37
|
-
|
|
38
|
-
!!! version-added "Added in version 0.2.34"
|
|
39
|
-
|
|
40
37
|
"""
|
|
41
38
|
|
|
42
39
|
def __init__(
|
|
@@ -69,12 +66,11 @@ class LangSmithLoader(BaseLoader):
|
|
|
69
66
|
format_content: Function for converting the content extracted from the example
|
|
70
67
|
inputs into a string. Defaults to JSON-encoding the contents.
|
|
71
68
|
example_ids: The IDs of the examples to filter by.
|
|
72
|
-
as_of: The dataset version tag
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
of the tagged (or timestamped) version.
|
|
69
|
+
as_of: The dataset version tag or timestamp to retrieve the examples as of.
|
|
70
|
+
Response examples will only be those that were present at the time of
|
|
71
|
+
the tagged (or timestamped) version.
|
|
76
72
|
splits: A list of dataset splits, which are
|
|
77
|
-
divisions of your dataset such as
|
|
73
|
+
divisions of your dataset such as `train`, `test`, or `validation`.
|
|
78
74
|
Returns examples only from the specified splits.
|
|
79
75
|
inline_s3_urls: Whether to inline S3 URLs.
|
|
80
76
|
offset: The offset to start from.
|
|
@@ -1,7 +1,28 @@
|
|
|
1
|
-
"""Documents module.
|
|
1
|
+
"""Documents module for data retrieval and processing workflows.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
and
|
|
3
|
+
This module provides core abstractions for handling data in retrieval-augmented
|
|
4
|
+
generation (RAG) pipelines, vector stores, and document processing workflows.
|
|
5
|
+
|
|
6
|
+
!!! warning "Documents vs. message content"
|
|
7
|
+
This module is distinct from `langchain_core.messages.content`, which provides
|
|
8
|
+
multimodal content blocks for **LLM chat I/O** (text, images, audio, etc. within
|
|
9
|
+
messages).
|
|
10
|
+
|
|
11
|
+
**Key distinction:**
|
|
12
|
+
|
|
13
|
+
- **Documents** (this module): For **data retrieval and processing workflows**
|
|
14
|
+
- Vector stores, retrievers, RAG pipelines
|
|
15
|
+
- Text chunking, embedding, and semantic search
|
|
16
|
+
- Example: Chunks of a PDF stored in a vector database
|
|
17
|
+
|
|
18
|
+
- **Content Blocks** (`messages.content`): For **LLM conversational I/O**
|
|
19
|
+
- Multimodal message content sent to/from models
|
|
20
|
+
- Tool calls, reasoning, citations within chat
|
|
21
|
+
- Example: An image sent to a vision model in a chat message (via
|
|
22
|
+
[`ImageContentBlock`][langchain.messages.ImageContentBlock])
|
|
23
|
+
|
|
24
|
+
While both can represent similar data types (text, files), they serve different
|
|
25
|
+
architectural purposes in LangChain applications.
|
|
5
26
|
"""
|
|
6
27
|
|
|
7
28
|
from typing import TYPE_CHECKING
|
langchain_core/documents/base.py
CHANGED
|
@@ -1,4 +1,16 @@
|
|
|
1
|
-
"""Base classes for media and documents.
|
|
1
|
+
"""Base classes for media and documents.
|
|
2
|
+
|
|
3
|
+
This module contains core abstractions for **data retrieval and processing workflows**:
|
|
4
|
+
|
|
5
|
+
- `BaseMedia`: Base class providing `id` and `metadata` fields
|
|
6
|
+
- `Blob`: Raw data loading (files, binary data) - used by document loaders
|
|
7
|
+
- `Document`: Text content for retrieval (RAG, vector stores, semantic search)
|
|
8
|
+
|
|
9
|
+
!!! note "Not for LLM chat messages"
|
|
10
|
+
These classes are for data processing pipelines, not LLM I/O. For multimodal
|
|
11
|
+
content in chat messages (images, audio in conversations), see
|
|
12
|
+
`langchain.messages` content blocks instead.
|
|
13
|
+
"""
|
|
2
14
|
|
|
3
15
|
from __future__ import annotations
|
|
4
16
|
|
|
@@ -19,27 +31,23 @@ PathLike = str | PurePath
|
|
|
19
31
|
|
|
20
32
|
|
|
21
33
|
class BaseMedia(Serializable):
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
Media objects can be used to represent raw data, such as text or binary data.
|
|
34
|
+
"""Base class for content used in retrieval and data processing workflows.
|
|
25
35
|
|
|
26
|
-
|
|
27
|
-
with the content.
|
|
36
|
+
Provides common fields for content that needs to be stored, indexed, or searched.
|
|
28
37
|
|
|
29
|
-
|
|
30
|
-
|
|
38
|
+
!!! note
|
|
39
|
+
For multimodal content in **chat messages** (images, audio sent to/from LLMs),
|
|
40
|
+
use `langchain.messages` content blocks instead.
|
|
31
41
|
"""
|
|
32
42
|
|
|
33
43
|
# The ID field is optional at the moment.
|
|
34
44
|
# It will likely become required in a future major release after
|
|
35
|
-
# it has been adopted by enough
|
|
45
|
+
# it has been adopted by enough VectorStore implementations.
|
|
36
46
|
id: str | None = Field(default=None, coerce_numbers_to_str=True)
|
|
37
47
|
"""An optional identifier for the document.
|
|
38
48
|
|
|
39
49
|
Ideally this should be unique across the document collection and formatted
|
|
40
50
|
as a UUID, but this will not be enforced.
|
|
41
|
-
|
|
42
|
-
!!! version-added "Added in version 0.2.11"
|
|
43
51
|
"""
|
|
44
52
|
|
|
45
53
|
metadata: dict = Field(default_factory=dict)
|
|
@@ -47,65 +55,64 @@ class BaseMedia(Serializable):
|
|
|
47
55
|
|
|
48
56
|
|
|
49
57
|
class Blob(BaseMedia):
|
|
50
|
-
"""
|
|
58
|
+
"""Raw data abstraction for document loading and file processing.
|
|
51
59
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
the raw data.
|
|
60
|
+
Represents raw bytes or text, either in-memory or by file reference. Used
|
|
61
|
+
primarily by document loaders to decouple data loading from parsing.
|
|
55
62
|
|
|
56
|
-
Inspired by
|
|
63
|
+
Inspired by [Mozilla's `Blob`](https://developer.mozilla.org/en-US/docs/Web/API/Blob)
|
|
57
64
|
|
|
58
|
-
|
|
65
|
+
???+ example "Initialize a blob from in-memory data"
|
|
59
66
|
|
|
60
|
-
|
|
61
|
-
|
|
67
|
+
```python
|
|
68
|
+
from langchain_core.documents import Blob
|
|
62
69
|
|
|
63
|
-
|
|
70
|
+
blob = Blob.from_data("Hello, world!")
|
|
64
71
|
|
|
65
|
-
|
|
66
|
-
|
|
72
|
+
# Read the blob as a string
|
|
73
|
+
print(blob.as_string())
|
|
67
74
|
|
|
68
|
-
|
|
69
|
-
|
|
75
|
+
# Read the blob as bytes
|
|
76
|
+
print(blob.as_bytes())
|
|
70
77
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
78
|
+
# Read the blob as a byte stream
|
|
79
|
+
with blob.as_bytes_io() as f:
|
|
80
|
+
print(f.read())
|
|
81
|
+
```
|
|
75
82
|
|
|
76
|
-
|
|
83
|
+
??? example "Load from memory and specify MIME type and metadata"
|
|
77
84
|
|
|
78
|
-
|
|
79
|
-
|
|
85
|
+
```python
|
|
86
|
+
from langchain_core.documents import Blob
|
|
80
87
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
88
|
+
blob = Blob.from_data(
|
|
89
|
+
data="Hello, world!",
|
|
90
|
+
mime_type="text/plain",
|
|
91
|
+
metadata={"source": "https://example.com"},
|
|
92
|
+
)
|
|
93
|
+
```
|
|
87
94
|
|
|
88
|
-
|
|
95
|
+
??? example "Load the blob from a file"
|
|
89
96
|
|
|
90
|
-
|
|
91
|
-
|
|
97
|
+
```python
|
|
98
|
+
from langchain_core.documents import Blob
|
|
92
99
|
|
|
93
|
-
|
|
100
|
+
blob = Blob.from_path("path/to/file.txt")
|
|
94
101
|
|
|
95
|
-
|
|
96
|
-
|
|
102
|
+
# Read the blob as a string
|
|
103
|
+
print(blob.as_string())
|
|
97
104
|
|
|
98
|
-
|
|
99
|
-
|
|
105
|
+
# Read the blob as bytes
|
|
106
|
+
print(blob.as_bytes())
|
|
100
107
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
108
|
+
# Read the blob as a byte stream
|
|
109
|
+
with blob.as_bytes_io() as f:
|
|
110
|
+
print(f.read())
|
|
111
|
+
```
|
|
105
112
|
"""
|
|
106
113
|
|
|
107
114
|
data: bytes | str | None = None
|
|
108
|
-
"""Raw data associated with the
|
|
115
|
+
"""Raw data associated with the `Blob`."""
|
|
109
116
|
mimetype: str | None = None
|
|
110
117
|
"""MimeType not to be confused with a file extension."""
|
|
111
118
|
encoding: str = "utf-8"
|
|
@@ -125,7 +132,7 @@ class Blob(BaseMedia):
|
|
|
125
132
|
def source(self) -> str | None:
|
|
126
133
|
"""The source location of the blob as string if known otherwise none.
|
|
127
134
|
|
|
128
|
-
If a path is associated with the
|
|
135
|
+
If a path is associated with the `Blob`, it will default to the path location.
|
|
129
136
|
|
|
130
137
|
Unless explicitly set via a metadata field called `"source"`, in which
|
|
131
138
|
case that value will be used instead.
|
|
@@ -213,13 +220,13 @@ class Blob(BaseMedia):
|
|
|
213
220
|
Args:
|
|
214
221
|
path: Path-like object to file to be read
|
|
215
222
|
encoding: Encoding to use if decoding the bytes into a string
|
|
216
|
-
mime_type: If provided, will be set as the
|
|
217
|
-
guess_type: If `True`, the
|
|
218
|
-
if a
|
|
219
|
-
metadata: Metadata to associate with the
|
|
223
|
+
mime_type: If provided, will be set as the MIME type of the data
|
|
224
|
+
guess_type: If `True`, the MIME type will be guessed from the file
|
|
225
|
+
extension, if a MIME type was not provided
|
|
226
|
+
metadata: Metadata to associate with the `Blob`
|
|
220
227
|
|
|
221
228
|
Returns:
|
|
222
|
-
Blob instance
|
|
229
|
+
`Blob` instance
|
|
223
230
|
"""
|
|
224
231
|
if mime_type is None and guess_type:
|
|
225
232
|
mimetype = mimetypes.guess_type(path)[0] if guess_type else None
|
|
@@ -245,17 +252,17 @@ class Blob(BaseMedia):
|
|
|
245
252
|
path: str | None = None,
|
|
246
253
|
metadata: dict | None = None,
|
|
247
254
|
) -> Blob:
|
|
248
|
-
"""Initialize the
|
|
255
|
+
"""Initialize the `Blob` from in-memory data.
|
|
249
256
|
|
|
250
257
|
Args:
|
|
251
|
-
data: The in-memory data associated with the
|
|
258
|
+
data: The in-memory data associated with the `Blob`
|
|
252
259
|
encoding: Encoding to use if decoding the bytes into a string
|
|
253
|
-
mime_type: If provided, will be set as the
|
|
260
|
+
mime_type: If provided, will be set as the MIME type of the data
|
|
254
261
|
path: If provided, will be set as the source from which the data came
|
|
255
|
-
metadata: Metadata to associate with the
|
|
262
|
+
metadata: Metadata to associate with the `Blob`
|
|
256
263
|
|
|
257
264
|
Returns:
|
|
258
|
-
Blob instance
|
|
265
|
+
`Blob` instance
|
|
259
266
|
"""
|
|
260
267
|
return cls(
|
|
261
268
|
data=data,
|
|
@@ -276,6 +283,10 @@ class Blob(BaseMedia):
|
|
|
276
283
|
class Document(BaseMedia):
|
|
277
284
|
"""Class for storing a piece of text and associated metadata.
|
|
278
285
|
|
|
286
|
+
!!! note
|
|
287
|
+
`Document` is for **retrieval workflows**, not chat I/O. For sending text
|
|
288
|
+
to an LLM in a conversation, use message types from `langchain.messages`.
|
|
289
|
+
|
|
279
290
|
Example:
|
|
280
291
|
```python
|
|
281
292
|
from langchain_core.documents import Document
|
|
@@ -21,14 +21,14 @@ class BaseDocumentCompressor(BaseModel, ABC):
|
|
|
21
21
|
|
|
22
22
|
This abstraction is primarily used for post-processing of retrieved documents.
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
`Document` objects matching a given query are first retrieved.
|
|
25
25
|
|
|
26
26
|
Then the list of documents can be further processed.
|
|
27
27
|
|
|
28
28
|
For example, one could re-rank the retrieved documents using an LLM.
|
|
29
29
|
|
|
30
30
|
!!! note
|
|
31
|
-
Users should favor using a RunnableLambda instead of sub-classing from this
|
|
31
|
+
Users should favor using a `RunnableLambda` instead of sub-classing from this
|
|
32
32
|
interface.
|
|
33
33
|
|
|
34
34
|
"""
|
|
@@ -43,9 +43,9 @@ class BaseDocumentCompressor(BaseModel, ABC):
|
|
|
43
43
|
"""Compress retrieved documents given the query context.
|
|
44
44
|
|
|
45
45
|
Args:
|
|
46
|
-
documents: The retrieved
|
|
46
|
+
documents: The retrieved `Document` objects.
|
|
47
47
|
query: The query context.
|
|
48
|
-
callbacks: Optional
|
|
48
|
+
callbacks: Optional `Callbacks` to run during compression.
|
|
49
49
|
|
|
50
50
|
Returns:
|
|
51
51
|
The compressed documents.
|
|
@@ -61,9 +61,9 @@ class BaseDocumentCompressor(BaseModel, ABC):
|
|
|
61
61
|
"""Async compress retrieved documents given the query context.
|
|
62
62
|
|
|
63
63
|
Args:
|
|
64
|
-
documents: The retrieved
|
|
64
|
+
documents: The retrieved `Document` objects.
|
|
65
65
|
query: The query context.
|
|
66
|
-
callbacks: Optional
|
|
66
|
+
callbacks: Optional `Callbacks` to run during compression.
|
|
67
67
|
|
|
68
68
|
Returns:
|
|
69
69
|
The compressed documents.
|
|
@@ -16,8 +16,8 @@ if TYPE_CHECKING:
|
|
|
16
16
|
class BaseDocumentTransformer(ABC):
|
|
17
17
|
"""Abstract base class for document transformation.
|
|
18
18
|
|
|
19
|
-
A document transformation takes a sequence of
|
|
20
|
-
sequence of transformed
|
|
19
|
+
A document transformation takes a sequence of `Document` objects and returns a
|
|
20
|
+
sequence of transformed `Document` objects.
|
|
21
21
|
|
|
22
22
|
Example:
|
|
23
23
|
```python
|
|
@@ -57,10 +57,10 @@ class BaseDocumentTransformer(ABC):
|
|
|
57
57
|
"""Transform a list of documents.
|
|
58
58
|
|
|
59
59
|
Args:
|
|
60
|
-
documents: A sequence of
|
|
60
|
+
documents: A sequence of `Document` objects to be transformed.
|
|
61
61
|
|
|
62
62
|
Returns:
|
|
63
|
-
A sequence of transformed
|
|
63
|
+
A sequence of transformed `Document` objects.
|
|
64
64
|
"""
|
|
65
65
|
|
|
66
66
|
async def atransform_documents(
|
|
@@ -69,10 +69,10 @@ class BaseDocumentTransformer(ABC):
|
|
|
69
69
|
"""Asynchronously transform a list of documents.
|
|
70
70
|
|
|
71
71
|
Args:
|
|
72
|
-
documents: A sequence of
|
|
72
|
+
documents: A sequence of `Document` objects to be transformed.
|
|
73
73
|
|
|
74
74
|
Returns:
|
|
75
|
-
A sequence of transformed
|
|
75
|
+
A sequence of transformed `Document` objects.
|
|
76
76
|
"""
|
|
77
77
|
return await run_in_executor(
|
|
78
78
|
None, self.transform_documents, documents, **kwargs
|
|
@@ -18,7 +18,7 @@ class FakeEmbeddings(Embeddings, BaseModel):
|
|
|
18
18
|
|
|
19
19
|
This embedding model creates embeddings by sampling from a normal distribution.
|
|
20
20
|
|
|
21
|
-
!!!
|
|
21
|
+
!!! danger "Toy model"
|
|
22
22
|
Do not use this outside of testing, as it is not a real embedding model.
|
|
23
23
|
|
|
24
24
|
Instantiate:
|
|
@@ -73,7 +73,7 @@ class DeterministicFakeEmbedding(Embeddings, BaseModel):
|
|
|
73
73
|
This embedding model creates embeddings by sampling from a normal distribution
|
|
74
74
|
with a seed based on the hash of the text.
|
|
75
75
|
|
|
76
|
-
!!!
|
|
76
|
+
!!! danger "Toy model"
|
|
77
77
|
Do not use this outside of testing, as it is not a real embedding model.
|
|
78
78
|
|
|
79
79
|
Instantiate:
|