langchain-core 0.4.0.dev0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langchain-core might be problematic. Click here for more details.
- langchain_core/__init__.py +1 -1
- langchain_core/_api/__init__.py +3 -4
- langchain_core/_api/beta_decorator.py +45 -70
- langchain_core/_api/deprecation.py +80 -80
- langchain_core/_api/path.py +22 -8
- langchain_core/_import_utils.py +10 -4
- langchain_core/agents.py +25 -21
- langchain_core/caches.py +53 -63
- langchain_core/callbacks/__init__.py +1 -8
- langchain_core/callbacks/base.py +341 -348
- langchain_core/callbacks/file.py +55 -44
- langchain_core/callbacks/manager.py +546 -683
- langchain_core/callbacks/stdout.py +29 -30
- langchain_core/callbacks/streaming_stdout.py +35 -36
- langchain_core/callbacks/usage.py +65 -70
- langchain_core/chat_history.py +48 -55
- langchain_core/document_loaders/base.py +46 -21
- langchain_core/document_loaders/langsmith.py +39 -36
- langchain_core/documents/__init__.py +0 -1
- langchain_core/documents/base.py +96 -74
- langchain_core/documents/compressor.py +12 -9
- langchain_core/documents/transformers.py +29 -28
- langchain_core/embeddings/fake.py +56 -57
- langchain_core/env.py +2 -3
- langchain_core/example_selectors/base.py +12 -0
- langchain_core/example_selectors/length_based.py +1 -1
- langchain_core/example_selectors/semantic_similarity.py +21 -25
- langchain_core/exceptions.py +15 -9
- langchain_core/globals.py +4 -163
- langchain_core/indexing/api.py +132 -125
- langchain_core/indexing/base.py +64 -67
- langchain_core/indexing/in_memory.py +26 -6
- langchain_core/language_models/__init__.py +15 -27
- langchain_core/language_models/_utils.py +267 -117
- langchain_core/language_models/base.py +92 -177
- langchain_core/language_models/chat_models.py +547 -407
- langchain_core/language_models/fake.py +11 -11
- langchain_core/language_models/fake_chat_models.py +72 -118
- langchain_core/language_models/llms.py +168 -242
- langchain_core/load/dump.py +8 -11
- langchain_core/load/load.py +32 -28
- langchain_core/load/mapping.py +2 -4
- langchain_core/load/serializable.py +50 -56
- langchain_core/messages/__init__.py +36 -51
- langchain_core/messages/ai.py +377 -150
- langchain_core/messages/base.py +239 -47
- langchain_core/messages/block_translators/__init__.py +111 -0
- langchain_core/messages/block_translators/anthropic.py +470 -0
- langchain_core/messages/block_translators/bedrock.py +94 -0
- langchain_core/messages/block_translators/bedrock_converse.py +297 -0
- langchain_core/messages/block_translators/google_genai.py +530 -0
- langchain_core/messages/block_translators/google_vertexai.py +21 -0
- langchain_core/messages/block_translators/groq.py +143 -0
- langchain_core/messages/block_translators/langchain_v0.py +301 -0
- langchain_core/messages/block_translators/openai.py +1010 -0
- langchain_core/messages/chat.py +2 -3
- langchain_core/messages/content.py +1423 -0
- langchain_core/messages/function.py +7 -7
- langchain_core/messages/human.py +44 -38
- langchain_core/messages/modifier.py +3 -2
- langchain_core/messages/system.py +40 -27
- langchain_core/messages/tool.py +160 -58
- langchain_core/messages/utils.py +527 -638
- langchain_core/output_parsers/__init__.py +1 -14
- langchain_core/output_parsers/base.py +68 -104
- langchain_core/output_parsers/json.py +13 -17
- langchain_core/output_parsers/list.py +11 -33
- langchain_core/output_parsers/openai_functions.py +56 -74
- langchain_core/output_parsers/openai_tools.py +68 -109
- langchain_core/output_parsers/pydantic.py +15 -13
- langchain_core/output_parsers/string.py +6 -2
- langchain_core/output_parsers/transform.py +17 -60
- langchain_core/output_parsers/xml.py +34 -44
- langchain_core/outputs/__init__.py +1 -1
- langchain_core/outputs/chat_generation.py +26 -11
- langchain_core/outputs/chat_result.py +1 -3
- langchain_core/outputs/generation.py +17 -6
- langchain_core/outputs/llm_result.py +15 -8
- langchain_core/prompt_values.py +29 -123
- langchain_core/prompts/__init__.py +3 -27
- langchain_core/prompts/base.py +48 -63
- langchain_core/prompts/chat.py +259 -288
- langchain_core/prompts/dict.py +19 -11
- langchain_core/prompts/few_shot.py +84 -90
- langchain_core/prompts/few_shot_with_templates.py +14 -12
- langchain_core/prompts/image.py +19 -14
- langchain_core/prompts/loading.py +6 -8
- langchain_core/prompts/message.py +7 -8
- langchain_core/prompts/prompt.py +42 -43
- langchain_core/prompts/string.py +37 -16
- langchain_core/prompts/structured.py +43 -46
- langchain_core/rate_limiters.py +51 -60
- langchain_core/retrievers.py +52 -192
- langchain_core/runnables/base.py +1727 -1683
- langchain_core/runnables/branch.py +52 -73
- langchain_core/runnables/config.py +89 -103
- langchain_core/runnables/configurable.py +128 -130
- langchain_core/runnables/fallbacks.py +93 -82
- langchain_core/runnables/graph.py +127 -127
- langchain_core/runnables/graph_ascii.py +63 -41
- langchain_core/runnables/graph_mermaid.py +87 -70
- langchain_core/runnables/graph_png.py +31 -36
- langchain_core/runnables/history.py +145 -161
- langchain_core/runnables/passthrough.py +141 -144
- langchain_core/runnables/retry.py +84 -68
- langchain_core/runnables/router.py +33 -37
- langchain_core/runnables/schema.py +79 -72
- langchain_core/runnables/utils.py +95 -139
- langchain_core/stores.py +85 -131
- langchain_core/structured_query.py +11 -15
- langchain_core/sys_info.py +31 -32
- langchain_core/tools/__init__.py +1 -14
- langchain_core/tools/base.py +221 -247
- langchain_core/tools/convert.py +144 -161
- langchain_core/tools/render.py +10 -10
- langchain_core/tools/retriever.py +12 -19
- langchain_core/tools/simple.py +52 -29
- langchain_core/tools/structured.py +56 -60
- langchain_core/tracers/__init__.py +1 -9
- langchain_core/tracers/_streaming.py +6 -7
- langchain_core/tracers/base.py +103 -112
- langchain_core/tracers/context.py +29 -48
- langchain_core/tracers/core.py +142 -105
- langchain_core/tracers/evaluation.py +30 -34
- langchain_core/tracers/event_stream.py +162 -117
- langchain_core/tracers/langchain.py +34 -36
- langchain_core/tracers/log_stream.py +87 -49
- langchain_core/tracers/memory_stream.py +3 -3
- langchain_core/tracers/root_listeners.py +18 -34
- langchain_core/tracers/run_collector.py +8 -20
- langchain_core/tracers/schemas.py +0 -125
- langchain_core/tracers/stdout.py +3 -3
- langchain_core/utils/__init__.py +1 -4
- langchain_core/utils/_merge.py +47 -9
- langchain_core/utils/aiter.py +70 -66
- langchain_core/utils/env.py +12 -9
- langchain_core/utils/function_calling.py +139 -206
- langchain_core/utils/html.py +7 -8
- langchain_core/utils/input.py +6 -6
- langchain_core/utils/interactive_env.py +6 -2
- langchain_core/utils/iter.py +48 -45
- langchain_core/utils/json.py +14 -4
- langchain_core/utils/json_schema.py +159 -43
- langchain_core/utils/mustache.py +32 -25
- langchain_core/utils/pydantic.py +67 -40
- langchain_core/utils/strings.py +5 -5
- langchain_core/utils/usage.py +1 -1
- langchain_core/utils/utils.py +104 -62
- langchain_core/vectorstores/base.py +131 -179
- langchain_core/vectorstores/in_memory.py +113 -182
- langchain_core/vectorstores/utils.py +23 -17
- langchain_core/version.py +1 -1
- langchain_core-1.0.0.dist-info/METADATA +68 -0
- langchain_core-1.0.0.dist-info/RECORD +172 -0
- {langchain_core-0.4.0.dev0.dist-info → langchain_core-1.0.0.dist-info}/WHEEL +1 -1
- langchain_core/beta/__init__.py +0 -1
- langchain_core/beta/runnables/__init__.py +0 -1
- langchain_core/beta/runnables/context.py +0 -448
- langchain_core/memory.py +0 -116
- langchain_core/messages/content_blocks.py +0 -1435
- langchain_core/prompts/pipeline.py +0 -133
- langchain_core/pydantic_v1/__init__.py +0 -30
- langchain_core/pydantic_v1/dataclasses.py +0 -23
- langchain_core/pydantic_v1/main.py +0 -23
- langchain_core/tracers/langchain_v1.py +0 -23
- langchain_core/utils/loading.py +0 -31
- langchain_core/v1/__init__.py +0 -1
- langchain_core/v1/chat_models.py +0 -1047
- langchain_core/v1/messages.py +0 -755
- langchain_core-0.4.0.dev0.dist-info/METADATA +0 -108
- langchain_core-0.4.0.dev0.dist-info/RECORD +0 -177
- langchain_core-0.4.0.dev0.dist-info/entry_points.txt +0 -4
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
7
|
|
|
8
8
|
from langchain_core.runnables import run_in_executor
|
|
9
9
|
|
|
@@ -15,6 +15,13 @@ if TYPE_CHECKING:
|
|
|
15
15
|
from langchain_core.documents import Document
|
|
16
16
|
from langchain_core.documents.base import Blob
|
|
17
17
|
|
|
18
|
+
try:
|
|
19
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
20
|
+
|
|
21
|
+
_HAS_TEXT_SPLITTERS = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
_HAS_TEXT_SPLITTERS = False
|
|
24
|
+
|
|
18
25
|
|
|
19
26
|
class BaseLoader(ABC): # noqa: B024
|
|
20
27
|
"""Interface for Document Loader.
|
|
@@ -28,37 +35,47 @@ class BaseLoader(ABC): # noqa: B024
|
|
|
28
35
|
# Sub-classes should not implement this method directly. Instead, they
|
|
29
36
|
# should implement the lazy load method.
|
|
30
37
|
def load(self) -> list[Document]:
|
|
31
|
-
"""Load data into Document objects.
|
|
38
|
+
"""Load data into `Document` objects.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
The documents.
|
|
42
|
+
"""
|
|
32
43
|
return list(self.lazy_load())
|
|
33
44
|
|
|
34
45
|
async def aload(self) -> list[Document]:
|
|
35
|
-
"""Load data into Document objects.
|
|
46
|
+
"""Load data into `Document` objects.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
The documents.
|
|
50
|
+
"""
|
|
36
51
|
return [document async for document in self.alazy_load()]
|
|
37
52
|
|
|
38
53
|
def load_and_split(
|
|
39
|
-
self, text_splitter:
|
|
54
|
+
self, text_splitter: TextSplitter | None = None
|
|
40
55
|
) -> list[Document]:
|
|
41
|
-
"""Load Documents and split into chunks. Chunks are returned as
|
|
56
|
+
"""Load Documents and split into chunks. Chunks are returned as `Document`.
|
|
42
57
|
|
|
43
58
|
Do not override this method. It should be considered to be deprecated!
|
|
44
59
|
|
|
45
60
|
Args:
|
|
46
|
-
text_splitter: TextSplitter instance to use for splitting documents.
|
|
47
|
-
|
|
61
|
+
text_splitter: `TextSplitter` instance to use for splitting documents.
|
|
62
|
+
Defaults to `RecursiveCharacterTextSplitter`.
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
ImportError: If `langchain-text-splitters` is not installed
|
|
66
|
+
and no `text_splitter` is provided.
|
|
48
67
|
|
|
49
68
|
Returns:
|
|
50
|
-
List of
|
|
69
|
+
List of `Document`.
|
|
51
70
|
"""
|
|
52
71
|
if text_splitter is None:
|
|
53
|
-
|
|
54
|
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
55
|
-
except ImportError as e:
|
|
72
|
+
if not _HAS_TEXT_SPLITTERS:
|
|
56
73
|
msg = (
|
|
57
74
|
"Unable to import from langchain_text_splitters. Please specify "
|
|
58
75
|
"text_splitter or install langchain_text_splitters with "
|
|
59
76
|
"`pip install -U langchain-text-splitters`."
|
|
60
77
|
)
|
|
61
|
-
raise ImportError(msg)
|
|
78
|
+
raise ImportError(msg)
|
|
62
79
|
|
|
63
80
|
text_splitter_: TextSplitter = RecursiveCharacterTextSplitter()
|
|
64
81
|
else:
|
|
@@ -69,14 +86,22 @@ class BaseLoader(ABC): # noqa: B024
|
|
|
69
86
|
# Attention: This method will be upgraded into an abstractmethod once it's
|
|
70
87
|
# implemented in all the existing subclasses.
|
|
71
88
|
def lazy_load(self) -> Iterator[Document]:
|
|
72
|
-
"""A lazy loader for
|
|
89
|
+
"""A lazy loader for `Document`.
|
|
90
|
+
|
|
91
|
+
Yields:
|
|
92
|
+
The `Document` objects.
|
|
93
|
+
"""
|
|
73
94
|
if type(self).load != BaseLoader.load:
|
|
74
95
|
return iter(self.load())
|
|
75
96
|
msg = f"{self.__class__.__name__} does not implement lazy_load()"
|
|
76
97
|
raise NotImplementedError(msg)
|
|
77
98
|
|
|
78
99
|
async def alazy_load(self) -> AsyncIterator[Document]:
|
|
79
|
-
"""A lazy loader for
|
|
100
|
+
"""A lazy loader for `Document`.
|
|
101
|
+
|
|
102
|
+
Yields:
|
|
103
|
+
The `Document` objects.
|
|
104
|
+
"""
|
|
80
105
|
iterator = await run_in_executor(None, self.lazy_load)
|
|
81
106
|
done = object()
|
|
82
107
|
while True:
|
|
@@ -90,7 +115,7 @@ class BaseBlobParser(ABC):
|
|
|
90
115
|
"""Abstract interface for blob parsers.
|
|
91
116
|
|
|
92
117
|
A blob parser provides a way to parse raw data stored in a blob into one
|
|
93
|
-
or more
|
|
118
|
+
or more `Document` objects.
|
|
94
119
|
|
|
95
120
|
The parser can be composed with blob loaders, making it easy to reuse
|
|
96
121
|
a parser independent of how the blob was originally loaded.
|
|
@@ -103,25 +128,25 @@ class BaseBlobParser(ABC):
|
|
|
103
128
|
Subclasses are required to implement this method.
|
|
104
129
|
|
|
105
130
|
Args:
|
|
106
|
-
blob: Blob instance
|
|
131
|
+
blob: `Blob` instance
|
|
107
132
|
|
|
108
133
|
Returns:
|
|
109
|
-
Generator of
|
|
134
|
+
Generator of `Document` objects
|
|
110
135
|
"""
|
|
111
136
|
|
|
112
137
|
def parse(self, blob: Blob) -> list[Document]:
|
|
113
|
-
"""Eagerly parse the blob into a
|
|
138
|
+
"""Eagerly parse the blob into a `Document` or `Document` objects.
|
|
114
139
|
|
|
115
140
|
This is a convenience method for interactive development environment.
|
|
116
141
|
|
|
117
|
-
Production applications should favor the lazy_parse method instead.
|
|
142
|
+
Production applications should favor the `lazy_parse` method instead.
|
|
118
143
|
|
|
119
144
|
Subclasses should generally not over-ride this parse method.
|
|
120
145
|
|
|
121
146
|
Args:
|
|
122
|
-
blob: Blob instance
|
|
147
|
+
blob: `Blob` instance
|
|
123
148
|
|
|
124
149
|
Returns:
|
|
125
|
-
List of
|
|
150
|
+
List of `Document` objects
|
|
126
151
|
"""
|
|
127
152
|
return list(self.lazy_parse(blob))
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
import datetime
|
|
4
4
|
import json
|
|
5
5
|
import uuid
|
|
6
|
-
from collections.abc import Iterator, Sequence
|
|
7
|
-
from typing import Any
|
|
6
|
+
from collections.abc import Callable, Iterator, Sequence
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
9
|
from langsmith import Client as LangSmithClient
|
|
10
10
|
from typing_extensions import override
|
|
@@ -20,55 +20,55 @@ class LangSmithLoader(BaseLoader):
|
|
|
20
20
|
into the Document metadata. This allows you to easily create few-shot example
|
|
21
21
|
retrievers from the loaded documents.
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
??? note "Lazy load"
|
|
24
24
|
|
|
25
|
-
|
|
25
|
+
```python
|
|
26
|
+
from langchain_core.document_loaders import LangSmithLoader
|
|
26
27
|
|
|
27
|
-
|
|
28
|
+
loader = LangSmithLoader(dataset_id="...", limit=100)
|
|
29
|
+
docs = []
|
|
30
|
+
for doc in loader.lazy_load():
|
|
31
|
+
docs.append(doc)
|
|
32
|
+
```
|
|
28
33
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
docs.append(doc)
|
|
34
|
+
```python
|
|
35
|
+
# -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]
|
|
36
|
+
```
|
|
33
37
|
|
|
34
|
-
|
|
38
|
+
!!! version-added "Added in version 0.2.34"
|
|
35
39
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
.. versionadded:: 0.2.34
|
|
39
|
-
|
|
40
|
-
""" # noqa: E501
|
|
40
|
+
"""
|
|
41
41
|
|
|
42
42
|
def __init__(
|
|
43
43
|
self,
|
|
44
44
|
*,
|
|
45
|
-
dataset_id:
|
|
46
|
-
dataset_name:
|
|
47
|
-
example_ids:
|
|
48
|
-
as_of:
|
|
49
|
-
splits:
|
|
45
|
+
dataset_id: uuid.UUID | str | None = None,
|
|
46
|
+
dataset_name: str | None = None,
|
|
47
|
+
example_ids: Sequence[uuid.UUID | str] | None = None,
|
|
48
|
+
as_of: datetime.datetime | str | None = None,
|
|
49
|
+
splits: Sequence[str] | None = None,
|
|
50
50
|
inline_s3_urls: bool = True,
|
|
51
51
|
offset: int = 0,
|
|
52
|
-
limit:
|
|
53
|
-
metadata:
|
|
54
|
-
filter:
|
|
52
|
+
limit: int | None = None,
|
|
53
|
+
metadata: dict | None = None,
|
|
54
|
+
filter: str | None = None, # noqa: A002
|
|
55
55
|
content_key: str = "",
|
|
56
|
-
format_content:
|
|
57
|
-
client:
|
|
56
|
+
format_content: Callable[..., str] | None = None,
|
|
57
|
+
client: LangSmithClient | None = None,
|
|
58
58
|
**client_kwargs: Any,
|
|
59
59
|
) -> None:
|
|
60
60
|
"""Create a LangSmith loader.
|
|
61
61
|
|
|
62
62
|
Args:
|
|
63
|
-
dataset_id: The ID of the dataset to filter by.
|
|
64
|
-
dataset_name: The name of the dataset to filter by.
|
|
65
|
-
content_key: The inputs key to set as Document page content.
|
|
66
|
-
are interpreted as nested keys. E.g.
|
|
63
|
+
dataset_id: The ID of the dataset to filter by.
|
|
64
|
+
dataset_name: The name of the dataset to filter by.
|
|
65
|
+
content_key: The inputs key to set as Document page content. `'.'` characters
|
|
66
|
+
are interpreted as nested keys. E.g. `content_key="first.second"` will
|
|
67
67
|
result in
|
|
68
|
-
|
|
68
|
+
`Document(page_content=format_content(example.inputs["first"]["second"]))`
|
|
69
69
|
format_content: Function for converting the content extracted from the example
|
|
70
70
|
inputs into a string. Defaults to JSON-encoding the contents.
|
|
71
|
-
example_ids: The IDs of the examples to filter by.
|
|
71
|
+
example_ids: The IDs of the examples to filter by.
|
|
72
72
|
as_of: The dataset version tag OR
|
|
73
73
|
timestamp to retrieve the examples as of.
|
|
74
74
|
Response examples will only be those that were present at the time
|
|
@@ -76,14 +76,17 @@ class LangSmithLoader(BaseLoader):
|
|
|
76
76
|
splits: A list of dataset splits, which are
|
|
77
77
|
divisions of your dataset such as 'train', 'test', or 'validation'.
|
|
78
78
|
Returns examples only from the specified splits.
|
|
79
|
-
inline_s3_urls: Whether to inline S3 URLs.
|
|
80
|
-
offset: The offset to start from.
|
|
79
|
+
inline_s3_urls: Whether to inline S3 URLs.
|
|
80
|
+
offset: The offset to start from.
|
|
81
81
|
limit: The maximum number of examples to return.
|
|
82
|
-
metadata: Metadata to filter by.
|
|
82
|
+
metadata: Metadata to filter by.
|
|
83
83
|
filter: A structured filter string to apply to the examples.
|
|
84
84
|
client: LangSmith Client. If not provided will be initialized from below args.
|
|
85
85
|
client_kwargs: Keyword args to pass to LangSmith client init. Should only be
|
|
86
|
-
specified if
|
|
86
|
+
specified if `client` isn't.
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ValueError: If both `client` and `client_kwargs` are provided.
|
|
87
90
|
""" # noqa: E501
|
|
88
91
|
if client and client_kwargs:
|
|
89
92
|
raise ValueError
|
|
@@ -126,7 +129,7 @@ class LangSmithLoader(BaseLoader):
|
|
|
126
129
|
yield Document(content_str, metadata=metadata)
|
|
127
130
|
|
|
128
131
|
|
|
129
|
-
def _stringify(x:
|
|
132
|
+
def _stringify(x: str | dict) -> str:
|
|
130
133
|
if isinstance(x, str):
|
|
131
134
|
return x
|
|
132
135
|
try:
|
langchain_core/documents/base.py
CHANGED
|
@@ -6,7 +6,7 @@ import contextlib
|
|
|
6
6
|
import mimetypes
|
|
7
7
|
from io import BufferedReader, BytesIO
|
|
8
8
|
from pathlib import Path, PurePath
|
|
9
|
-
from typing import TYPE_CHECKING, Any, Literal,
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
10
10
|
|
|
11
11
|
from pydantic import ConfigDict, Field, model_validator
|
|
12
12
|
|
|
@@ -15,7 +15,7 @@ from langchain_core.load.serializable import Serializable
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
16
|
from collections.abc import Generator
|
|
17
17
|
|
|
18
|
-
PathLike =
|
|
18
|
+
PathLike = str | PurePath
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class BaseMedia(Serializable):
|
|
@@ -33,13 +33,13 @@ class BaseMedia(Serializable):
|
|
|
33
33
|
# The ID field is optional at the moment.
|
|
34
34
|
# It will likely become required in a future major release after
|
|
35
35
|
# it has been adopted by enough vectorstore implementations.
|
|
36
|
-
id:
|
|
36
|
+
id: str | None = Field(default=None, coerce_numbers_to_str=True)
|
|
37
37
|
"""An optional identifier for the document.
|
|
38
38
|
|
|
39
39
|
Ideally this should be unique across the document collection and formatted
|
|
40
40
|
as a UUID, but this will not be enforced.
|
|
41
41
|
|
|
42
|
-
|
|
42
|
+
!!! version-added "Added in version 0.2.11"
|
|
43
43
|
"""
|
|
44
44
|
|
|
45
45
|
metadata: dict = Field(default_factory=dict)
|
|
@@ -57,64 +57,63 @@ class Blob(BaseMedia):
|
|
|
57
57
|
|
|
58
58
|
Example: Initialize a blob from in-memory data
|
|
59
59
|
|
|
60
|
-
|
|
60
|
+
```python
|
|
61
|
+
from langchain_core.documents import Blob
|
|
61
62
|
|
|
62
|
-
|
|
63
|
+
blob = Blob.from_data("Hello, world!")
|
|
63
64
|
|
|
64
|
-
|
|
65
|
+
# Read the blob as a string
|
|
66
|
+
print(blob.as_string())
|
|
65
67
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
+
# Read the blob as bytes
|
|
69
|
+
print(blob.as_bytes())
|
|
68
70
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
with blob.as_bytes_io() as f:
|
|
74
|
-
print(f.read())
|
|
71
|
+
# Read the blob as a byte stream
|
|
72
|
+
with blob.as_bytes_io() as f:
|
|
73
|
+
print(f.read())
|
|
74
|
+
```
|
|
75
75
|
|
|
76
76
|
Example: Load from memory and specify mime-type and metadata
|
|
77
77
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
from langchain_core.documents import Blob
|
|
78
|
+
```python
|
|
79
|
+
from langchain_core.documents import Blob
|
|
81
80
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
81
|
+
blob = Blob.from_data(
|
|
82
|
+
data="Hello, world!",
|
|
83
|
+
mime_type="text/plain",
|
|
84
|
+
metadata={"source": "https://example.com"},
|
|
85
|
+
)
|
|
86
|
+
```
|
|
87
87
|
|
|
88
88
|
Example: Load the blob from a file
|
|
89
89
|
|
|
90
|
-
|
|
90
|
+
```python
|
|
91
|
+
from langchain_core.documents import Blob
|
|
91
92
|
|
|
92
|
-
|
|
93
|
+
blob = Blob.from_path("path/to/file.txt")
|
|
93
94
|
|
|
94
|
-
|
|
95
|
+
# Read the blob as a string
|
|
96
|
+
print(blob.as_string())
|
|
95
97
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
# Read the blob as bytes
|
|
100
|
-
print(blob.as_bytes())
|
|
101
|
-
|
|
102
|
-
# Read the blob as a byte stream
|
|
103
|
-
with blob.as_bytes_io() as f:
|
|
104
|
-
print(f.read())
|
|
98
|
+
# Read the blob as bytes
|
|
99
|
+
print(blob.as_bytes())
|
|
105
100
|
|
|
101
|
+
# Read the blob as a byte stream
|
|
102
|
+
with blob.as_bytes_io() as f:
|
|
103
|
+
print(f.read())
|
|
104
|
+
```
|
|
106
105
|
"""
|
|
107
106
|
|
|
108
|
-
data:
|
|
107
|
+
data: bytes | str | None = None
|
|
109
108
|
"""Raw data associated with the blob."""
|
|
110
|
-
mimetype:
|
|
109
|
+
mimetype: str | None = None
|
|
111
110
|
"""MimeType not to be confused with a file extension."""
|
|
112
111
|
encoding: str = "utf-8"
|
|
113
112
|
"""Encoding to use if decoding the bytes into a string.
|
|
114
113
|
|
|
115
|
-
Use utf-8 as default encoding, if decoding to string.
|
|
114
|
+
Use `utf-8` as default encoding, if decoding to string.
|
|
116
115
|
"""
|
|
117
|
-
path:
|
|
116
|
+
path: PathLike | None = None
|
|
118
117
|
"""Location where the original content was found."""
|
|
119
118
|
|
|
120
119
|
model_config = ConfigDict(
|
|
@@ -123,16 +122,16 @@ class Blob(BaseMedia):
|
|
|
123
122
|
)
|
|
124
123
|
|
|
125
124
|
@property
|
|
126
|
-
def source(self) ->
|
|
125
|
+
def source(self) -> str | None:
|
|
127
126
|
"""The source location of the blob as string if known otherwise none.
|
|
128
127
|
|
|
129
128
|
If a path is associated with the blob, it will default to the path location.
|
|
130
129
|
|
|
131
|
-
Unless explicitly set via a metadata field called "source"
|
|
130
|
+
Unless explicitly set via a metadata field called `"source"`, in which
|
|
132
131
|
case that value will be used instead.
|
|
133
132
|
"""
|
|
134
133
|
if self.metadata and "source" in self.metadata:
|
|
135
|
-
return cast("
|
|
134
|
+
return cast("str | None", self.metadata["source"])
|
|
136
135
|
return str(self.path) if self.path else None
|
|
137
136
|
|
|
138
137
|
@model_validator(mode="before")
|
|
@@ -145,7 +144,14 @@ class Blob(BaseMedia):
|
|
|
145
144
|
return values
|
|
146
145
|
|
|
147
146
|
def as_string(self) -> str:
|
|
148
|
-
"""Read data as a string.
|
|
147
|
+
"""Read data as a string.
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
ValueError: If the blob cannot be represented as a string.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
The data as a string.
|
|
154
|
+
"""
|
|
149
155
|
if self.data is None and self.path:
|
|
150
156
|
return Path(self.path).read_text(encoding=self.encoding)
|
|
151
157
|
if isinstance(self.data, bytes):
|
|
@@ -156,7 +162,14 @@ class Blob(BaseMedia):
|
|
|
156
162
|
raise ValueError(msg)
|
|
157
163
|
|
|
158
164
|
def as_bytes(self) -> bytes:
|
|
159
|
-
"""Read data as bytes.
|
|
165
|
+
"""Read data as bytes.
|
|
166
|
+
|
|
167
|
+
Raises:
|
|
168
|
+
ValueError: If the blob cannot be represented as bytes.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
The data as bytes.
|
|
172
|
+
"""
|
|
160
173
|
if isinstance(self.data, bytes):
|
|
161
174
|
return self.data
|
|
162
175
|
if isinstance(self.data, str):
|
|
@@ -167,8 +180,15 @@ class Blob(BaseMedia):
|
|
|
167
180
|
raise ValueError(msg)
|
|
168
181
|
|
|
169
182
|
@contextlib.contextmanager
|
|
170
|
-
def as_bytes_io(self) -> Generator[
|
|
171
|
-
"""Read data as a byte stream.
|
|
183
|
+
def as_bytes_io(self) -> Generator[BytesIO | BufferedReader, None, None]:
|
|
184
|
+
"""Read data as a byte stream.
|
|
185
|
+
|
|
186
|
+
Raises:
|
|
187
|
+
NotImplementedError: If the blob cannot be represented as a byte stream.
|
|
188
|
+
|
|
189
|
+
Yields:
|
|
190
|
+
The data as a byte stream.
|
|
191
|
+
"""
|
|
172
192
|
if isinstance(self.data, bytes):
|
|
173
193
|
yield BytesIO(self.data)
|
|
174
194
|
elif self.data is None and self.path:
|
|
@@ -184,18 +204,18 @@ class Blob(BaseMedia):
|
|
|
184
204
|
path: PathLike,
|
|
185
205
|
*,
|
|
186
206
|
encoding: str = "utf-8",
|
|
187
|
-
mime_type:
|
|
207
|
+
mime_type: str | None = None,
|
|
188
208
|
guess_type: bool = True,
|
|
189
|
-
metadata:
|
|
209
|
+
metadata: dict | None = None,
|
|
190
210
|
) -> Blob:
|
|
191
211
|
"""Load the blob from a path like object.
|
|
192
212
|
|
|
193
213
|
Args:
|
|
194
|
-
path:
|
|
214
|
+
path: Path-like object to file to be read
|
|
195
215
|
encoding: Encoding to use if decoding the bytes into a string
|
|
196
|
-
mime_type:
|
|
197
|
-
guess_type: If True
|
|
198
|
-
|
|
216
|
+
mime_type: If provided, will be set as the mime-type of the data
|
|
217
|
+
guess_type: If `True`, the mimetype will be guessed from the file extension,
|
|
218
|
+
if a mime-type was not provided
|
|
199
219
|
metadata: Metadata to associate with the blob
|
|
200
220
|
|
|
201
221
|
Returns:
|
|
@@ -218,20 +238,20 @@ class Blob(BaseMedia):
|
|
|
218
238
|
@classmethod
|
|
219
239
|
def from_data(
|
|
220
240
|
cls,
|
|
221
|
-
data:
|
|
241
|
+
data: str | bytes,
|
|
222
242
|
*,
|
|
223
243
|
encoding: str = "utf-8",
|
|
224
|
-
mime_type:
|
|
225
|
-
path:
|
|
226
|
-
metadata:
|
|
244
|
+
mime_type: str | None = None,
|
|
245
|
+
path: str | None = None,
|
|
246
|
+
metadata: dict | None = None,
|
|
227
247
|
) -> Blob:
|
|
228
248
|
"""Initialize the blob from in-memory data.
|
|
229
249
|
|
|
230
250
|
Args:
|
|
231
|
-
data:
|
|
251
|
+
data: The in-memory data associated with the blob
|
|
232
252
|
encoding: Encoding to use if decoding the bytes into a string
|
|
233
|
-
mime_type:
|
|
234
|
-
path:
|
|
253
|
+
mime_type: If provided, will be set as the mime-type of the data
|
|
254
|
+
path: If provided, will be set as the source from which the data came
|
|
235
255
|
metadata: Metadata to associate with the blob
|
|
236
256
|
|
|
237
257
|
Returns:
|
|
@@ -246,7 +266,7 @@ class Blob(BaseMedia):
|
|
|
246
266
|
)
|
|
247
267
|
|
|
248
268
|
def __repr__(self) -> str:
|
|
249
|
-
"""
|
|
269
|
+
"""Return the blob representation."""
|
|
250
270
|
str_repr = f"Blob {id(self)}"
|
|
251
271
|
if self.source:
|
|
252
272
|
str_repr += f" {self.source}"
|
|
@@ -257,16 +277,13 @@ class Document(BaseMedia):
|
|
|
257
277
|
"""Class for storing a piece of text and associated metadata.
|
|
258
278
|
|
|
259
279
|
Example:
|
|
280
|
+
```python
|
|
281
|
+
from langchain_core.documents import Document
|
|
260
282
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
document = Document(
|
|
266
|
-
page_content="Hello, world!",
|
|
267
|
-
metadata={"source": "https://example.com"}
|
|
268
|
-
)
|
|
269
|
-
|
|
283
|
+
document = Document(
|
|
284
|
+
page_content="Hello, world!", metadata={"source": "https://example.com"}
|
|
285
|
+
)
|
|
286
|
+
```
|
|
270
287
|
"""
|
|
271
288
|
|
|
272
289
|
page_content: str
|
|
@@ -277,23 +294,28 @@ class Document(BaseMedia):
|
|
|
277
294
|
"""Pass page_content in as positional or named arg."""
|
|
278
295
|
# my-py is complaining that page_content is not defined on the base class.
|
|
279
296
|
# Here, we're relying on pydantic base class to handle the validation.
|
|
280
|
-
super().__init__(page_content=page_content, **kwargs)
|
|
297
|
+
super().__init__(page_content=page_content, **kwargs)
|
|
281
298
|
|
|
282
299
|
@classmethod
|
|
283
300
|
def is_lc_serializable(cls) -> bool:
|
|
284
|
-
"""Return
|
|
301
|
+
"""Return True as this class is serializable."""
|
|
285
302
|
return True
|
|
286
303
|
|
|
287
304
|
@classmethod
|
|
288
305
|
def get_lc_namespace(cls) -> list[str]:
|
|
289
|
-
"""Get the namespace of the
|
|
306
|
+
"""Get the namespace of the LangChain object.
|
|
290
307
|
|
|
291
|
-
|
|
308
|
+
Returns:
|
|
309
|
+
["langchain", "schema", "document"]
|
|
292
310
|
"""
|
|
293
311
|
return ["langchain", "schema", "document"]
|
|
294
312
|
|
|
295
313
|
def __str__(self) -> str:
|
|
296
|
-
"""Override __str__ to restrict it to page_content and metadata.
|
|
314
|
+
"""Override __str__ to restrict it to page_content and metadata.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
A string representation of the Document.
|
|
318
|
+
"""
|
|
297
319
|
# The format matches pydantic format for __str__.
|
|
298
320
|
#
|
|
299
321
|
# The purpose of this change is to make sure that user code that
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from abc import ABC, abstractmethod
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
7
|
|
|
8
8
|
from pydantic import BaseModel
|
|
9
9
|
|
|
@@ -19,17 +19,18 @@ if TYPE_CHECKING:
|
|
|
19
19
|
class BaseDocumentCompressor(BaseModel, ABC):
|
|
20
20
|
"""Base class for document compressors.
|
|
21
21
|
|
|
22
|
-
This abstraction is primarily used for
|
|
23
|
-
post-processing of retrieved documents.
|
|
22
|
+
This abstraction is primarily used for post-processing of retrieved documents.
|
|
24
23
|
|
|
25
24
|
Documents matching a given query are first retrieved.
|
|
25
|
+
|
|
26
26
|
Then the list of documents can be further processed.
|
|
27
27
|
|
|
28
|
-
For example, one could re-rank the retrieved documents
|
|
29
|
-
|
|
28
|
+
For example, one could re-rank the retrieved documents using an LLM.
|
|
29
|
+
|
|
30
|
+
!!! note
|
|
31
|
+
Users should favor using a RunnableLambda instead of sub-classing from this
|
|
32
|
+
interface.
|
|
30
33
|
|
|
31
|
-
**Note** users should favor using a RunnableLambda
|
|
32
|
-
instead of sub-classing from this interface.
|
|
33
34
|
"""
|
|
34
35
|
|
|
35
36
|
@abstractmethod
|
|
@@ -37,7 +38,7 @@ class BaseDocumentCompressor(BaseModel, ABC):
|
|
|
37
38
|
self,
|
|
38
39
|
documents: Sequence[Document],
|
|
39
40
|
query: str,
|
|
40
|
-
callbacks:
|
|
41
|
+
callbacks: Callbacks | None = None,
|
|
41
42
|
) -> Sequence[Document]:
|
|
42
43
|
"""Compress retrieved documents given the query context.
|
|
43
44
|
|
|
@@ -48,13 +49,14 @@ class BaseDocumentCompressor(BaseModel, ABC):
|
|
|
48
49
|
|
|
49
50
|
Returns:
|
|
50
51
|
The compressed documents.
|
|
52
|
+
|
|
51
53
|
"""
|
|
52
54
|
|
|
53
55
|
async def acompress_documents(
|
|
54
56
|
self,
|
|
55
57
|
documents: Sequence[Document],
|
|
56
58
|
query: str,
|
|
57
|
-
callbacks:
|
|
59
|
+
callbacks: Callbacks | None = None,
|
|
58
60
|
) -> Sequence[Document]:
|
|
59
61
|
"""Async compress retrieved documents given the query context.
|
|
60
62
|
|
|
@@ -65,6 +67,7 @@ class BaseDocumentCompressor(BaseModel, ABC):
|
|
|
65
67
|
|
|
66
68
|
Returns:
|
|
67
69
|
The compressed documents.
|
|
70
|
+
|
|
68
71
|
"""
|
|
69
72
|
return await run_in_executor(
|
|
70
73
|
None, self.compress_documents, documents, query, callbacks
|