langchain-core 0.3.79__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langchain-core might be problematic. Click here for more details.
- langchain_core/__init__.py +1 -1
- langchain_core/_api/__init__.py +3 -4
- langchain_core/_api/beta_decorator.py +23 -26
- langchain_core/_api/deprecation.py +52 -65
- langchain_core/_api/path.py +3 -6
- langchain_core/_import_utils.py +3 -4
- langchain_core/agents.py +19 -19
- langchain_core/caches.py +53 -63
- langchain_core/callbacks/__init__.py +1 -8
- langchain_core/callbacks/base.py +323 -334
- langchain_core/callbacks/file.py +44 -44
- langchain_core/callbacks/manager.py +441 -507
- langchain_core/callbacks/stdout.py +29 -30
- langchain_core/callbacks/streaming_stdout.py +32 -32
- langchain_core/callbacks/usage.py +60 -57
- langchain_core/chat_history.py +48 -63
- langchain_core/document_loaders/base.py +23 -23
- langchain_core/document_loaders/langsmith.py +37 -37
- langchain_core/documents/__init__.py +0 -1
- langchain_core/documents/base.py +62 -65
- langchain_core/documents/compressor.py +4 -4
- langchain_core/documents/transformers.py +28 -29
- langchain_core/embeddings/fake.py +50 -54
- langchain_core/example_selectors/length_based.py +1 -1
- langchain_core/example_selectors/semantic_similarity.py +21 -25
- langchain_core/exceptions.py +10 -11
- langchain_core/globals.py +3 -151
- langchain_core/indexing/api.py +61 -66
- langchain_core/indexing/base.py +58 -58
- langchain_core/indexing/in_memory.py +3 -3
- langchain_core/language_models/__init__.py +14 -27
- langchain_core/language_models/_utils.py +270 -84
- langchain_core/language_models/base.py +55 -162
- langchain_core/language_models/chat_models.py +442 -402
- langchain_core/language_models/fake.py +11 -11
- langchain_core/language_models/fake_chat_models.py +61 -39
- langchain_core/language_models/llms.py +123 -231
- langchain_core/load/dump.py +4 -5
- langchain_core/load/load.py +18 -28
- langchain_core/load/mapping.py +2 -4
- langchain_core/load/serializable.py +39 -40
- langchain_core/messages/__init__.py +61 -22
- langchain_core/messages/ai.py +368 -163
- langchain_core/messages/base.py +214 -43
- langchain_core/messages/block_translators/__init__.py +111 -0
- langchain_core/messages/block_translators/anthropic.py +470 -0
- langchain_core/messages/block_translators/bedrock.py +94 -0
- langchain_core/messages/block_translators/bedrock_converse.py +297 -0
- langchain_core/messages/block_translators/google_genai.py +530 -0
- langchain_core/messages/block_translators/google_vertexai.py +21 -0
- langchain_core/messages/block_translators/groq.py +143 -0
- langchain_core/messages/block_translators/langchain_v0.py +301 -0
- langchain_core/messages/block_translators/openai.py +1010 -0
- langchain_core/messages/chat.py +2 -6
- langchain_core/messages/content.py +1423 -0
- langchain_core/messages/function.py +6 -10
- langchain_core/messages/human.py +41 -38
- langchain_core/messages/modifier.py +2 -2
- langchain_core/messages/system.py +38 -28
- langchain_core/messages/tool.py +96 -103
- langchain_core/messages/utils.py +478 -504
- langchain_core/output_parsers/__init__.py +1 -14
- langchain_core/output_parsers/base.py +58 -61
- langchain_core/output_parsers/json.py +7 -8
- langchain_core/output_parsers/list.py +5 -7
- langchain_core/output_parsers/openai_functions.py +49 -47
- langchain_core/output_parsers/openai_tools.py +14 -19
- langchain_core/output_parsers/pydantic.py +12 -13
- langchain_core/output_parsers/string.py +2 -2
- langchain_core/output_parsers/transform.py +15 -17
- langchain_core/output_parsers/xml.py +8 -10
- langchain_core/outputs/__init__.py +1 -1
- langchain_core/outputs/chat_generation.py +18 -18
- langchain_core/outputs/chat_result.py +1 -3
- langchain_core/outputs/generation.py +8 -8
- langchain_core/outputs/llm_result.py +10 -10
- langchain_core/prompt_values.py +12 -12
- langchain_core/prompts/__init__.py +3 -27
- langchain_core/prompts/base.py +45 -55
- langchain_core/prompts/chat.py +254 -313
- langchain_core/prompts/dict.py +5 -5
- langchain_core/prompts/few_shot.py +81 -88
- langchain_core/prompts/few_shot_with_templates.py +11 -13
- langchain_core/prompts/image.py +12 -14
- langchain_core/prompts/loading.py +6 -8
- langchain_core/prompts/message.py +3 -3
- langchain_core/prompts/prompt.py +24 -39
- langchain_core/prompts/string.py +4 -4
- langchain_core/prompts/structured.py +42 -50
- langchain_core/rate_limiters.py +51 -60
- langchain_core/retrievers.py +49 -190
- langchain_core/runnables/base.py +1484 -1709
- langchain_core/runnables/branch.py +45 -61
- langchain_core/runnables/config.py +80 -88
- langchain_core/runnables/configurable.py +117 -134
- langchain_core/runnables/fallbacks.py +83 -79
- langchain_core/runnables/graph.py +85 -95
- langchain_core/runnables/graph_ascii.py +27 -28
- langchain_core/runnables/graph_mermaid.py +38 -50
- langchain_core/runnables/graph_png.py +15 -16
- langchain_core/runnables/history.py +135 -148
- langchain_core/runnables/passthrough.py +124 -150
- langchain_core/runnables/retry.py +46 -51
- langchain_core/runnables/router.py +25 -30
- langchain_core/runnables/schema.py +79 -74
- langchain_core/runnables/utils.py +62 -68
- langchain_core/stores.py +81 -115
- langchain_core/structured_query.py +8 -8
- langchain_core/sys_info.py +27 -29
- langchain_core/tools/__init__.py +1 -14
- langchain_core/tools/base.py +179 -187
- langchain_core/tools/convert.py +131 -139
- langchain_core/tools/render.py +10 -10
- langchain_core/tools/retriever.py +11 -11
- langchain_core/tools/simple.py +19 -24
- langchain_core/tools/structured.py +30 -39
- langchain_core/tracers/__init__.py +1 -9
- langchain_core/tracers/base.py +97 -99
- langchain_core/tracers/context.py +29 -52
- langchain_core/tracers/core.py +50 -60
- langchain_core/tracers/evaluation.py +11 -11
- langchain_core/tracers/event_stream.py +115 -70
- langchain_core/tracers/langchain.py +21 -21
- langchain_core/tracers/log_stream.py +43 -43
- langchain_core/tracers/memory_stream.py +3 -3
- langchain_core/tracers/root_listeners.py +16 -16
- langchain_core/tracers/run_collector.py +2 -4
- langchain_core/tracers/schemas.py +0 -129
- langchain_core/tracers/stdout.py +3 -3
- langchain_core/utils/__init__.py +1 -4
- langchain_core/utils/_merge.py +46 -8
- langchain_core/utils/aiter.py +57 -61
- langchain_core/utils/env.py +9 -9
- langchain_core/utils/function_calling.py +89 -191
- langchain_core/utils/html.py +7 -8
- langchain_core/utils/input.py +6 -6
- langchain_core/utils/interactive_env.py +1 -1
- langchain_core/utils/iter.py +37 -42
- langchain_core/utils/json.py +4 -3
- langchain_core/utils/json_schema.py +8 -8
- langchain_core/utils/mustache.py +9 -11
- langchain_core/utils/pydantic.py +33 -35
- langchain_core/utils/strings.py +5 -5
- langchain_core/utils/usage.py +1 -1
- langchain_core/utils/utils.py +80 -54
- langchain_core/vectorstores/base.py +129 -164
- langchain_core/vectorstores/in_memory.py +99 -174
- langchain_core/vectorstores/utils.py +5 -5
- langchain_core/version.py +1 -1
- {langchain_core-0.3.79.dist-info → langchain_core-1.0.0.dist-info}/METADATA +28 -27
- langchain_core-1.0.0.dist-info/RECORD +172 -0
- {langchain_core-0.3.79.dist-info → langchain_core-1.0.0.dist-info}/WHEEL +1 -1
- langchain_core/beta/__init__.py +0 -1
- langchain_core/beta/runnables/__init__.py +0 -1
- langchain_core/beta/runnables/context.py +0 -447
- langchain_core/memory.py +0 -120
- langchain_core/messages/content_blocks.py +0 -176
- langchain_core/prompts/pipeline.py +0 -138
- langchain_core/pydantic_v1/__init__.py +0 -30
- langchain_core/pydantic_v1/dataclasses.py +0 -23
- langchain_core/pydantic_v1/main.py +0 -23
- langchain_core/tracers/langchain_v1.py +0 -31
- langchain_core/utils/loading.py +0 -35
- langchain_core-0.3.79.dist-info/RECORD +0 -174
- langchain_core-0.3.79.dist-info/entry_points.txt +0 -4
langchain_core/indexing/api.py
CHANGED
|
@@ -6,16 +6,20 @@ import hashlib
|
|
|
6
6
|
import json
|
|
7
7
|
import uuid
|
|
8
8
|
import warnings
|
|
9
|
-
from collections.abc import
|
|
9
|
+
from collections.abc import (
|
|
10
|
+
AsyncIterable,
|
|
11
|
+
AsyncIterator,
|
|
12
|
+
Callable,
|
|
13
|
+
Iterable,
|
|
14
|
+
Iterator,
|
|
15
|
+
Sequence,
|
|
16
|
+
)
|
|
10
17
|
from itertools import islice
|
|
11
18
|
from typing import (
|
|
12
19
|
Any,
|
|
13
|
-
Callable,
|
|
14
20
|
Literal,
|
|
15
|
-
Optional,
|
|
16
21
|
TypedDict,
|
|
17
22
|
TypeVar,
|
|
18
|
-
Union,
|
|
19
23
|
cast,
|
|
20
24
|
)
|
|
21
25
|
|
|
@@ -107,8 +111,8 @@ async def _abatch(size: int, iterable: AsyncIterable[T]) -> AsyncIterator[list[T
|
|
|
107
111
|
|
|
108
112
|
|
|
109
113
|
def _get_source_id_assigner(
|
|
110
|
-
source_id_key:
|
|
111
|
-
) -> Callable[[Document],
|
|
114
|
+
source_id_key: str | Callable[[Document], str] | None,
|
|
115
|
+
) -> Callable[[Document], str | None]:
|
|
112
116
|
"""Get the source id from the document."""
|
|
113
117
|
if source_id_key is None:
|
|
114
118
|
return lambda _doc: None
|
|
@@ -162,9 +166,8 @@ def _calculate_hash(
|
|
|
162
166
|
def _get_document_with_hash(
|
|
163
167
|
document: Document,
|
|
164
168
|
*,
|
|
165
|
-
key_encoder:
|
|
166
|
-
|
|
167
|
-
],
|
|
169
|
+
key_encoder: Callable[[Document], str]
|
|
170
|
+
| Literal["sha1", "sha256", "sha512", "blake2b"],
|
|
168
171
|
) -> Document:
|
|
169
172
|
"""Calculate a hash of the document, and assign it to the uid.
|
|
170
173
|
|
|
@@ -233,7 +236,7 @@ class _HashedDocument:
|
|
|
233
236
|
|
|
234
237
|
|
|
235
238
|
def _delete(
|
|
236
|
-
vector_store:
|
|
239
|
+
vector_store: VectorStore | DocumentIndex,
|
|
237
240
|
ids: list[str],
|
|
238
241
|
) -> None:
|
|
239
242
|
if isinstance(vector_store, VectorStore):
|
|
@@ -271,19 +274,18 @@ class IndexingResult(TypedDict):
|
|
|
271
274
|
|
|
272
275
|
|
|
273
276
|
def index(
|
|
274
|
-
docs_source:
|
|
277
|
+
docs_source: BaseLoader | Iterable[Document],
|
|
275
278
|
record_manager: RecordManager,
|
|
276
|
-
vector_store:
|
|
279
|
+
vector_store: VectorStore | DocumentIndex,
|
|
277
280
|
*,
|
|
278
281
|
batch_size: int = 100,
|
|
279
|
-
cleanup:
|
|
280
|
-
source_id_key:
|
|
282
|
+
cleanup: Literal["incremental", "full", "scoped_full"] | None = None,
|
|
283
|
+
source_id_key: str | Callable[[Document], str] | None = None,
|
|
281
284
|
cleanup_batch_size: int = 1_000,
|
|
282
285
|
force_update: bool = False,
|
|
283
|
-
key_encoder:
|
|
284
|
-
|
|
285
|
-
] =
|
|
286
|
-
upsert_kwargs: Optional[dict[str, Any]] = None,
|
|
286
|
+
key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]
|
|
287
|
+
| Callable[[Document], str] = "sha1",
|
|
288
|
+
upsert_kwargs: dict[str, Any] | None = None,
|
|
287
289
|
) -> IndexingResult:
|
|
288
290
|
"""Index data from the loader into the vector store.
|
|
289
291
|
|
|
@@ -296,10 +298,10 @@ def index(
|
|
|
296
298
|
For the time being, documents are indexed using their hashes, and users
|
|
297
299
|
are not able to specify the uid of the document.
|
|
298
300
|
|
|
299
|
-
|
|
300
|
-
Added
|
|
301
|
+
!!! warning "Behavior changed in 0.3.25"
|
|
302
|
+
Added `scoped_full` cleanup mode.
|
|
301
303
|
|
|
302
|
-
|
|
304
|
+
!!! warning
|
|
303
305
|
|
|
304
306
|
* In full mode, the loader should be returning
|
|
305
307
|
the entire dataset, and not just a subset of the dataset.
|
|
@@ -313,7 +315,7 @@ def index(
|
|
|
313
315
|
chunks, and we index them using a batch size of 5, we'll have 3 batches
|
|
314
316
|
all with the same source id. In general, to avoid doing too much
|
|
315
317
|
redundant work select as big a batch size as possible.
|
|
316
|
-
* The
|
|
318
|
+
* The `scoped_full` mode is suitable if determining an appropriate batch size
|
|
317
319
|
is challenging or if your data loader cannot return the entire dataset at
|
|
318
320
|
once. This mode keeps track of source IDs in memory, which should be fine
|
|
319
321
|
for most use cases. If your dataset is large (10M+ docs), you will likely
|
|
@@ -324,8 +326,8 @@ def index(
|
|
|
324
326
|
record_manager: Timestamped set to keep track of which documents were
|
|
325
327
|
updated.
|
|
326
328
|
vector_store: VectorStore or DocumentIndex to index the documents into.
|
|
327
|
-
batch_size: Batch size to use when indexing.
|
|
328
|
-
cleanup: How to handle clean up of documents.
|
|
329
|
+
batch_size: Batch size to use when indexing.
|
|
330
|
+
cleanup: How to handle clean up of documents.
|
|
329
331
|
|
|
330
332
|
- incremental: Cleans up all documents that haven't been updated AND
|
|
331
333
|
that are associated with source ids that were seen during indexing.
|
|
@@ -340,17 +342,14 @@ def index(
|
|
|
340
342
|
source ids that were seen during indexing.
|
|
341
343
|
- None: Do not delete any documents.
|
|
342
344
|
source_id_key: Optional key that helps identify the original source
|
|
343
|
-
of the document.
|
|
345
|
+
of the document.
|
|
344
346
|
cleanup_batch_size: Batch size to use when cleaning up documents.
|
|
345
|
-
Default is 1_000.
|
|
346
347
|
force_update: Force update documents even if they are present in the
|
|
347
348
|
record manager. Useful if you are re-indexing with updated embeddings.
|
|
348
|
-
Default is False.
|
|
349
349
|
key_encoder: Hashing algorithm to use for hashing the document content and
|
|
350
|
-
metadata.
|
|
351
|
-
Other options include "blake2b", "sha256", and "sha512".
|
|
350
|
+
metadata. Options include "blake2b", "sha256", and "sha512".
|
|
352
351
|
|
|
353
|
-
|
|
352
|
+
!!! version-added "Added in version 0.3.66"
|
|
354
353
|
|
|
355
354
|
key_encoder: Hashing algorithm to use for hashing the document.
|
|
356
355
|
If not provided, a default encoder using SHA-1 will be used.
|
|
@@ -367,7 +366,7 @@ def index(
|
|
|
367
366
|
method of the VectorStore or the upsert method of the DocumentIndex.
|
|
368
367
|
For example, you can use this to specify a custom vector_field:
|
|
369
368
|
upsert_kwargs={"vector_field": "embedding"}
|
|
370
|
-
|
|
369
|
+
!!! version-added "Added in version 0.3.10"
|
|
371
370
|
|
|
372
371
|
Returns:
|
|
373
372
|
Indexing result which contains information about how many documents
|
|
@@ -379,8 +378,8 @@ def index(
|
|
|
379
378
|
ValueError: If vectorstore does not have
|
|
380
379
|
"delete" and "add_documents" required methods.
|
|
381
380
|
ValueError: If source_id_key is not None, but is not a string or callable.
|
|
382
|
-
TypeError: If
|
|
383
|
-
AssertionError: If
|
|
381
|
+
TypeError: If `vectorstore` is not a VectorStore or a DocumentIndex.
|
|
382
|
+
AssertionError: If `source_id` is None when cleanup mode is incremental.
|
|
384
383
|
(should be unreachable code).
|
|
385
384
|
"""
|
|
386
385
|
# Behavior is deprecated, but we keep it for backwards compatibility.
|
|
@@ -462,13 +461,13 @@ def index(
|
|
|
462
461
|
# Count documents removed by within-batch deduplication
|
|
463
462
|
num_skipped += original_batch_size - len(hashed_docs)
|
|
464
463
|
|
|
465
|
-
source_ids: Sequence[
|
|
464
|
+
source_ids: Sequence[str | None] = [
|
|
466
465
|
source_id_assigner(hashed_doc) for hashed_doc in hashed_docs
|
|
467
466
|
]
|
|
468
467
|
|
|
469
468
|
if cleanup in {"incremental", "scoped_full"}:
|
|
470
469
|
# source ids are required.
|
|
471
|
-
for source_id, hashed_doc in zip(source_ids, hashed_docs):
|
|
470
|
+
for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
|
|
472
471
|
if source_id is None:
|
|
473
472
|
msg = (
|
|
474
473
|
f"Source ids are required when cleanup mode is "
|
|
@@ -492,7 +491,7 @@ def index(
|
|
|
492
491
|
docs_to_index = []
|
|
493
492
|
uids_to_refresh = []
|
|
494
493
|
seen_docs: set[str] = set()
|
|
495
|
-
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
|
494
|
+
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):
|
|
496
495
|
hashed_id = cast("str", hashed_doc.id)
|
|
497
496
|
if doc_exists:
|
|
498
497
|
if force_update:
|
|
@@ -563,7 +562,7 @@ def index(
|
|
|
563
562
|
if cleanup == "full" or (
|
|
564
563
|
cleanup == "scoped_full" and scoped_full_cleanup_source_ids
|
|
565
564
|
):
|
|
566
|
-
delete_group_ids:
|
|
565
|
+
delete_group_ids: Sequence[str] | None = None
|
|
567
566
|
if cleanup == "scoped_full":
|
|
568
567
|
delete_group_ids = list(scoped_full_cleanup_source_ids)
|
|
569
568
|
while uids_to_delete := record_manager.list_keys(
|
|
@@ -591,7 +590,7 @@ async def _to_async_iterator(iterator: Iterable[T]) -> AsyncIterator[T]:
|
|
|
591
590
|
|
|
592
591
|
|
|
593
592
|
async def _adelete(
|
|
594
|
-
vector_store:
|
|
593
|
+
vector_store: VectorStore | DocumentIndex,
|
|
595
594
|
ids: list[str],
|
|
596
595
|
) -> None:
|
|
597
596
|
if isinstance(vector_store, VectorStore):
|
|
@@ -613,19 +612,18 @@ async def _adelete(
|
|
|
613
612
|
|
|
614
613
|
|
|
615
614
|
async def aindex(
|
|
616
|
-
docs_source:
|
|
615
|
+
docs_source: BaseLoader | Iterable[Document] | AsyncIterator[Document],
|
|
617
616
|
record_manager: RecordManager,
|
|
618
|
-
vector_store:
|
|
617
|
+
vector_store: VectorStore | DocumentIndex,
|
|
619
618
|
*,
|
|
620
619
|
batch_size: int = 100,
|
|
621
|
-
cleanup:
|
|
622
|
-
source_id_key:
|
|
620
|
+
cleanup: Literal["incremental", "full", "scoped_full"] | None = None,
|
|
621
|
+
source_id_key: str | Callable[[Document], str] | None = None,
|
|
623
622
|
cleanup_batch_size: int = 1_000,
|
|
624
623
|
force_update: bool = False,
|
|
625
|
-
key_encoder:
|
|
626
|
-
|
|
627
|
-
] =
|
|
628
|
-
upsert_kwargs: Optional[dict[str, Any]] = None,
|
|
624
|
+
key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]
|
|
625
|
+
| Callable[[Document], str] = "sha1",
|
|
626
|
+
upsert_kwargs: dict[str, Any] | None = None,
|
|
629
627
|
) -> IndexingResult:
|
|
630
628
|
"""Async index data from the loader into the vector store.
|
|
631
629
|
|
|
@@ -638,10 +636,10 @@ async def aindex(
|
|
|
638
636
|
For the time being, documents are indexed using their hashes, and users
|
|
639
637
|
are not able to specify the uid of the document.
|
|
640
638
|
|
|
641
|
-
|
|
642
|
-
Added
|
|
639
|
+
!!! warning "Behavior changed in 0.3.25"
|
|
640
|
+
Added `scoped_full` cleanup mode.
|
|
643
641
|
|
|
644
|
-
|
|
642
|
+
!!! warning
|
|
645
643
|
|
|
646
644
|
* In full mode, the loader should be returning
|
|
647
645
|
the entire dataset, and not just a subset of the dataset.
|
|
@@ -655,7 +653,7 @@ async def aindex(
|
|
|
655
653
|
chunks, and we index them using a batch size of 5, we'll have 3 batches
|
|
656
654
|
all with the same source id. In general, to avoid doing too much
|
|
657
655
|
redundant work select as big a batch size as possible.
|
|
658
|
-
* The
|
|
656
|
+
* The `scoped_full` mode is suitable if determining an appropriate batch size
|
|
659
657
|
is challenging or if your data loader cannot return the entire dataset at
|
|
660
658
|
once. This mode keeps track of source IDs in memory, which should be fine
|
|
661
659
|
for most use cases. If your dataset is large (10M+ docs), you will likely
|
|
@@ -666,8 +664,8 @@ async def aindex(
|
|
|
666
664
|
record_manager: Timestamped set to keep track of which documents were
|
|
667
665
|
updated.
|
|
668
666
|
vector_store: VectorStore or DocumentIndex to index the documents into.
|
|
669
|
-
batch_size: Batch size to use when indexing.
|
|
670
|
-
cleanup: How to handle clean up of documents.
|
|
667
|
+
batch_size: Batch size to use when indexing.
|
|
668
|
+
cleanup: How to handle clean up of documents.
|
|
671
669
|
|
|
672
670
|
- incremental: Cleans up all documents that haven't been updated AND
|
|
673
671
|
that are associated with source ids that were seen during indexing.
|
|
@@ -682,17 +680,14 @@ async def aindex(
|
|
|
682
680
|
source ids that were seen during indexing.
|
|
683
681
|
- None: Do not delete any documents.
|
|
684
682
|
source_id_key: Optional key that helps identify the original source
|
|
685
|
-
of the document.
|
|
683
|
+
of the document.
|
|
686
684
|
cleanup_batch_size: Batch size to use when cleaning up documents.
|
|
687
|
-
Default is 1_000.
|
|
688
685
|
force_update: Force update documents even if they are present in the
|
|
689
686
|
record manager. Useful if you are re-indexing with updated embeddings.
|
|
690
|
-
Default is False.
|
|
691
687
|
key_encoder: Hashing algorithm to use for hashing the document content and
|
|
692
|
-
metadata.
|
|
693
|
-
Other options include "blake2b", "sha256", and "sha512".
|
|
688
|
+
metadata. Options include "blake2b", "sha256", and "sha512".
|
|
694
689
|
|
|
695
|
-
|
|
690
|
+
!!! version-added "Added in version 0.3.66"
|
|
696
691
|
|
|
697
692
|
key_encoder: Hashing algorithm to use for hashing the document.
|
|
698
693
|
If not provided, a default encoder using SHA-1 will be used.
|
|
@@ -709,7 +704,7 @@ async def aindex(
|
|
|
709
704
|
method of the VectorStore or the upsert method of the DocumentIndex.
|
|
710
705
|
For example, you can use this to specify a custom vector_field:
|
|
711
706
|
upsert_kwargs={"vector_field": "embedding"}
|
|
712
|
-
|
|
707
|
+
!!! version-added "Added in version 0.3.10"
|
|
713
708
|
|
|
714
709
|
Returns:
|
|
715
710
|
Indexing result which contains information about how many documents
|
|
@@ -721,9 +716,9 @@ async def aindex(
|
|
|
721
716
|
ValueError: If vectorstore does not have
|
|
722
717
|
"adelete" and "aadd_documents" required methods.
|
|
723
718
|
ValueError: If source_id_key is not None, but is not a string or callable.
|
|
724
|
-
TypeError: If
|
|
725
|
-
AssertionError: If
|
|
726
|
-
incremental or
|
|
719
|
+
TypeError: If `vector_store` is not a VectorStore or DocumentIndex.
|
|
720
|
+
AssertionError: If `source_id_key` is None when cleanup mode is
|
|
721
|
+
incremental or `scoped_full` (should be unreachable).
|
|
727
722
|
"""
|
|
728
723
|
# Behavior is deprecated, but we keep it for backwards compatibility.
|
|
729
724
|
# # Warn only once per process.
|
|
@@ -815,13 +810,13 @@ async def aindex(
|
|
|
815
810
|
# Count documents removed by within-batch deduplication
|
|
816
811
|
num_skipped += original_batch_size - len(hashed_docs)
|
|
817
812
|
|
|
818
|
-
source_ids: Sequence[
|
|
813
|
+
source_ids: Sequence[str | None] = [
|
|
819
814
|
source_id_assigner(doc) for doc in hashed_docs
|
|
820
815
|
]
|
|
821
816
|
|
|
822
817
|
if cleanup in {"incremental", "scoped_full"}:
|
|
823
818
|
# If the cleanup mode is incremental, source ids are required.
|
|
824
|
-
for source_id, hashed_doc in zip(source_ids, hashed_docs):
|
|
819
|
+
for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
|
|
825
820
|
if source_id is None:
|
|
826
821
|
msg = (
|
|
827
822
|
f"Source ids are required when cleanup mode is "
|
|
@@ -845,7 +840,7 @@ async def aindex(
|
|
|
845
840
|
docs_to_index: list[Document] = []
|
|
846
841
|
uids_to_refresh = []
|
|
847
842
|
seen_docs: set[str] = set()
|
|
848
|
-
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
|
843
|
+
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):
|
|
849
844
|
hashed_id = cast("str", hashed_doc.id)
|
|
850
845
|
if doc_exists:
|
|
851
846
|
if force_update:
|
|
@@ -917,7 +912,7 @@ async def aindex(
|
|
|
917
912
|
if cleanup == "full" or (
|
|
918
913
|
cleanup == "scoped_full" and scoped_full_cleanup_source_ids
|
|
919
914
|
):
|
|
920
|
-
delete_group_ids:
|
|
915
|
+
delete_group_ids: Sequence[str] | None = None
|
|
921
916
|
if cleanup == "scoped_full":
|
|
922
917
|
delete_group_ids = list(scoped_full_cleanup_source_ids)
|
|
923
918
|
while uids_to_delete := await record_manager.alist_keys(
|
langchain_core/indexing/base.py
CHANGED
|
@@ -5,7 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
import abc
|
|
6
6
|
import time
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
|
-
from typing import TYPE_CHECKING, Any,
|
|
8
|
+
from typing import TYPE_CHECKING, Any, TypedDict
|
|
9
9
|
|
|
10
10
|
from typing_extensions import override
|
|
11
11
|
|
|
@@ -61,7 +61,7 @@ class RecordManager(ABC):
|
|
|
61
61
|
"""Initialize the record manager.
|
|
62
62
|
|
|
63
63
|
Args:
|
|
64
|
-
namespace
|
|
64
|
+
namespace: The namespace for the record manager.
|
|
65
65
|
"""
|
|
66
66
|
self.namespace = namespace
|
|
67
67
|
|
|
@@ -100,8 +100,8 @@ class RecordManager(ABC):
|
|
|
100
100
|
self,
|
|
101
101
|
keys: Sequence[str],
|
|
102
102
|
*,
|
|
103
|
-
group_ids:
|
|
104
|
-
time_at_least:
|
|
103
|
+
group_ids: Sequence[str | None] | None = None,
|
|
104
|
+
time_at_least: float | None = None,
|
|
105
105
|
) -> None:
|
|
106
106
|
"""Upsert records into the database.
|
|
107
107
|
|
|
@@ -128,8 +128,8 @@ class RecordManager(ABC):
|
|
|
128
128
|
self,
|
|
129
129
|
keys: Sequence[str],
|
|
130
130
|
*,
|
|
131
|
-
group_ids:
|
|
132
|
-
time_at_least:
|
|
131
|
+
group_ids: Sequence[str | None] | None = None,
|
|
132
|
+
time_at_least: float | None = None,
|
|
133
133
|
) -> None:
|
|
134
134
|
"""Asynchronously upsert records into the database.
|
|
135
135
|
|
|
@@ -177,10 +177,10 @@ class RecordManager(ABC):
|
|
|
177
177
|
def list_keys(
|
|
178
178
|
self,
|
|
179
179
|
*,
|
|
180
|
-
before:
|
|
181
|
-
after:
|
|
182
|
-
group_ids:
|
|
183
|
-
limit:
|
|
180
|
+
before: float | None = None,
|
|
181
|
+
after: float | None = None,
|
|
182
|
+
group_ids: Sequence[str] | None = None,
|
|
183
|
+
limit: int | None = None,
|
|
184
184
|
) -> list[str]:
|
|
185
185
|
"""List records in the database based on the provided filters.
|
|
186
186
|
|
|
@@ -198,10 +198,10 @@ class RecordManager(ABC):
|
|
|
198
198
|
async def alist_keys(
|
|
199
199
|
self,
|
|
200
200
|
*,
|
|
201
|
-
before:
|
|
202
|
-
after:
|
|
203
|
-
group_ids:
|
|
204
|
-
limit:
|
|
201
|
+
before: float | None = None,
|
|
202
|
+
after: float | None = None,
|
|
203
|
+
group_ids: Sequence[str] | None = None,
|
|
204
|
+
limit: int | None = None,
|
|
205
205
|
) -> list[str]:
|
|
206
206
|
"""Asynchronously list records in the database based on the provided filters.
|
|
207
207
|
|
|
@@ -233,7 +233,7 @@ class RecordManager(ABC):
|
|
|
233
233
|
|
|
234
234
|
|
|
235
235
|
class _Record(TypedDict):
|
|
236
|
-
group_id:
|
|
236
|
+
group_id: str | None
|
|
237
237
|
updated_at: float
|
|
238
238
|
|
|
239
239
|
|
|
@@ -244,7 +244,7 @@ class InMemoryRecordManager(RecordManager):
|
|
|
244
244
|
"""Initialize the in-memory record manager.
|
|
245
245
|
|
|
246
246
|
Args:
|
|
247
|
-
namespace
|
|
247
|
+
namespace: The namespace for the record manager.
|
|
248
248
|
"""
|
|
249
249
|
super().__init__(namespace)
|
|
250
250
|
# Each key points to a dictionary
|
|
@@ -270,18 +270,18 @@ class InMemoryRecordManager(RecordManager):
|
|
|
270
270
|
self,
|
|
271
271
|
keys: Sequence[str],
|
|
272
272
|
*,
|
|
273
|
-
group_ids:
|
|
274
|
-
time_at_least:
|
|
273
|
+
group_ids: Sequence[str | None] | None = None,
|
|
274
|
+
time_at_least: float | None = None,
|
|
275
275
|
) -> None:
|
|
276
276
|
"""Upsert records into the database.
|
|
277
277
|
|
|
278
278
|
Args:
|
|
279
279
|
keys: A list of record keys to upsert.
|
|
280
280
|
group_ids: A list of group IDs corresponding to the keys.
|
|
281
|
-
|
|
281
|
+
|
|
282
282
|
time_at_least: Optional timestamp. Implementation can use this
|
|
283
283
|
to optionally verify that the timestamp IS at least this time
|
|
284
|
-
in the system that stores.
|
|
284
|
+
in the system that stores.
|
|
285
285
|
E.g., use to validate that the time in the postgres database
|
|
286
286
|
is equal to or larger than the given timestamp, if not
|
|
287
287
|
raise an error.
|
|
@@ -307,18 +307,18 @@ class InMemoryRecordManager(RecordManager):
|
|
|
307
307
|
self,
|
|
308
308
|
keys: Sequence[str],
|
|
309
309
|
*,
|
|
310
|
-
group_ids:
|
|
311
|
-
time_at_least:
|
|
310
|
+
group_ids: Sequence[str | None] | None = None,
|
|
311
|
+
time_at_least: float | None = None,
|
|
312
312
|
) -> None:
|
|
313
313
|
"""Async upsert records into the database.
|
|
314
314
|
|
|
315
315
|
Args:
|
|
316
316
|
keys: A list of record keys to upsert.
|
|
317
317
|
group_ids: A list of group IDs corresponding to the keys.
|
|
318
|
-
|
|
318
|
+
|
|
319
319
|
time_at_least: Optional timestamp. Implementation can use this
|
|
320
320
|
to optionally verify that the timestamp IS at least this time
|
|
321
|
-
in the system that stores.
|
|
321
|
+
in the system that stores.
|
|
322
322
|
E.g., use to validate that the time in the postgres database
|
|
323
323
|
is equal to or larger than the given timestamp, if not
|
|
324
324
|
raise an error.
|
|
@@ -352,22 +352,22 @@ class InMemoryRecordManager(RecordManager):
|
|
|
352
352
|
def list_keys(
|
|
353
353
|
self,
|
|
354
354
|
*,
|
|
355
|
-
before:
|
|
356
|
-
after:
|
|
357
|
-
group_ids:
|
|
358
|
-
limit:
|
|
355
|
+
before: float | None = None,
|
|
356
|
+
after: float | None = None,
|
|
357
|
+
group_ids: Sequence[str] | None = None,
|
|
358
|
+
limit: int | None = None,
|
|
359
359
|
) -> list[str]:
|
|
360
360
|
"""List records in the database based on the provided filters.
|
|
361
361
|
|
|
362
362
|
Args:
|
|
363
363
|
before: Filter to list records updated before this time.
|
|
364
|
-
|
|
364
|
+
|
|
365
365
|
after: Filter to list records updated after this time.
|
|
366
|
-
|
|
366
|
+
|
|
367
367
|
group_ids: Filter to list records with specific group IDs.
|
|
368
|
-
|
|
368
|
+
|
|
369
369
|
limit: optional limit on the number of records to return.
|
|
370
|
-
|
|
370
|
+
|
|
371
371
|
|
|
372
372
|
Returns:
|
|
373
373
|
A list of keys for the matching records.
|
|
@@ -388,22 +388,22 @@ class InMemoryRecordManager(RecordManager):
|
|
|
388
388
|
async def alist_keys(
|
|
389
389
|
self,
|
|
390
390
|
*,
|
|
391
|
-
before:
|
|
392
|
-
after:
|
|
393
|
-
group_ids:
|
|
394
|
-
limit:
|
|
391
|
+
before: float | None = None,
|
|
392
|
+
after: float | None = None,
|
|
393
|
+
group_ids: Sequence[str] | None = None,
|
|
394
|
+
limit: int | None = None,
|
|
395
395
|
) -> list[str]:
|
|
396
396
|
"""Async list records in the database based on the provided filters.
|
|
397
397
|
|
|
398
398
|
Args:
|
|
399
399
|
before: Filter to list records updated before this time.
|
|
400
|
-
|
|
400
|
+
|
|
401
401
|
after: Filter to list records updated after this time.
|
|
402
|
-
|
|
402
|
+
|
|
403
403
|
group_ids: Filter to list records with specific group IDs.
|
|
404
|
-
|
|
404
|
+
|
|
405
405
|
limit: optional limit on the number of records to return.
|
|
406
|
-
|
|
406
|
+
|
|
407
407
|
|
|
408
408
|
Returns:
|
|
409
409
|
A list of keys for the matching records.
|
|
@@ -485,7 +485,7 @@ class DeleteResponse(TypedDict, total=False):
|
|
|
485
485
|
failed: Sequence[str]
|
|
486
486
|
"""The IDs that failed to be deleted.
|
|
487
487
|
|
|
488
|
-
|
|
488
|
+
!!! warning
|
|
489
489
|
Deleting an ID that does not exist is **NOT** considered a failure.
|
|
490
490
|
"""
|
|
491
491
|
|
|
@@ -509,7 +509,7 @@ class DocumentIndex(BaseRetriever):
|
|
|
509
509
|
2. Fetching document by ID.
|
|
510
510
|
3. Searching for document using a query.
|
|
511
511
|
|
|
512
|
-
|
|
512
|
+
!!! version-added "Added in version 0.2.29"
|
|
513
513
|
"""
|
|
514
514
|
|
|
515
515
|
@abc.abstractmethod
|
|
@@ -522,14 +522,14 @@ class DocumentIndex(BaseRetriever):
|
|
|
522
522
|
|
|
523
523
|
When an ID is specified and the content already exists in the vectorstore,
|
|
524
524
|
the upsert method should update the content with the new data. If the content
|
|
525
|
-
does not exist, the upsert method should add the item to the
|
|
525
|
+
does not exist, the upsert method should add the item to the `VectorStore`.
|
|
526
526
|
|
|
527
527
|
Args:
|
|
528
|
-
items: Sequence of documents to add to the
|
|
528
|
+
items: Sequence of documents to add to the `VectorStore`.
|
|
529
529
|
**kwargs: Additional keyword arguments.
|
|
530
530
|
|
|
531
531
|
Returns:
|
|
532
|
-
|
|
532
|
+
A response object that contains the list of IDs that were
|
|
533
533
|
successfully added or updated in the vectorstore and the list of IDs that
|
|
534
534
|
failed to be added or updated.
|
|
535
535
|
"""
|
|
@@ -545,14 +545,14 @@ class DocumentIndex(BaseRetriever):
|
|
|
545
545
|
|
|
546
546
|
When an ID is specified and the item already exists in the vectorstore,
|
|
547
547
|
the upsert method should update the item with the new data. If the item
|
|
548
|
-
does not exist, the upsert method should add the item to the
|
|
548
|
+
does not exist, the upsert method should add the item to the `VectorStore`.
|
|
549
549
|
|
|
550
550
|
Args:
|
|
551
|
-
items: Sequence of documents to add to the
|
|
551
|
+
items: Sequence of documents to add to the `VectorStore`.
|
|
552
552
|
**kwargs: Additional keyword arguments.
|
|
553
553
|
|
|
554
554
|
Returns:
|
|
555
|
-
|
|
555
|
+
A response object that contains the list of IDs that were
|
|
556
556
|
successfully added or updated in the vectorstore and the list of IDs that
|
|
557
557
|
failed to be added or updated.
|
|
558
558
|
"""
|
|
@@ -564,24 +564,24 @@ class DocumentIndex(BaseRetriever):
|
|
|
564
564
|
)
|
|
565
565
|
|
|
566
566
|
@abc.abstractmethod
|
|
567
|
-
def delete(self, ids:
|
|
567
|
+
def delete(self, ids: list[str] | None = None, **kwargs: Any) -> DeleteResponse:
|
|
568
568
|
"""Delete by IDs or other criteria.
|
|
569
569
|
|
|
570
570
|
Calling delete without any input parameters should raise a ValueError!
|
|
571
571
|
|
|
572
572
|
Args:
|
|
573
573
|
ids: List of ids to delete.
|
|
574
|
-
kwargs: Additional keyword arguments. This is up to the implementation.
|
|
574
|
+
**kwargs: Additional keyword arguments. This is up to the implementation.
|
|
575
575
|
For example, can include an option to delete the entire index,
|
|
576
576
|
or else issue a non-blocking delete etc.
|
|
577
577
|
|
|
578
578
|
Returns:
|
|
579
|
-
|
|
579
|
+
A response object that contains the list of IDs that were
|
|
580
580
|
successfully deleted and the list of IDs that failed to be deleted.
|
|
581
581
|
"""
|
|
582
582
|
|
|
583
583
|
async def adelete(
|
|
584
|
-
self, ids:
|
|
584
|
+
self, ids: list[str] | None = None, **kwargs: Any
|
|
585
585
|
) -> DeleteResponse:
|
|
586
586
|
"""Delete by IDs or other criteria. Async variant.
|
|
587
587
|
|
|
@@ -589,11 +589,11 @@ class DocumentIndex(BaseRetriever):
|
|
|
589
589
|
|
|
590
590
|
Args:
|
|
591
591
|
ids: List of ids to delete.
|
|
592
|
-
kwargs: Additional keyword arguments. This is up to the implementation.
|
|
592
|
+
**kwargs: Additional keyword arguments. This is up to the implementation.
|
|
593
593
|
For example, can include an option to delete the entire index.
|
|
594
594
|
|
|
595
595
|
Returns:
|
|
596
|
-
|
|
596
|
+
A response object that contains the list of IDs that were
|
|
597
597
|
successfully deleted and the list of IDs that failed to be deleted.
|
|
598
598
|
"""
|
|
599
599
|
return await run_in_executor(
|
|
@@ -624,10 +624,10 @@ class DocumentIndex(BaseRetriever):
|
|
|
624
624
|
|
|
625
625
|
Args:
|
|
626
626
|
ids: List of IDs to get.
|
|
627
|
-
kwargs: Additional keyword arguments. These are up to the implementation.
|
|
627
|
+
**kwargs: Additional keyword arguments. These are up to the implementation.
|
|
628
628
|
|
|
629
629
|
Returns:
|
|
630
|
-
|
|
630
|
+
List of documents that were found.
|
|
631
631
|
"""
|
|
632
632
|
|
|
633
633
|
async def aget(
|
|
@@ -650,10 +650,10 @@ class DocumentIndex(BaseRetriever):
|
|
|
650
650
|
|
|
651
651
|
Args:
|
|
652
652
|
ids: List of IDs to get.
|
|
653
|
-
kwargs: Additional keyword arguments. These are up to the implementation.
|
|
653
|
+
**kwargs: Additional keyword arguments. These are up to the implementation.
|
|
654
654
|
|
|
655
655
|
Returns:
|
|
656
|
-
|
|
656
|
+
List of documents that were found.
|
|
657
657
|
"""
|
|
658
658
|
return await run_in_executor(
|
|
659
659
|
None,
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import operator
|
|
4
4
|
import uuid
|
|
5
5
|
from collections.abc import Sequence
|
|
6
|
-
from typing import Any,
|
|
6
|
+
from typing import Any, cast
|
|
7
7
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
from typing_extensions import override
|
|
@@ -24,7 +24,7 @@ class InMemoryDocumentIndex(DocumentIndex):
|
|
|
24
24
|
It provides a simple search API that returns documents by the number of
|
|
25
25
|
counts the given query appears in the document.
|
|
26
26
|
|
|
27
|
-
|
|
27
|
+
!!! version-added "Added in version 0.2.29"
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
30
|
store: dict[str, Document] = Field(default_factory=dict)
|
|
@@ -60,7 +60,7 @@ class InMemoryDocumentIndex(DocumentIndex):
|
|
|
60
60
|
return UpsertResponse(succeeded=ok_ids, failed=[])
|
|
61
61
|
|
|
62
62
|
@override
|
|
63
|
-
def delete(self, ids:
|
|
63
|
+
def delete(self, ids: list[str] | None = None, **kwargs: Any) -> DeleteResponse:
|
|
64
64
|
"""Delete by IDs.
|
|
65
65
|
|
|
66
66
|
Args:
|