langchain-core 1.0.0a5__py3-none-any.whl → 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_core/__init__.py +1 -1
- langchain_core/_api/__init__.py +3 -4
- langchain_core/_api/beta_decorator.py +23 -26
- langchain_core/_api/deprecation.py +51 -64
- langchain_core/_api/path.py +3 -6
- langchain_core/_import_utils.py +3 -4
- langchain_core/agents.py +20 -22
- langchain_core/caches.py +65 -66
- langchain_core/callbacks/__init__.py +1 -8
- langchain_core/callbacks/base.py +321 -336
- langchain_core/callbacks/file.py +44 -44
- langchain_core/callbacks/manager.py +436 -513
- langchain_core/callbacks/stdout.py +29 -30
- langchain_core/callbacks/streaming_stdout.py +32 -32
- langchain_core/callbacks/usage.py +60 -57
- langchain_core/chat_history.py +53 -68
- langchain_core/document_loaders/base.py +27 -25
- langchain_core/document_loaders/blob_loaders.py +1 -1
- langchain_core/document_loaders/langsmith.py +44 -48
- langchain_core/documents/__init__.py +23 -3
- langchain_core/documents/base.py +98 -90
- langchain_core/documents/compressor.py +10 -10
- langchain_core/documents/transformers.py +34 -35
- langchain_core/embeddings/fake.py +50 -54
- langchain_core/example_selectors/length_based.py +1 -1
- langchain_core/example_selectors/semantic_similarity.py +28 -32
- langchain_core/exceptions.py +21 -20
- langchain_core/globals.py +3 -151
- langchain_core/indexing/__init__.py +1 -1
- langchain_core/indexing/api.py +121 -126
- langchain_core/indexing/base.py +73 -75
- langchain_core/indexing/in_memory.py +4 -6
- langchain_core/language_models/__init__.py +14 -29
- langchain_core/language_models/_utils.py +58 -61
- langchain_core/language_models/base.py +53 -162
- langchain_core/language_models/chat_models.py +298 -387
- langchain_core/language_models/fake.py +11 -11
- langchain_core/language_models/fake_chat_models.py +42 -36
- langchain_core/language_models/llms.py +125 -235
- langchain_core/load/dump.py +9 -12
- langchain_core/load/load.py +18 -28
- langchain_core/load/mapping.py +2 -4
- langchain_core/load/serializable.py +42 -40
- langchain_core/messages/__init__.py +10 -16
- langchain_core/messages/ai.py +148 -148
- langchain_core/messages/base.py +58 -52
- langchain_core/messages/block_translators/__init__.py +27 -17
- langchain_core/messages/block_translators/anthropic.py +6 -6
- langchain_core/messages/block_translators/bedrock_converse.py +5 -5
- langchain_core/messages/block_translators/google_genai.py +505 -20
- langchain_core/messages/block_translators/google_vertexai.py +4 -32
- langchain_core/messages/block_translators/groq.py +117 -21
- langchain_core/messages/block_translators/langchain_v0.py +5 -5
- langchain_core/messages/block_translators/openai.py +11 -11
- langchain_core/messages/chat.py +2 -6
- langchain_core/messages/content.py +337 -328
- langchain_core/messages/function.py +6 -10
- langchain_core/messages/human.py +24 -31
- langchain_core/messages/modifier.py +2 -2
- langchain_core/messages/system.py +19 -29
- langchain_core/messages/tool.py +74 -90
- langchain_core/messages/utils.py +474 -504
- langchain_core/output_parsers/__init__.py +13 -10
- langchain_core/output_parsers/base.py +61 -61
- langchain_core/output_parsers/format_instructions.py +9 -4
- langchain_core/output_parsers/json.py +12 -10
- langchain_core/output_parsers/list.py +21 -23
- langchain_core/output_parsers/openai_functions.py +49 -47
- langchain_core/output_parsers/openai_tools.py +16 -21
- langchain_core/output_parsers/pydantic.py +13 -14
- langchain_core/output_parsers/string.py +5 -5
- langchain_core/output_parsers/transform.py +15 -17
- langchain_core/output_parsers/xml.py +35 -34
- langchain_core/outputs/__init__.py +1 -1
- langchain_core/outputs/chat_generation.py +18 -18
- langchain_core/outputs/chat_result.py +1 -3
- langchain_core/outputs/generation.py +10 -11
- langchain_core/outputs/llm_result.py +10 -10
- langchain_core/prompt_values.py +11 -17
- langchain_core/prompts/__init__.py +3 -27
- langchain_core/prompts/base.py +48 -56
- langchain_core/prompts/chat.py +275 -325
- langchain_core/prompts/dict.py +5 -5
- langchain_core/prompts/few_shot.py +81 -88
- langchain_core/prompts/few_shot_with_templates.py +11 -13
- langchain_core/prompts/image.py +12 -14
- langchain_core/prompts/loading.py +4 -6
- langchain_core/prompts/message.py +3 -3
- langchain_core/prompts/prompt.py +24 -39
- langchain_core/prompts/string.py +26 -10
- langchain_core/prompts/structured.py +49 -53
- langchain_core/rate_limiters.py +51 -60
- langchain_core/retrievers.py +61 -198
- langchain_core/runnables/base.py +1478 -1630
- langchain_core/runnables/branch.py +53 -57
- langchain_core/runnables/config.py +72 -89
- langchain_core/runnables/configurable.py +120 -137
- langchain_core/runnables/fallbacks.py +83 -79
- langchain_core/runnables/graph.py +91 -97
- langchain_core/runnables/graph_ascii.py +27 -28
- langchain_core/runnables/graph_mermaid.py +38 -50
- langchain_core/runnables/graph_png.py +15 -16
- langchain_core/runnables/history.py +135 -148
- langchain_core/runnables/passthrough.py +124 -150
- langchain_core/runnables/retry.py +46 -51
- langchain_core/runnables/router.py +25 -30
- langchain_core/runnables/schema.py +75 -80
- langchain_core/runnables/utils.py +60 -67
- langchain_core/stores.py +85 -121
- langchain_core/structured_query.py +8 -8
- langchain_core/sys_info.py +27 -29
- langchain_core/tools/__init__.py +1 -14
- langchain_core/tools/base.py +285 -229
- langchain_core/tools/convert.py +160 -155
- langchain_core/tools/render.py +10 -10
- langchain_core/tools/retriever.py +12 -11
- langchain_core/tools/simple.py +19 -24
- langchain_core/tools/structured.py +32 -39
- langchain_core/tracers/__init__.py +1 -9
- langchain_core/tracers/base.py +97 -99
- langchain_core/tracers/context.py +29 -52
- langchain_core/tracers/core.py +49 -53
- langchain_core/tracers/evaluation.py +11 -11
- langchain_core/tracers/event_stream.py +65 -64
- langchain_core/tracers/langchain.py +21 -21
- langchain_core/tracers/log_stream.py +45 -45
- langchain_core/tracers/memory_stream.py +3 -3
- langchain_core/tracers/root_listeners.py +16 -16
- langchain_core/tracers/run_collector.py +2 -4
- langchain_core/tracers/schemas.py +0 -129
- langchain_core/tracers/stdout.py +3 -3
- langchain_core/utils/__init__.py +1 -4
- langchain_core/utils/_merge.py +2 -2
- langchain_core/utils/aiter.py +57 -61
- langchain_core/utils/env.py +9 -9
- langchain_core/utils/function_calling.py +89 -186
- langchain_core/utils/html.py +7 -8
- langchain_core/utils/input.py +6 -6
- langchain_core/utils/interactive_env.py +1 -1
- langchain_core/utils/iter.py +36 -40
- langchain_core/utils/json.py +4 -3
- langchain_core/utils/json_schema.py +9 -9
- langchain_core/utils/mustache.py +8 -10
- langchain_core/utils/pydantic.py +33 -35
- langchain_core/utils/strings.py +6 -9
- langchain_core/utils/usage.py +1 -1
- langchain_core/utils/utils.py +66 -62
- langchain_core/vectorstores/base.py +182 -216
- langchain_core/vectorstores/in_memory.py +101 -176
- langchain_core/vectorstores/utils.py +5 -5
- langchain_core/version.py +1 -1
- langchain_core-1.0.3.dist-info/METADATA +69 -0
- langchain_core-1.0.3.dist-info/RECORD +172 -0
- {langchain_core-1.0.0a5.dist-info → langchain_core-1.0.3.dist-info}/WHEEL +1 -1
- langchain_core/memory.py +0 -120
- langchain_core/messages/block_translators/ollama.py +0 -47
- langchain_core/prompts/pipeline.py +0 -138
- langchain_core/pydantic_v1/__init__.py +0 -30
- langchain_core/pydantic_v1/dataclasses.py +0 -23
- langchain_core/pydantic_v1/main.py +0 -23
- langchain_core/tracers/langchain_v1.py +0 -31
- langchain_core/utils/loading.py +0 -35
- langchain_core-1.0.0a5.dist-info/METADATA +0 -77
- langchain_core-1.0.0a5.dist-info/RECORD +0 -181
- langchain_core-1.0.0a5.dist-info/entry_points.txt +0 -4
langchain_core/indexing/api.py
CHANGED
|
@@ -6,16 +6,20 @@ import hashlib
|
|
|
6
6
|
import json
|
|
7
7
|
import uuid
|
|
8
8
|
import warnings
|
|
9
|
-
from collections.abc import
|
|
9
|
+
from collections.abc import (
|
|
10
|
+
AsyncIterable,
|
|
11
|
+
AsyncIterator,
|
|
12
|
+
Callable,
|
|
13
|
+
Iterable,
|
|
14
|
+
Iterator,
|
|
15
|
+
Sequence,
|
|
16
|
+
)
|
|
10
17
|
from itertools import islice
|
|
11
18
|
from typing import (
|
|
12
19
|
Any,
|
|
13
|
-
Callable,
|
|
14
20
|
Literal,
|
|
15
|
-
Optional,
|
|
16
21
|
TypedDict,
|
|
17
22
|
TypeVar,
|
|
18
|
-
Union,
|
|
19
23
|
cast,
|
|
20
24
|
)
|
|
21
25
|
|
|
@@ -107,8 +111,8 @@ async def _abatch(size: int, iterable: AsyncIterable[T]) -> AsyncIterator[list[T
|
|
|
107
111
|
|
|
108
112
|
|
|
109
113
|
def _get_source_id_assigner(
|
|
110
|
-
source_id_key:
|
|
111
|
-
) -> Callable[[Document],
|
|
114
|
+
source_id_key: str | Callable[[Document], str] | None,
|
|
115
|
+
) -> Callable[[Document], str | None]:
|
|
112
116
|
"""Get the source id from the document."""
|
|
113
117
|
if source_id_key is None:
|
|
114
118
|
return lambda _doc: None
|
|
@@ -162,9 +166,8 @@ def _calculate_hash(
|
|
|
162
166
|
def _get_document_with_hash(
|
|
163
167
|
document: Document,
|
|
164
168
|
*,
|
|
165
|
-
key_encoder:
|
|
166
|
-
|
|
167
|
-
],
|
|
169
|
+
key_encoder: Callable[[Document], str]
|
|
170
|
+
| Literal["sha1", "sha256", "sha512", "blake2b"],
|
|
168
171
|
) -> Document:
|
|
169
172
|
"""Calculate a hash of the document, and assign it to the uid.
|
|
170
173
|
|
|
@@ -233,7 +236,7 @@ class _HashedDocument:
|
|
|
233
236
|
|
|
234
237
|
|
|
235
238
|
def _delete(
|
|
236
|
-
vector_store:
|
|
239
|
+
vector_store: VectorStore | DocumentIndex,
|
|
237
240
|
ids: list[str],
|
|
238
241
|
) -> None:
|
|
239
242
|
if isinstance(vector_store, VectorStore):
|
|
@@ -271,19 +274,18 @@ class IndexingResult(TypedDict):
|
|
|
271
274
|
|
|
272
275
|
|
|
273
276
|
def index(
|
|
274
|
-
docs_source:
|
|
277
|
+
docs_source: BaseLoader | Iterable[Document],
|
|
275
278
|
record_manager: RecordManager,
|
|
276
|
-
vector_store:
|
|
279
|
+
vector_store: VectorStore | DocumentIndex,
|
|
277
280
|
*,
|
|
278
281
|
batch_size: int = 100,
|
|
279
|
-
cleanup:
|
|
280
|
-
source_id_key:
|
|
282
|
+
cleanup: Literal["incremental", "full", "scoped_full"] | None = None,
|
|
283
|
+
source_id_key: str | Callable[[Document], str] | None = None,
|
|
281
284
|
cleanup_batch_size: int = 1_000,
|
|
282
285
|
force_update: bool = False,
|
|
283
|
-
key_encoder:
|
|
284
|
-
|
|
285
|
-
] =
|
|
286
|
-
upsert_kwargs: Optional[dict[str, Any]] = None,
|
|
286
|
+
key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]
|
|
287
|
+
| Callable[[Document], str] = "sha1",
|
|
288
|
+
upsert_kwargs: dict[str, Any] | None = None,
|
|
287
289
|
) -> IndexingResult:
|
|
288
290
|
"""Index data from the loader into the vector store.
|
|
289
291
|
|
|
@@ -296,61 +298,58 @@ def index(
|
|
|
296
298
|
For the time being, documents are indexed using their hashes, and users
|
|
297
299
|
are not able to specify the uid of the document.
|
|
298
300
|
|
|
299
|
-
|
|
300
|
-
Added
|
|
301
|
+
!!! warning "Behavior changed in 0.3.25"
|
|
302
|
+
Added `scoped_full` cleanup mode.
|
|
301
303
|
|
|
302
|
-
|
|
304
|
+
!!! warning
|
|
303
305
|
|
|
304
306
|
* In full mode, the loader should be returning
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
307
|
+
the entire dataset, and not just a subset of the dataset.
|
|
308
|
+
Otherwise, the auto_cleanup will remove documents that it is not
|
|
309
|
+
supposed to.
|
|
308
310
|
* In incremental mode, if documents associated with a particular
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
* The
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
311
|
+
source id appear across different batches, the indexing API
|
|
312
|
+
will do some redundant work. This will still result in the
|
|
313
|
+
correct end state of the index, but will unfortunately not be
|
|
314
|
+
100% efficient. For example, if a given document is split into 15
|
|
315
|
+
chunks, and we index them using a batch size of 5, we'll have 3 batches
|
|
316
|
+
all with the same source id. In general, to avoid doing too much
|
|
317
|
+
redundant work select as big a batch size as possible.
|
|
318
|
+
* The `scoped_full` mode is suitable if determining an appropriate batch size
|
|
319
|
+
is challenging or if your data loader cannot return the entire dataset at
|
|
320
|
+
once. This mode keeps track of source IDs in memory, which should be fine
|
|
321
|
+
for most use cases. If your dataset is large (10M+ docs), you will likely
|
|
322
|
+
need to parallelize the indexing process regardless.
|
|
321
323
|
|
|
322
324
|
Args:
|
|
323
325
|
docs_source: Data loader or iterable of documents to index.
|
|
324
326
|
record_manager: Timestamped set to keep track of which documents were
|
|
325
327
|
updated.
|
|
326
|
-
vector_store: VectorStore or DocumentIndex to index the documents into.
|
|
327
|
-
batch_size: Batch size to use when indexing.
|
|
328
|
-
cleanup: How to handle clean up of documents.
|
|
328
|
+
vector_store: `VectorStore` or DocumentIndex to index the documents into.
|
|
329
|
+
batch_size: Batch size to use when indexing.
|
|
330
|
+
cleanup: How to handle clean up of documents.
|
|
329
331
|
|
|
330
332
|
- incremental: Cleans up all documents that haven't been updated AND
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
333
|
+
that are associated with source IDs that were seen during indexing.
|
|
334
|
+
Clean up is done continuously during indexing helping to minimize the
|
|
335
|
+
probability of users seeing duplicated content.
|
|
334
336
|
- full: Delete all documents that have not been returned by the loader
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
337
|
+
during this run of indexing.
|
|
338
|
+
Clean up runs after all documents have been indexed.
|
|
339
|
+
This means that users may see duplicated content during indexing.
|
|
338
340
|
- scoped_full: Similar to Full, but only deletes all documents
|
|
339
|
-
|
|
340
|
-
|
|
341
|
+
that haven't been updated AND that are associated with
|
|
342
|
+
source IDs that were seen during indexing.
|
|
341
343
|
- None: Do not delete any documents.
|
|
342
344
|
source_id_key: Optional key that helps identify the original source
|
|
343
|
-
of the document.
|
|
345
|
+
of the document.
|
|
344
346
|
cleanup_batch_size: Batch size to use when cleaning up documents.
|
|
345
|
-
Default is 1_000.
|
|
346
347
|
force_update: Force update documents even if they are present in the
|
|
347
348
|
record manager. Useful if you are re-indexing with updated embeddings.
|
|
348
|
-
Default is False.
|
|
349
349
|
key_encoder: Hashing algorithm to use for hashing the document content and
|
|
350
|
-
metadata.
|
|
351
|
-
Other options include "blake2b", "sha256", and "sha512".
|
|
350
|
+
metadata. Options include "blake2b", "sha256", and "sha512".
|
|
352
351
|
|
|
353
|
-
|
|
352
|
+
!!! version-added "Added in version 0.3.66"
|
|
354
353
|
|
|
355
354
|
key_encoder: Hashing algorithm to use for hashing the document.
|
|
356
355
|
If not provided, a default encoder using SHA-1 will be used.
|
|
@@ -364,10 +363,10 @@ def index(
|
|
|
364
363
|
When changing the key encoder, you must change the
|
|
365
364
|
index as well to avoid duplicated documents in the cache.
|
|
366
365
|
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
|
367
|
-
method of the VectorStore or the upsert method of the DocumentIndex.
|
|
366
|
+
method of the `VectorStore` or the upsert method of the DocumentIndex.
|
|
368
367
|
For example, you can use this to specify a custom vector_field:
|
|
369
368
|
upsert_kwargs={"vector_field": "embedding"}
|
|
370
|
-
|
|
369
|
+
!!! version-added "Added in version 0.3.10"
|
|
371
370
|
|
|
372
371
|
Returns:
|
|
373
372
|
Indexing result which contains information about how many documents
|
|
@@ -376,11 +375,11 @@ def index(
|
|
|
376
375
|
Raises:
|
|
377
376
|
ValueError: If cleanup mode is not one of 'incremental', 'full' or None
|
|
378
377
|
ValueError: If cleanup mode is incremental and source_id_key is None.
|
|
379
|
-
ValueError: If
|
|
378
|
+
ValueError: If `VectorStore` does not have
|
|
380
379
|
"delete" and "add_documents" required methods.
|
|
381
380
|
ValueError: If source_id_key is not None, but is not a string or callable.
|
|
382
|
-
TypeError: If
|
|
383
|
-
AssertionError: If
|
|
381
|
+
TypeError: If `vectorstore` is not a `VectorStore` or a DocumentIndex.
|
|
382
|
+
AssertionError: If `source_id` is None when cleanup mode is incremental.
|
|
384
383
|
(should be unreachable code).
|
|
385
384
|
"""
|
|
386
385
|
# Behavior is deprecated, but we keep it for backwards compatibility.
|
|
@@ -416,7 +415,7 @@ def index(
|
|
|
416
415
|
raise ValueError(msg)
|
|
417
416
|
|
|
418
417
|
if type(destination).delete == VectorStore.delete:
|
|
419
|
-
# Checking if the
|
|
418
|
+
# Checking if the VectorStore has overridden the default delete method
|
|
420
419
|
# implementation which just raises a NotImplementedError
|
|
421
420
|
msg = "Vectorstore has not implemented the delete method"
|
|
422
421
|
raise ValueError(msg)
|
|
@@ -462,16 +461,16 @@ def index(
|
|
|
462
461
|
# Count documents removed by within-batch deduplication
|
|
463
462
|
num_skipped += original_batch_size - len(hashed_docs)
|
|
464
463
|
|
|
465
|
-
source_ids: Sequence[
|
|
464
|
+
source_ids: Sequence[str | None] = [
|
|
466
465
|
source_id_assigner(hashed_doc) for hashed_doc in hashed_docs
|
|
467
466
|
]
|
|
468
467
|
|
|
469
468
|
if cleanup in {"incremental", "scoped_full"}:
|
|
470
|
-
#
|
|
471
|
-
for source_id, hashed_doc in zip(source_ids, hashed_docs):
|
|
469
|
+
# Source IDs are required.
|
|
470
|
+
for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
|
|
472
471
|
if source_id is None:
|
|
473
472
|
msg = (
|
|
474
|
-
f"Source
|
|
473
|
+
f"Source IDs are required when cleanup mode is "
|
|
475
474
|
f"incremental or scoped_full. "
|
|
476
475
|
f"Document that starts with "
|
|
477
476
|
f"content: {hashed_doc.page_content[:100]} "
|
|
@@ -480,7 +479,7 @@ def index(
|
|
|
480
479
|
raise ValueError(msg)
|
|
481
480
|
if cleanup == "scoped_full":
|
|
482
481
|
scoped_full_cleanup_source_ids.add(source_id)
|
|
483
|
-
#
|
|
482
|
+
# Source IDs cannot be None after for loop above.
|
|
484
483
|
source_ids = cast("Sequence[str]", source_ids)
|
|
485
484
|
|
|
486
485
|
exists_batch = record_manager.exists(
|
|
@@ -492,7 +491,7 @@ def index(
|
|
|
492
491
|
docs_to_index = []
|
|
493
492
|
uids_to_refresh = []
|
|
494
493
|
seen_docs: set[str] = set()
|
|
495
|
-
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
|
494
|
+
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):
|
|
496
495
|
hashed_id = cast("str", hashed_doc.id)
|
|
497
496
|
if doc_exists:
|
|
498
497
|
if force_update:
|
|
@@ -539,7 +538,7 @@ def index(
|
|
|
539
538
|
# If source IDs are provided, we can do the deletion incrementally!
|
|
540
539
|
if cleanup == "incremental":
|
|
541
540
|
# Get the uids of the documents that were not returned by the loader.
|
|
542
|
-
# mypy isn't good enough to determine that source
|
|
541
|
+
# mypy isn't good enough to determine that source IDs cannot be None
|
|
543
542
|
# here due to a check that's happening above, so we check again.
|
|
544
543
|
for source_id in source_ids:
|
|
545
544
|
if source_id is None:
|
|
@@ -563,7 +562,7 @@ def index(
|
|
|
563
562
|
if cleanup == "full" or (
|
|
564
563
|
cleanup == "scoped_full" and scoped_full_cleanup_source_ids
|
|
565
564
|
):
|
|
566
|
-
delete_group_ids:
|
|
565
|
+
delete_group_ids: Sequence[str] | None = None
|
|
567
566
|
if cleanup == "scoped_full":
|
|
568
567
|
delete_group_ids = list(scoped_full_cleanup_source_ids)
|
|
569
568
|
while uids_to_delete := record_manager.list_keys(
|
|
@@ -591,7 +590,7 @@ async def _to_async_iterator(iterator: Iterable[T]) -> AsyncIterator[T]:
|
|
|
591
590
|
|
|
592
591
|
|
|
593
592
|
async def _adelete(
|
|
594
|
-
vector_store:
|
|
593
|
+
vector_store: VectorStore | DocumentIndex,
|
|
595
594
|
ids: list[str],
|
|
596
595
|
) -> None:
|
|
597
596
|
if isinstance(vector_store, VectorStore):
|
|
@@ -613,19 +612,18 @@ async def _adelete(
|
|
|
613
612
|
|
|
614
613
|
|
|
615
614
|
async def aindex(
|
|
616
|
-
docs_source:
|
|
615
|
+
docs_source: BaseLoader | Iterable[Document] | AsyncIterator[Document],
|
|
617
616
|
record_manager: RecordManager,
|
|
618
|
-
vector_store:
|
|
617
|
+
vector_store: VectorStore | DocumentIndex,
|
|
619
618
|
*,
|
|
620
619
|
batch_size: int = 100,
|
|
621
|
-
cleanup:
|
|
622
|
-
source_id_key:
|
|
620
|
+
cleanup: Literal["incremental", "full", "scoped_full"] | None = None,
|
|
621
|
+
source_id_key: str | Callable[[Document], str] | None = None,
|
|
623
622
|
cleanup_batch_size: int = 1_000,
|
|
624
623
|
force_update: bool = False,
|
|
625
|
-
key_encoder:
|
|
626
|
-
|
|
627
|
-
] =
|
|
628
|
-
upsert_kwargs: Optional[dict[str, Any]] = None,
|
|
624
|
+
key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]
|
|
625
|
+
| Callable[[Document], str] = "sha1",
|
|
626
|
+
upsert_kwargs: dict[str, Any] | None = None,
|
|
629
627
|
) -> IndexingResult:
|
|
630
628
|
"""Async index data from the loader into the vector store.
|
|
631
629
|
|
|
@@ -638,61 +636,58 @@ async def aindex(
|
|
|
638
636
|
For the time being, documents are indexed using their hashes, and users
|
|
639
637
|
are not able to specify the uid of the document.
|
|
640
638
|
|
|
641
|
-
|
|
642
|
-
Added
|
|
639
|
+
!!! warning "Behavior changed in 0.3.25"
|
|
640
|
+
Added `scoped_full` cleanup mode.
|
|
643
641
|
|
|
644
|
-
|
|
642
|
+
!!! warning
|
|
645
643
|
|
|
646
644
|
* In full mode, the loader should be returning
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
645
|
+
the entire dataset, and not just a subset of the dataset.
|
|
646
|
+
Otherwise, the auto_cleanup will remove documents that it is not
|
|
647
|
+
supposed to.
|
|
650
648
|
* In incremental mode, if documents associated with a particular
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
* The
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
649
|
+
source id appear across different batches, the indexing API
|
|
650
|
+
will do some redundant work. This will still result in the
|
|
651
|
+
correct end state of the index, but will unfortunately not be
|
|
652
|
+
100% efficient. For example, if a given document is split into 15
|
|
653
|
+
chunks, and we index them using a batch size of 5, we'll have 3 batches
|
|
654
|
+
all with the same source id. In general, to avoid doing too much
|
|
655
|
+
redundant work select as big a batch size as possible.
|
|
656
|
+
* The `scoped_full` mode is suitable if determining an appropriate batch size
|
|
657
|
+
is challenging or if your data loader cannot return the entire dataset at
|
|
658
|
+
once. This mode keeps track of source IDs in memory, which should be fine
|
|
659
|
+
for most use cases. If your dataset is large (10M+ docs), you will likely
|
|
660
|
+
need to parallelize the indexing process regardless.
|
|
663
661
|
|
|
664
662
|
Args:
|
|
665
663
|
docs_source: Data loader or iterable of documents to index.
|
|
666
664
|
record_manager: Timestamped set to keep track of which documents were
|
|
667
665
|
updated.
|
|
668
|
-
vector_store: VectorStore or DocumentIndex to index the documents into.
|
|
669
|
-
batch_size: Batch size to use when indexing.
|
|
670
|
-
cleanup: How to handle clean up of documents.
|
|
666
|
+
vector_store: `VectorStore` or DocumentIndex to index the documents into.
|
|
667
|
+
batch_size: Batch size to use when indexing.
|
|
668
|
+
cleanup: How to handle clean up of documents.
|
|
671
669
|
|
|
672
670
|
- incremental: Cleans up all documents that haven't been updated AND
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
671
|
+
that are associated with source IDs that were seen during indexing.
|
|
672
|
+
Clean up is done continuously during indexing helping to minimize the
|
|
673
|
+
probability of users seeing duplicated content.
|
|
676
674
|
- full: Delete all documents that have not been returned by the loader
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
675
|
+
during this run of indexing.
|
|
676
|
+
Clean up runs after all documents have been indexed.
|
|
677
|
+
This means that users may see duplicated content during indexing.
|
|
680
678
|
- scoped_full: Similar to Full, but only deletes all documents
|
|
681
|
-
|
|
682
|
-
|
|
679
|
+
that haven't been updated AND that are associated with
|
|
680
|
+
source IDs that were seen during indexing.
|
|
683
681
|
- None: Do not delete any documents.
|
|
684
682
|
source_id_key: Optional key that helps identify the original source
|
|
685
|
-
of the document.
|
|
683
|
+
of the document.
|
|
686
684
|
cleanup_batch_size: Batch size to use when cleaning up documents.
|
|
687
|
-
Default is 1_000.
|
|
688
685
|
force_update: Force update documents even if they are present in the
|
|
689
686
|
record manager. Useful if you are re-indexing with updated embeddings.
|
|
690
|
-
Default is False.
|
|
691
687
|
key_encoder: Hashing algorithm to use for hashing the document content and
|
|
692
|
-
metadata.
|
|
693
|
-
Other options include "blake2b", "sha256", and "sha512".
|
|
688
|
+
metadata. Options include "blake2b", "sha256", and "sha512".
|
|
694
689
|
|
|
695
|
-
|
|
690
|
+
!!! version-added "Added in version 0.3.66"
|
|
696
691
|
|
|
697
692
|
key_encoder: Hashing algorithm to use for hashing the document.
|
|
698
693
|
If not provided, a default encoder using SHA-1 will be used.
|
|
@@ -706,10 +701,10 @@ async def aindex(
|
|
|
706
701
|
When changing the key encoder, you must change the
|
|
707
702
|
index as well to avoid duplicated documents in the cache.
|
|
708
703
|
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
|
709
|
-
method of the VectorStore or the upsert method of the DocumentIndex.
|
|
704
|
+
method of the `VectorStore` or the upsert method of the DocumentIndex.
|
|
710
705
|
For example, you can use this to specify a custom vector_field:
|
|
711
706
|
upsert_kwargs={"vector_field": "embedding"}
|
|
712
|
-
|
|
707
|
+
!!! version-added "Added in version 0.3.10"
|
|
713
708
|
|
|
714
709
|
Returns:
|
|
715
710
|
Indexing result which contains information about how many documents
|
|
@@ -718,12 +713,12 @@ async def aindex(
|
|
|
718
713
|
Raises:
|
|
719
714
|
ValueError: If cleanup mode is not one of 'incremental', 'full' or None
|
|
720
715
|
ValueError: If cleanup mode is incremental and source_id_key is None.
|
|
721
|
-
ValueError: If
|
|
716
|
+
ValueError: If `VectorStore` does not have
|
|
722
717
|
"adelete" and "aadd_documents" required methods.
|
|
723
718
|
ValueError: If source_id_key is not None, but is not a string or callable.
|
|
724
|
-
TypeError: If
|
|
725
|
-
AssertionError: If
|
|
726
|
-
incremental or
|
|
719
|
+
TypeError: If `vector_store` is not a `VectorStore` or DocumentIndex.
|
|
720
|
+
AssertionError: If `source_id_key` is None when cleanup mode is
|
|
721
|
+
incremental or `scoped_full` (should be unreachable).
|
|
727
722
|
"""
|
|
728
723
|
# Behavior is deprecated, but we keep it for backwards compatibility.
|
|
729
724
|
# # Warn only once per process.
|
|
@@ -762,7 +757,7 @@ async def aindex(
|
|
|
762
757
|
type(destination).adelete == VectorStore.adelete
|
|
763
758
|
and type(destination).delete == VectorStore.delete
|
|
764
759
|
):
|
|
765
|
-
# Checking if the
|
|
760
|
+
# Checking if the VectorStore has overridden the default adelete or delete
|
|
766
761
|
# methods implementation which just raises a NotImplementedError
|
|
767
762
|
msg = "Vectorstore has not implemented the adelete or delete method"
|
|
768
763
|
raise ValueError(msg)
|
|
@@ -815,16 +810,16 @@ async def aindex(
|
|
|
815
810
|
# Count documents removed by within-batch deduplication
|
|
816
811
|
num_skipped += original_batch_size - len(hashed_docs)
|
|
817
812
|
|
|
818
|
-
source_ids: Sequence[
|
|
813
|
+
source_ids: Sequence[str | None] = [
|
|
819
814
|
source_id_assigner(doc) for doc in hashed_docs
|
|
820
815
|
]
|
|
821
816
|
|
|
822
817
|
if cleanup in {"incremental", "scoped_full"}:
|
|
823
|
-
# If the cleanup mode is incremental, source
|
|
824
|
-
for source_id, hashed_doc in zip(source_ids, hashed_docs):
|
|
818
|
+
# If the cleanup mode is incremental, source IDs are required.
|
|
819
|
+
for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
|
|
825
820
|
if source_id is None:
|
|
826
821
|
msg = (
|
|
827
|
-
f"Source
|
|
822
|
+
f"Source IDs are required when cleanup mode is "
|
|
828
823
|
f"incremental or scoped_full. "
|
|
829
824
|
f"Document that starts with "
|
|
830
825
|
f"content: {hashed_doc.page_content[:100]} "
|
|
@@ -833,7 +828,7 @@ async def aindex(
|
|
|
833
828
|
raise ValueError(msg)
|
|
834
829
|
if cleanup == "scoped_full":
|
|
835
830
|
scoped_full_cleanup_source_ids.add(source_id)
|
|
836
|
-
#
|
|
831
|
+
# Source IDs cannot be None after for loop above.
|
|
837
832
|
source_ids = cast("Sequence[str]", source_ids)
|
|
838
833
|
|
|
839
834
|
exists_batch = await record_manager.aexists(
|
|
@@ -845,7 +840,7 @@ async def aindex(
|
|
|
845
840
|
docs_to_index: list[Document] = []
|
|
846
841
|
uids_to_refresh = []
|
|
847
842
|
seen_docs: set[str] = set()
|
|
848
|
-
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
|
843
|
+
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):
|
|
849
844
|
hashed_id = cast("str", hashed_doc.id)
|
|
850
845
|
if doc_exists:
|
|
851
846
|
if force_update:
|
|
@@ -893,7 +888,7 @@ async def aindex(
|
|
|
893
888
|
if cleanup == "incremental":
|
|
894
889
|
# Get the uids of the documents that were not returned by the loader.
|
|
895
890
|
|
|
896
|
-
# mypy isn't good enough to determine that source
|
|
891
|
+
# mypy isn't good enough to determine that source IDs cannot be None
|
|
897
892
|
# here due to a check that's happening above, so we check again.
|
|
898
893
|
for source_id in source_ids:
|
|
899
894
|
if source_id is None:
|
|
@@ -917,7 +912,7 @@ async def aindex(
|
|
|
917
912
|
if cleanup == "full" or (
|
|
918
913
|
cleanup == "scoped_full" and scoped_full_cleanup_source_ids
|
|
919
914
|
):
|
|
920
|
-
delete_group_ids:
|
|
915
|
+
delete_group_ids: Sequence[str] | None = None
|
|
921
916
|
if cleanup == "scoped_full":
|
|
922
917
|
delete_group_ids = list(scoped_full_cleanup_source_ids)
|
|
923
918
|
while uids_to_delete := await record_manager.alist_keys(
|