langchain-core 0.4.0.dev0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langchain-core might be problematic. Click here for more details.
- langchain_core/__init__.py +1 -1
- langchain_core/_api/__init__.py +3 -4
- langchain_core/_api/beta_decorator.py +45 -70
- langchain_core/_api/deprecation.py +80 -80
- langchain_core/_api/path.py +22 -8
- langchain_core/_import_utils.py +10 -4
- langchain_core/agents.py +25 -21
- langchain_core/caches.py +53 -63
- langchain_core/callbacks/__init__.py +1 -8
- langchain_core/callbacks/base.py +341 -348
- langchain_core/callbacks/file.py +55 -44
- langchain_core/callbacks/manager.py +546 -683
- langchain_core/callbacks/stdout.py +29 -30
- langchain_core/callbacks/streaming_stdout.py +35 -36
- langchain_core/callbacks/usage.py +65 -70
- langchain_core/chat_history.py +48 -55
- langchain_core/document_loaders/base.py +46 -21
- langchain_core/document_loaders/langsmith.py +39 -36
- langchain_core/documents/__init__.py +0 -1
- langchain_core/documents/base.py +96 -74
- langchain_core/documents/compressor.py +12 -9
- langchain_core/documents/transformers.py +29 -28
- langchain_core/embeddings/fake.py +56 -57
- langchain_core/env.py +2 -3
- langchain_core/example_selectors/base.py +12 -0
- langchain_core/example_selectors/length_based.py +1 -1
- langchain_core/example_selectors/semantic_similarity.py +21 -25
- langchain_core/exceptions.py +15 -9
- langchain_core/globals.py +4 -163
- langchain_core/indexing/api.py +132 -125
- langchain_core/indexing/base.py +64 -67
- langchain_core/indexing/in_memory.py +26 -6
- langchain_core/language_models/__init__.py +15 -27
- langchain_core/language_models/_utils.py +267 -117
- langchain_core/language_models/base.py +92 -177
- langchain_core/language_models/chat_models.py +547 -407
- langchain_core/language_models/fake.py +11 -11
- langchain_core/language_models/fake_chat_models.py +72 -118
- langchain_core/language_models/llms.py +168 -242
- langchain_core/load/dump.py +8 -11
- langchain_core/load/load.py +32 -28
- langchain_core/load/mapping.py +2 -4
- langchain_core/load/serializable.py +50 -56
- langchain_core/messages/__init__.py +36 -51
- langchain_core/messages/ai.py +377 -150
- langchain_core/messages/base.py +239 -47
- langchain_core/messages/block_translators/__init__.py +111 -0
- langchain_core/messages/block_translators/anthropic.py +470 -0
- langchain_core/messages/block_translators/bedrock.py +94 -0
- langchain_core/messages/block_translators/bedrock_converse.py +297 -0
- langchain_core/messages/block_translators/google_genai.py +530 -0
- langchain_core/messages/block_translators/google_vertexai.py +21 -0
- langchain_core/messages/block_translators/groq.py +143 -0
- langchain_core/messages/block_translators/langchain_v0.py +301 -0
- langchain_core/messages/block_translators/openai.py +1010 -0
- langchain_core/messages/chat.py +2 -3
- langchain_core/messages/content.py +1423 -0
- langchain_core/messages/function.py +7 -7
- langchain_core/messages/human.py +44 -38
- langchain_core/messages/modifier.py +3 -2
- langchain_core/messages/system.py +40 -27
- langchain_core/messages/tool.py +160 -58
- langchain_core/messages/utils.py +527 -638
- langchain_core/output_parsers/__init__.py +1 -14
- langchain_core/output_parsers/base.py +68 -104
- langchain_core/output_parsers/json.py +13 -17
- langchain_core/output_parsers/list.py +11 -33
- langchain_core/output_parsers/openai_functions.py +56 -74
- langchain_core/output_parsers/openai_tools.py +68 -109
- langchain_core/output_parsers/pydantic.py +15 -13
- langchain_core/output_parsers/string.py +6 -2
- langchain_core/output_parsers/transform.py +17 -60
- langchain_core/output_parsers/xml.py +34 -44
- langchain_core/outputs/__init__.py +1 -1
- langchain_core/outputs/chat_generation.py +26 -11
- langchain_core/outputs/chat_result.py +1 -3
- langchain_core/outputs/generation.py +17 -6
- langchain_core/outputs/llm_result.py +15 -8
- langchain_core/prompt_values.py +29 -123
- langchain_core/prompts/__init__.py +3 -27
- langchain_core/prompts/base.py +48 -63
- langchain_core/prompts/chat.py +259 -288
- langchain_core/prompts/dict.py +19 -11
- langchain_core/prompts/few_shot.py +84 -90
- langchain_core/prompts/few_shot_with_templates.py +14 -12
- langchain_core/prompts/image.py +19 -14
- langchain_core/prompts/loading.py +6 -8
- langchain_core/prompts/message.py +7 -8
- langchain_core/prompts/prompt.py +42 -43
- langchain_core/prompts/string.py +37 -16
- langchain_core/prompts/structured.py +43 -46
- langchain_core/rate_limiters.py +51 -60
- langchain_core/retrievers.py +52 -192
- langchain_core/runnables/base.py +1727 -1683
- langchain_core/runnables/branch.py +52 -73
- langchain_core/runnables/config.py +89 -103
- langchain_core/runnables/configurable.py +128 -130
- langchain_core/runnables/fallbacks.py +93 -82
- langchain_core/runnables/graph.py +127 -127
- langchain_core/runnables/graph_ascii.py +63 -41
- langchain_core/runnables/graph_mermaid.py +87 -70
- langchain_core/runnables/graph_png.py +31 -36
- langchain_core/runnables/history.py +145 -161
- langchain_core/runnables/passthrough.py +141 -144
- langchain_core/runnables/retry.py +84 -68
- langchain_core/runnables/router.py +33 -37
- langchain_core/runnables/schema.py +79 -72
- langchain_core/runnables/utils.py +95 -139
- langchain_core/stores.py +85 -131
- langchain_core/structured_query.py +11 -15
- langchain_core/sys_info.py +31 -32
- langchain_core/tools/__init__.py +1 -14
- langchain_core/tools/base.py +221 -247
- langchain_core/tools/convert.py +144 -161
- langchain_core/tools/render.py +10 -10
- langchain_core/tools/retriever.py +12 -19
- langchain_core/tools/simple.py +52 -29
- langchain_core/tools/structured.py +56 -60
- langchain_core/tracers/__init__.py +1 -9
- langchain_core/tracers/_streaming.py +6 -7
- langchain_core/tracers/base.py +103 -112
- langchain_core/tracers/context.py +29 -48
- langchain_core/tracers/core.py +142 -105
- langchain_core/tracers/evaluation.py +30 -34
- langchain_core/tracers/event_stream.py +162 -117
- langchain_core/tracers/langchain.py +34 -36
- langchain_core/tracers/log_stream.py +87 -49
- langchain_core/tracers/memory_stream.py +3 -3
- langchain_core/tracers/root_listeners.py +18 -34
- langchain_core/tracers/run_collector.py +8 -20
- langchain_core/tracers/schemas.py +0 -125
- langchain_core/tracers/stdout.py +3 -3
- langchain_core/utils/__init__.py +1 -4
- langchain_core/utils/_merge.py +47 -9
- langchain_core/utils/aiter.py +70 -66
- langchain_core/utils/env.py +12 -9
- langchain_core/utils/function_calling.py +139 -206
- langchain_core/utils/html.py +7 -8
- langchain_core/utils/input.py +6 -6
- langchain_core/utils/interactive_env.py +6 -2
- langchain_core/utils/iter.py +48 -45
- langchain_core/utils/json.py +14 -4
- langchain_core/utils/json_schema.py +159 -43
- langchain_core/utils/mustache.py +32 -25
- langchain_core/utils/pydantic.py +67 -40
- langchain_core/utils/strings.py +5 -5
- langchain_core/utils/usage.py +1 -1
- langchain_core/utils/utils.py +104 -62
- langchain_core/vectorstores/base.py +131 -179
- langchain_core/vectorstores/in_memory.py +113 -182
- langchain_core/vectorstores/utils.py +23 -17
- langchain_core/version.py +1 -1
- langchain_core-1.0.0.dist-info/METADATA +68 -0
- langchain_core-1.0.0.dist-info/RECORD +172 -0
- {langchain_core-0.4.0.dev0.dist-info → langchain_core-1.0.0.dist-info}/WHEEL +1 -1
- langchain_core/beta/__init__.py +0 -1
- langchain_core/beta/runnables/__init__.py +0 -1
- langchain_core/beta/runnables/context.py +0 -448
- langchain_core/memory.py +0 -116
- langchain_core/messages/content_blocks.py +0 -1435
- langchain_core/prompts/pipeline.py +0 -133
- langchain_core/pydantic_v1/__init__.py +0 -30
- langchain_core/pydantic_v1/dataclasses.py +0 -23
- langchain_core/pydantic_v1/main.py +0 -23
- langchain_core/tracers/langchain_v1.py +0 -23
- langchain_core/utils/loading.py +0 -31
- langchain_core/v1/__init__.py +0 -1
- langchain_core/v1/chat_models.py +0 -1047
- langchain_core/v1/messages.py +0 -755
- langchain_core-0.4.0.dev0.dist-info/METADATA +0 -108
- langchain_core-0.4.0.dev0.dist-info/RECORD +0 -177
- langchain_core-0.4.0.dev0.dist-info/entry_points.txt +0 -4
langchain_core/indexing/api.py
CHANGED
|
@@ -6,16 +6,20 @@ import hashlib
|
|
|
6
6
|
import json
|
|
7
7
|
import uuid
|
|
8
8
|
import warnings
|
|
9
|
-
from collections.abc import
|
|
9
|
+
from collections.abc import (
|
|
10
|
+
AsyncIterable,
|
|
11
|
+
AsyncIterator,
|
|
12
|
+
Callable,
|
|
13
|
+
Iterable,
|
|
14
|
+
Iterator,
|
|
15
|
+
Sequence,
|
|
16
|
+
)
|
|
10
17
|
from itertools import islice
|
|
11
18
|
from typing import (
|
|
12
19
|
Any,
|
|
13
|
-
Callable,
|
|
14
20
|
Literal,
|
|
15
|
-
Optional,
|
|
16
21
|
TypedDict,
|
|
17
22
|
TypeVar,
|
|
18
|
-
Union,
|
|
19
23
|
cast,
|
|
20
24
|
)
|
|
21
25
|
|
|
@@ -56,7 +60,7 @@ def _warn_about_sha1() -> None:
|
|
|
56
60
|
"that map to the same fingerprint. If this matters in your "
|
|
57
61
|
"threat model, switch to a stronger algorithm such "
|
|
58
62
|
"as 'blake2b', 'sha256', or 'sha512' by specifying "
|
|
59
|
-
" `key_encoder` parameter in the
|
|
63
|
+
" `key_encoder` parameter in the `index` or `aindex` function. ",
|
|
60
64
|
category=UserWarning,
|
|
61
65
|
stacklevel=2,
|
|
62
66
|
)
|
|
@@ -107,8 +111,8 @@ async def _abatch(size: int, iterable: AsyncIterable[T]) -> AsyncIterator[list[T
|
|
|
107
111
|
|
|
108
112
|
|
|
109
113
|
def _get_source_id_assigner(
|
|
110
|
-
source_id_key:
|
|
111
|
-
) -> Callable[[Document],
|
|
114
|
+
source_id_key: str | Callable[[Document], str] | None,
|
|
115
|
+
) -> Callable[[Document], str | None]:
|
|
112
116
|
"""Get the source id from the document."""
|
|
113
117
|
if source_id_key is None:
|
|
114
118
|
return lambda _doc: None
|
|
@@ -162,9 +166,8 @@ def _calculate_hash(
|
|
|
162
166
|
def _get_document_with_hash(
|
|
163
167
|
document: Document,
|
|
164
168
|
*,
|
|
165
|
-
key_encoder:
|
|
166
|
-
|
|
167
|
-
],
|
|
169
|
+
key_encoder: Callable[[Document], str]
|
|
170
|
+
| Literal["sha1", "sha256", "sha512", "blake2b"],
|
|
168
171
|
) -> Document:
|
|
169
172
|
"""Calculate a hash of the document, and assign it to the uid.
|
|
170
173
|
|
|
@@ -185,6 +188,9 @@ def _get_document_with_hash(
|
|
|
185
188
|
When changing the key encoder, you must change the
|
|
186
189
|
index as well to avoid duplicated documents in the cache.
|
|
187
190
|
|
|
191
|
+
Raises:
|
|
192
|
+
ValueError: If the metadata cannot be serialized using json.
|
|
193
|
+
|
|
188
194
|
Returns:
|
|
189
195
|
Document with a unique identifier based on the hash of the content and metadata.
|
|
190
196
|
"""
|
|
@@ -230,7 +236,7 @@ class _HashedDocument:
|
|
|
230
236
|
|
|
231
237
|
|
|
232
238
|
def _delete(
|
|
233
|
-
vector_store:
|
|
239
|
+
vector_store: VectorStore | DocumentIndex,
|
|
234
240
|
ids: list[str],
|
|
235
241
|
) -> None:
|
|
236
242
|
if isinstance(vector_store, VectorStore):
|
|
@@ -268,19 +274,18 @@ class IndexingResult(TypedDict):
|
|
|
268
274
|
|
|
269
275
|
|
|
270
276
|
def index(
|
|
271
|
-
docs_source:
|
|
277
|
+
docs_source: BaseLoader | Iterable[Document],
|
|
272
278
|
record_manager: RecordManager,
|
|
273
|
-
vector_store:
|
|
279
|
+
vector_store: VectorStore | DocumentIndex,
|
|
274
280
|
*,
|
|
275
281
|
batch_size: int = 100,
|
|
276
|
-
cleanup:
|
|
277
|
-
source_id_key:
|
|
282
|
+
cleanup: Literal["incremental", "full", "scoped_full"] | None = None,
|
|
283
|
+
source_id_key: str | Callable[[Document], str] | None = None,
|
|
278
284
|
cleanup_batch_size: int = 1_000,
|
|
279
285
|
force_update: bool = False,
|
|
280
|
-
key_encoder:
|
|
281
|
-
|
|
282
|
-
] =
|
|
283
|
-
upsert_kwargs: Optional[dict[str, Any]] = None,
|
|
286
|
+
key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]
|
|
287
|
+
| Callable[[Document], str] = "sha1",
|
|
288
|
+
upsert_kwargs: dict[str, Any] | None = None,
|
|
284
289
|
) -> IndexingResult:
|
|
285
290
|
"""Index data from the loader into the vector store.
|
|
286
291
|
|
|
@@ -291,21 +296,25 @@ def index(
|
|
|
291
296
|
documents were deleted, which documents should be skipped.
|
|
292
297
|
|
|
293
298
|
For the time being, documents are indexed using their hashes, and users
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
299
|
+
are not able to specify the uid of the document.
|
|
300
|
+
|
|
301
|
+
!!! warning "Behavior changed in 0.3.25"
|
|
302
|
+
Added `scoped_full` cleanup mode.
|
|
303
|
+
|
|
304
|
+
!!! warning
|
|
305
|
+
|
|
306
|
+
* In full mode, the loader should be returning
|
|
307
|
+
the entire dataset, and not just a subset of the dataset.
|
|
308
|
+
Otherwise, the auto_cleanup will remove documents that it is not
|
|
309
|
+
supposed to.
|
|
310
|
+
* In incremental mode, if documents associated with a particular
|
|
311
|
+
source id appear across different batches, the indexing API
|
|
312
|
+
will do some redundant work. This will still result in the
|
|
313
|
+
correct end state of the index, but will unfortunately not be
|
|
314
|
+
100% efficient. For example, if a given document is split into 15
|
|
315
|
+
chunks, and we index them using a batch size of 5, we'll have 3 batches
|
|
316
|
+
all with the same source id. In general, to avoid doing too much
|
|
317
|
+
redundant work select as big a batch size as possible.
|
|
309
318
|
* The `scoped_full` mode is suitable if determining an appropriate batch size
|
|
310
319
|
is challenging or if your data loader cannot return the entire dataset at
|
|
311
320
|
once. This mode keeps track of source IDs in memory, which should be fine
|
|
@@ -315,36 +324,32 @@ def index(
|
|
|
315
324
|
Args:
|
|
316
325
|
docs_source: Data loader or iterable of documents to index.
|
|
317
326
|
record_manager: Timestamped set to keep track of which documents were
|
|
318
|
-
|
|
327
|
+
updated.
|
|
319
328
|
vector_store: VectorStore or DocumentIndex to index the documents into.
|
|
320
|
-
batch_size: Batch size to use when indexing.
|
|
321
|
-
cleanup: How to handle clean up of documents.
|
|
329
|
+
batch_size: Batch size to use when indexing.
|
|
330
|
+
cleanup: How to handle clean up of documents.
|
|
331
|
+
|
|
322
332
|
- incremental: Cleans up all documents that haven't been updated AND
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
to minimize the probability of users seeing duplicated
|
|
327
|
-
content.
|
|
333
|
+
that are associated with source ids that were seen during indexing.
|
|
334
|
+
Clean up is done continuously during indexing helping to minimize the
|
|
335
|
+
probability of users seeing duplicated content.
|
|
328
336
|
- full: Delete all documents that have not been returned by the loader
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
337
|
+
during this run of indexing.
|
|
338
|
+
Clean up runs after all documents have been indexed.
|
|
339
|
+
This means that users may see duplicated content during indexing.
|
|
332
340
|
- scoped_full: Similar to Full, but only deletes all documents
|
|
333
|
-
|
|
334
|
-
|
|
341
|
+
that haven't been updated AND that are associated with
|
|
342
|
+
source ids that were seen during indexing.
|
|
335
343
|
- None: Do not delete any documents.
|
|
336
344
|
source_id_key: Optional key that helps identify the original source
|
|
337
|
-
of the document.
|
|
345
|
+
of the document.
|
|
338
346
|
cleanup_batch_size: Batch size to use when cleaning up documents.
|
|
339
|
-
Default is 1_000.
|
|
340
347
|
force_update: Force update documents even if they are present in the
|
|
341
348
|
record manager. Useful if you are re-indexing with updated embeddings.
|
|
342
|
-
Default is False.
|
|
343
349
|
key_encoder: Hashing algorithm to use for hashing the document content and
|
|
344
|
-
metadata.
|
|
345
|
-
Other options include "blake2b", "sha256", and "sha512".
|
|
350
|
+
metadata. Options include "blake2b", "sha256", and "sha512".
|
|
346
351
|
|
|
347
|
-
|
|
352
|
+
!!! version-added "Added in version 0.3.66"
|
|
348
353
|
|
|
349
354
|
key_encoder: Hashing algorithm to use for hashing the document.
|
|
350
355
|
If not provided, a default encoder using SHA-1 will be used.
|
|
@@ -358,11 +363,10 @@ def index(
|
|
|
358
363
|
When changing the key encoder, you must change the
|
|
359
364
|
index as well to avoid duplicated documents in the cache.
|
|
360
365
|
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
.. versionadded:: 0.3.10
|
|
366
|
+
method of the VectorStore or the upsert method of the DocumentIndex.
|
|
367
|
+
For example, you can use this to specify a custom vector_field:
|
|
368
|
+
upsert_kwargs={"vector_field": "embedding"}
|
|
369
|
+
!!! version-added "Added in version 0.3.10"
|
|
366
370
|
|
|
367
371
|
Returns:
|
|
368
372
|
Indexing result which contains information about how many documents
|
|
@@ -374,10 +378,9 @@ def index(
|
|
|
374
378
|
ValueError: If vectorstore does not have
|
|
375
379
|
"delete" and "add_documents" required methods.
|
|
376
380
|
ValueError: If source_id_key is not None, but is not a string or callable.
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
* Added `scoped_full` cleanup mode.
|
|
381
|
+
TypeError: If `vectorstore` is not a VectorStore or a DocumentIndex.
|
|
382
|
+
AssertionError: If `source_id` is None when cleanup mode is incremental.
|
|
383
|
+
(should be unreachable code).
|
|
381
384
|
"""
|
|
382
385
|
# Behavior is deprecated, but we keep it for backwards compatibility.
|
|
383
386
|
# # Warn only once per process.
|
|
@@ -458,13 +461,13 @@ def index(
|
|
|
458
461
|
# Count documents removed by within-batch deduplication
|
|
459
462
|
num_skipped += original_batch_size - len(hashed_docs)
|
|
460
463
|
|
|
461
|
-
source_ids: Sequence[
|
|
464
|
+
source_ids: Sequence[str | None] = [
|
|
462
465
|
source_id_assigner(hashed_doc) for hashed_doc in hashed_docs
|
|
463
466
|
]
|
|
464
467
|
|
|
465
468
|
if cleanup in {"incremental", "scoped_full"}:
|
|
466
469
|
# source ids are required.
|
|
467
|
-
for source_id, hashed_doc in zip(source_ids, hashed_docs):
|
|
470
|
+
for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
|
|
468
471
|
if source_id is None:
|
|
469
472
|
msg = (
|
|
470
473
|
f"Source ids are required when cleanup mode is "
|
|
@@ -488,7 +491,7 @@ def index(
|
|
|
488
491
|
docs_to_index = []
|
|
489
492
|
uids_to_refresh = []
|
|
490
493
|
seen_docs: set[str] = set()
|
|
491
|
-
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
|
494
|
+
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):
|
|
492
495
|
hashed_id = cast("str", hashed_doc.id)
|
|
493
496
|
if doc_exists:
|
|
494
497
|
if force_update:
|
|
@@ -559,7 +562,7 @@ def index(
|
|
|
559
562
|
if cleanup == "full" or (
|
|
560
563
|
cleanup == "scoped_full" and scoped_full_cleanup_source_ids
|
|
561
564
|
):
|
|
562
|
-
delete_group_ids:
|
|
565
|
+
delete_group_ids: Sequence[str] | None = None
|
|
563
566
|
if cleanup == "scoped_full":
|
|
564
567
|
delete_group_ids = list(scoped_full_cleanup_source_ids)
|
|
565
568
|
while uids_to_delete := record_manager.list_keys(
|
|
@@ -587,7 +590,7 @@ async def _to_async_iterator(iterator: Iterable[T]) -> AsyncIterator[T]:
|
|
|
587
590
|
|
|
588
591
|
|
|
589
592
|
async def _adelete(
|
|
590
|
-
vector_store:
|
|
593
|
+
vector_store: VectorStore | DocumentIndex,
|
|
591
594
|
ids: list[str],
|
|
592
595
|
) -> None:
|
|
593
596
|
if isinstance(vector_store, VectorStore):
|
|
@@ -609,19 +612,18 @@ async def _adelete(
|
|
|
609
612
|
|
|
610
613
|
|
|
611
614
|
async def aindex(
|
|
612
|
-
docs_source:
|
|
615
|
+
docs_source: BaseLoader | Iterable[Document] | AsyncIterator[Document],
|
|
613
616
|
record_manager: RecordManager,
|
|
614
|
-
vector_store:
|
|
617
|
+
vector_store: VectorStore | DocumentIndex,
|
|
615
618
|
*,
|
|
616
619
|
batch_size: int = 100,
|
|
617
|
-
cleanup:
|
|
618
|
-
source_id_key:
|
|
620
|
+
cleanup: Literal["incremental", "full", "scoped_full"] | None = None,
|
|
621
|
+
source_id_key: str | Callable[[Document], str] | None = None,
|
|
619
622
|
cleanup_batch_size: int = 1_000,
|
|
620
623
|
force_update: bool = False,
|
|
621
|
-
key_encoder:
|
|
622
|
-
|
|
623
|
-
] =
|
|
624
|
-
upsert_kwargs: Optional[dict[str, Any]] = None,
|
|
624
|
+
key_encoder: Literal["sha1", "sha256", "sha512", "blake2b"]
|
|
625
|
+
| Callable[[Document], str] = "sha1",
|
|
626
|
+
upsert_kwargs: dict[str, Any] | None = None,
|
|
625
627
|
) -> IndexingResult:
|
|
626
628
|
"""Async index data from the loader into the vector store.
|
|
627
629
|
|
|
@@ -632,54 +634,61 @@ async def aindex(
|
|
|
632
634
|
documents were deleted, which documents should be skipped.
|
|
633
635
|
|
|
634
636
|
For the time being, documents are indexed using their hashes, and users
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
637
|
+
are not able to specify the uid of the document.
|
|
638
|
+
|
|
639
|
+
!!! warning "Behavior changed in 0.3.25"
|
|
640
|
+
Added `scoped_full` cleanup mode.
|
|
641
|
+
|
|
642
|
+
!!! warning
|
|
643
|
+
|
|
644
|
+
* In full mode, the loader should be returning
|
|
645
|
+
the entire dataset, and not just a subset of the dataset.
|
|
646
|
+
Otherwise, the auto_cleanup will remove documents that it is not
|
|
647
|
+
supposed to.
|
|
648
|
+
* In incremental mode, if documents associated with a particular
|
|
649
|
+
source id appear across different batches, the indexing API
|
|
650
|
+
will do some redundant work. This will still result in the
|
|
651
|
+
correct end state of the index, but will unfortunately not be
|
|
652
|
+
100% efficient. For example, if a given document is split into 15
|
|
653
|
+
chunks, and we index them using a batch size of 5, we'll have 3 batches
|
|
654
|
+
all with the same source id. In general, to avoid doing too much
|
|
655
|
+
redundant work select as big a batch size as possible.
|
|
656
|
+
* The `scoped_full` mode is suitable if determining an appropriate batch size
|
|
657
|
+
is challenging or if your data loader cannot return the entire dataset at
|
|
658
|
+
once. This mode keeps track of source IDs in memory, which should be fine
|
|
659
|
+
for most use cases. If your dataset is large (10M+ docs), you will likely
|
|
660
|
+
need to parallelize the indexing process regardless.
|
|
655
661
|
|
|
656
662
|
Args:
|
|
657
663
|
docs_source: Data loader or iterable of documents to index.
|
|
658
664
|
record_manager: Timestamped set to keep track of which documents were
|
|
659
|
-
|
|
665
|
+
updated.
|
|
660
666
|
vector_store: VectorStore or DocumentIndex to index the documents into.
|
|
661
|
-
batch_size: Batch size to use when indexing.
|
|
662
|
-
cleanup: How to handle clean up of documents.
|
|
667
|
+
batch_size: Batch size to use when indexing.
|
|
668
|
+
cleanup: How to handle clean up of documents.
|
|
669
|
+
|
|
663
670
|
- incremental: Cleans up all documents that haven't been updated AND
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
This means that users may see duplicated content during indexing.
|
|
671
|
+
that are associated with source ids that were seen during indexing.
|
|
672
|
+
Clean up is done continuously during indexing helping to minimize the
|
|
673
|
+
probability of users seeing duplicated content.
|
|
674
|
+
- full: Delete all documents that have not been returned by the loader
|
|
675
|
+
during this run of indexing.
|
|
676
|
+
Clean up runs after all documents have been indexed.
|
|
677
|
+
This means that users may see duplicated content during indexing.
|
|
672
678
|
- scoped_full: Similar to Full, but only deletes all documents
|
|
673
|
-
|
|
674
|
-
|
|
679
|
+
that haven't been updated AND that are associated with
|
|
680
|
+
source ids that were seen during indexing.
|
|
675
681
|
- None: Do not delete any documents.
|
|
676
682
|
source_id_key: Optional key that helps identify the original source
|
|
677
|
-
of the document.
|
|
683
|
+
of the document.
|
|
678
684
|
cleanup_batch_size: Batch size to use when cleaning up documents.
|
|
679
|
-
Default is 1_000.
|
|
680
685
|
force_update: Force update documents even if they are present in the
|
|
681
686
|
record manager. Useful if you are re-indexing with updated embeddings.
|
|
682
|
-
|
|
687
|
+
key_encoder: Hashing algorithm to use for hashing the document content and
|
|
688
|
+
metadata. Options include "blake2b", "sha256", and "sha512".
|
|
689
|
+
|
|
690
|
+
!!! version-added "Added in version 0.3.66"
|
|
691
|
+
|
|
683
692
|
key_encoder: Hashing algorithm to use for hashing the document.
|
|
684
693
|
If not provided, a default encoder using SHA-1 will be used.
|
|
685
694
|
SHA-1 is not collision-resistant, and a motivated attacker
|
|
@@ -691,12 +700,11 @@ async def aindex(
|
|
|
691
700
|
|
|
692
701
|
When changing the key encoder, you must change the
|
|
693
702
|
index as well to avoid duplicated documents in the cache.
|
|
694
|
-
upsert_kwargs: Additional keyword arguments to pass to the
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
.. versionadded:: 0.3.10
|
|
703
|
+
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
|
704
|
+
method of the VectorStore or the upsert method of the DocumentIndex.
|
|
705
|
+
For example, you can use this to specify a custom vector_field:
|
|
706
|
+
upsert_kwargs={"vector_field": "embedding"}
|
|
707
|
+
!!! version-added "Added in version 0.3.10"
|
|
700
708
|
|
|
701
709
|
Returns:
|
|
702
710
|
Indexing result which contains information about how many documents
|
|
@@ -708,10 +716,9 @@ async def aindex(
|
|
|
708
716
|
ValueError: If vectorstore does not have
|
|
709
717
|
"adelete" and "aadd_documents" required methods.
|
|
710
718
|
ValueError: If source_id_key is not None, but is not a string or callable.
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
* Added `scoped_full` cleanup mode.
|
|
719
|
+
TypeError: If `vector_store` is not a VectorStore or DocumentIndex.
|
|
720
|
+
AssertionError: If `source_id_key` is None when cleanup mode is
|
|
721
|
+
incremental or `scoped_full` (should be unreachable).
|
|
715
722
|
"""
|
|
716
723
|
# Behavior is deprecated, but we keep it for backwards compatibility.
|
|
717
724
|
# # Warn only once per process.
|
|
@@ -803,13 +810,13 @@ async def aindex(
|
|
|
803
810
|
# Count documents removed by within-batch deduplication
|
|
804
811
|
num_skipped += original_batch_size - len(hashed_docs)
|
|
805
812
|
|
|
806
|
-
source_ids: Sequence[
|
|
813
|
+
source_ids: Sequence[str | None] = [
|
|
807
814
|
source_id_assigner(doc) for doc in hashed_docs
|
|
808
815
|
]
|
|
809
816
|
|
|
810
817
|
if cleanup in {"incremental", "scoped_full"}:
|
|
811
818
|
# If the cleanup mode is incremental, source ids are required.
|
|
812
|
-
for source_id, hashed_doc in zip(source_ids, hashed_docs):
|
|
819
|
+
for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
|
|
813
820
|
if source_id is None:
|
|
814
821
|
msg = (
|
|
815
822
|
f"Source ids are required when cleanup mode is "
|
|
@@ -833,7 +840,7 @@ async def aindex(
|
|
|
833
840
|
docs_to_index: list[Document] = []
|
|
834
841
|
uids_to_refresh = []
|
|
835
842
|
seen_docs: set[str] = set()
|
|
836
|
-
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch):
|
|
843
|
+
for hashed_doc, doc_exists in zip(hashed_docs, exists_batch, strict=False):
|
|
837
844
|
hashed_id = cast("str", hashed_doc.id)
|
|
838
845
|
if doc_exists:
|
|
839
846
|
if force_update:
|
|
@@ -905,7 +912,7 @@ async def aindex(
|
|
|
905
912
|
if cleanup == "full" or (
|
|
906
913
|
cleanup == "scoped_full" and scoped_full_cleanup_source_ids
|
|
907
914
|
):
|
|
908
|
-
delete_group_ids:
|
|
915
|
+
delete_group_ids: Sequence[str] | None = None
|
|
909
916
|
if cleanup == "scoped_full":
|
|
910
917
|
delete_group_ids = list(scoped_full_cleanup_source_ids)
|
|
911
918
|
while uids_to_delete := await record_manager.alist_keys(
|