langchain-core 1.0.0rc3__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langchain-core might be problematic. Click here for more details.
- langchain_core/agents.py +2 -4
- langchain_core/caches.py +16 -7
- langchain_core/callbacks/base.py +0 -4
- langchain_core/callbacks/manager.py +0 -11
- langchain_core/chat_history.py +5 -5
- langchain_core/document_loaders/base.py +6 -4
- langchain_core/document_loaders/blob_loaders.py +1 -1
- langchain_core/document_loaders/langsmith.py +9 -13
- langchain_core/documents/__init__.py +24 -3
- langchain_core/documents/base.py +72 -61
- langchain_core/documents/compressor.py +6 -6
- langchain_core/documents/transformers.py +6 -6
- langchain_core/embeddings/fake.py +2 -2
- langchain_core/example_selectors/semantic_similarity.py +7 -7
- langchain_core/exceptions.py +2 -2
- langchain_core/indexing/__init__.py +1 -1
- langchain_core/indexing/api.py +62 -62
- langchain_core/indexing/base.py +20 -22
- langchain_core/indexing/in_memory.py +2 -4
- langchain_core/language_models/__init__.py +6 -5
- langchain_core/language_models/base.py +7 -8
- langchain_core/language_models/chat_models.py +84 -78
- langchain_core/language_models/fake_chat_models.py +1 -1
- langchain_core/language_models/llms.py +20 -18
- langchain_core/load/dump.py +6 -8
- langchain_core/load/serializable.py +4 -1
- langchain_core/messages/__init__.py +9 -0
- langchain_core/messages/ai.py +11 -7
- langchain_core/messages/base.py +4 -0
- langchain_core/messages/block_translators/google_genai.py +5 -3
- langchain_core/messages/content.py +4 -4
- langchain_core/messages/utils.py +17 -17
- langchain_core/output_parsers/__init__.py +17 -1
- langchain_core/output_parsers/base.py +3 -0
- langchain_core/output_parsers/format_instructions.py +9 -4
- langchain_core/output_parsers/json.py +5 -2
- langchain_core/output_parsers/list.py +16 -16
- langchain_core/output_parsers/openai_tools.py +2 -2
- langchain_core/output_parsers/pydantic.py +1 -1
- langchain_core/output_parsers/string.py +3 -3
- langchain_core/output_parsers/xml.py +28 -25
- langchain_core/outputs/generation.py +2 -3
- langchain_core/prompt_values.py +0 -6
- langchain_core/prompts/base.py +5 -3
- langchain_core/prompts/chat.py +60 -52
- langchain_core/prompts/string.py +5 -2
- langchain_core/prompts/structured.py +12 -8
- langchain_core/rate_limiters.py +1 -3
- langchain_core/retrievers.py +41 -37
- langchain_core/runnables/base.py +25 -29
- langchain_core/runnables/branch.py +9 -9
- langchain_core/runnables/config.py +2 -4
- langchain_core/runnables/configurable.py +3 -3
- langchain_core/runnables/fallbacks.py +1 -1
- langchain_core/runnables/graph.py +7 -3
- langchain_core/runnables/retry.py +1 -1
- langchain_core/runnables/schema.py +2 -5
- langchain_core/runnables/utils.py +3 -3
- langchain_core/stores.py +4 -6
- langchain_core/tools/base.py +68 -14
- langchain_core/tools/convert.py +8 -7
- langchain_core/tools/retriever.py +6 -5
- langchain_core/tools/structured.py +7 -5
- langchain_core/tracers/event_stream.py +4 -1
- langchain_core/tracers/log_stream.py +6 -3
- langchain_core/utils/function_calling.py +8 -0
- langchain_core/utils/json_schema.py +1 -1
- langchain_core/utils/strings.py +1 -4
- langchain_core/utils/utils.py +12 -5
- langchain_core/vectorstores/base.py +130 -130
- langchain_core/vectorstores/in_memory.py +4 -4
- langchain_core/vectorstores/utils.py +1 -1
- langchain_core/version.py +1 -1
- {langchain_core-1.0.0rc3.dist-info → langchain_core-1.0.2.dist-info}/METADATA +8 -7
- {langchain_core-1.0.0rc3.dist-info → langchain_core-1.0.2.dist-info}/RECORD +76 -76
- {langchain_core-1.0.0rc3.dist-info → langchain_core-1.0.2.dist-info}/WHEEL +0 -0
|
@@ -41,7 +41,7 @@ class _VectorStoreExampleSelector(BaseExampleSelector, BaseModel, ABC):
|
|
|
41
41
|
"""Optional keys to filter input to. If provided, the search is based on
|
|
42
42
|
the input variables instead of all variables."""
|
|
43
43
|
vectorstore_kwargs: dict[str, Any] | None = None
|
|
44
|
-
"""Extra arguments passed to similarity_search function of the
|
|
44
|
+
"""Extra arguments passed to similarity_search function of the `VectorStore`."""
|
|
45
45
|
|
|
46
46
|
model_config = ConfigDict(
|
|
47
47
|
arbitrary_types_allowed=True,
|
|
@@ -159,7 +159,7 @@ class SemanticSimilarityExampleSelector(_VectorStoreExampleSelector):
|
|
|
159
159
|
instead of all variables.
|
|
160
160
|
example_keys: If provided, keys to filter examples to.
|
|
161
161
|
vectorstore_kwargs: Extra arguments passed to similarity_search function
|
|
162
|
-
of the
|
|
162
|
+
of the `VectorStore`.
|
|
163
163
|
vectorstore_cls_kwargs: optional kwargs containing url for vector store
|
|
164
164
|
|
|
165
165
|
Returns:
|
|
@@ -203,7 +203,7 @@ class SemanticSimilarityExampleSelector(_VectorStoreExampleSelector):
|
|
|
203
203
|
instead of all variables.
|
|
204
204
|
example_keys: If provided, keys to filter examples to.
|
|
205
205
|
vectorstore_kwargs: Extra arguments passed to similarity_search function
|
|
206
|
-
of the
|
|
206
|
+
of the `VectorStore`.
|
|
207
207
|
vectorstore_cls_kwargs: optional kwargs containing url for vector store
|
|
208
208
|
|
|
209
209
|
Returns:
|
|
@@ -286,12 +286,12 @@ class MaxMarginalRelevanceExampleSelector(_VectorStoreExampleSelector):
|
|
|
286
286
|
embeddings: An initialized embedding API interface, e.g. OpenAIEmbeddings().
|
|
287
287
|
vectorstore_cls: A vector store DB interface class, e.g. FAISS.
|
|
288
288
|
k: Number of examples to select.
|
|
289
|
-
fetch_k: Number of
|
|
289
|
+
fetch_k: Number of `Document` objects to fetch to pass to MMR algorithm.
|
|
290
290
|
input_keys: If provided, the search is based on the input variables
|
|
291
291
|
instead of all variables.
|
|
292
292
|
example_keys: If provided, keys to filter examples to.
|
|
293
293
|
vectorstore_kwargs: Extra arguments passed to similarity_search function
|
|
294
|
-
of the
|
|
294
|
+
of the `VectorStore`.
|
|
295
295
|
vectorstore_cls_kwargs: optional kwargs containing url for vector store
|
|
296
296
|
|
|
297
297
|
Returns:
|
|
@@ -333,12 +333,12 @@ class MaxMarginalRelevanceExampleSelector(_VectorStoreExampleSelector):
|
|
|
333
333
|
embeddings: An initialized embedding API interface, e.g. OpenAIEmbeddings().
|
|
334
334
|
vectorstore_cls: A vector store DB interface class, e.g. FAISS.
|
|
335
335
|
k: Number of examples to select.
|
|
336
|
-
fetch_k: Number of
|
|
336
|
+
fetch_k: Number of `Document` objects to fetch to pass to MMR algorithm.
|
|
337
337
|
input_keys: If provided, the search is based on the input variables
|
|
338
338
|
instead of all variables.
|
|
339
339
|
example_keys: If provided, keys to filter examples to.
|
|
340
340
|
vectorstore_kwargs: Extra arguments passed to similarity_search function
|
|
341
|
-
of the
|
|
341
|
+
of the `VectorStore`.
|
|
342
342
|
vectorstore_cls_kwargs: optional kwargs containing url for vector store
|
|
343
343
|
|
|
344
344
|
Returns:
|
langchain_core/exceptions.py
CHANGED
|
@@ -86,6 +86,6 @@ def create_message(*, message: str, error_code: ErrorCode) -> str:
|
|
|
86
86
|
"""
|
|
87
87
|
return (
|
|
88
88
|
f"{message}\n"
|
|
89
|
-
"For troubleshooting, visit: https://
|
|
90
|
-
f"
|
|
89
|
+
"For troubleshooting, visit: https://docs.langchain.com/oss/python/langchain"
|
|
90
|
+
f"/errors/{error_code.value} "
|
|
91
91
|
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Code to help indexing data into a vectorstore.
|
|
2
2
|
|
|
3
3
|
This package contains helper logic to help deal with indexing data into
|
|
4
|
-
a
|
|
4
|
+
a `VectorStore` while avoiding duplicated content and over-writing content
|
|
5
5
|
if it's unchanged.
|
|
6
6
|
"""
|
|
7
7
|
|
langchain_core/indexing/api.py
CHANGED
|
@@ -304,42 +304,42 @@ def index(
|
|
|
304
304
|
!!! warning
|
|
305
305
|
|
|
306
306
|
* In full mode, the loader should be returning
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
307
|
+
the entire dataset, and not just a subset of the dataset.
|
|
308
|
+
Otherwise, the auto_cleanup will remove documents that it is not
|
|
309
|
+
supposed to.
|
|
310
310
|
* In incremental mode, if documents associated with a particular
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
311
|
+
source id appear across different batches, the indexing API
|
|
312
|
+
will do some redundant work. This will still result in the
|
|
313
|
+
correct end state of the index, but will unfortunately not be
|
|
314
|
+
100% efficient. For example, if a given document is split into 15
|
|
315
|
+
chunks, and we index them using a batch size of 5, we'll have 3 batches
|
|
316
|
+
all with the same source id. In general, to avoid doing too much
|
|
317
|
+
redundant work select as big a batch size as possible.
|
|
318
318
|
* The `scoped_full` mode is suitable if determining an appropriate batch size
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
319
|
+
is challenging or if your data loader cannot return the entire dataset at
|
|
320
|
+
once. This mode keeps track of source IDs in memory, which should be fine
|
|
321
|
+
for most use cases. If your dataset is large (10M+ docs), you will likely
|
|
322
|
+
need to parallelize the indexing process regardless.
|
|
323
323
|
|
|
324
324
|
Args:
|
|
325
325
|
docs_source: Data loader or iterable of documents to index.
|
|
326
326
|
record_manager: Timestamped set to keep track of which documents were
|
|
327
327
|
updated.
|
|
328
|
-
vector_store: VectorStore or DocumentIndex to index the documents into.
|
|
328
|
+
vector_store: `VectorStore` or DocumentIndex to index the documents into.
|
|
329
329
|
batch_size: Batch size to use when indexing.
|
|
330
330
|
cleanup: How to handle clean up of documents.
|
|
331
331
|
|
|
332
332
|
- incremental: Cleans up all documents that haven't been updated AND
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
333
|
+
that are associated with source IDs that were seen during indexing.
|
|
334
|
+
Clean up is done continuously during indexing helping to minimize the
|
|
335
|
+
probability of users seeing duplicated content.
|
|
336
336
|
- full: Delete all documents that have not been returned by the loader
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
337
|
+
during this run of indexing.
|
|
338
|
+
Clean up runs after all documents have been indexed.
|
|
339
|
+
This means that users may see duplicated content during indexing.
|
|
340
340
|
- scoped_full: Similar to Full, but only deletes all documents
|
|
341
|
-
|
|
342
|
-
|
|
341
|
+
that haven't been updated AND that are associated with
|
|
342
|
+
source IDs that were seen during indexing.
|
|
343
343
|
- None: Do not delete any documents.
|
|
344
344
|
source_id_key: Optional key that helps identify the original source
|
|
345
345
|
of the document.
|
|
@@ -363,7 +363,7 @@ def index(
|
|
|
363
363
|
When changing the key encoder, you must change the
|
|
364
364
|
index as well to avoid duplicated documents in the cache.
|
|
365
365
|
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
|
366
|
-
method of the VectorStore or the upsert method of the DocumentIndex.
|
|
366
|
+
method of the `VectorStore` or the upsert method of the DocumentIndex.
|
|
367
367
|
For example, you can use this to specify a custom vector_field:
|
|
368
368
|
upsert_kwargs={"vector_field": "embedding"}
|
|
369
369
|
!!! version-added "Added in version 0.3.10"
|
|
@@ -375,10 +375,10 @@ def index(
|
|
|
375
375
|
Raises:
|
|
376
376
|
ValueError: If cleanup mode is not one of 'incremental', 'full' or None
|
|
377
377
|
ValueError: If cleanup mode is incremental and source_id_key is None.
|
|
378
|
-
ValueError: If
|
|
378
|
+
ValueError: If `VectorStore` does not have
|
|
379
379
|
"delete" and "add_documents" required methods.
|
|
380
380
|
ValueError: If source_id_key is not None, but is not a string or callable.
|
|
381
|
-
TypeError: If `vectorstore` is not a VectorStore or a DocumentIndex.
|
|
381
|
+
TypeError: If `vectorstore` is not a `VectorStore` or a DocumentIndex.
|
|
382
382
|
AssertionError: If `source_id` is None when cleanup mode is incremental.
|
|
383
383
|
(should be unreachable code).
|
|
384
384
|
"""
|
|
@@ -415,7 +415,7 @@ def index(
|
|
|
415
415
|
raise ValueError(msg)
|
|
416
416
|
|
|
417
417
|
if type(destination).delete == VectorStore.delete:
|
|
418
|
-
# Checking if the
|
|
418
|
+
# Checking if the VectorStore has overridden the default delete method
|
|
419
419
|
# implementation which just raises a NotImplementedError
|
|
420
420
|
msg = "Vectorstore has not implemented the delete method"
|
|
421
421
|
raise ValueError(msg)
|
|
@@ -466,11 +466,11 @@ def index(
|
|
|
466
466
|
]
|
|
467
467
|
|
|
468
468
|
if cleanup in {"incremental", "scoped_full"}:
|
|
469
|
-
#
|
|
469
|
+
# Source IDs are required.
|
|
470
470
|
for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
|
|
471
471
|
if source_id is None:
|
|
472
472
|
msg = (
|
|
473
|
-
f"Source
|
|
473
|
+
f"Source IDs are required when cleanup mode is "
|
|
474
474
|
f"incremental or scoped_full. "
|
|
475
475
|
f"Document that starts with "
|
|
476
476
|
f"content: {hashed_doc.page_content[:100]} "
|
|
@@ -479,7 +479,7 @@ def index(
|
|
|
479
479
|
raise ValueError(msg)
|
|
480
480
|
if cleanup == "scoped_full":
|
|
481
481
|
scoped_full_cleanup_source_ids.add(source_id)
|
|
482
|
-
#
|
|
482
|
+
# Source IDs cannot be None after for loop above.
|
|
483
483
|
source_ids = cast("Sequence[str]", source_ids)
|
|
484
484
|
|
|
485
485
|
exists_batch = record_manager.exists(
|
|
@@ -538,7 +538,7 @@ def index(
|
|
|
538
538
|
# If source IDs are provided, we can do the deletion incrementally!
|
|
539
539
|
if cleanup == "incremental":
|
|
540
540
|
# Get the uids of the documents that were not returned by the loader.
|
|
541
|
-
# mypy isn't good enough to determine that source
|
|
541
|
+
# mypy isn't good enough to determine that source IDs cannot be None
|
|
542
542
|
# here due to a check that's happening above, so we check again.
|
|
543
543
|
for source_id in source_ids:
|
|
544
544
|
if source_id is None:
|
|
@@ -642,42 +642,42 @@ async def aindex(
|
|
|
642
642
|
!!! warning
|
|
643
643
|
|
|
644
644
|
* In full mode, the loader should be returning
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
645
|
+
the entire dataset, and not just a subset of the dataset.
|
|
646
|
+
Otherwise, the auto_cleanup will remove documents that it is not
|
|
647
|
+
supposed to.
|
|
648
648
|
* In incremental mode, if documents associated with a particular
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
649
|
+
source id appear across different batches, the indexing API
|
|
650
|
+
will do some redundant work. This will still result in the
|
|
651
|
+
correct end state of the index, but will unfortunately not be
|
|
652
|
+
100% efficient. For example, if a given document is split into 15
|
|
653
|
+
chunks, and we index them using a batch size of 5, we'll have 3 batches
|
|
654
|
+
all with the same source id. In general, to avoid doing too much
|
|
655
|
+
redundant work select as big a batch size as possible.
|
|
656
656
|
* The `scoped_full` mode is suitable if determining an appropriate batch size
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
657
|
+
is challenging or if your data loader cannot return the entire dataset at
|
|
658
|
+
once. This mode keeps track of source IDs in memory, which should be fine
|
|
659
|
+
for most use cases. If your dataset is large (10M+ docs), you will likely
|
|
660
|
+
need to parallelize the indexing process regardless.
|
|
661
661
|
|
|
662
662
|
Args:
|
|
663
663
|
docs_source: Data loader or iterable of documents to index.
|
|
664
664
|
record_manager: Timestamped set to keep track of which documents were
|
|
665
665
|
updated.
|
|
666
|
-
vector_store: VectorStore or DocumentIndex to index the documents into.
|
|
666
|
+
vector_store: `VectorStore` or DocumentIndex to index the documents into.
|
|
667
667
|
batch_size: Batch size to use when indexing.
|
|
668
668
|
cleanup: How to handle clean up of documents.
|
|
669
669
|
|
|
670
670
|
- incremental: Cleans up all documents that haven't been updated AND
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
671
|
+
that are associated with source IDs that were seen during indexing.
|
|
672
|
+
Clean up is done continuously during indexing helping to minimize the
|
|
673
|
+
probability of users seeing duplicated content.
|
|
674
674
|
- full: Delete all documents that have not been returned by the loader
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
675
|
+
during this run of indexing.
|
|
676
|
+
Clean up runs after all documents have been indexed.
|
|
677
|
+
This means that users may see duplicated content during indexing.
|
|
678
678
|
- scoped_full: Similar to Full, but only deletes all documents
|
|
679
|
-
|
|
680
|
-
|
|
679
|
+
that haven't been updated AND that are associated with
|
|
680
|
+
source IDs that were seen during indexing.
|
|
681
681
|
- None: Do not delete any documents.
|
|
682
682
|
source_id_key: Optional key that helps identify the original source
|
|
683
683
|
of the document.
|
|
@@ -701,7 +701,7 @@ async def aindex(
|
|
|
701
701
|
When changing the key encoder, you must change the
|
|
702
702
|
index as well to avoid duplicated documents in the cache.
|
|
703
703
|
upsert_kwargs: Additional keyword arguments to pass to the add_documents
|
|
704
|
-
method of the VectorStore or the upsert method of the DocumentIndex.
|
|
704
|
+
method of the `VectorStore` or the upsert method of the DocumentIndex.
|
|
705
705
|
For example, you can use this to specify a custom vector_field:
|
|
706
706
|
upsert_kwargs={"vector_field": "embedding"}
|
|
707
707
|
!!! version-added "Added in version 0.3.10"
|
|
@@ -713,10 +713,10 @@ async def aindex(
|
|
|
713
713
|
Raises:
|
|
714
714
|
ValueError: If cleanup mode is not one of 'incremental', 'full' or None
|
|
715
715
|
ValueError: If cleanup mode is incremental and source_id_key is None.
|
|
716
|
-
ValueError: If
|
|
716
|
+
ValueError: If `VectorStore` does not have
|
|
717
717
|
"adelete" and "aadd_documents" required methods.
|
|
718
718
|
ValueError: If source_id_key is not None, but is not a string or callable.
|
|
719
|
-
TypeError: If `vector_store` is not a VectorStore or DocumentIndex.
|
|
719
|
+
TypeError: If `vector_store` is not a `VectorStore` or DocumentIndex.
|
|
720
720
|
AssertionError: If `source_id_key` is None when cleanup mode is
|
|
721
721
|
incremental or `scoped_full` (should be unreachable).
|
|
722
722
|
"""
|
|
@@ -757,7 +757,7 @@ async def aindex(
|
|
|
757
757
|
type(destination).adelete == VectorStore.adelete
|
|
758
758
|
and type(destination).delete == VectorStore.delete
|
|
759
759
|
):
|
|
760
|
-
# Checking if the
|
|
760
|
+
# Checking if the VectorStore has overridden the default adelete or delete
|
|
761
761
|
# methods implementation which just raises a NotImplementedError
|
|
762
762
|
msg = "Vectorstore has not implemented the adelete or delete method"
|
|
763
763
|
raise ValueError(msg)
|
|
@@ -815,11 +815,11 @@ async def aindex(
|
|
|
815
815
|
]
|
|
816
816
|
|
|
817
817
|
if cleanup in {"incremental", "scoped_full"}:
|
|
818
|
-
# If the cleanup mode is incremental, source
|
|
818
|
+
# If the cleanup mode is incremental, source IDs are required.
|
|
819
819
|
for source_id, hashed_doc in zip(source_ids, hashed_docs, strict=False):
|
|
820
820
|
if source_id is None:
|
|
821
821
|
msg = (
|
|
822
|
-
f"Source
|
|
822
|
+
f"Source IDs are required when cleanup mode is "
|
|
823
823
|
f"incremental or scoped_full. "
|
|
824
824
|
f"Document that starts with "
|
|
825
825
|
f"content: {hashed_doc.page_content[:100]} "
|
|
@@ -828,7 +828,7 @@ async def aindex(
|
|
|
828
828
|
raise ValueError(msg)
|
|
829
829
|
if cleanup == "scoped_full":
|
|
830
830
|
scoped_full_cleanup_source_ids.add(source_id)
|
|
831
|
-
#
|
|
831
|
+
# Source IDs cannot be None after for loop above.
|
|
832
832
|
source_ids = cast("Sequence[str]", source_ids)
|
|
833
833
|
|
|
834
834
|
exists_batch = await record_manager.aexists(
|
|
@@ -888,7 +888,7 @@ async def aindex(
|
|
|
888
888
|
if cleanup == "incremental":
|
|
889
889
|
# Get the uids of the documents that were not returned by the loader.
|
|
890
890
|
|
|
891
|
-
# mypy isn't good enough to determine that source
|
|
891
|
+
# mypy isn't good enough to determine that source IDs cannot be None
|
|
892
892
|
# here due to a check that's happening above, so we check again.
|
|
893
893
|
for source_id in source_ids:
|
|
894
894
|
if source_id is None:
|
langchain_core/indexing/base.py
CHANGED
|
@@ -25,7 +25,7 @@ class RecordManager(ABC):
|
|
|
25
25
|
The record manager abstraction is used by the langchain indexing API.
|
|
26
26
|
|
|
27
27
|
The record manager keeps track of which documents have been
|
|
28
|
-
written into a
|
|
28
|
+
written into a `VectorStore` and when they were written.
|
|
29
29
|
|
|
30
30
|
The indexing API computes hashes for each document and stores the hash
|
|
31
31
|
together with the write time and the source id in the record manager.
|
|
@@ -37,7 +37,7 @@ class RecordManager(ABC):
|
|
|
37
37
|
already been indexed, and to only index new documents.
|
|
38
38
|
|
|
39
39
|
The main benefit of this abstraction is that it works across many vectorstores.
|
|
40
|
-
To be supported, a
|
|
40
|
+
To be supported, a `VectorStore` needs to only support the ability to add and
|
|
41
41
|
delete documents by ID. Using the record manager, the indexing API will
|
|
42
42
|
be able to delete outdated documents and avoid redundant indexing of documents
|
|
43
43
|
that have already been indexed.
|
|
@@ -45,13 +45,13 @@ class RecordManager(ABC):
|
|
|
45
45
|
The main constraints of this abstraction are:
|
|
46
46
|
|
|
47
47
|
1. It relies on the time-stamps to determine which documents have been
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
48
|
+
indexed and which have not. This means that the time-stamps must be
|
|
49
|
+
monotonically increasing. The timestamp should be the timestamp
|
|
50
|
+
as measured by the server to minimize issues.
|
|
51
51
|
2. The record manager is currently implemented separately from the
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
52
|
+
vectorstore, which means that the overall system becomes distributed
|
|
53
|
+
and may create issues with consistency. For example, writing to
|
|
54
|
+
record manager succeeds, but corresponding writing to `VectorStore` fails.
|
|
55
55
|
"""
|
|
56
56
|
|
|
57
57
|
def __init__(
|
|
@@ -460,7 +460,7 @@ class UpsertResponse(TypedDict):
|
|
|
460
460
|
class DeleteResponse(TypedDict, total=False):
|
|
461
461
|
"""A generic response for delete operation.
|
|
462
462
|
|
|
463
|
-
The fields in this response are optional and whether the
|
|
463
|
+
The fields in this response are optional and whether the `VectorStore`
|
|
464
464
|
returns them or not is up to the implementation.
|
|
465
465
|
"""
|
|
466
466
|
|
|
@@ -508,8 +508,6 @@ class DocumentIndex(BaseRetriever):
|
|
|
508
508
|
1. Storing document in the index.
|
|
509
509
|
2. Fetching document by ID.
|
|
510
510
|
3. Searching for document using a query.
|
|
511
|
-
|
|
512
|
-
!!! version-added "Added in version 0.2.29"
|
|
513
511
|
"""
|
|
514
512
|
|
|
515
513
|
@abc.abstractmethod
|
|
@@ -520,40 +518,40 @@ class DocumentIndex(BaseRetriever):
|
|
|
520
518
|
if it is provided. If the ID is not provided, the upsert method is free
|
|
521
519
|
to generate an ID for the content.
|
|
522
520
|
|
|
523
|
-
When an ID is specified and the content already exists in the
|
|
521
|
+
When an ID is specified and the content already exists in the `VectorStore`,
|
|
524
522
|
the upsert method should update the content with the new data. If the content
|
|
525
|
-
does not exist, the upsert method should add the item to the
|
|
523
|
+
does not exist, the upsert method should add the item to the `VectorStore`.
|
|
526
524
|
|
|
527
525
|
Args:
|
|
528
|
-
items: Sequence of documents to add to the
|
|
526
|
+
items: Sequence of documents to add to the `VectorStore`.
|
|
529
527
|
**kwargs: Additional keyword arguments.
|
|
530
528
|
|
|
531
529
|
Returns:
|
|
532
530
|
A response object that contains the list of IDs that were
|
|
533
|
-
successfully added or updated in the
|
|
531
|
+
successfully added or updated in the `VectorStore` and the list of IDs that
|
|
534
532
|
failed to be added or updated.
|
|
535
533
|
"""
|
|
536
534
|
|
|
537
535
|
async def aupsert(
|
|
538
536
|
self, items: Sequence[Document], /, **kwargs: Any
|
|
539
537
|
) -> UpsertResponse:
|
|
540
|
-
"""Add or update documents in the
|
|
538
|
+
"""Add or update documents in the `VectorStore`. Async version of `upsert`.
|
|
541
539
|
|
|
542
540
|
The upsert functionality should utilize the ID field of the item
|
|
543
541
|
if it is provided. If the ID is not provided, the upsert method is free
|
|
544
542
|
to generate an ID for the item.
|
|
545
543
|
|
|
546
|
-
When an ID is specified and the item already exists in the
|
|
544
|
+
When an ID is specified and the item already exists in the `VectorStore`,
|
|
547
545
|
the upsert method should update the item with the new data. If the item
|
|
548
|
-
does not exist, the upsert method should add the item to the
|
|
546
|
+
does not exist, the upsert method should add the item to the `VectorStore`.
|
|
549
547
|
|
|
550
548
|
Args:
|
|
551
|
-
items: Sequence of documents to add to the
|
|
549
|
+
items: Sequence of documents to add to the `VectorStore`.
|
|
552
550
|
**kwargs: Additional keyword arguments.
|
|
553
551
|
|
|
554
552
|
Returns:
|
|
555
553
|
A response object that contains the list of IDs that were
|
|
556
|
-
successfully added or updated in the
|
|
554
|
+
successfully added or updated in the `VectorStore` and the list of IDs that
|
|
557
555
|
failed to be added or updated.
|
|
558
556
|
"""
|
|
559
557
|
return await run_in_executor(
|
|
@@ -570,7 +568,7 @@ class DocumentIndex(BaseRetriever):
|
|
|
570
568
|
Calling delete without any input parameters should raise a ValueError!
|
|
571
569
|
|
|
572
570
|
Args:
|
|
573
|
-
ids: List of
|
|
571
|
+
ids: List of IDs to delete.
|
|
574
572
|
**kwargs: Additional keyword arguments. This is up to the implementation.
|
|
575
573
|
For example, can include an option to delete the entire index,
|
|
576
574
|
or else issue a non-blocking delete etc.
|
|
@@ -588,7 +586,7 @@ class DocumentIndex(BaseRetriever):
|
|
|
588
586
|
Calling adelete without any input parameters should raise a ValueError!
|
|
589
587
|
|
|
590
588
|
Args:
|
|
591
|
-
ids: List of
|
|
589
|
+
ids: List of IDs to delete.
|
|
592
590
|
**kwargs: Additional keyword arguments. This is up to the implementation.
|
|
593
591
|
For example, can include an option to delete the entire index.
|
|
594
592
|
|
|
@@ -23,8 +23,6 @@ class InMemoryDocumentIndex(DocumentIndex):
|
|
|
23
23
|
|
|
24
24
|
It provides a simple search API that returns documents by the number of
|
|
25
25
|
counts the given query appears in the document.
|
|
26
|
-
|
|
27
|
-
!!! version-added "Added in version 0.2.29"
|
|
28
26
|
"""
|
|
29
27
|
|
|
30
28
|
store: dict[str, Document] = Field(default_factory=dict)
|
|
@@ -64,10 +62,10 @@ class InMemoryDocumentIndex(DocumentIndex):
|
|
|
64
62
|
"""Delete by IDs.
|
|
65
63
|
|
|
66
64
|
Args:
|
|
67
|
-
ids: List of
|
|
65
|
+
ids: List of IDs to delete.
|
|
68
66
|
|
|
69
67
|
Raises:
|
|
70
|
-
ValueError: If
|
|
68
|
+
ValueError: If IDs is None.
|
|
71
69
|
|
|
72
70
|
Returns:
|
|
73
71
|
A response object that contains the list of IDs that were successfully
|
|
@@ -6,12 +6,13 @@ LangChain has two main classes to work with language models: chat models and
|
|
|
6
6
|
**Chat models**
|
|
7
7
|
|
|
8
8
|
Language models that use a sequence of messages as inputs and return chat messages
|
|
9
|
-
as outputs (as opposed to using plain text).
|
|
10
|
-
distinct roles to conversation messages, helping to distinguish messages from the AI,
|
|
11
|
-
users, and instructions such as system messages.
|
|
9
|
+
as outputs (as opposed to using plain text).
|
|
12
10
|
|
|
13
|
-
|
|
14
|
-
|
|
11
|
+
Chat models support the assignment of distinct roles to conversation messages, helping
|
|
12
|
+
to distinguish messages from the AI, users, and instructions such as system messages.
|
|
13
|
+
|
|
14
|
+
The key abstraction for chat models is `BaseChatModel`. Implementations should inherit
|
|
15
|
+
from this class.
|
|
15
16
|
|
|
16
17
|
See existing [chat model integrations](https://docs.langchain.com/oss/python/integrations/chat).
|
|
17
18
|
|
|
@@ -200,14 +200,14 @@ class BaseLanguageModel(
|
|
|
200
200
|
pure text generation models and `BaseMessage` objects for chat models).
|
|
201
201
|
stop: Stop words to use when generating. Model output is cut off at the
|
|
202
202
|
first occurrence of any of these substrings.
|
|
203
|
-
callbacks: Callbacks to pass through. Used for executing additional
|
|
203
|
+
callbacks: `Callbacks` to pass through. Used for executing additional
|
|
204
204
|
functionality, such as logging or streaming, throughout generation.
|
|
205
205
|
**kwargs: Arbitrary additional keyword arguments. These are usually passed
|
|
206
206
|
to the model provider API call.
|
|
207
207
|
|
|
208
208
|
Returns:
|
|
209
209
|
An `LLMResult`, which contains a list of candidate `Generation` objects for
|
|
210
|
-
|
|
210
|
+
each input prompt and additional model provider-specific output.
|
|
211
211
|
|
|
212
212
|
"""
|
|
213
213
|
|
|
@@ -237,14 +237,14 @@ class BaseLanguageModel(
|
|
|
237
237
|
pure text generation models and `BaseMessage` objects for chat models).
|
|
238
238
|
stop: Stop words to use when generating. Model output is cut off at the
|
|
239
239
|
first occurrence of any of these substrings.
|
|
240
|
-
callbacks: Callbacks to pass through. Used for executing additional
|
|
240
|
+
callbacks: `Callbacks` to pass through. Used for executing additional
|
|
241
241
|
functionality, such as logging or streaming, throughout generation.
|
|
242
242
|
**kwargs: Arbitrary additional keyword arguments. These are usually passed
|
|
243
243
|
to the model provider API call.
|
|
244
244
|
|
|
245
245
|
Returns:
|
|
246
246
|
An `LLMResult`, which contains a list of candidate `Generation` objects for
|
|
247
|
-
|
|
247
|
+
each input prompt and additional model provider-specific output.
|
|
248
248
|
|
|
249
249
|
"""
|
|
250
250
|
|
|
@@ -262,15 +262,14 @@ class BaseLanguageModel(
|
|
|
262
262
|
return self.lc_attributes
|
|
263
263
|
|
|
264
264
|
def get_token_ids(self, text: str) -> list[int]:
|
|
265
|
-
"""Return the ordered
|
|
265
|
+
"""Return the ordered IDs of the tokens in a text.
|
|
266
266
|
|
|
267
267
|
Args:
|
|
268
268
|
text: The string input to tokenize.
|
|
269
269
|
|
|
270
270
|
Returns:
|
|
271
|
-
A list of
|
|
272
|
-
|
|
273
|
-
|
|
271
|
+
A list of IDs corresponding to the tokens in the text, in order they occur
|
|
272
|
+
in the text.
|
|
274
273
|
"""
|
|
275
274
|
if self.custom_get_token_ids is not None:
|
|
276
275
|
return self.custom_get_token_ids(text)
|