cohere-haystack 2.0.2__tar.gz → 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/CHANGELOG.md +12 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/PKG-INFO +1 -1
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/pyproject.toml +1 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/embedders/cohere/document_embedder.py +17 -4
- cohere_haystack-3.0.0/src/haystack_integrations/components/embedders/cohere/embedding_types.py +37 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/embedders/cohere/text_embedder.py +20 -6
- cohere_haystack-3.0.0/src/haystack_integrations/components/embedders/cohere/utils.py +112 -0
- cohere_haystack-3.0.0/src/haystack_integrations/components/generators/cohere/chat/chat_generator.py +430 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/rankers/cohere/ranker.py +16 -2
- cohere_haystack-3.0.0/tests/test_cohere_chat_generator.py +473 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/tests/test_cohere_generator.py +3 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/tests/test_cohere_ranker.py +10 -1
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/tests/test_document_embedder.py +7 -1
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/tests/test_text_embedder.py +5 -0
- cohere_haystack-2.0.2/src/haystack_integrations/components/embedders/cohere/utils.py +0 -71
- cohere_haystack-2.0.2/src/haystack_integrations/components/generators/cohere/chat/chat_generator.py +0 -236
- cohere_haystack-2.0.2/tests/test_cohere_chat_generator.py +0 -280
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/.gitignore +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/LICENSE.txt +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/README.md +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/examples/cohere_embedding.py +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/examples/cohere_generation.py +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/examples/cohere_ranker.py +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/pydoc/config.yml +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/embedders/cohere/__init__.py +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/generators/cohere/__init__.py +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/generators/cohere/chat/__init__.py +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/generators/cohere/generator.py +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/rankers/cohere/__init__.py +0 -0
- {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/tests/__init__.py +0 -0
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [integrations/cohere-v2.0.2] - 2025-01-15
|
|
4
|
+
|
|
5
|
+
### 🐛 Bug Fixes
|
|
6
|
+
|
|
7
|
+
- Make GoogleAI Chat Generator compatible with new `ChatMessage`; small fixes to Cohere tests (#1253)
|
|
8
|
+
- Cohere - fix chat message creation (#1289)
|
|
9
|
+
|
|
10
|
+
### 🌀 Miscellaneous
|
|
11
|
+
|
|
12
|
+
- Test: remove tests involving serialization of lambdas (#1281)
|
|
13
|
+
- Test: remove more tests involving serialization of lambdas (#1285)
|
|
14
|
+
|
|
3
15
|
## [integrations/cohere-v2.0.1] - 2024-12-09
|
|
4
16
|
|
|
5
17
|
### ⚙️ CI
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cohere-haystack
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.0
|
|
4
4
|
Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/cohere#readme
|
|
5
5
|
Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
|
|
6
6
|
Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/cohere
|
|
@@ -7,7 +7,8 @@ from typing import Any, Dict, List, Optional
|
|
|
7
7
|
from haystack import Document, component, default_from_dict, default_to_dict
|
|
8
8
|
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
9
9
|
|
|
10
|
-
from cohere import
|
|
10
|
+
from cohere import AsyncClientV2, ClientV2
|
|
11
|
+
from haystack_integrations.components.embedders.cohere.embedding_types import EmbeddingTypes
|
|
11
12
|
from haystack_integrations.components.embedders.cohere.utils import get_async_response, get_response
|
|
12
13
|
|
|
13
14
|
|
|
@@ -47,6 +48,7 @@ class CohereDocumentEmbedder:
|
|
|
47
48
|
progress_bar: bool = True,
|
|
48
49
|
meta_fields_to_embed: Optional[List[str]] = None,
|
|
49
50
|
embedding_separator: str = "\n",
|
|
51
|
+
embedding_type: Optional[EmbeddingTypes] = None,
|
|
50
52
|
):
|
|
51
53
|
"""
|
|
52
54
|
:param api_key: the Cohere API key.
|
|
@@ -72,6 +74,8 @@ class CohereDocumentEmbedder:
|
|
|
72
74
|
to keep the logs clean.
|
|
73
75
|
:param meta_fields_to_embed: list of meta fields that should be embedded along with the Document text.
|
|
74
76
|
:param embedding_separator: separator used to concatenate the meta fields to the Document text.
|
|
77
|
+
:param embedding_type: the type of embeddings to return. Defaults to float embeddings.
|
|
78
|
+
Note that int8, uint8, binary, and ubinary are only valid for v3 models.
|
|
75
79
|
"""
|
|
76
80
|
|
|
77
81
|
self.api_key = api_key
|
|
@@ -85,6 +89,7 @@ class CohereDocumentEmbedder:
|
|
|
85
89
|
self.progress_bar = progress_bar
|
|
86
90
|
self.meta_fields_to_embed = meta_fields_to_embed or []
|
|
87
91
|
self.embedding_separator = embedding_separator
|
|
92
|
+
self.embedding_type = embedding_type or EmbeddingTypes.FLOAT
|
|
88
93
|
|
|
89
94
|
def to_dict(self) -> Dict[str, Any]:
|
|
90
95
|
"""
|
|
@@ -106,6 +111,7 @@ class CohereDocumentEmbedder:
|
|
|
106
111
|
progress_bar=self.progress_bar,
|
|
107
112
|
meta_fields_to_embed=self.meta_fields_to_embed,
|
|
108
113
|
embedding_separator=self.embedding_separator,
|
|
114
|
+
embedding_type=self.embedding_type.value,
|
|
109
115
|
)
|
|
110
116
|
|
|
111
117
|
@classmethod
|
|
@@ -120,6 +126,10 @@ class CohereDocumentEmbedder:
|
|
|
120
126
|
"""
|
|
121
127
|
init_params = data.get("init_parameters", {})
|
|
122
128
|
deserialize_secrets_inplace(init_params, ["api_key"])
|
|
129
|
+
|
|
130
|
+
# Convert embedding_type string to EmbeddingTypes enum value
|
|
131
|
+
init_params["embedding_type"] = EmbeddingTypes.from_str(init_params["embedding_type"])
|
|
132
|
+
|
|
123
133
|
return default_from_dict(cls, data)
|
|
124
134
|
|
|
125
135
|
def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
|
|
@@ -163,17 +173,19 @@ class CohereDocumentEmbedder:
|
|
|
163
173
|
assert api_key is not None
|
|
164
174
|
|
|
165
175
|
if self.use_async_client:
|
|
166
|
-
cohere_client =
|
|
176
|
+
cohere_client = AsyncClientV2(
|
|
167
177
|
api_key,
|
|
168
178
|
base_url=self.api_base_url,
|
|
169
179
|
timeout=self.timeout,
|
|
170
180
|
client_name="haystack",
|
|
171
181
|
)
|
|
172
182
|
all_embeddings, metadata = asyncio.run(
|
|
173
|
-
get_async_response(
|
|
183
|
+
get_async_response(
|
|
184
|
+
cohere_client, texts_to_embed, self.model, self.input_type, self.truncate, self.embedding_type
|
|
185
|
+
)
|
|
174
186
|
)
|
|
175
187
|
else:
|
|
176
|
-
cohere_client =
|
|
188
|
+
cohere_client = ClientV2(
|
|
177
189
|
api_key,
|
|
178
190
|
base_url=self.api_base_url,
|
|
179
191
|
timeout=self.timeout,
|
|
@@ -187,6 +199,7 @@ class CohereDocumentEmbedder:
|
|
|
187
199
|
self.truncate,
|
|
188
200
|
self.batch_size,
|
|
189
201
|
self.progress_bar,
|
|
202
|
+
self.embedding_type,
|
|
190
203
|
)
|
|
191
204
|
|
|
192
205
|
for doc, embeddings in zip(documents, all_embeddings):
|
cohere_haystack-3.0.0/src/haystack_integrations/components/embedders/cohere/embedding_types.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EmbeddingTypes(Enum):
|
|
8
|
+
"""
|
|
9
|
+
Supported types for Cohere embeddings.
|
|
10
|
+
|
|
11
|
+
FLOAT: Default float embeddings. Valid for all models.
|
|
12
|
+
INT8: Signed int8 embeddings. Valid for only v3 models.
|
|
13
|
+
UINT8: Unsigned int8 embeddings. Valid for only v3 models.
|
|
14
|
+
BINARY: Signed binary embeddings. Valid for only v3 models.
|
|
15
|
+
UBINARY: Unsigned binary embeddings. Valid for only v3 models.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
FLOAT = "float"
|
|
19
|
+
INT8 = "int8"
|
|
20
|
+
UINT8 = "uint8"
|
|
21
|
+
BINARY = "binary"
|
|
22
|
+
UBINARY = "ubinary"
|
|
23
|
+
|
|
24
|
+
def __str__(self):
|
|
25
|
+
return self.value
|
|
26
|
+
|
|
27
|
+
@staticmethod
|
|
28
|
+
def from_str(string: str) -> "EmbeddingTypes":
|
|
29
|
+
"""
|
|
30
|
+
Convert a string to an EmbeddingTypes enum.
|
|
31
|
+
"""
|
|
32
|
+
enum_map = {e.value: e for e in EmbeddingTypes}
|
|
33
|
+
embedding_type = enum_map.get(string.lower())
|
|
34
|
+
if embedding_type is None:
|
|
35
|
+
msg = f"Unknown embedding type '{string}'. Supported types are: {list(enum_map.keys())}"
|
|
36
|
+
raise ValueError(msg)
|
|
37
|
+
return embedding_type
|
|
@@ -2,12 +2,13 @@
|
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
import asyncio
|
|
5
|
-
from typing import Any, Dict, List
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
6
|
|
|
7
7
|
from haystack import component, default_from_dict, default_to_dict
|
|
8
8
|
from haystack.utils import Secret, deserialize_secrets_inplace
|
|
9
9
|
|
|
10
|
-
from cohere import
|
|
10
|
+
from cohere import AsyncClientV2, ClientV2
|
|
11
|
+
from haystack_integrations.components.embedders.cohere.embedding_types import EmbeddingTypes
|
|
11
12
|
from haystack_integrations.components.embedders.cohere.utils import get_async_response, get_response
|
|
12
13
|
|
|
13
14
|
|
|
@@ -40,6 +41,7 @@ class CohereTextEmbedder:
|
|
|
40
41
|
truncate: str = "END",
|
|
41
42
|
use_async_client: bool = False,
|
|
42
43
|
timeout: int = 120,
|
|
44
|
+
embedding_type: Optional[EmbeddingTypes] = None,
|
|
43
45
|
):
|
|
44
46
|
"""
|
|
45
47
|
:param api_key: the Cohere API key.
|
|
@@ -60,6 +62,8 @@ class CohereTextEmbedder:
|
|
|
60
62
|
:param use_async_client: flag to select the AsyncClient. It is recommended to use
|
|
61
63
|
AsyncClient for applications with many concurrent calls.
|
|
62
64
|
:param timeout: request timeout in seconds.
|
|
65
|
+
:param embedding_type: the type of embeddings to return. Defaults to float embeddings.
|
|
66
|
+
Note that int8, uint8, binary, and ubinary are only valid for v3 models.
|
|
63
67
|
"""
|
|
64
68
|
|
|
65
69
|
self.api_key = api_key
|
|
@@ -69,6 +73,7 @@ class CohereTextEmbedder:
|
|
|
69
73
|
self.truncate = truncate
|
|
70
74
|
self.use_async_client = use_async_client
|
|
71
75
|
self.timeout = timeout
|
|
76
|
+
self.embedding_type = embedding_type or EmbeddingTypes.FLOAT
|
|
72
77
|
|
|
73
78
|
def to_dict(self) -> Dict[str, Any]:
|
|
74
79
|
"""
|
|
@@ -86,6 +91,7 @@ class CohereTextEmbedder:
|
|
|
86
91
|
truncate=self.truncate,
|
|
87
92
|
use_async_client=self.use_async_client,
|
|
88
93
|
timeout=self.timeout,
|
|
94
|
+
embedding_type=self.embedding_type.value,
|
|
89
95
|
)
|
|
90
96
|
|
|
91
97
|
@classmethod
|
|
@@ -100,6 +106,10 @@ class CohereTextEmbedder:
|
|
|
100
106
|
"""
|
|
101
107
|
init_params = data.get("init_parameters", {})
|
|
102
108
|
deserialize_secrets_inplace(init_params, ["api_key"])
|
|
109
|
+
|
|
110
|
+
# Convert embedding_type string to EmbeddingTypes enum value
|
|
111
|
+
init_params["embedding_type"] = EmbeddingTypes.from_str(init_params["embedding_type"])
|
|
112
|
+
|
|
103
113
|
return default_from_dict(cls, data)
|
|
104
114
|
|
|
105
115
|
@component.output_types(embedding=List[float], meta=Dict[str, Any])
|
|
@@ -125,22 +135,26 @@ class CohereTextEmbedder:
|
|
|
125
135
|
assert api_key is not None
|
|
126
136
|
|
|
127
137
|
if self.use_async_client:
|
|
128
|
-
cohere_client =
|
|
138
|
+
cohere_client = AsyncClientV2(
|
|
129
139
|
api_key,
|
|
130
140
|
base_url=self.api_base_url,
|
|
131
141
|
timeout=self.timeout,
|
|
132
142
|
client_name="haystack",
|
|
133
143
|
)
|
|
134
144
|
embedding, metadata = asyncio.run(
|
|
135
|
-
get_async_response(
|
|
145
|
+
get_async_response(
|
|
146
|
+
cohere_client, [text], self.model, self.input_type, self.truncate, self.embedding_type
|
|
147
|
+
)
|
|
136
148
|
)
|
|
137
149
|
else:
|
|
138
|
-
cohere_client =
|
|
150
|
+
cohere_client = ClientV2(
|
|
139
151
|
api_key,
|
|
140
152
|
base_url=self.api_base_url,
|
|
141
153
|
timeout=self.timeout,
|
|
142
154
|
client_name="haystack",
|
|
143
155
|
)
|
|
144
|
-
embedding, metadata = get_response(
|
|
156
|
+
embedding, metadata = get_response(
|
|
157
|
+
cohere_client, [text], self.model, self.input_type, self.truncate, embedding_type=self.embedding_type
|
|
158
|
+
)
|
|
145
159
|
|
|
146
160
|
return {"embedding": embedding[0], "meta": metadata}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
|
+
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
from cohere import AsyncClientV2, ClientV2
|
|
9
|
+
from haystack_integrations.components.embedders.cohere.embedding_types import EmbeddingTypes
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def get_async_response(
|
|
13
|
+
cohere_async_client: AsyncClientV2,
|
|
14
|
+
texts: List[str],
|
|
15
|
+
model_name,
|
|
16
|
+
input_type,
|
|
17
|
+
truncate,
|
|
18
|
+
embedding_type: Optional[EmbeddingTypes] = None,
|
|
19
|
+
):
|
|
20
|
+
"""Embeds a list of texts asynchronously using the Cohere API.
|
|
21
|
+
|
|
22
|
+
:param cohere_async_client: the Cohere `AsyncClient`
|
|
23
|
+
:param texts: the texts to embed
|
|
24
|
+
:param model_name: the name of the model to use
|
|
25
|
+
:param input_type: one of "classification", "clustering", "search_document", "search_query".
|
|
26
|
+
The type of input text provided to embed.
|
|
27
|
+
:param truncate: one of "NONE", "START", "END". How the API handles text longer than the maximum token length.
|
|
28
|
+
:param embedding_type: the type of embeddings to return. Defaults to float embeddings.
|
|
29
|
+
|
|
30
|
+
:returns: A tuple of the embeddings and metadata.
|
|
31
|
+
|
|
32
|
+
:raises ValueError: If an error occurs while querying the Cohere API.
|
|
33
|
+
"""
|
|
34
|
+
all_embeddings: List[List[float]] = []
|
|
35
|
+
metadata: Dict[str, Any] = {}
|
|
36
|
+
|
|
37
|
+
embedding_type = embedding_type or EmbeddingTypes.FLOAT
|
|
38
|
+
response = await cohere_async_client.embed(
|
|
39
|
+
texts=texts,
|
|
40
|
+
model=model_name,
|
|
41
|
+
input_type=input_type,
|
|
42
|
+
truncate=truncate,
|
|
43
|
+
embedding_types=[embedding_type.value],
|
|
44
|
+
)
|
|
45
|
+
if response.meta is not None:
|
|
46
|
+
metadata = response.meta
|
|
47
|
+
for emb_tuple in response.embeddings:
|
|
48
|
+
# emb_tuple[0] is a str denoting the embedding type (e.g. "float", "int8", etc.)
|
|
49
|
+
if emb_tuple[1] is not None:
|
|
50
|
+
# ok we have embeddings for this type, let's take all
|
|
51
|
+
# the embeddings (a list of embeddings) and break the loop
|
|
52
|
+
all_embeddings.extend(emb_tuple[1])
|
|
53
|
+
break
|
|
54
|
+
|
|
55
|
+
return all_embeddings, metadata
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_response(
|
|
59
|
+
cohere_client: ClientV2,
|
|
60
|
+
texts: List[str],
|
|
61
|
+
model_name,
|
|
62
|
+
input_type,
|
|
63
|
+
truncate,
|
|
64
|
+
batch_size=32,
|
|
65
|
+
progress_bar=False,
|
|
66
|
+
embedding_type: Optional[EmbeddingTypes] = None,
|
|
67
|
+
) -> Tuple[List[List[float]], Dict[str, Any]]:
|
|
68
|
+
"""Embeds a list of texts using the Cohere API.
|
|
69
|
+
|
|
70
|
+
:param cohere_client: the Cohere `Client`
|
|
71
|
+
:param texts: the texts to embed
|
|
72
|
+
:param model_name: the name of the model to use
|
|
73
|
+
:param input_type: one of "classification", "clustering", "search_document", "search_query".
|
|
74
|
+
The type of input text provided to embed.
|
|
75
|
+
:param truncate: one of "NONE", "START", "END". How the API handles text longer than the maximum token length.
|
|
76
|
+
:param batch_size: the batch size to use
|
|
77
|
+
:param progress_bar: if `True`, show a progress bar
|
|
78
|
+
:param embedding_type: the type of embeddings to return. Defaults to float embeddings.
|
|
79
|
+
|
|
80
|
+
:returns: A tuple of the embeddings and metadata.
|
|
81
|
+
|
|
82
|
+
:raises ValueError: If an error occurs while querying the Cohere API.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
all_embeddings: List[List[float]] = []
|
|
86
|
+
metadata: Dict[str, Any] = {}
|
|
87
|
+
embedding_type = embedding_type or EmbeddingTypes.FLOAT
|
|
88
|
+
|
|
89
|
+
for i in tqdm(
|
|
90
|
+
range(0, len(texts), batch_size),
|
|
91
|
+
disable=not progress_bar,
|
|
92
|
+
desc="Calculating embeddings",
|
|
93
|
+
):
|
|
94
|
+
batch = texts[i : i + batch_size]
|
|
95
|
+
response = cohere_client.embed(
|
|
96
|
+
texts=batch,
|
|
97
|
+
model=model_name,
|
|
98
|
+
input_type=input_type,
|
|
99
|
+
truncate=truncate,
|
|
100
|
+
embedding_types=[embedding_type.value],
|
|
101
|
+
)
|
|
102
|
+
## response.embeddings always returns 5 tuples, one tuple per embedding type
|
|
103
|
+
## let's take first non None tuple as that's the one we want
|
|
104
|
+
for emb_tuple in response.embeddings:
|
|
105
|
+
# emb_tuple[0] is a str denoting the embedding type (e.g. "float", "int8", etc.)
|
|
106
|
+
if emb_tuple[1] is not None:
|
|
107
|
+
# ok we have embeddings for this type, let's take all the embeddings (a list of embeddings)
|
|
108
|
+
all_embeddings.extend(emb_tuple[1])
|
|
109
|
+
if response.meta is not None:
|
|
110
|
+
metadata = response.meta
|
|
111
|
+
|
|
112
|
+
return all_embeddings, metadata
|