cohere-haystack 2.0.2__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/CHANGELOG.md +12 -0
  2. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/PKG-INFO +1 -1
  3. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/pyproject.toml +1 -0
  4. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/embedders/cohere/document_embedder.py +17 -4
  5. cohere_haystack-3.0.0/src/haystack_integrations/components/embedders/cohere/embedding_types.py +37 -0
  6. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/embedders/cohere/text_embedder.py +20 -6
  7. cohere_haystack-3.0.0/src/haystack_integrations/components/embedders/cohere/utils.py +112 -0
  8. cohere_haystack-3.0.0/src/haystack_integrations/components/generators/cohere/chat/chat_generator.py +430 -0
  9. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/rankers/cohere/ranker.py +16 -2
  10. cohere_haystack-3.0.0/tests/test_cohere_chat_generator.py +473 -0
  11. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/tests/test_cohere_generator.py +3 -0
  12. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/tests/test_cohere_ranker.py +10 -1
  13. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/tests/test_document_embedder.py +7 -1
  14. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/tests/test_text_embedder.py +5 -0
  15. cohere_haystack-2.0.2/src/haystack_integrations/components/embedders/cohere/utils.py +0 -71
  16. cohere_haystack-2.0.2/src/haystack_integrations/components/generators/cohere/chat/chat_generator.py +0 -236
  17. cohere_haystack-2.0.2/tests/test_cohere_chat_generator.py +0 -280
  18. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/.gitignore +0 -0
  19. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/LICENSE.txt +0 -0
  20. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/README.md +0 -0
  21. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/examples/cohere_embedding.py +0 -0
  22. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/examples/cohere_generation.py +0 -0
  23. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/examples/cohere_ranker.py +0 -0
  24. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/pydoc/config.yml +0 -0
  25. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/embedders/cohere/__init__.py +0 -0
  26. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/generators/cohere/__init__.py +0 -0
  27. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/generators/cohere/chat/__init__.py +0 -0
  28. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/generators/cohere/generator.py +0 -0
  29. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/src/haystack_integrations/components/rankers/cohere/__init__.py +0 -0
  30. {cohere_haystack-2.0.2 → cohere_haystack-3.0.0}/tests/__init__.py +0 -0
@@ -1,5 +1,17 @@
1
1
  # Changelog
2
2
 
3
+ ## [integrations/cohere-v2.0.2] - 2025-01-15
4
+
5
+ ### 🐛 Bug Fixes
6
+
7
+ - Make GoogleAI Chat Generator compatible with new `ChatMessage`; small fixes to Cohere tests (#1253)
8
+ - Cohere - fix chat message creation (#1289)
9
+
10
+ ### 🌀 Miscellaneous
11
+
12
+ - Test: remove tests involving serialization of lambdas (#1281)
13
+ - Test: remove more tests involving serialization of lambdas (#1285)
14
+
3
15
  ## [integrations/cohere-v2.0.1] - 2024-12-09
4
16
 
5
17
  ### ⚙️ CI
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cohere-haystack
3
- Version: 2.0.2
3
+ Version: 3.0.0
4
4
  Project-URL: Documentation, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/cohere#readme
5
5
  Project-URL: Issues, https://github.com/deepset-ai/haystack-core-integrations/issues
6
6
  Project-URL: Source, https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/cohere
@@ -47,6 +47,7 @@ dependencies = [
47
47
  "pytest",
48
48
  "pytest-rerunfailures",
49
49
  "haystack-pydoc-tools",
50
+ "jsonschema" # for tools
50
51
  ]
51
52
  [tool.hatch.envs.default.scripts]
52
53
  test = "pytest {args:tests}"
@@ -7,7 +7,8 @@ from typing import Any, Dict, List, Optional
7
7
  from haystack import Document, component, default_from_dict, default_to_dict
8
8
  from haystack.utils import Secret, deserialize_secrets_inplace
9
9
 
10
- from cohere import AsyncClient, Client
10
+ from cohere import AsyncClientV2, ClientV2
11
+ from haystack_integrations.components.embedders.cohere.embedding_types import EmbeddingTypes
11
12
  from haystack_integrations.components.embedders.cohere.utils import get_async_response, get_response
12
13
 
13
14
 
@@ -47,6 +48,7 @@ class CohereDocumentEmbedder:
47
48
  progress_bar: bool = True,
48
49
  meta_fields_to_embed: Optional[List[str]] = None,
49
50
  embedding_separator: str = "\n",
51
+ embedding_type: Optional[EmbeddingTypes] = None,
50
52
  ):
51
53
  """
52
54
  :param api_key: the Cohere API key.
@@ -72,6 +74,8 @@ class CohereDocumentEmbedder:
72
74
  to keep the logs clean.
73
75
  :param meta_fields_to_embed: list of meta fields that should be embedded along with the Document text.
74
76
  :param embedding_separator: separator used to concatenate the meta fields to the Document text.
77
+ :param embedding_type: the type of embeddings to return. Defaults to float embeddings.
78
+ Note that int8, uint8, binary, and ubinary are only valid for v3 models.
75
79
  """
76
80
 
77
81
  self.api_key = api_key
@@ -85,6 +89,7 @@ class CohereDocumentEmbedder:
85
89
  self.progress_bar = progress_bar
86
90
  self.meta_fields_to_embed = meta_fields_to_embed or []
87
91
  self.embedding_separator = embedding_separator
92
+ self.embedding_type = embedding_type or EmbeddingTypes.FLOAT
88
93
 
89
94
  def to_dict(self) -> Dict[str, Any]:
90
95
  """
@@ -106,6 +111,7 @@ class CohereDocumentEmbedder:
106
111
  progress_bar=self.progress_bar,
107
112
  meta_fields_to_embed=self.meta_fields_to_embed,
108
113
  embedding_separator=self.embedding_separator,
114
+ embedding_type=self.embedding_type.value,
109
115
  )
110
116
 
111
117
  @classmethod
@@ -120,6 +126,10 @@ class CohereDocumentEmbedder:
120
126
  """
121
127
  init_params = data.get("init_parameters", {})
122
128
  deserialize_secrets_inplace(init_params, ["api_key"])
129
+
130
+ # Convert embedding_type string to EmbeddingTypes enum value
131
+ init_params["embedding_type"] = EmbeddingTypes.from_str(init_params["embedding_type"])
132
+
123
133
  return default_from_dict(cls, data)
124
134
 
125
135
  def _prepare_texts_to_embed(self, documents: List[Document]) -> List[str]:
@@ -163,17 +173,19 @@ class CohereDocumentEmbedder:
163
173
  assert api_key is not None
164
174
 
165
175
  if self.use_async_client:
166
- cohere_client = AsyncClient(
176
+ cohere_client = AsyncClientV2(
167
177
  api_key,
168
178
  base_url=self.api_base_url,
169
179
  timeout=self.timeout,
170
180
  client_name="haystack",
171
181
  )
172
182
  all_embeddings, metadata = asyncio.run(
173
- get_async_response(cohere_client, texts_to_embed, self.model, self.input_type, self.truncate)
183
+ get_async_response(
184
+ cohere_client, texts_to_embed, self.model, self.input_type, self.truncate, self.embedding_type
185
+ )
174
186
  )
175
187
  else:
176
- cohere_client = Client(
188
+ cohere_client = ClientV2(
177
189
  api_key,
178
190
  base_url=self.api_base_url,
179
191
  timeout=self.timeout,
@@ -187,6 +199,7 @@ class CohereDocumentEmbedder:
187
199
  self.truncate,
188
200
  self.batch_size,
189
201
  self.progress_bar,
202
+ self.embedding_type,
190
203
  )
191
204
 
192
205
  for doc, embeddings in zip(documents, all_embeddings):
@@ -0,0 +1,37 @@
1
+ # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ from enum import Enum
5
+
6
+
7
+ class EmbeddingTypes(Enum):
8
+ """
9
+ Supported types for Cohere embeddings.
10
+
11
+ FLOAT: Default float embeddings. Valid for all models.
12
+ INT8: Signed int8 embeddings. Valid for only v3 models.
13
+ UINT8: Unsigned int8 embeddings. Valid for only v3 models.
14
+ BINARY: Signed binary embeddings. Valid for only v3 models.
15
+ UBINARY: Unsigned binary embeddings. Valid for only v3 models.
16
+ """
17
+
18
+ FLOAT = "float"
19
+ INT8 = "int8"
20
+ UINT8 = "uint8"
21
+ BINARY = "binary"
22
+ UBINARY = "ubinary"
23
+
24
+ def __str__(self):
25
+ return self.value
26
+
27
+ @staticmethod
28
+ def from_str(string: str) -> "EmbeddingTypes":
29
+ """
30
+ Convert a string to an EmbeddingTypes enum.
31
+ """
32
+ enum_map = {e.value: e for e in EmbeddingTypes}
33
+ embedding_type = enum_map.get(string.lower())
34
+ if embedding_type is None:
35
+ msg = f"Unknown embedding type '{string}'. Supported types are: {list(enum_map.keys())}"
36
+ raise ValueError(msg)
37
+ return embedding_type
@@ -2,12 +2,13 @@
2
2
  #
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
  import asyncio
5
- from typing import Any, Dict, List
5
+ from typing import Any, Dict, List, Optional
6
6
 
7
7
  from haystack import component, default_from_dict, default_to_dict
8
8
  from haystack.utils import Secret, deserialize_secrets_inplace
9
9
 
10
- from cohere import AsyncClient, Client
10
+ from cohere import AsyncClientV2, ClientV2
11
+ from haystack_integrations.components.embedders.cohere.embedding_types import EmbeddingTypes
11
12
  from haystack_integrations.components.embedders.cohere.utils import get_async_response, get_response
12
13
 
13
14
 
@@ -40,6 +41,7 @@ class CohereTextEmbedder:
40
41
  truncate: str = "END",
41
42
  use_async_client: bool = False,
42
43
  timeout: int = 120,
44
+ embedding_type: Optional[EmbeddingTypes] = None,
43
45
  ):
44
46
  """
45
47
  :param api_key: the Cohere API key.
@@ -60,6 +62,8 @@ class CohereTextEmbedder:
60
62
  :param use_async_client: flag to select the AsyncClient. It is recommended to use
61
63
  AsyncClient for applications with many concurrent calls.
62
64
  :param timeout: request timeout in seconds.
65
+ :param embedding_type: the type of embeddings to return. Defaults to float embeddings.
66
+ Note that int8, uint8, binary, and ubinary are only valid for v3 models.
63
67
  """
64
68
 
65
69
  self.api_key = api_key
@@ -69,6 +73,7 @@ class CohereTextEmbedder:
69
73
  self.truncate = truncate
70
74
  self.use_async_client = use_async_client
71
75
  self.timeout = timeout
76
+ self.embedding_type = embedding_type or EmbeddingTypes.FLOAT
72
77
 
73
78
  def to_dict(self) -> Dict[str, Any]:
74
79
  """
@@ -86,6 +91,7 @@ class CohereTextEmbedder:
86
91
  truncate=self.truncate,
87
92
  use_async_client=self.use_async_client,
88
93
  timeout=self.timeout,
94
+ embedding_type=self.embedding_type.value,
89
95
  )
90
96
 
91
97
  @classmethod
@@ -100,6 +106,10 @@ class CohereTextEmbedder:
100
106
  """
101
107
  init_params = data.get("init_parameters", {})
102
108
  deserialize_secrets_inplace(init_params, ["api_key"])
109
+
110
+ # Convert embedding_type string to EmbeddingTypes enum value
111
+ init_params["embedding_type"] = EmbeddingTypes.from_str(init_params["embedding_type"])
112
+
103
113
  return default_from_dict(cls, data)
104
114
 
105
115
  @component.output_types(embedding=List[float], meta=Dict[str, Any])
@@ -125,22 +135,26 @@ class CohereTextEmbedder:
125
135
  assert api_key is not None
126
136
 
127
137
  if self.use_async_client:
128
- cohere_client = AsyncClient(
138
+ cohere_client = AsyncClientV2(
129
139
  api_key,
130
140
  base_url=self.api_base_url,
131
141
  timeout=self.timeout,
132
142
  client_name="haystack",
133
143
  )
134
144
  embedding, metadata = asyncio.run(
135
- get_async_response(cohere_client, [text], self.model, self.input_type, self.truncate)
145
+ get_async_response(
146
+ cohere_client, [text], self.model, self.input_type, self.truncate, self.embedding_type
147
+ )
136
148
  )
137
149
  else:
138
- cohere_client = Client(
150
+ cohere_client = ClientV2(
139
151
  api_key,
140
152
  base_url=self.api_base_url,
141
153
  timeout=self.timeout,
142
154
  client_name="haystack",
143
155
  )
144
- embedding, metadata = get_response(cohere_client, [text], self.model, self.input_type, self.truncate)
156
+ embedding, metadata = get_response(
157
+ cohere_client, [text], self.model, self.input_type, self.truncate, embedding_type=self.embedding_type
158
+ )
145
159
 
146
160
  return {"embedding": embedding[0], "meta": metadata}
@@ -0,0 +1,112 @@
1
+ # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ from typing import Any, Dict, List, Optional, Tuple
5
+
6
+ from tqdm import tqdm
7
+
8
+ from cohere import AsyncClientV2, ClientV2
9
+ from haystack_integrations.components.embedders.cohere.embedding_types import EmbeddingTypes
10
+
11
+
12
+ async def get_async_response(
13
+ cohere_async_client: AsyncClientV2,
14
+ texts: List[str],
15
+ model_name,
16
+ input_type,
17
+ truncate,
18
+ embedding_type: Optional[EmbeddingTypes] = None,
19
+ ):
20
+ """Embeds a list of texts asynchronously using the Cohere API.
21
+
22
+ :param cohere_async_client: the Cohere `AsyncClient`
23
+ :param texts: the texts to embed
24
+ :param model_name: the name of the model to use
25
+ :param input_type: one of "classification", "clustering", "search_document", "search_query".
26
+ The type of input text provided to embed.
27
+ :param truncate: one of "NONE", "START", "END". How the API handles text longer than the maximum token length.
28
+ :param embedding_type: the type of embeddings to return. Defaults to float embeddings.
29
+
30
+ :returns: A tuple of the embeddings and metadata.
31
+
32
+ :raises ValueError: If an error occurs while querying the Cohere API.
33
+ """
34
+ all_embeddings: List[List[float]] = []
35
+ metadata: Dict[str, Any] = {}
36
+
37
+ embedding_type = embedding_type or EmbeddingTypes.FLOAT
38
+ response = await cohere_async_client.embed(
39
+ texts=texts,
40
+ model=model_name,
41
+ input_type=input_type,
42
+ truncate=truncate,
43
+ embedding_types=[embedding_type.value],
44
+ )
45
+ if response.meta is not None:
46
+ metadata = response.meta
47
+ for emb_tuple in response.embeddings:
48
+ # emb_tuple[0] is a str denoting the embedding type (e.g. "float", "int8", etc.)
49
+ if emb_tuple[1] is not None:
50
+ # ok we have embeddings for this type, let's take all
51
+ # the embeddings (a list of embeddings) and break the loop
52
+ all_embeddings.extend(emb_tuple[1])
53
+ break
54
+
55
+ return all_embeddings, metadata
56
+
57
+
58
+ def get_response(
59
+ cohere_client: ClientV2,
60
+ texts: List[str],
61
+ model_name,
62
+ input_type,
63
+ truncate,
64
+ batch_size=32,
65
+ progress_bar=False,
66
+ embedding_type: Optional[EmbeddingTypes] = None,
67
+ ) -> Tuple[List[List[float]], Dict[str, Any]]:
68
+ """Embeds a list of texts using the Cohere API.
69
+
70
+ :param cohere_client: the Cohere `Client`
71
+ :param texts: the texts to embed
72
+ :param model_name: the name of the model to use
73
+ :param input_type: one of "classification", "clustering", "search_document", "search_query".
74
+ The type of input text provided to embed.
75
+ :param truncate: one of "NONE", "START", "END". How the API handles text longer than the maximum token length.
76
+ :param batch_size: the batch size to use
77
+ :param progress_bar: if `True`, show a progress bar
78
+ :param embedding_type: the type of embeddings to return. Defaults to float embeddings.
79
+
80
+ :returns: A tuple of the embeddings and metadata.
81
+
82
+ :raises ValueError: If an error occurs while querying the Cohere API.
83
+ """
84
+
85
+ all_embeddings: List[List[float]] = []
86
+ metadata: Dict[str, Any] = {}
87
+ embedding_type = embedding_type or EmbeddingTypes.FLOAT
88
+
89
+ for i in tqdm(
90
+ range(0, len(texts), batch_size),
91
+ disable=not progress_bar,
92
+ desc="Calculating embeddings",
93
+ ):
94
+ batch = texts[i : i + batch_size]
95
+ response = cohere_client.embed(
96
+ texts=batch,
97
+ model=model_name,
98
+ input_type=input_type,
99
+ truncate=truncate,
100
+ embedding_types=[embedding_type.value],
101
+ )
102
+ ## response.embeddings always returns 5 tuples, one tuple per embedding type
103
+ ## let's take first non None tuple as that's the one we want
104
+ for emb_tuple in response.embeddings:
105
+ # emb_tuple[0] is a str denoting the embedding type (e.g. "float", "int8", etc.)
106
+ if emb_tuple[1] is not None:
107
+ # ok we have embeddings for this type, let's take all the embeddings (a list of embeddings)
108
+ all_embeddings.extend(emb_tuple[1])
109
+ if response.meta is not None:
110
+ metadata = response.meta
111
+
112
+ return all_embeddings, metadata