chunkey-bert 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Yaniv Shulman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,52 @@
1
+ Metadata-Version: 2.1
2
+ Name: chunkey-bert
3
+ Version: 0.1.0
4
+ Summary: Modification of the KeyBERT method to extract keywords and keyphrases using chunks. This provides better results, especialy when handling long documents.
5
+ Home-page: https://github.com/yaniv-shulman/chunkey-keybert
6
+ Keywords: machine learning
7
+ Author: Yaniv Shulman
8
+ Author-email: yaniv@shulman.info
9
+ Requires-Python: >=3.9,<4.0
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Dist: keybert (>=0.8.4,<0.9.0)
21
+ Project-URL: Repository, https://github.com/yaniv-shulman/chunkey-bert
22
+ Description-Content-Type: text/markdown
23
+
24
+ ![Tests](https://github.com/yaniv-shulman/chunkey-bert/actions/workflows/linting_and_tests.yml/badge.svg?branch=main)
25
+ [![phorm.ai](https://img.shields.io/badge/ask%20phorm.ai-8A2BE2)](https://www.phorm.ai/query?projectId=f7ddaf97-2b90-4515-a364-855258454655)
26
+
27
+ # ChunkeyBERT #
28
+ ## Overview ##
29
+ ChunkeyBert is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings for unsupervised
30
+ keyphrase extraction from text documents. ChunkeyBert is a modification of the
31
+ [KeyBERT method](https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea) to handle documents with
32
+ arbitrary length with better results. ChunkeyBERT works by chunking the documents and uses KeyBERT to extract candidate
33
+ keywords/keyphrases from all chunks followed by a similarity based selection stage to produce the final keywords for the
34
+ entire document. ChunkeyBert can use any document chunking method as long as it can be wrapped in a simple function,
35
+ however it can also work without a chunker and process the entire document as a single chunk. ChunkeyBert works with any
36
+ configuration of KeyBERT and can handle batches of documents.
37
+
38
+ ## Installation ##
39
+ Install from [PyPI](https://pypi.org/project/rsklpr/) using pip (preferred method):
40
+ ```bash
41
+ pip install chunkey-bert
42
+ ```
43
+
44
+ ## Experimental results ##
45
+ Very limited experimental results and demonstration of the library on a small number of documents is available at
46
+ https://nbviewer.org/github/yaniv-shulman/chunkey-bert/tree/main/src/experiments/.
47
+
48
+
49
+ ## Contribution and feedback ##
50
+ Contributions and feedback are most welcome. Please see
51
+ [CONTRIBUTING.md](https://github.com/yaniv-shulman/chunkey-bert/tree/main/CONTRIBUTING.md) for further details.
52
+
@@ -0,0 +1,28 @@
1
+ ![Tests](https://github.com/yaniv-shulman/chunkey-bert/actions/workflows/linting_and_tests.yml/badge.svg?branch=main)
2
+ [![phorm.ai](https://img.shields.io/badge/ask%20phorm.ai-8A2BE2)](https://www.phorm.ai/query?projectId=f7ddaf97-2b90-4515-a364-855258454655)
3
+
4
+ # ChunkeyBERT #
5
+ ## Overview ##
6
+ ChunkeyBert is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings for unsupervised
7
+ keyphrase extraction from text documents. ChunkeyBert is a modification of the
8
+ [KeyBERT method](https://towardsdatascience.com/keyword-extraction-with-bert-724efca412ea) to handle documents with
9
+ arbitrary length with better results. ChunkeyBERT works by chunking the documents and uses KeyBERT to extract candidate
10
+ keywords/keyphrases from all chunks followed by a similarity based selection stage to produce the final keywords for the
11
+ entire document. ChunkeyBert can use any document chunking method as long as it can be wrapped in a simple function,
12
+ however it can also work without a chunker and process the entire document as a single chunk. ChunkeyBert works with any
13
+ configuration of KeyBERT and can handle batches of documents.
14
+
15
+ ## Installation ##
16
+ Install from [PyPI](https://pypi.org/project/rsklpr/) using pip (preferred method):
17
+ ```bash
18
+ pip install chunkey-bert
19
+ ```
20
+
21
+ ## Experimental results ##
22
+ Very limited experimental results and demonstration of the library on a small number of documents is available at
23
+ https://nbviewer.org/github/yaniv-shulman/chunkey-bert/tree/main/src/experiments/.
24
+
25
+
26
+ ## Contribution and feedback ##
27
+ Contributions and feedback are most welcome. Please see
28
+ [CONTRIBUTING.md](https://github.com/yaniv-shulman/chunkey-bert/tree/main/CONTRIBUTING.md) for further details.
@@ -0,0 +1,140 @@
1
+ [tool.poetry]
2
+ authors = ["Yaniv Shulman <yaniv@shulman.info>"]
3
+ classifiers = [
4
+ "Intended Audience :: Developers",
5
+ "Intended Audience :: Science/Research",
6
+ "License :: OSI Approved :: MIT License",
7
+ "Programming Language :: Python :: 3.9",
8
+ "Programming Language :: Python :: 3.10",
9
+ "Programming Language :: Python :: 3.11",
10
+ "Topic :: Scientific/Engineering",
11
+ "Topic :: Scientific/Engineering :: Artificial Intelligence"
12
+ ]
13
+ description = "Modification of the KeyBERT method to extract keywords and keyphrases using chunks. This provides better results, especialy when handling long documents."
14
+ homepage = "https://github.com/yaniv-shulman/chunkey-keybert"
15
+ keywords = [
16
+ "machine learning",
17
+ ]
18
+ name = "chunkey-bert"
19
+ packages = [
20
+ { include = "chunkey_bert", from = "src" }
21
+ ]
22
+ readme = "README.md"
23
+ repository = "https://github.com/yaniv-shulman/chunkey-bert"
24
+ version = "0.1.0"
25
+
26
+ [tool.poetry.group.experiments]
27
+ optional = true
28
+
29
+ [tool.poetry.group.dev]
30
+ optional = true
31
+
32
+ [tool.poetry.dependencies]
33
+ python = ">=3.9,<4.0"
34
+ keybert = "^0.8.4"
35
+
36
+
37
+ [tool.poetry.group.experiments.dependencies]
38
+ notebook = "^7.1.3"
39
+ ipywidgets = "^8.1.2"
40
+ spacy = "^3.7.4"
41
+ cupy-cuda12x = "^13.1.0"
42
+ keyphrase-vectorizers = "^0.0.13"
43
+ sentence-transformers = "^2.7.0"
44
+ en-core-web-trf = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3.tar.gz"}
45
+ datasets = "^2.19.1"
46
+
47
+
48
+ [tool.poetry.group.dev.dependencies]
49
+ black = {extras = ["jupyter"], version = "^24.4.2"}
50
+ mypy = "^1.10.0"
51
+ flake8 = "^7.0.0"
52
+ ruff = "^0.4.3"
53
+ pytest = "^8.2.0"
54
+ pytest-mock = "^3.14.0"
55
+ coverage = {extras = ["toml"], version = "^7.5.1"}
56
+ pytest-xdist = "^3.6.1"
57
+ pytest-cov = "^5.0.0"
58
+
59
+
60
+ [build-system]
61
+ requires = ["poetry-core"]
62
+ build-backend = "poetry.core.masonry.api"
63
+
64
+
65
+ [tool.black]
66
+ line-length = 120
67
+ target-version = ["py39"]
68
+
69
+
70
+ [tool.ruff]
71
+ # Exclude a variety of commonly ignored directories.
72
+ exclude = [
73
+ ".bzr",
74
+ ".direnv",
75
+ ".eggs",
76
+ ".git",
77
+ ".git-rewrite",
78
+ ".hg",
79
+ ".idea",
80
+ ".ipynb_checkpoints",
81
+ ".mypy_cache",
82
+ ".nox",
83
+ ".pants.d",
84
+ ".pytype",
85
+ ".ruff_cache",
86
+ ".svn",
87
+ ".tox",
88
+ ".venv",
89
+ "__pycache__",
90
+ "__pypackages__",
91
+ "_build",
92
+ "buck-out",
93
+ "build",
94
+ "dist",
95
+ "node_modules",
96
+ "paper",
97
+ "venv",
98
+ ]
99
+
100
+ # Same as Black.
101
+ line-length = 120
102
+ indent-width = 4
103
+ target-version = "py39"
104
+
105
+
106
+ [tool.ruff.lint]
107
+ # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
108
+ # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
109
+ # McCabe complexity (`C901`) by default.
110
+ select = ["E4", "E7", "E9", "F"]
111
+ ignore = []
112
+
113
+ # Allow fix for all enabled rules (when `--fix`) is provided.
114
+ fixable = ["ALL"]
115
+ unfixable = []
116
+
117
+ # Allow unused variables when underscore-prefixed.
118
+ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
119
+
120
+
121
+ [tool.mypy]
122
+ python_version = "3.9"
123
+ warn_return_any = true
124
+ warn_unused_configs = true
125
+ ignore_missing_imports = true
126
+
127
+
128
+ [tool.pytest.ini_options]
129
+ addopts = "-ra -q"
130
+ minversion = "6.0"
131
+ testpaths = ["tests"]
132
+
133
+
134
+ [tool.coverage.run]
135
+ branch = true
136
+ omit = ["tests/*", "src/experiments/*"]
137
+
138
+
139
+ [tool.coverage.report]
140
+ show_missing=true
File without changes
@@ -0,0 +1,306 @@
1
+ import warnings
2
+ from typing import Tuple, List, Optional, Union, Callable
3
+
4
+ import numpy as np
5
+ from keybert import KeyBERT
6
+ from keybert.backend import BaseEmbedder
7
+ from sklearn.feature_extraction.text import CountVectorizer
8
+
9
+
10
+ def _calculate_top_similar_keywords_for_doc(
11
+ embeddings_doc: np.ndarray,
12
+ counts_doc: Optional[np.ndarray],
13
+ top_k: Optional[int] = None,
14
+ ) -> Tuple[np.ndarray, np.ndarray]:
15
+ """
16
+ Given embeddings to all keywords extracted from all chunks comprising a document, this method determines the top k
17
+ most similar keywords across all chunks. The method assumes the embeddings are normalized and uses dot product
18
+ similarity (cosine similarity). The score is then normalized to the range [0,1].
19
+
20
+ Args:
21
+ embeddings_doc: embeddings of all keywords extracted from a single document.
22
+ counts_doc: If provided, the multiplicity of keywords extracted from the document to use in calculating
23
+ similarity weighting.
24
+ top_k: the number of top keywords to return. If unspecified, returns all keywords sorted by decreasing score.
25
+
26
+ Returns:
27
+ The top k most similar keywords across all chunks and their score in [0,1].
28
+ """
29
+ if top_k is not None and top_k < 0:
30
+ raise ValueError("top_k must be greater than or equal to 0, or None.")
31
+
32
+ sim: np.ndarray = embeddings_doc @ embeddings_doc.T
33
+ np.fill_diagonal(a=sim, val=np.nan)
34
+ sim = np.nanmean(sim, axis=1)
35
+ sim = np.clip(sim, a_min=-1, a_max=1) + 1.0
36
+
37
+ if counts_doc is not None:
38
+ weights: np.ndarray = np.log2(counts_doc + 1)
39
+ weights /= np.max(weights)
40
+ sim *= weights
41
+
42
+ sim /= 2
43
+ top_idx: np.ndarray = np.argsort(sim)
44
+
45
+ if top_k is None:
46
+ top_idx = np.flip(top_idx)
47
+ else:
48
+ top_idx = top_idx[-1 : -top_k - 1 : -1]
49
+
50
+ return top_idx, sim[top_idx]
51
+
52
+
53
+ def _get_unique_keywords_by_doc_idx(
54
+ all_keywords_chunks: List[List[Tuple[str, float]]],
55
+ docs_idx_list: List[int],
56
+ doc_idx: int,
57
+ use_count_weights: bool,
58
+ ) -> Tuple[np.ndarray, Optional[np.ndarray]]:
59
+ """
60
+ Extract all unique keywords (case sensitive) for a single doc from the list of keywords for all chunks.
61
+
62
+ Args:
63
+ all_keywords_chunks: Keywords for all chunks of documents.
64
+ docs_idx_list: Mapping chunks keywords indices to documents.
65
+ doc_idx: The document index to extract keywords for.
66
+ use_count_weights: If True, the number of times a keyword is repeated across chunks in the same document is
67
+ returned. If False, None is returned as counts.
68
+
69
+ Returns:
70
+ All unique keywords for a single doc from the list of keywords and optionally their counts.
71
+ """
72
+ if doc_idx < 0 or doc_idx >= len(docs_idx_list):
73
+ raise ValueError("doc_idx out of range")
74
+
75
+ chunk_i_keywords: List[List[Tuple[str, float]]] = all_keywords_chunks[
76
+ docs_idx_list[doc_idx] : docs_idx_list[doc_idx + 1]
77
+ ]
78
+
79
+ keywords_doc_unique: Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]
80
+
81
+ keywords_doc_unique = np.unique(
82
+ ar=[t[0] for lk in chunk_i_keywords for t in lk if (len(t[0]) > 0 and not t[0].isspace())], # type: ignore[call-overload]
83
+ return_counts=use_count_weights,
84
+ )
85
+
86
+ return (
87
+ (keywords_doc_unique[0].astype(str), keywords_doc_unique[1]) # type: ignore[return-value]
88
+ if use_count_weights
89
+ else (keywords_doc_unique, None)
90
+ )
91
+
92
+
93
+ def _extract_chunks_from_docs(
94
+ docs: Union[str, List[str]],
95
+ chunker: Optional[Callable[[str], List[str]]],
96
+ ) -> Tuple[List[str], List[int]]:
97
+ """
98
+ Applies the chunker to the docs to create a flat list of contiguous doc chunks with a list of indices that index the
99
+ beginning and end of each doc's chunks. The chunker is applied to each document independently. These chunks
100
+ represent the document for subsequent processing and each is later provided to KeyBERT to extract keywords. There is
101
+ no need for the chunker to return all the text in the document, and it can apply filtering and sampling to reduce
102
+ downstream processing complexity. If a chunker is not provided then the chunks returned are the input documents.
103
+
104
+ Args:
105
+ docs: The documents to chunk.
106
+ chunker: A callable that takes a string and returns a list of strings. This is applied to each document
107
+ independently
108
+
109
+ Returns:
110
+ A flat list of contiguous doc chunks with a list of indices that index the beginning and end of each doc's
111
+ chunks.
112
+ """
113
+ if len(docs) == 0:
114
+ return [], []
115
+
116
+ if isinstance(docs, str):
117
+ docs = [docs]
118
+
119
+ chunks: List[str]
120
+ idx: List[int]
121
+
122
+ if chunker is not None:
123
+ chunks = []
124
+ idx = [0] * (len(docs) + 1)
125
+ i: int
126
+ doc: str
127
+
128
+ for i, doc in enumerate(docs):
129
+ chunks_doc: List[str] = chunker(doc)
130
+ chunks.extend(chunks_doc)
131
+ idx[i + 1] = len(chunks)
132
+ else:
133
+ chunks = docs
134
+ idx = list(range(len(docs) + 1))
135
+
136
+ return chunks, idx
137
+
138
+
139
+ class ChunkeyBert:
140
+ def __init__(
141
+ self,
142
+ keybert: KeyBERT,
143
+ ) -> None:
144
+ self._keybert: KeyBERT = keybert
145
+ self._embedder: BaseEmbedder = keybert.model
146
+ try:
147
+ self._embedding_dim: int = self._embedder.embed(documents=["Determining model embedding dim"]).shape[-1]
148
+ except Exception as exc:
149
+ raise ValueError(f"The provided embedder model is not working as expected. Original exception {exc}")
150
+
151
+ def extract_keywords(
152
+ self,
153
+ docs: Union[str, List[str]],
154
+ num_keywords: int,
155
+ chunker: Optional[Callable[[str], List[str]]] = None,
156
+ return_keywords_embeddings: bool = False,
157
+ use_count_weights: bool = True,
158
+ candidates: Optional[List[str]] = None,
159
+ keyphrase_ngram_range: Tuple[int, int] = (1, 1),
160
+ stop_words: Union[str, List[str]] = "english",
161
+ top_n: int = 3,
162
+ min_df: int = 1,
163
+ use_maxsum: bool = False,
164
+ use_mmr: bool = False,
165
+ diversity: float = 0.5,
166
+ nr_candidates: int = 20,
167
+ vectorizer: Optional[CountVectorizer] = None,
168
+ highlight: bool = False,
169
+ seed_keywords: Optional[Union[List[str], List[List[str]]]] = None,
170
+ doc_embeddings: Optional[np.ndarray] = None,
171
+ word_embeddings: Optional[np.ndarray] = None,
172
+ threshold: Optional[float] = None,
173
+ ) -> List[Optional[Union[List[Tuple[str, np.float32]], List[Tuple[str, np.float32, np.ndarray]]]]]:
174
+ """
175
+ Extract the unique keywords/keyphrases for the provided documents. The method uses the chunker if provided to
176
+ chunk the document and then use the KeyBERT model to extract keywords from each chunk. Finally, it merges all
177
+ the results and finds the most similar keywords across all chunks. If a chunker is not provided the documents
178
+ are provided as inputs to KeyBERT in their entirety and the similarity is calculated for the keywords extracted
179
+ of each complete document.
180
+
181
+ Args:
182
+ docs: The documents to extract keywords for.
183
+ num_keywords: The maximum number of keywords to extract.
184
+ chunker: Chunks the documents. The chunker can be any callbale that takes a string and returns a list of
185
+ strings. There are no constraints on the chunks, their length or order, e.g. chunks may be disjoint or
186
+ overlap and can be filtered or even sampled from the document.
187
+ return_keywords_embeddings: True to include the keywords embeddings in the returned list.
188
+ use_count_weights: If True, the number of times a keyword is repeated across chunks in the same document is
189
+ considered in scoring. If False it has no impact. Seems to work best when a small KeyBERT top_n value is
190
+ specified.
191
+ candidates: A KeyBert.extract_keywords parameter. Candidate keywords/keyphrases to use instead of extracting
192
+ them from the document(s). NOTE: This is not used if you passed a `vectorizer`.
193
+ keyphrase_ngram_range: A KeyBert.extract_keywords parameter. Length, in words, of the extracted
194
+ keywords/keyphrases. NOTE: This is not used if you passed a `vectorizer`.
195
+ stop_words: A KeyBert.extract_keywords parameter. Stopwords to remove from the document. NOTE: This is not
196
+ used if you passed a `vectorizer`.
197
+ top_n: A KeyBert.extract_keywords parameter. Return the top n keywords/keyphrases
198
+ min_df: A KeyBert.extract_keywords parameter. Minimum document frequency of a word across all documents if
199
+ keywords for multiple documents need to be extracted. NOTE: This is not used if you passed a
200
+ `vectorizer`.
201
+ use_maxsum: A KeyBert.extract_keywords parameter. Whether to use Max Sum Distance for the selection of
202
+ keywords/keyphrases.
203
+ use_mmr: A KeyBert.extract_keywords parameter. Whether to use Maximal Marginal Relevance (MMR) for the
204
+ selection of keywords/keyphrases.
205
+ diversity: A KeyBert.extract_keywords parameter. The diversity of the results between 0 and 1 if `use_mmr`
206
+ is set to True.
207
+ nr_candidates: A KeyBert.extract_keywords parameter. The number of candidates to consider if `use_maxsum` is
208
+ set to True.
209
+ vectorizer: A KeyBert.extract_keywords parameter. Pass in your own `CountVectorizer` from
210
+ `sklearn.feature_extraction.text.CountVectorizer`
211
+ highlight: A KeyBert.extract_keywords parameter. Whether to print the document and highlight its
212
+ keywords/keyphrases. NOTE: This does not work if multiple documents are passed.
213
+ seed_keywords: A KeyBert.extract_keywords parameter. Seed keywords that may guide the extraction of keywords
214
+ by steering the similarities towards the seeded keywords. NOTE: when multiple documents are passed,
215
+ `seed_keywords`funtions in either of the two ways:
216
+ - globally: when a flat list of str is passed, keywords are shared by all documents,
217
+ - locally: when a nested list of str is passed, keywords differs among documents.
218
+ doc_embeddings: A KeyBert.extract_keywords parameter. The embeddings of each document.
219
+ word_embeddings: A KeyBert.extract_keywords parameter. The embeddings of each potential keyword/keyphrase
220
+ across the vocabulary of the set of input documents. NOTE: The `word_embeddings` should be generated
221
+ through `.extract_embeddings` as the order of these embeddings depend on the vectorizer that was used to
222
+ generate its vocabulary.
223
+ threshold: Used by KeyBERT but is undocumented. Seems to be give to community_detection in
224
+ sentence_transformers.utils to determine clusters.
225
+
226
+ Returns:
227
+ The top keywords/keyphrases for each corresponding document and their score or None if no keywords are
228
+ available for a document. Optionally the embeddings for each keyword/keyphrase are returned if specified.
229
+ """
230
+ if chunker is None and num_keywords > top_n:
231
+ warnings.warn(
232
+ message="Setting num_keywords higher than top_n without a chunker will result in at most top_n "
233
+ "keywords/keyphrases returned."
234
+ )
235
+
236
+ if isinstance(docs, str):
237
+ docs = [docs]
238
+
239
+ chunks: List[str]
240
+ docs_idx_list: List[int]
241
+
242
+ chunks, docs_idx_list = _extract_chunks_from_docs(docs=docs, chunker=chunker)
243
+
244
+ if len(chunks) == 0:
245
+ return []
246
+
247
+ if len(chunks) != docs_idx_list[-1]:
248
+ raise RuntimeError("Indices mapping segments don't map to documents, this is a likely issue.")
249
+
250
+ all_keywords_chunks: List[List[Tuple[str, float]]] = self._keybert.extract_keywords(
251
+ docs=chunks,
252
+ candidates=candidates,
253
+ keyphrase_ngram_range=keyphrase_ngram_range,
254
+ stop_words=stop_words,
255
+ top_n=top_n,
256
+ min_df=min_df,
257
+ use_maxsum=use_maxsum,
258
+ use_mmr=use_mmr,
259
+ diversity=diversity,
260
+ nr_candidates=nr_candidates,
261
+ vectorizer=vectorizer,
262
+ highlight=highlight,
263
+ seed_keywords=seed_keywords,
264
+ doc_embeddings=doc_embeddings,
265
+ word_embeddings=word_embeddings,
266
+ threshold=threshold,
267
+ )
268
+
269
+ if len(all_keywords_chunks) > 0 and not isinstance(all_keywords_chunks[0], List):
270
+ raise ValueError("Unexpected type returned by keybert.extract_keywords().")
271
+
272
+ keywords: List[Optional[Union[List[Tuple[str, np.float32]], List[Tuple[str, np.float32, np.ndarray]]]]] = []
273
+ doc_idx: int
274
+
275
+ for doc_idx in range(len(docs_idx_list) - 1):
276
+ keywords_doc: np.ndarray
277
+ counts_doc: Optional[np.ndarray]
278
+
279
+ keywords_doc, counts_doc = _get_unique_keywords_by_doc_idx(
280
+ all_keywords_chunks=all_keywords_chunks,
281
+ docs_idx_list=docs_idx_list,
282
+ doc_idx=doc_idx,
283
+ use_count_weights=use_count_weights,
284
+ )
285
+
286
+ if keywords_doc.shape[0] > 0:
287
+ embeddings_doc: np.ndarray = self._embedder.embed(documents=keywords_doc)
288
+
289
+ top_idx: np.ndarray
290
+ score: np.ndarray
291
+
292
+ top_idx, score = _calculate_top_similar_keywords_for_doc(
293
+ embeddings_doc=embeddings_doc,
294
+ counts_doc=counts_doc,
295
+ top_k=num_keywords,
296
+ )
297
+
298
+ if not return_keywords_embeddings:
299
+ keywords.append(list(zip(keywords_doc[top_idx], score)))
300
+ else:
301
+ keywords.append(list(zip(keywords_doc[top_idx], score, embeddings_doc[top_idx])))
302
+
303
+ else:
304
+ keywords.append(None)
305
+
306
+ return keywords