cbrkit 0.6.2__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cbrkit
3
- Version: 0.6.2
3
+ Version: 0.7.0
4
4
  Summary: Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI.
5
5
  Home-page: https://wi2trier.github.io/cbrkit/
6
6
  License: MIT
@@ -3,8 +3,10 @@
3
3
  """
4
4
 
5
5
  import csv
6
+ import fnmatch
6
7
  import itertools
7
- from collections.abc import Sequence
8
+ import re
9
+ from collections.abc import Callable, Sequence
8
10
  from pathlib import Path
9
11
  from typing import cast
10
12
 
@@ -187,6 +189,80 @@ def jaro_winkler(
187
189
  return wrapped_func
188
190
 
189
191
 
192
+ def ngram(
193
+ n: int,
194
+ case_sensitive: bool = False,
195
+ tokenizer: Callable[[str], Sequence[str]] | None = None,
196
+ ) -> SimPairFunc[str, float]:
197
+ """N-gram similarity function to compute [similarity](https://procake.pages.gitlab.rlp.net/procake-wiki/sim/strings/#n-gram) between two strings.
198
+
199
+ Args:
200
+ n: Length of the n-gram
201
+ case_sensitive: If True, the comparison is case-sensitive
202
+ tokenizer: Tokenizer function to split the input strings into tokens. If None, the input strings are split into characters.
203
+ Examples:
204
+ >>> sim = ngram(3, case_sensitive=False)
205
+ >>> sim("kitten", "sitting")
206
+ 0.125
207
+
208
+ """
209
+ from nltk.util import ngrams
210
+
211
+ def wrapped_func(x: str, y: str) -> float:
212
+ if not case_sensitive:
213
+ x = x.lower()
214
+ y = y.lower()
215
+
216
+ x_items = tokenizer(x) if tokenizer is not None else list(x)
217
+ y_items = tokenizer(y) if tokenizer is not None else list(y)
218
+
219
+ x_ngrams = set(ngrams(x_items, n))
220
+ y_ngrams = set(ngrams(y_items, n))
221
+
222
+ return len(x_ngrams.intersection(y_ngrams)) / len(x_ngrams.union(y_ngrams))
223
+
224
+ return wrapped_func
225
+
226
+
227
+ def regex() -> SimPairFunc[str, float]:
228
+ """Compares a case x to a query y, written as a regular expression. If the case matches the query, the similarity is 1.0, otherwise 0.0.
229
+
230
+ Examples:
231
+ >>> sim = regex()
232
+ >>> sim("Test1", "T.st[0-9]")
233
+ 1.0
234
+ >>> sim("Test2", "T.st[3-6]")
235
+ 0.0
236
+ """
237
+
238
+ def wrapped_func(x: str, y: str) -> float:
239
+ regex = re.compile(y)
240
+ return 1.0 if regex.match(x) else 0.0
241
+
242
+ return wrapped_func
243
+
244
+
245
+ def glob(case_sensitive: bool = False) -> SimPairFunc[str, float]:
246
+ """Compares a case x to a query y, written as a glob pattern, which can contain wildcards. If the case matches the query, the similarity is 1.0, otherwise 0.0.
247
+
248
+ Args:
249
+ case_sensitive: If True, the comparison is case-sensitive
250
+ Examples:
251
+ >>> sim = glob()
252
+ >>> sim("Test1", "Test?")
253
+ 1.0
254
+ >>> sim("Test2", "Test[3-9]")
255
+ 0.0
256
+ """
257
+
258
+ comparison_func = fnmatch.fnmatchcase if case_sensitive else fnmatch.fnmatch
259
+
260
+ def wrapped_func(x: str, y: str) -> float:
261
+ return 1.0 if comparison_func(x, y) else 0.0
262
+
263
+ return wrapped_func
264
+
265
+
190
266
  def table(
191
267
  entries: Sequence[tuple[str, str, float]] | FilePath,
192
268
  symmetric: bool = True,
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "cbrkit"
3
- version = "0.6.2"
3
+ version = "0.7.0"
4
4
  description = "Customizable Case-Based Reasoning (CBR) toolkit for Python with a built-in API and CLI."
5
5
  authors = ["Mirko Lenz <mirko@mirkolenz.com>"]
6
6
  license = "MIT"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes