veridica 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- veridica-0.1.0/LICENCE +7 -0
- veridica-0.1.0/PKG-INFO +97 -0
- veridica-0.1.0/README.md +82 -0
- veridica-0.1.0/pyproject.toml +19 -0
- veridica-0.1.0/src/veridica/__init__.py +0 -0
- veridica-0.1.0/src/veridica/batch.py +18 -0
- veridica-0.1.0/src/veridica/normalisation.py +1 -0
- veridica-0.1.0/src/veridica/similarity.py +107 -0
- veridica-0.1.0/src/veridica/timing.py +26 -0
veridica-0.1.0/LICENCE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright 2025 Sebastiaan Swanenberg
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
veridica-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: veridica
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Fuzzy matching for the use in gherkin based automated tests.
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Sebastiaan Swanenberg
|
|
7
|
+
Author-email: Sebastiaan@staeble.it
|
|
8
|
+
Requires-Python: >=3.12,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# Veridica
|
|
16
|
+
|
|
17
|
+
This project was made for the use of fuzzy comparison algorithmic comparisons for the use in gherkin based test automation.
|
|
18
|
+
|
|
19
|
+
We found that it was required to have full control over the system to limit dependencies. However that does not mean that others cannot use the code that I have developed and will use.
|
|
20
|
+
|
|
21
|
+
## Overview
|
|
22
|
+
|
|
23
|
+
Veridica provides a set of pure Python algorithms to compare strings reliably based on fixed math based algorithms.
|
|
24
|
+
|
|
25
|
+
It includes:
|
|
26
|
+
|
|
27
|
+
- Classic string similarity metrics Classic string to string matching based on algorithms like levenstein.
|
|
28
|
+
|
|
29
|
+
- One-to-many batch comparisons While one to one comparisons can give you a sence of what you are looking for, most usecases involve the comparion for one item to a set. Therefore an implementation was made using classic and recursive techniques that can be directly implemented.
|
|
30
|
+
|
|
31
|
+
- Timing utilities for measuring comparison performance For now limited to a wrapper function that can give back the time it took to complete for compartive data.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
You can install the package via **PyPI** or from **source**.
|
|
36
|
+
|
|
37
|
+
### Install from PyPI
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install veridica
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Install from Source (GitHub)
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
git clone https://github.com/sebastiaanswanenberg/veridica.git
|
|
48
|
+
cd veridica
|
|
49
|
+
pip install .
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
After installation, you can use `veridica` to compare strings one to one or compare a string to a set.
|
|
55
|
+
|
|
56
|
+
### Example: Comparing strings one-to-one.
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from veridica.similarity import levenshtein
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Example: Comparing string to a set of strings.
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from veridica.similarity import levenshtein
|
|
67
|
+
from veridica.batch import compare_many
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Example: Time your string to set calculations.
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from veridica.similarity import levenshtein
|
|
76
|
+
from veridica.batch import compare_many
|
|
77
|
+
from veridica.timing import timed
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Testing
|
|
82
|
+
|
|
83
|
+
This project includes a test suite. You can run tests using `pytest`:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
pip install pytest
|
|
87
|
+
pytest tests
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## License
|
|
91
|
+
|
|
92
|
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details.
|
|
93
|
+
|
|
94
|
+
## Contributing
|
|
95
|
+
|
|
96
|
+
Contributions are welcome! Please feel free to submit pull requests or open issues if you have any suggestions or find bugs.
|
|
97
|
+
|
veridica-0.1.0/README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Veridica
|
|
2
|
+
|
|
3
|
+
This project was made for the use of fuzzy comparison algorithmic comparisons for the use in gherkin based test automation.
|
|
4
|
+
|
|
5
|
+
We found that it was required to have full control over the system to limit dependencies. However that does not mean that others cannot use the code that I have developed and will use.
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
Veridica provides a set of pure Python algorithms to compare strings reliably based on fixed math based algorithms.
|
|
10
|
+
|
|
11
|
+
It includes:
|
|
12
|
+
|
|
13
|
+
- Classic string similarity metrics Classic string to string matching based on algorithms like levenstein.
|
|
14
|
+
|
|
15
|
+
- One-to-many batch comparisons While one to one comparisons can give you a sence of what you are looking for, most usecases involve the comparion for one item to a set. Therefore an implementation was made using classic and recursive techniques that can be directly implemented.
|
|
16
|
+
|
|
17
|
+
- Timing utilities for measuring comparison performance For now limited to a wrapper function that can give back the time it took to complete for compartive data.
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
You can install the package via **PyPI** or from **source**.
|
|
22
|
+
|
|
23
|
+
### Install from PyPI
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install veridica
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Install from Source (GitHub)
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
git clone https://github.com/sebastiaanswanenberg/veridica.git
|
|
34
|
+
cd veridica
|
|
35
|
+
pip install .
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
After installation, you can use `veridica` to compare strings one to one or compare a string to a set.
|
|
41
|
+
|
|
42
|
+
### Example: Comparing strings one-to-one.
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from veridica.similarity import levenshtein
|
|
46
|
+
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Example: Comparing string to a set of strings.
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from veridica.similarity import levenshtein
|
|
53
|
+
from veridica.batch import compare_many
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Example: Time your string to set calculations.
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from veridica.similarity import levenshtein
|
|
62
|
+
from veridica.batch import compare_many
|
|
63
|
+
from veridica.timing import timed
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Testing
|
|
68
|
+
|
|
69
|
+
This project includes a test suite. You can run tests using `pytest`:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install pytest
|
|
73
|
+
pytest tests
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## License
|
|
77
|
+
|
|
78
|
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details.
|
|
79
|
+
|
|
80
|
+
## Contributing
|
|
81
|
+
|
|
82
|
+
Contributions are welcome! Please feel free to submit pull requests or open issues if you have any suggestions or find bugs.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "veridica"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Fuzzy matching for the use in gherkin based automated tests."
|
|
5
|
+
authors = ["Sebastiaan Swanenberg <Sebastiaan@staeble.it>"]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
|
|
9
|
+
[tool.poetry.dependencies]
|
|
10
|
+
python = "^3.12"
|
|
11
|
+
|
|
12
|
+
[tool.poetry.dev-dependencies]
|
|
13
|
+
pytest = "^7.4"
|
|
14
|
+
black = "^24.9"
|
|
15
|
+
isort = "^6.0"
|
|
16
|
+
|
|
17
|
+
[build-system]
|
|
18
|
+
requires = ["poetry-core"]
|
|
19
|
+
build-backend = "poetry.core.masonry.api"
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""One-to-many string comparison helpers."""
|
|
2
|
+
|
|
3
|
+
from typing import Iterable, Callable
|
|
4
|
+
|
|
5
|
+
def compare_many(target: str, candidates: Iterable[str], comparator: Callable[[str, str], float]) -> list[tuple[str, float]]:
|
|
6
|
+
"""
|
|
7
|
+
Compare a single target string against many candidates using the given comparator that returns a list with a numeric score.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
target (str): string to compare
|
|
11
|
+
candidates (Iterable[str]): strings to compare against
|
|
12
|
+
comparator (Callable[[str, str], float]): function returning a numeric score for similarity
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
list[tuple[str, float]]: (candidate, score) pairs ordered by descending score (best matches first)
|
|
16
|
+
"""
|
|
17
|
+
scores = [(candidate, float(comparator(target, candidate))) for candidate in candidates]
|
|
18
|
+
return sorted(scores, key=lambda pair: pair[1], reverse=True)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Helper functions for the normalisation of scores when using multiple algorithms."""
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""String similarity measurement functions."""
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
def levenshtein_distance(s1: str, s2: str) -> int:
|
|
6
|
+
"""Calculate the levenstein distance between two strings"""
|
|
7
|
+
|
|
8
|
+
if s1 == s2:
|
|
9
|
+
return 0
|
|
10
|
+
rows = len(s1) + 1
|
|
11
|
+
cols = len(s2) + 1
|
|
12
|
+
|
|
13
|
+
if not s1:
|
|
14
|
+
return cols - 1
|
|
15
|
+
if not s2:
|
|
16
|
+
return rows - 1
|
|
17
|
+
|
|
18
|
+
prev = None
|
|
19
|
+
cur = range(cols)
|
|
20
|
+
for r in range(1, rows):
|
|
21
|
+
prev, cur = cur, [r] + [0] * (cols - 1)
|
|
22
|
+
for c in range(1, cols):
|
|
23
|
+
deletion = prev[c] + 1
|
|
24
|
+
insertion = cur[c - 1] + 1
|
|
25
|
+
edit = prev[c - 1] + (0 if s1[r - 1] == s2[c - 1] else 1)
|
|
26
|
+
cur[c] = min(edit, deletion, insertion)
|
|
27
|
+
|
|
28
|
+
return cur[-1]
|
|
29
|
+
|
|
30
|
+
def jaccard_distance(s1: str, s2: str) -> float:
|
|
31
|
+
"""Calculate the Jaccard distance between two strings."""
|
|
32
|
+
# split on whitespace into token sets
|
|
33
|
+
set1 = set(s1.split())
|
|
34
|
+
set2 = set(s2.split())
|
|
35
|
+
|
|
36
|
+
# both empty -> identical
|
|
37
|
+
if not set1 and not set2:
|
|
38
|
+
return 0.0
|
|
39
|
+
|
|
40
|
+
intersection = set1 & set2
|
|
41
|
+
union = set1 | set2
|
|
42
|
+
|
|
43
|
+
# guard against division by zero (shouldn't happen due to check above)
|
|
44
|
+
if not union:
|
|
45
|
+
return 0.0
|
|
46
|
+
|
|
47
|
+
return 1.0 - (len(intersection) / len(union))
|
|
48
|
+
|
|
49
|
+
def hamming_distance(s1: str, s2: str) -> float:
|
|
50
|
+
"""Calculate the Hamming distance between two strings."""
|
|
51
|
+
# ensure length of s1 >= s2
|
|
52
|
+
if len(s2) > len(s1):
|
|
53
|
+
s1, s2 = s2, s1
|
|
54
|
+
|
|
55
|
+
# distance is difference in length + differing chars
|
|
56
|
+
distance = len(s1) - len(s2)
|
|
57
|
+
for i, c in enumerate(s2):
|
|
58
|
+
if c != s1[i]:
|
|
59
|
+
distance += 1
|
|
60
|
+
|
|
61
|
+
return distance
|
|
62
|
+
|
|
63
|
+
def damerau_levenshtein_distance(s1: str, s2: str) -> int:
|
|
64
|
+
"""Calculate the Damerau-Levenshtein distance between two strings."""
|
|
65
|
+
|
|
66
|
+
if not isinstance(s1, str):
|
|
67
|
+
return 0
|
|
68
|
+
if not isinstance(s2, str):
|
|
69
|
+
return 0
|
|
70
|
+
|
|
71
|
+
len1 = len(s1)
|
|
72
|
+
len2 = len(s2)
|
|
73
|
+
infinite = len1 + len2
|
|
74
|
+
|
|
75
|
+
# character array
|
|
76
|
+
da = defaultdict(int)
|
|
77
|
+
|
|
78
|
+
# distance matrix
|
|
79
|
+
score = [[0] * (len2 + 2) for x in range(len1 + 2)]
|
|
80
|
+
|
|
81
|
+
score[0][0] = infinite
|
|
82
|
+
for i in range(0, len1 + 1):
|
|
83
|
+
score[i + 1][0] = infinite
|
|
84
|
+
score[i + 1][1] = i
|
|
85
|
+
for i in range(0, len2 + 1):
|
|
86
|
+
score[0][i + 1] = infinite
|
|
87
|
+
score[1][i + 1] = i
|
|
88
|
+
|
|
89
|
+
for i in range(1, len1 + 1):
|
|
90
|
+
db = 0
|
|
91
|
+
for j in range(1, len2 + 1):
|
|
92
|
+
i1 = da[s2[j - 1]]
|
|
93
|
+
j1 = db
|
|
94
|
+
cost = 1
|
|
95
|
+
if s1[i - 1] == s2[j - 1]:
|
|
96
|
+
cost = 0
|
|
97
|
+
db = j
|
|
98
|
+
|
|
99
|
+
score[i + 1][j + 1] = min(
|
|
100
|
+
score[i][j] + cost,
|
|
101
|
+
score[i + 1][j] + 1,
|
|
102
|
+
score[i][j + 1] + 1,
|
|
103
|
+
score[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1),
|
|
104
|
+
)
|
|
105
|
+
da[s1[i - 1]] = i
|
|
106
|
+
|
|
107
|
+
return score[len1 + 1][len2 + 1]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Timing decorators and utilities for measuring comparison performance."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from functools import wraps
|
|
5
|
+
from typing import Callable, Any
|
|
6
|
+
|
|
7
|
+
def timed(func: Callable) -> Callable:
|
|
8
|
+
"""
|
|
9
|
+
Decorator to measure the execution time of a function.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
@timed
|
|
13
|
+
def my_func(...):
|
|
14
|
+
...
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
function: wrapped function
|
|
18
|
+
"""
|
|
19
|
+
@wraps(func)
|
|
20
|
+
def wrapper(*args, **kwargs) -> tuple[Any, float]:
|
|
21
|
+
start = time.perf_counter()
|
|
22
|
+
end = time.perf_counter()
|
|
23
|
+
elapsed = end - start
|
|
24
|
+
result = func(*args, **kwargs)
|
|
25
|
+
return elapsed, result
|
|
26
|
+
return wrapper
|