smith-utils 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smith_utils-0.1.0/LICENSE +21 -0
- smith_utils-0.1.0/PKG-INFO +106 -0
- smith_utils-0.1.0/README.md +62 -0
- smith_utils-0.1.0/pyproject.toml +47 -0
- smith_utils-0.1.0/setup.cfg +4 -0
- smith_utils-0.1.0/src/smith_utils/__init__.py +14 -0
- smith_utils-0.1.0/src/smith_utils/datetime/__init__.py +7 -0
- smith_utils-0.1.0/src/smith_utils/datetime/date_utils.py +111 -0
- smith_utils-0.1.0/src/smith_utils/numeric/__init__.py +6 -0
- smith_utils-0.1.0/src/smith_utils/numeric/refinement.py +55 -0
- smith_utils-0.1.0/src/smith_utils/text/__init__.py +10 -0
- smith_utils-0.1.0/src/smith_utils/text/metrics.py +203 -0
- smith_utils-0.1.0/src/smith_utils/text/normalization.py +43 -0
- smith_utils-0.1.0/src/smith_utils.egg-info/PKG-INFO +106 -0
- smith_utils-0.1.0/src/smith_utils.egg-info/SOURCES.txt +20 -0
- smith_utils-0.1.0/src/smith_utils.egg-info/dependency_links.txt +1 -0
- smith_utils-0.1.0/src/smith_utils.egg-info/requires.txt +4 -0
- smith_utils-0.1.0/src/smith_utils.egg-info/top_level.txt +1 -0
- smith_utils-0.1.0/tests/test_date_utils.py +68 -0
- smith_utils-0.1.0/tests/test_metrics.py +74 -0
- smith_utils-0.1.0/tests/test_normalization.py +43 -0
- smith_utils-0.1.0/tests/test_refinement.py +37 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Eiichi YAMAMOTO
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
20
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
21
|
+
IN THE SOFTWARE.
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: smith-utils
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A utility library for data cleaning and parsing.
|
|
5
|
+
Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Eiichi YAMAMOTO
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in
|
|
18
|
+
all copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
25
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
26
|
+
IN THE SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/yeiichi/smith-utils
|
|
29
|
+
Project-URL: Repository, https://github.com/yeiichi/smith-utils
|
|
30
|
+
Keywords: csv,deduplication,data-filtering,file-organization,filtering
|
|
31
|
+
Classifier: Programming Language :: Python :: 3
|
|
32
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
36
|
+
Classifier: Topic :: Utilities
|
|
37
|
+
Requires-Python: >=3.10
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
License-File: LICENSE
|
|
40
|
+
Provides-Extra: docs
|
|
41
|
+
Requires-Dist: sphinx<9,>=8; extra == "docs"
|
|
42
|
+
Requires-Dist: furo>=2024.8.6; extra == "docs"
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
|
|
45
|
+
# smith-utils
|
|
46
|
+
[](https://pypi.org/project/smith-utils/)
|
|
47
|
+

|
|
48
|
+

|
|
49
|
+

|
|
50
|
+

|
|
51
|
+
[](https://smith-utils.readthedocs.io/en/latest/?badge=latest)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
**Smith Utils** is a central hub for data cleaning and parsing scripts.
|
|
55
|
+
This package consolidates distributed utility functions to improve code reuse and maintenance efficiency across all yeiichi projects.
|
|
56
|
+
|
|
57
|
+
## Key Features
|
|
58
|
+
|
|
59
|
+
### 📅 Datetime Utilities (`smith_utils.datetime`)
|
|
60
|
+
Robust date parsing and formatting.
|
|
61
|
+
- `ensure_date`: Flexible conversion of strings, `datetime.date` objects, or `None` (returns today) into a `date` object.
|
|
62
|
+
- `parse_strict_date`: Strict parsing for `YYYYMMDD` or `YYYY-MM-DD` formats, rejecting ambiguous inputs.
|
|
63
|
+
- `format_ordinal`: Converts integers to ordinal strings (e.g., `1` → `"1st"`, `22` → `"22nd"`).
|
|
64
|
+
|
|
65
|
+
### 🔢 Numeric Refinement (`smith_utils.numeric`)
|
|
66
|
+
Clean and parse messy numeric data.
|
|
67
|
+
- `parse_numeric_value`: Handles custom separators, decimals, and negative formats like `(1,234.56)`.
|
|
68
|
+
- `parse_currency_value`: Alias for numeric parsing, specifically for currency strings.
|
|
69
|
+
|
|
70
|
+
### 📝 Text Normalization & Metrics (`smith_utils.text`)
|
|
71
|
+
Standardize text and compare string similarity.
|
|
72
|
+
- `normalize_text`: Unicode NFKC normalization, case folding, and whitespace handling.
|
|
73
|
+
- `StringDistance`: Implementation of Damerau-Levenshtein and Jaro-Winkler algorithms for fuzzy matching.
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
Install via pip:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install smith-utils
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Quick Start
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from smith_utils.datetime.date_utils import ensure_date
|
|
87
|
+
from smith_utils.numeric.refinement import parse_numeric_value
|
|
88
|
+
from smith_utils.text.normalization import normalize_text
|
|
89
|
+
|
|
90
|
+
# Datetime
|
|
91
|
+
date = ensure_date("20231225") # datetime.date(2023, 12, 25)
|
|
92
|
+
|
|
93
|
+
# Numeric
|
|
94
|
+
value = parse_numeric_value("(1,250.50)") # -1250.5
|
|
95
|
+
|
|
96
|
+
# Text
|
|
97
|
+
clean_text = normalize_text(" Smith Utils ") # "smith utils"
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Directory Structure
|
|
101
|
+
- `src/smith_utils/`: Main package source.
|
|
102
|
+
- `legacy/`: Legacy scripts and templates (not included in distribution).
|
|
103
|
+
- `tests/`: Comprehensive test suite.
|
|
104
|
+
|
|
105
|
+
## License
|
|
106
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# smith-utils
|
|
2
|
+
[](https://pypi.org/project/smith-utils/)
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+

|
|
6
|
+

|
|
7
|
+
[](https://smith-utils.readthedocs.io/en/latest/?badge=latest)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
**Smith Utils** is a central hub for data cleaning and parsing scripts.
|
|
11
|
+
This package consolidates distributed utility functions to improve code reuse and maintenance efficiency across all yeiichi projects.
|
|
12
|
+
|
|
13
|
+
## Key Features
|
|
14
|
+
|
|
15
|
+
### 📅 Datetime Utilities (`smith_utils.datetime`)
|
|
16
|
+
Robust date parsing and formatting.
|
|
17
|
+
- `ensure_date`: Flexible conversion of strings, `datetime.date` objects, or `None` (returns today) into a `date` object.
|
|
18
|
+
- `parse_strict_date`: Strict parsing for `YYYYMMDD` or `YYYY-MM-DD` formats, rejecting ambiguous inputs.
|
|
19
|
+
- `format_ordinal`: Converts integers to ordinal strings (e.g., `1` → `"1st"`, `22` → `"22nd"`).
|
|
20
|
+
|
|
21
|
+
### 🔢 Numeric Refinement (`smith_utils.numeric`)
|
|
22
|
+
Clean and parse messy numeric data.
|
|
23
|
+
- `parse_numeric_value`: Handles custom separators, decimals, and negative formats like `(1,234.56)`.
|
|
24
|
+
- `parse_currency_value`: Alias for numeric parsing, specifically for currency strings.
|
|
25
|
+
|
|
26
|
+
### 📝 Text Normalization & Metrics (`smith_utils.text`)
|
|
27
|
+
Standardize text and compare string similarity.
|
|
28
|
+
- `normalize_text`: Unicode NFKC normalization, case folding, and whitespace handling.
|
|
29
|
+
- `StringDistance`: Implementation of Damerau-Levenshtein and Jaro-Winkler algorithms for fuzzy matching.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
Install via pip:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install smith-utils
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from smith_utils.datetime.date_utils import ensure_date
|
|
43
|
+
from smith_utils.numeric.refinement import parse_numeric_value
|
|
44
|
+
from smith_utils.text.normalization import normalize_text
|
|
45
|
+
|
|
46
|
+
# Datetime
|
|
47
|
+
date = ensure_date("20231225") # datetime.date(2023, 12, 25)
|
|
48
|
+
|
|
49
|
+
# Numeric
|
|
50
|
+
value = parse_numeric_value("(1,250.50)") # -1250.5
|
|
51
|
+
|
|
52
|
+
# Text
|
|
53
|
+
clean_text = normalize_text(" Smith Utils ") # "smith utils"
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Directory Structure
|
|
57
|
+
- `src/smith_utils/`: Main package source.
|
|
58
|
+
- `legacy/`: Legacy scripts and templates (not included in distribution).
|
|
59
|
+
- `tests/`: Comprehensive test suite.
|
|
60
|
+
|
|
61
|
+
## License
|
|
62
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "smith-utils"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A utility library for data cleaning and parsing."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Eiichi YAMAMOTO", email = "info@yeiichi.com" }
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
keywords = [
|
|
18
|
+
"csv",
|
|
19
|
+
"deduplication",
|
|
20
|
+
"data-filtering",
|
|
21
|
+
"file-organization",
|
|
22
|
+
"filtering",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
classifiers = [
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
28
|
+
"License :: OSI Approved :: MIT License",
|
|
29
|
+
"Intended Audience :: Developers",
|
|
30
|
+
"Topic :: Software Development :: Libraries",
|
|
31
|
+
"Topic :: Utilities",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
dependencies = []
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
docs = [
|
|
38
|
+
"sphinx>=8,<9",
|
|
39
|
+
"furo>=2024.8.6",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
Homepage = "https://github.com/yeiichi/smith-utils"
|
|
44
|
+
Repository = "https://github.com/yeiichi/smith-utils"
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.packages.find]
|
|
47
|
+
where = ["src"]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .numeric import parse_numeric_value, parse_currency_value
|
|
2
|
+
from .text import StringDistance, analyze_pair, normalize_text
|
|
3
|
+
from .datetime import format_ordinal, parse_strict_date, ensure_date
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"parse_numeric_value",
|
|
7
|
+
"parse_currency_value",
|
|
8
|
+
"StringDistance",
|
|
9
|
+
"analyze_pair",
|
|
10
|
+
"normalize_text",
|
|
11
|
+
"format_ordinal",
|
|
12
|
+
"parse_strict_date",
|
|
13
|
+
"ensure_date",
|
|
14
|
+
]
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# src/calendar_smith/utils.py
|
|
2
|
+
import datetime
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
DATE_FORMAT_BASIC = "%Y%m%d"
|
|
6
|
+
DATE_FORMAT_EXTENDED = "%Y-%m-%d"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def format_ordinal(n: int) -> str:
|
|
10
|
+
"""
|
|
11
|
+
Converts an integer to its ordinal representation as a string.
|
|
12
|
+
|
|
13
|
+
The function takes an integer input and returns the ordinal form of the
|
|
14
|
+
number as a string with its appropriate suffix. For example, 1 becomes
|
|
15
|
+
"1st", 2 becomes "2nd", and so on. The suffix rules account for the
|
|
16
|
+
exceptional cases such as numbers ending in 11, 12, or 13.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
n (int): The integer to be converted to its ordinal equivalent.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
str: The ordinal representation of the input number.
|
|
23
|
+
"""
|
|
24
|
+
if 11 <= (abs(n) % 100) <= 13:
|
|
25
|
+
suffix = 'th'
|
|
26
|
+
else:
|
|
27
|
+
suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(abs(n) % 10, 'th')
|
|
28
|
+
return f"{n}{suffix}"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_strict_date(date_str: str) -> datetime.date:
|
|
32
|
+
"""
|
|
33
|
+
Parses a given date string into a `datetime.date` object.
|
|
34
|
+
|
|
35
|
+
This function attempts to parse the input date string in various formats
|
|
36
|
+
while adhering to strict expectations for valid date representations. It
|
|
37
|
+
supports both basic (YYYYMMDD) and extended (YYYY-MM-DD) formats and rejects
|
|
38
|
+
ambiguous or malformed inputs. The function raises an error if the provided
|
|
39
|
+
string does not conform to the acceptable formats.
|
|
40
|
+
|
|
41
|
+
Note:
|
|
42
|
+
The returned `datetime.date` object is naive and does not retain any time zone
|
|
43
|
+
information. If the input string contains a time component or time zone
|
|
44
|
+
identifier (e.g., after a 'T' or space), it is simply discarded without any
|
|
45
|
+
time zone conversion. This means a UTC timestamp might represent a different
|
|
46
|
+
calendar date in a local time zone.
|
|
47
|
+
|
|
48
|
+
Parameters:
|
|
49
|
+
date_str (str): The date string to be parsed. It is expected to be in either
|
|
50
|
+
basic (YYYYMMDD) or extended (YYYY-MM-DD) format.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
datetime.date: A `date` object representing the parsed date.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
ValueError: If the input string contains an ambiguous or invalid date.
|
|
57
|
+
"""
|
|
58
|
+
date_part = date_str.split('T')[0].split(' ')[0]
|
|
59
|
+
digits_only = re.sub(r'[^0-9]', '', date_part)
|
|
60
|
+
|
|
61
|
+
if len(digits_only) == 8 and date_part.isdigit():
|
|
62
|
+
return datetime.datetime.strptime(digits_only, DATE_FORMAT_BASIC).date()
|
|
63
|
+
|
|
64
|
+
if '-' in date_part:
|
|
65
|
+
try:
|
|
66
|
+
return datetime.datetime.strptime(date_part, DATE_FORMAT_EXTENDED).date()
|
|
67
|
+
except ValueError:
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
if len(digits_only) in (6, 7) or (re.match(r'^\d{1,2}-\d{1,2}$', date_part)):
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"Ambiguous date '{date_str}' rejected. "
|
|
73
|
+
"Please use 8-digit YYYYMMDD or delimiters (YYYY-MM-DD)."
|
|
74
|
+
)
|
|
75
|
+
raise ValueError(f"Could not parse '{date_str}'. Expected YYYY-MM-DD or YYYYMMDD.")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def ensure_date(date_input: str | datetime.date | None) -> datetime.date:
|
|
79
|
+
"""
|
|
80
|
+
Converts various date input formats into a `datetime.date` object.
|
|
81
|
+
|
|
82
|
+
This function accepts a string, a `datetime.date` object, or a `None`
|
|
83
|
+
value to produce a `datetime.date` object. If the input is `None`, the
|
|
84
|
+
current date is returned. String inputs are first stripped of leading
|
|
85
|
+
and trailing whitespace, and the function attempts to parse the string
|
|
86
|
+
as an ISO 8601 formatted date. If the ISO 8601 parsing fails, the input
|
|
87
|
+
is passed to a fallback parsing function for stricter date parsing.
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
ValueError: If the input string cannot be parsed into a valid date
|
|
91
|
+
via both ISO 8601 and the fallback parsing mechanism.
|
|
92
|
+
|
|
93
|
+
Parameters:
|
|
94
|
+
date_input (str | datetime.date | None): The input to be converted
|
|
95
|
+
to a `datetime.date` object. This can be a string representing a
|
|
96
|
+
date, a `datetime.date` object itself, or `None`.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
datetime.date: A valid `datetime.date` object corresponding to the
|
|
100
|
+
provided input, or the current date if the input is `None`.
|
|
101
|
+
"""
|
|
102
|
+
if not date_input:
|
|
103
|
+
return datetime.date.today()
|
|
104
|
+
if isinstance(date_input, datetime.date):
|
|
105
|
+
return date_input
|
|
106
|
+
|
|
107
|
+
clean_input = str(date_input).strip()
|
|
108
|
+
try:
|
|
109
|
+
return datetime.date.fromisoformat(clean_input)
|
|
110
|
+
except ValueError:
|
|
111
|
+
return parse_strict_date(clean_input)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
def parse_numeric_value(val: str | float | None, sep: str = ",", decimal: str = ".", relaxed: bool = False) -> float | str:
|
|
2
|
+
if val is None:
|
|
3
|
+
return 0.0
|
|
4
|
+
|
|
5
|
+
s = str(val).strip()
|
|
6
|
+
is_negative = False
|
|
7
|
+
|
|
8
|
+
if s.startswith("-"):
|
|
9
|
+
is_negative = True
|
|
10
|
+
s = s[1:].strip()
|
|
11
|
+
elif s.startswith("(") and s.endswith(")"):
|
|
12
|
+
is_negative = True
|
|
13
|
+
s = s[1:-1].strip()
|
|
14
|
+
|
|
15
|
+
# Split by decimal to validate groups
|
|
16
|
+
parts = s.split(decimal)
|
|
17
|
+
if len(parts) > 2:
|
|
18
|
+
if relaxed:
|
|
19
|
+
return val
|
|
20
|
+
raise ValueError(f"Invalid numeric value: '{val}'")
|
|
21
|
+
|
|
22
|
+
# Validate integer part groups if 'sep' is used
|
|
23
|
+
int_part = parts[0]
|
|
24
|
+
if sep in int_part:
|
|
25
|
+
groups = int_part.split(sep)
|
|
26
|
+
# All groups except the first must be exactly 3 digits
|
|
27
|
+
# This is a common rule for thousand separators
|
|
28
|
+
if any(len(g) != 3 for g in groups[1:]):
|
|
29
|
+
if relaxed:
|
|
30
|
+
return val
|
|
31
|
+
raise ValueError(f"Invalid numeric value: '{val}'")
|
|
32
|
+
|
|
33
|
+
s_cleaned = s.replace(sep, "")
|
|
34
|
+
if decimal != ".":
|
|
35
|
+
s_cleaned = s_cleaned.replace(decimal, ".")
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
# Check if separator appears after decimal
|
|
39
|
+
if sep in s and decimal in s and s.rfind(sep) > s.find(decimal):
|
|
40
|
+
raise ValueError
|
|
41
|
+
|
|
42
|
+
num = float(s_cleaned)
|
|
43
|
+
# Check if it was purely digits + decimal
|
|
44
|
+
if not s_cleaned.replace(".", "", 1).isdigit():
|
|
45
|
+
raise ValueError
|
|
46
|
+
|
|
47
|
+
return -num if is_negative else num
|
|
48
|
+
except ValueError:
|
|
49
|
+
if relaxed:
|
|
50
|
+
return val
|
|
51
|
+
raise ValueError(f"Invalid numeric value: '{val}'")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def parse_currency_value(val: str | float | None, sep: str = ",", decimal: str = ".", relaxed: bool = False) -> float | str:
|
|
55
|
+
return parse_numeric_value(val, sep=sep, decimal=decimal, relaxed=relaxed)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum, auto
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Relation(Enum):
|
|
8
|
+
EXACT_MATCH = auto()
|
|
9
|
+
CASE_INSENSITIVE_MATCH = auto()
|
|
10
|
+
WHITESPACE_TRIMMED_MATCH = auto()
|
|
11
|
+
NORMALIZED_SPACE_MATCH = auto()
|
|
12
|
+
NO_STRUCTURAL_MATCH = auto()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Result:
|
|
17
|
+
"""
|
|
18
|
+
Represents the result of a comparison, encapsulating classification and similarity metrics.
|
|
19
|
+
|
|
20
|
+
This class encapsulates the result of a comparison operation between two entities,
|
|
21
|
+
including their relationship classification and various similarity metrics. It provides
|
|
22
|
+
a method to retrieve a string representation of the relationship classification.
|
|
23
|
+
"""
|
|
24
|
+
classification: Relation
|
|
25
|
+
damerau_levenshtein_distance: int
|
|
26
|
+
jaro_winkler_score: float
|
|
27
|
+
similarity_percentage: float
|
|
28
|
+
|
|
29
|
+
def get_relation_string(self) -> str:
|
|
30
|
+
if self.classification is Relation.EXACT_MATCH:
|
|
31
|
+
return "Identical"
|
|
32
|
+
if self.classification is Relation.CASE_INSENSITIVE_MATCH:
|
|
33
|
+
return "Case-Insensitive Match"
|
|
34
|
+
if self.classification is Relation.WHITESPACE_TRIMMED_MATCH:
|
|
35
|
+
return "Similar (Trimmed)"
|
|
36
|
+
if self.classification is Relation.NORMALIZED_SPACE_MATCH:
|
|
37
|
+
return "Synonymous (No Spaces)"
|
|
38
|
+
return "Different"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class StringDistance:
|
|
42
|
+
"""
|
|
43
|
+
Provides functionality for calculating string distances and relationships between
|
|
44
|
+
two strings based on various algorithms.
|
|
45
|
+
|
|
46
|
+
This class includes methods for analyzing string similarities and relationships,
|
|
47
|
+
including exact matches, case-insensitive comparisons, and whitespace normalization.
|
|
48
|
+
It also implements Damerau-Levenshtein and Jaro-Winkler distance calculations.
|
|
49
|
+
|
|
50
|
+
:return classification: Possible relationship classification between two strings.
|
|
51
|
+
:return damerau_levenshtein_distance: Integer distance calculated using the Damerau-Levenshtein algorithm.
|
|
52
|
+
:return jaro_winkler_score: A float score indicating similarity using the Jaro-Winkler algorithm.
|
|
53
|
+
:return similarity_percentage: A percentage similarity score between two strings.
|
|
54
|
+
"""
|
|
55
|
+
@staticmethod
|
|
56
|
+
def analyze(a: str, b: str, ignore_case: bool = False) -> Result:
|
|
57
|
+
sa = str(a)
|
|
58
|
+
sb = str(b)
|
|
59
|
+
|
|
60
|
+
relation = StringDistance.classify(sa, sb)
|
|
61
|
+
|
|
62
|
+
if ignore_case:
|
|
63
|
+
sa = sa.lower()
|
|
64
|
+
sb = sb.lower()
|
|
65
|
+
|
|
66
|
+
d_dist = StringDistance.calculate_damerau_levenshtein(sa, sb)
|
|
67
|
+
jw_score = StringDistance.calculate_jaro_winkler(sa, sb)
|
|
68
|
+
|
|
69
|
+
max_len = max(len(sa), len(sb))
|
|
70
|
+
similarity = 100.0 if max_len == 0 else (1.0 - d_dist / max_len) * 100.0
|
|
71
|
+
|
|
72
|
+
return Result(
|
|
73
|
+
classification=relation,
|
|
74
|
+
damerau_levenshtein_distance=d_dist,
|
|
75
|
+
jaro_winkler_score=jw_score,
|
|
76
|
+
similarity_percentage=similarity,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def trim(s: str) -> str:
|
|
81
|
+
return s.strip()
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def strip_all(s: str) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Remove all whitespace characters from a string.
|
|
87
|
+
|
|
88
|
+
Uses split/join logic so any whitespace character acts as a separator,
|
|
89
|
+
including spaces, tabs, and newlines.
|
|
90
|
+
"""
|
|
91
|
+
return "".join(s.split())
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def equals_ignore_case(a: str, b: str) -> bool:
|
|
95
|
+
return a.lower() == b.lower()
|
|
96
|
+
|
|
97
|
+
@staticmethod
|
|
98
|
+
def classify(a: str, b: str) -> Relation:
|
|
99
|
+
if a == b:
|
|
100
|
+
return Relation.EXACT_MATCH
|
|
101
|
+
if StringDistance.equals_ignore_case(a, b):
|
|
102
|
+
return Relation.CASE_INSENSITIVE_MATCH
|
|
103
|
+
if StringDistance.trim(a) == StringDistance.trim(b):
|
|
104
|
+
return Relation.WHITESPACE_TRIMMED_MATCH
|
|
105
|
+
if StringDistance.strip_all(a) == StringDistance.strip_all(b):
|
|
106
|
+
return Relation.NORMALIZED_SPACE_MATCH
|
|
107
|
+
return Relation.NO_STRUCTURAL_MATCH
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def calculate_damerau_levenshtein(s1: str, s2: str) -> int:
|
|
111
|
+
"""
|
|
112
|
+
Restricted Damerau-Levenshtein distance:
|
|
113
|
+
insertion, deletion, substitution, adjacent transposition.
|
|
114
|
+
"""
|
|
115
|
+
m = len(s1)
|
|
116
|
+
n = len(s2)
|
|
117
|
+
|
|
118
|
+
d = [[0] * (n + 1) for _ in range(m + 1)]
|
|
119
|
+
|
|
120
|
+
for i in range(m + 1):
|
|
121
|
+
d[i][0] = i
|
|
122
|
+
for j in range(n + 1):
|
|
123
|
+
d[0][j] = j
|
|
124
|
+
|
|
125
|
+
for i in range(1, m + 1):
|
|
126
|
+
for j in range(1, n + 1):
|
|
127
|
+
cost = 0 if s1[i - 1] == s2[j - 1] else 1
|
|
128
|
+
|
|
129
|
+
d[i][j] = min(
|
|
130
|
+
d[i - 1][j] + 1, # deletion
|
|
131
|
+
d[i][j - 1] + 1, # insertion
|
|
132
|
+
d[i - 1][j - 1] + cost, # substitution
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if (
|
|
136
|
+
i > 1
|
|
137
|
+
and j > 1
|
|
138
|
+
and s1[i - 1] == s2[j - 2]
|
|
139
|
+
and s1[i - 2] == s2[j - 1]
|
|
140
|
+
):
|
|
141
|
+
d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost)
|
|
142
|
+
|
|
143
|
+
return d[m][n]
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def calculate_jaro_winkler(s1: str, s2: str) -> float:
|
|
147
|
+
len1 = len(s1)
|
|
148
|
+
len2 = len(s2)
|
|
149
|
+
|
|
150
|
+
if len1 == 0 and len2 == 0:
|
|
151
|
+
return 1.0
|
|
152
|
+
if len1 == 0 or len2 == 0:
|
|
153
|
+
return 0.0
|
|
154
|
+
|
|
155
|
+
match_distance = max(len1, len2) // 2 - 1
|
|
156
|
+
if match_distance < 0:
|
|
157
|
+
match_distance = 0
|
|
158
|
+
|
|
159
|
+
s1_matches = [False] * len1
|
|
160
|
+
s2_matches = [False] * len2
|
|
161
|
+
|
|
162
|
+
matches = 0
|
|
163
|
+
for i in range(len1):
|
|
164
|
+
start = max(0, i - match_distance)
|
|
165
|
+
end = min(i + match_distance + 1, len2)
|
|
166
|
+
|
|
167
|
+
for j in range(start, end):
|
|
168
|
+
if not s2_matches[j] and s1[i] == s2[j]:
|
|
169
|
+
s1_matches[i] = True
|
|
170
|
+
s2_matches[j] = True
|
|
171
|
+
matches += 1
|
|
172
|
+
break
|
|
173
|
+
|
|
174
|
+
if matches == 0:
|
|
175
|
+
return 0.0
|
|
176
|
+
|
|
177
|
+
transpositions = 0
|
|
178
|
+
k = 0
|
|
179
|
+
for i in range(len1):
|
|
180
|
+
if s1_matches[i]:
|
|
181
|
+
while not s2_matches[k]:
|
|
182
|
+
k += 1
|
|
183
|
+
if s1[i] != s2[k]:
|
|
184
|
+
transpositions += 1
|
|
185
|
+
k += 1
|
|
186
|
+
|
|
187
|
+
jaro = (
|
|
188
|
+
matches / len1
|
|
189
|
+
+ matches / len2
|
|
190
|
+
+ (matches - transpositions / 2.0) / matches
|
|
191
|
+
) / 3.0
|
|
192
|
+
|
|
193
|
+
p = 0.1
|
|
194
|
+
max_l = 4
|
|
195
|
+
l = 0
|
|
196
|
+
while l < min(len1, len2, max_l) and s1[l] == s2[l]:
|
|
197
|
+
l += 1
|
|
198
|
+
|
|
199
|
+
return jaro + (l * p * (1.0 - jaro))
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def analyze_pair(a: str, b: str, ignore_case: bool = False) -> Result:
|
|
203
|
+
return StringDistance.analyze(a, b, ignore_case)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import unicodedata
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def normalize_text(text, ignore_case=True, remove_all_whitespace=False, nfkc=True):
|
|
5
|
+
"""
|
|
6
|
+
Normalizes the input text by applying transformations such as Unicode normalization,
|
|
7
|
+
case folding, and whitespace handling.
|
|
8
|
+
|
|
9
|
+
Parameters:
|
|
10
|
+
text (str or None): The input text to normalize_text. If None, an empty string is returned.
|
|
11
|
+
ignore_case (bool, optional): Whether to convert the text to lowercase. Defaults to True.
|
|
12
|
+
remove_all_whitespace (bool, optional): Whether to remove all internal whitespace and trim outer
|
|
13
|
+
whitespace. Defaults to False.
|
|
14
|
+
nfkc (bool, optional): Whether to apply Unicode normalization using NFKC form. Defaults to
|
|
15
|
+
True.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
str: The normalized text.
|
|
19
|
+
"""
|
|
20
|
+
if text is None:
|
|
21
|
+
return ""
|
|
22
|
+
|
|
23
|
+
# Cast to string to handle numeric cells safely
|
|
24
|
+
text = str(text)
|
|
25
|
+
|
|
26
|
+
# 1. Unicode Compatibility (Handles full-width/ligatures)
|
|
27
|
+
if nfkc:
|
|
28
|
+
text = unicodedata.normalize('NFKC', text)
|
|
29
|
+
|
|
30
|
+
# 3. Whitespace handling
|
|
31
|
+
# Always trim outer whitespace, and optionally remove all internal whitespace.
|
|
32
|
+
text = text.strip()
|
|
33
|
+
if remove_all_whitespace:
|
|
34
|
+
text = "".join(text.split())
|
|
35
|
+
else:
|
|
36
|
+
# Note: join(split()) reduces multiple spaces to one.
|
|
37
|
+
text = " ".join(text.split())
|
|
38
|
+
|
|
39
|
+
# 2. Case Folding
|
|
40
|
+
if ignore_case:
|
|
41
|
+
text = text.lower()
|
|
42
|
+
|
|
43
|
+
return text
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: smith-utils
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A utility library for data cleaning and parsing.
|
|
5
|
+
Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Eiichi YAMAMOTO
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in
|
|
18
|
+
all copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
25
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
26
|
+
IN THE SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/yeiichi/smith-utils
|
|
29
|
+
Project-URL: Repository, https://github.com/yeiichi/smith-utils
|
|
30
|
+
Keywords: csv,deduplication,data-filtering,file-organization,filtering
|
|
31
|
+
Classifier: Programming Language :: Python :: 3
|
|
32
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
36
|
+
Classifier: Topic :: Utilities
|
|
37
|
+
Requires-Python: >=3.10
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
License-File: LICENSE
|
|
40
|
+
Provides-Extra: docs
|
|
41
|
+
Requires-Dist: sphinx<9,>=8; extra == "docs"
|
|
42
|
+
Requires-Dist: furo>=2024.8.6; extra == "docs"
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
|
|
45
|
+
# smith-utils
|
|
46
|
+
[](https://pypi.org/project/smith-utils/)
|
|
47
|
+

|
|
48
|
+

|
|
49
|
+

|
|
50
|
+

|
|
51
|
+
[](https://smith-utils.readthedocs.io/en/latest/?badge=latest)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
**Smith Utils** is a central hub for data cleaning and parsing scripts.
|
|
55
|
+
This package consolidates distributed utility functions to improve code reuse and maintenance efficiency across all yeiichi projects.
|
|
56
|
+
|
|
57
|
+
## Key Features
|
|
58
|
+
|
|
59
|
+
### 📅 Datetime Utilities (`smith_utils.datetime`)
|
|
60
|
+
Robust date parsing and formatting.
|
|
61
|
+
- `ensure_date`: Flexible conversion of strings, `datetime.date` objects, or `None` (returns today) into a `date` object.
|
|
62
|
+
- `parse_strict_date`: Strict parsing for `YYYYMMDD` or `YYYY-MM-DD` formats, rejecting ambiguous inputs.
|
|
63
|
+
- `format_ordinal`: Converts integers to ordinal strings (e.g., `1` → `"1st"`, `22` → `"22nd"`).
|
|
64
|
+
|
|
65
|
+
### 🔢 Numeric Refinement (`smith_utils.numeric`)
|
|
66
|
+
Clean and parse messy numeric data.
|
|
67
|
+
- `parse_numeric_value`: Handles custom separators, decimals, and negative formats like `(1,234.56)`.
|
|
68
|
+
- `parse_currency_value`: Alias for numeric parsing, specifically for currency strings.
|
|
69
|
+
|
|
70
|
+
### 📝 Text Normalization & Metrics (`smith_utils.text`)
|
|
71
|
+
Standardize text and compare string similarity.
|
|
72
|
+
- `normalize_text`: Unicode NFKC normalization, case folding, and whitespace handling.
|
|
73
|
+
- `StringDistance`: Implementation of Damerau-Levenshtein and Jaro-Winkler algorithms for fuzzy matching.
|
|
74
|
+
|
|
75
|
+
## Installation
|
|
76
|
+
|
|
77
|
+
Install via pip:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install smith-utils
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Quick Start
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from smith_utils.datetime.date_utils import ensure_date
|
|
87
|
+
from smith_utils.numeric.refinement import parse_numeric_value
|
|
88
|
+
from smith_utils.text.normalization import normalize_text
|
|
89
|
+
|
|
90
|
+
# Datetime
|
|
91
|
+
date = ensure_date("20231225") # datetime.date(2023, 12, 25)
|
|
92
|
+
|
|
93
|
+
# Numeric
|
|
94
|
+
value = parse_numeric_value("(1,250.50)") # -1250.5
|
|
95
|
+
|
|
96
|
+
# Text
|
|
97
|
+
clean_text = normalize_text(" Smith Utils ") # "smith utils"
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Directory Structure
|
|
101
|
+
- `src/smith_utils/`: Main package source.
|
|
102
|
+
- `legacy/`: Legacy scripts and templates (not included in distribution).
|
|
103
|
+
- `tests/`: Comprehensive test suite.
|
|
104
|
+
|
|
105
|
+
## License
|
|
106
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/smith_utils/__init__.py
|
|
5
|
+
src/smith_utils.egg-info/PKG-INFO
|
|
6
|
+
src/smith_utils.egg-info/SOURCES.txt
|
|
7
|
+
src/smith_utils.egg-info/dependency_links.txt
|
|
8
|
+
src/smith_utils.egg-info/requires.txt
|
|
9
|
+
src/smith_utils.egg-info/top_level.txt
|
|
10
|
+
src/smith_utils/datetime/__init__.py
|
|
11
|
+
src/smith_utils/datetime/date_utils.py
|
|
12
|
+
src/smith_utils/numeric/__init__.py
|
|
13
|
+
src/smith_utils/numeric/refinement.py
|
|
14
|
+
src/smith_utils/text/__init__.py
|
|
15
|
+
src/smith_utils/text/metrics.py
|
|
16
|
+
src/smith_utils/text/normalization.py
|
|
17
|
+
tests/test_date_utils.py
|
|
18
|
+
tests/test_metrics.py
|
|
19
|
+
tests/test_normalization.py
|
|
20
|
+
tests/test_refinement.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
smith_utils
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from src.smith_utils.datetime.date_utils import format_ordinal, parse_strict_date
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_format_ordinal_single_digits():
|
|
9
|
+
assert format_ordinal(1) == "1st"
|
|
10
|
+
assert format_ordinal(2) == "2nd"
|
|
11
|
+
assert format_ordinal(3) == "3rd"
|
|
12
|
+
assert format_ordinal(4) == "4th"
|
|
13
|
+
assert format_ordinal(9) == "9th"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_format_ordinal_double_digits():
|
|
17
|
+
assert format_ordinal(11) == "11th"
|
|
18
|
+
assert format_ordinal(12) == "12th"
|
|
19
|
+
assert format_ordinal(13) == "13th"
|
|
20
|
+
assert format_ordinal(21) == "21st"
|
|
21
|
+
assert format_ordinal(22) == "22nd"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_format_ordinal_large_numbers():
|
|
25
|
+
assert format_ordinal(100) == "100th"
|
|
26
|
+
assert format_ordinal(111) == "111th"
|
|
27
|
+
assert format_ordinal(112) == "112th"
|
|
28
|
+
assert format_ordinal(113) == "113th"
|
|
29
|
+
assert format_ordinal(121) == "121st"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_format_ordinal_negative_numbers():
|
|
33
|
+
assert format_ordinal(-1) == "-1st"
|
|
34
|
+
assert format_ordinal(-2) == "-2nd"
|
|
35
|
+
assert format_ordinal(-3) == "-3rd"
|
|
36
|
+
assert format_ordinal(-11) == "-11th"
|
|
37
|
+
assert format_ordinal(-22) == "-22nd"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_format_ordinal_edge_cases():
|
|
41
|
+
assert format_ordinal(0) == "0th"
|
|
42
|
+
assert format_ordinal(1000) == "1000th"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_parse_strict_date_basic_format():
|
|
46
|
+
assert parse_strict_date("20260413") == datetime.date(2026, 4, 13)
|
|
47
|
+
assert parse_strict_date("19991231") == datetime.date(1999, 12, 31)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_parse_strict_date_extended_format():
|
|
51
|
+
assert parse_strict_date("2026-04-13") == datetime.date(2026, 4, 13)
|
|
52
|
+
assert parse_strict_date("1999-12-31") == datetime.date(1999, 12, 31)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_parse_strict_date_invalid_formats():
|
|
56
|
+
with pytest.raises(ValueError, match="Could not parse '13-04-2026'. Expected YYYY-MM-DD or YYYYMMDD."):
|
|
57
|
+
parse_strict_date("13-04-2026")
|
|
58
|
+
|
|
59
|
+
with pytest.raises(ValueError, match="Could not parse '2026/04/13'. Expected YYYY-MM-DD or YYYYMMDD."):
|
|
60
|
+
parse_strict_date("2026/04/13")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_parse_strict_date_ambiguous_inputs():
|
|
64
|
+
with pytest.raises(ValueError, match=r"Ambiguous date '2026413' rejected\. .*"):
|
|
65
|
+
parse_strict_date("2026413")
|
|
66
|
+
|
|
67
|
+
with pytest.raises(ValueError, match="Ambiguous date '06-13' rejected. .*"):
|
|
68
|
+
parse_strict_date("06-13")
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# tests/test_metrics.py
|
|
2
|
+
|
|
3
|
+
from src.smith_utils.text.metrics import StringDistance, Relation, Result
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_equals_ignore_case():
|
|
7
|
+
assert StringDistance.equals_ignore_case("Test", "test")
|
|
8
|
+
assert StringDistance.equals_ignore_case("TEST", "test")
|
|
9
|
+
assert not StringDistance.equals_ignore_case("Test", "Another")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_trim():
|
|
13
|
+
assert StringDistance.trim(" hello ") == "hello"
|
|
14
|
+
assert StringDistance.trim("\t\ttest\t") == "test"
|
|
15
|
+
assert StringDistance.trim("no_spaces") == "no_spaces"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_strip_all():
|
|
19
|
+
assert StringDistance.strip_all(" he llo ") == "hello"
|
|
20
|
+
assert StringDistance.strip_all("\t\tt e\ts\tt\t") == "test"
|
|
21
|
+
assert StringDistance.strip_all("no spaces\t") == "nospaces"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_classify_exact_match():
|
|
25
|
+
result = StringDistance.classify("hello", "hello")
|
|
26
|
+
assert result == Relation.EXACT_MATCH
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_classify_case_insensitive_match():
|
|
30
|
+
result = StringDistance.classify("HELLO", "hello")
|
|
31
|
+
assert result == Relation.CASE_INSENSITIVE_MATCH
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_classify_whitespace_trimmed_match():
|
|
35
|
+
result = StringDistance.classify(" hello ", "hello")
|
|
36
|
+
assert result == Relation.WHITESPACE_TRIMMED_MATCH
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_classify_normalized_space_match():
|
|
40
|
+
result = StringDistance.classify("he llo", "hel lo")
|
|
41
|
+
assert result == Relation.NORMALIZED_SPACE_MATCH
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_classify_no_structural_match():
|
|
45
|
+
result = StringDistance.classify("hello", "world")
|
|
46
|
+
assert result == Relation.NO_STRUCTURAL_MATCH
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_analyze_similarity_metrics_case_insensitive():
|
|
50
|
+
result = StringDistance.analyze("HELLO", "hello", ignore_case=True)
|
|
51
|
+
assert isinstance(result, Result)
|
|
52
|
+
assert result.classification == Relation.CASE_INSENSITIVE_MATCH
|
|
53
|
+
assert result.damerau_levenshtein_distance == 0
|
|
54
|
+
assert result.jaro_winkler_score > 0.9
|
|
55
|
+
assert result.similarity_percentage == 100.0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_calculate_damerau_levenshtein():
|
|
59
|
+
distance = StringDistance.calculate_damerau_levenshtein("kitten", "sitting")
|
|
60
|
+
assert distance == 3
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_calculate_jaro_winkler():
|
|
64
|
+
score = StringDistance.calculate_jaro_winkler("MARTHA", "MARHTA")
|
|
65
|
+
assert score > 0.9
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_analyze_empty_strings():
|
|
69
|
+
result = StringDistance.analyze("", "", ignore_case=False)
|
|
70
|
+
assert isinstance(result, Result)
|
|
71
|
+
assert result.classification == Relation.EXACT_MATCH
|
|
72
|
+
assert result.damerau_levenshtein_distance == 0
|
|
73
|
+
assert result.jaro_winkler_score == 1.0
|
|
74
|
+
assert result.similarity_percentage == 100.0
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from src.smith_utils.text.normalization import normalize_text
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_normalize_none_input():
|
|
5
|
+
assert normalize_text(None) == ""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_normalize_empty_string():
|
|
9
|
+
assert normalize_text("") == ""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_normalize_whitespaces_only():
|
|
13
|
+
assert normalize_text(" ") == ""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_normalize_lowercase_conversion():
|
|
17
|
+
result = normalize_text("Hello World")
|
|
18
|
+
assert result == "hello world"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_normalize_unicodedata_nfkc():
|
|
22
|
+
result = normalize_text(u"① Ⅱ Ⅲ")
|
|
23
|
+
assert result == "1 ii iii"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_normalize_remove_all_whitespace():
|
|
27
|
+
result = normalize_text(" Hello World ", ignore_case=False, remove_all_whitespace=True)
|
|
28
|
+
assert result == "HelloWorld"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_normalize_trim_whitespace():
|
|
32
|
+
result = normalize_text(" Hello World ")
|
|
33
|
+
assert result == "hello world"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_normalize_nfkc_and_case_fold():
|
|
37
|
+
result = normalize_text(u"①ABC def")
|
|
38
|
+
assert result == "1abc def"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_normalize_case_sensitive_option():
|
|
42
|
+
result = normalize_text("Hello World", ignore_case=False)
|
|
43
|
+
assert result == "Hello World"
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# tests/test_refinement.py
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from src.smith_utils.numeric.refinement import parse_numeric_value
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_clean_numeric_valid_input():
|
|
8
|
+
assert parse_numeric_value("1,234.56") == 1234.56
|
|
9
|
+
assert parse_numeric_value("1234.56") == 1234.56
|
|
10
|
+
assert parse_numeric_value("1 234.56", sep=" ") == 1234.56
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_clean_numeric_invalid_input_raises_error():
|
|
14
|
+
with pytest.raises(ValueError):
|
|
15
|
+
parse_numeric_value("invalid")
|
|
16
|
+
with pytest.raises(ValueError):
|
|
17
|
+
parse_numeric_value("1234,56.78")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_clean_numeric_relaxed_mode_invalid_input():
|
|
21
|
+
assert parse_numeric_value("invalid", relaxed=True) == "invalid"
|
|
22
|
+
assert parse_numeric_value("1234,56.78", relaxed=True) == "1234,56.78"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_clean_numeric_none_value():
|
|
26
|
+
assert parse_numeric_value(None) == 0.0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_clean_numeric_with_custom_separators():
|
|
30
|
+
assert parse_numeric_value("1.234,56", sep=".", decimal=",") == 1234.56
|
|
31
|
+
assert parse_numeric_value("1 234,56", sep=" ", decimal=",") == 1234.56
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_clean_numeric_negative_number():
|
|
35
|
+
assert parse_numeric_value("-1234.56") == -1234.56
|
|
36
|
+
assert parse_numeric_value("(1234.56)") == -1234.56
|
|
37
|
+
assert parse_numeric_value("(1,234.56)") == -1234.56
|