smith-utils 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Eiichi YAMAMOTO
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
+ IN THE SOFTWARE.
@@ -0,0 +1,106 @@
1
+ Metadata-Version: 2.4
2
+ Name: smith-utils
3
+ Version: 0.1.0
4
+ Summary: A utility library for data cleaning and parsing.
5
+ Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Eiichi YAMAMOTO
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in
18
+ all copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26
+ IN THE SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/yeiichi/smith-utils
29
+ Project-URL: Repository, https://github.com/yeiichi/smith-utils
30
+ Keywords: csv,deduplication,data-filtering,file-organization,filtering
31
+ Classifier: Programming Language :: Python :: 3
32
+ Classifier: Programming Language :: Python :: 3 :: Only
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Topic :: Software Development :: Libraries
36
+ Classifier: Topic :: Utilities
37
+ Requires-Python: >=3.10
38
+ Description-Content-Type: text/markdown
39
+ License-File: LICENSE
40
+ Provides-Extra: docs
41
+ Requires-Dist: sphinx<9,>=8; extra == "docs"
42
+ Requires-Dist: furo>=2024.8.6; extra == "docs"
43
+ Dynamic: license-file
44
+
45
+ # smith-utils
46
+ [![PyPI version](https://img.shields.io/pypi/v/smith-utils.svg)](https://pypi.org/project/smith-utils/)
47
+ ![Python versions](https://img.shields.io/pypi/pyversions/smith-utils.svg)
48
+ ![Status](https://img.shields.io/badge/status-Alpha-orange.svg)
49
+ ![License](https://img.shields.io/badge/license-MIT-blue.svg)
50
+ ![Tests](https://img.shields.io/badge/tests-passing-brightgreen.svg)
51
+ [![Documentation Status](https://readthedocs.org/projects/smith-utils/badge/?version=latest)](https://smith-utils.readthedocs.io/en/latest/?badge=latest)
52
+
53
+
54
+ **Smith Utils** is a central hub for data cleaning and parsing scripts.
55
+ This package consolidates distributed utility functions to improve code reuse and maintenance efficiency across all yeiichi projects.
56
+
57
+ ## Key Features
58
+
59
+ ### 📅 Datetime Utilities (`smith_utils.datetime`)
60
+ Robust date parsing and formatting.
61
+ - `ensure_date`: Flexible conversion of strings, `datetime.date` objects, or `None` (returns today) into a `date` object.
62
+ - `parse_strict_date`: Strict parsing for `YYYYMMDD` or `YYYY-MM-DD` formats, rejecting ambiguous inputs.
63
+ - `format_ordinal`: Converts integers to ordinal strings (e.g., `1` → `"1st"`, `22` → `"22nd"`).
64
+
65
+ ### 🔢 Numeric Refinement (`smith_utils.numeric`)
66
+ Clean and parse messy numeric data.
67
+ - `parse_numeric_value`: Handles custom separators, decimals, and negative formats like `(1,234.56)`.
68
+ - `parse_currency_value`: Alias for numeric parsing, specifically for currency strings.
69
+
70
+ ### 📝 Text Normalization & Metrics (`smith_utils.text`)
71
+ Standardize text and compare string similarity.
72
+ - `normalize_text`: Unicode NFKC normalization, case folding, and whitespace handling.
73
+ - `StringDistance`: Implementation of Damerau-Levenshtein and Jaro-Winkler algorithms for fuzzy matching.
74
+
75
+ ## Installation
76
+
77
+ Install via pip:
78
+
79
+ ```bash
80
+ pip install smith-utils
81
+ ```
82
+
83
+ ## Quick Start
84
+
85
+ ```python
86
+ from smith_utils.datetime.date_utils import ensure_date
87
+ from smith_utils.numeric.refinement import parse_numeric_value
88
+ from smith_utils.text.normalization import normalize_text
89
+
90
+ # Datetime
91
+ date = ensure_date("20231225") # datetime.date(2023, 12, 25)
92
+
93
+ # Numeric
94
+ value = parse_numeric_value("(1,250.50)") # -1250.5
95
+
96
+ # Text
97
+ clean_text = normalize_text(" Smith Utils ") # "smith utils"
98
+ ```
99
+
100
+ ## Directory Structure
101
+ - `src/smith_utils/`: Main package source.
102
+ - `legacy/`: Legacy scripts and templates (not included in distribution).
103
+ - `tests/`: Comprehensive test suite.
104
+
105
+ ## License
106
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,62 @@
1
+ # smith-utils
2
+ [![PyPI version](https://img.shields.io/pypi/v/smith-utils.svg)](https://pypi.org/project/smith-utils/)
3
+ ![Python versions](https://img.shields.io/pypi/pyversions/smith-utils.svg)
4
+ ![Status](https://img.shields.io/badge/status-Alpha-orange.svg)
5
+ ![License](https://img.shields.io/badge/license-MIT-blue.svg)
6
+ ![Tests](https://img.shields.io/badge/tests-passing-brightgreen.svg)
7
+ [![Documentation Status](https://readthedocs.org/projects/smith-utils/badge/?version=latest)](https://smith-utils.readthedocs.io/en/latest/?badge=latest)
8
+
9
+
10
+ **Smith Utils** is a central hub for data cleaning and parsing scripts.
11
+ This package consolidates distributed utility functions to improve code reuse and maintenance efficiency across all yeiichi projects.
12
+
13
+ ## Key Features
14
+
15
+ ### 📅 Datetime Utilities (`smith_utils.datetime`)
16
+ Robust date parsing and formatting.
17
+ - `ensure_date`: Flexible conversion of strings, `datetime.date` objects, or `None` (returns today) into a `date` object.
18
+ - `parse_strict_date`: Strict parsing for `YYYYMMDD` or `YYYY-MM-DD` formats, rejecting ambiguous inputs.
19
+ - `format_ordinal`: Converts integers to ordinal strings (e.g., `1` → `"1st"`, `22` → `"22nd"`).
20
+
21
+ ### 🔢 Numeric Refinement (`smith_utils.numeric`)
22
+ Clean and parse messy numeric data.
23
+ - `parse_numeric_value`: Handles custom separators, decimals, and negative formats like `(1,234.56)`.
24
+ - `parse_currency_value`: Alias for numeric parsing, specifically for currency strings.
25
+
26
+ ### 📝 Text Normalization & Metrics (`smith_utils.text`)
27
+ Standardize text and compare string similarity.
28
+ - `normalize_text`: Unicode NFKC normalization, case folding, and whitespace handling.
29
+ - `StringDistance`: Implementation of Damerau-Levenshtein and Jaro-Winkler algorithms for fuzzy matching.
30
+
31
+ ## Installation
32
+
33
+ Install via pip:
34
+
35
+ ```bash
36
+ pip install smith-utils
37
+ ```
38
+
39
+ ## Quick Start
40
+
41
+ ```python
42
+ from smith_utils.datetime.date_utils import ensure_date
43
+ from smith_utils.numeric.refinement import parse_numeric_value
44
+ from smith_utils.text.normalization import normalize_text
45
+
46
+ # Datetime
47
+ date = ensure_date("20231225") # datetime.date(2023, 12, 25)
48
+
49
+ # Numeric
50
+ value = parse_numeric_value("(1,250.50)") # -1250.5
51
+
52
+ # Text
53
+ clean_text = normalize_text(" Smith Utils ") # "smith utils"
54
+ ```
55
+
56
+ ## Directory Structure
57
+ - `src/smith_utils/`: Main package source.
58
+ - `legacy/`: Legacy scripts and templates (not included in distribution).
59
+ - `tests/`: Comprehensive test suite.
60
+
61
+ ## License
62
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "smith-utils"
7
+ version = "0.1.0"
8
+ description = "A utility library for data cleaning and parsing."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { file = "LICENSE" }
12
+
13
+ authors = [
14
+ { name = "Eiichi YAMAMOTO", email = "info@yeiichi.com" }
15
+ ]
16
+
17
+ keywords = [
18
+ "csv",
19
+ "deduplication",
20
+ "data-filtering",
21
+ "file-organization",
22
+ "filtering",
23
+ ]
24
+
25
+ classifiers = [
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: Python :: 3 :: Only",
28
+ "License :: OSI Approved :: MIT License",
29
+ "Intended Audience :: Developers",
30
+ "Topic :: Software Development :: Libraries",
31
+ "Topic :: Utilities",
32
+ ]
33
+
34
+ dependencies = []
35
+
36
+ [project.optional-dependencies]
37
+ docs = [
38
+ "sphinx>=8,<9",
39
+ "furo>=2024.8.6",
40
+ ]
41
+
42
+ [project.urls]
43
+ Homepage = "https://github.com/yeiichi/smith-utils"
44
+ Repository = "https://github.com/yeiichi/smith-utils"
45
+
46
+ [tool.setuptools.packages.find]
47
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,14 @@
1
+ from .numeric import parse_numeric_value, parse_currency_value
2
+ from .text import StringDistance, analyze_pair, normalize_text
3
+ from .datetime import format_ordinal, parse_strict_date, ensure_date
4
+
5
+ __all__ = [
6
+ "parse_numeric_value",
7
+ "parse_currency_value",
8
+ "StringDistance",
9
+ "analyze_pair",
10
+ "normalize_text",
11
+ "format_ordinal",
12
+ "parse_strict_date",
13
+ "ensure_date",
14
+ ]
@@ -0,0 +1,7 @@
1
+ from .date_utils import format_ordinal, parse_strict_date, ensure_date
2
+
3
+ __all__ = [
4
+ "format_ordinal",
5
+ "parse_strict_date",
6
+ "ensure_date",
7
+ ]
@@ -0,0 +1,111 @@
1
+ # src/calendar_smith/utils.py
2
+ import datetime
3
+ import re
4
+
5
+ DATE_FORMAT_BASIC = "%Y%m%d"
6
+ DATE_FORMAT_EXTENDED = "%Y-%m-%d"
7
+
8
+
9
+ def format_ordinal(n: int) -> str:
10
+ """
11
+ Converts an integer to its ordinal representation as a string.
12
+
13
+ The function takes an integer input and returns the ordinal form of the
14
+ number as a string with its appropriate suffix. For example, 1 becomes
15
+ "1st", 2 becomes "2nd", and so on. The suffix rules account for the
16
+ exceptional cases such as numbers ending in 11, 12, or 13.
17
+
18
+ Args:
19
+ n (int): The integer to be converted to its ordinal equivalent.
20
+
21
+ Returns:
22
+ str: The ordinal representation of the input number.
23
+ """
24
+ if 11 <= (abs(n) % 100) <= 13:
25
+ suffix = 'th'
26
+ else:
27
+ suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(abs(n) % 10, 'th')
28
+ return f"{n}{suffix}"
29
+
30
+
31
+ def parse_strict_date(date_str: str) -> datetime.date:
32
+ """
33
+ Parses a given date string into a `datetime.date` object.
34
+
35
+ This function attempts to parse the input date string in various formats
36
+ while adhering to strict expectations for valid date representations. It
37
+ supports both basic (YYYYMMDD) and extended (YYYY-MM-DD) formats and rejects
38
+ ambiguous or malformed inputs. The function raises an error if the provided
39
+ string does not conform to the acceptable formats.
40
+
41
+ Note:
42
+ The returned `datetime.date` object is naive and does not retain any time zone
43
+ information. If the input string contains a time component or time zone
44
+ identifier (e.g., after a 'T' or space), it is simply discarded without any
45
+ time zone conversion. This means a UTC timestamp might represent a different
46
+ calendar date in a local time zone.
47
+
48
+ Parameters:
49
+ date_str (str): The date string to be parsed. It is expected to be in either
50
+ basic (YYYYMMDD) or extended (YYYY-MM-DD) format.
51
+
52
+ Returns:
53
+ datetime.date: A `date` object representing the parsed date.
54
+
55
+ Raises:
56
+ ValueError: If the input string contains an ambiguous or invalid date.
57
+ """
58
+ date_part = date_str.split('T')[0].split(' ')[0]
59
+ digits_only = re.sub(r'[^0-9]', '', date_part)
60
+
61
+ if len(digits_only) == 8 and date_part.isdigit():
62
+ return datetime.datetime.strptime(digits_only, DATE_FORMAT_BASIC).date()
63
+
64
+ if '-' in date_part:
65
+ try:
66
+ return datetime.datetime.strptime(date_part, DATE_FORMAT_EXTENDED).date()
67
+ except ValueError:
68
+ pass
69
+
70
+ if len(digits_only) in (6, 7) or (re.match(r'^\d{1,2}-\d{1,2}$', date_part)):
71
+ raise ValueError(
72
+ f"Ambiguous date '{date_str}' rejected. "
73
+ "Please use 8-digit YYYYMMDD or delimiters (YYYY-MM-DD)."
74
+ )
75
+ raise ValueError(f"Could not parse '{date_str}'. Expected YYYY-MM-DD or YYYYMMDD.")
76
+
77
+
78
+ def ensure_date(date_input: str | datetime.date | None) -> datetime.date:
79
+ """
80
+ Converts various date input formats into a `datetime.date` object.
81
+
82
+ This function accepts a string, a `datetime.date` object, or a `None`
83
+ value to produce a `datetime.date` object. If the input is `None`, the
84
+ current date is returned. String inputs are first stripped of leading
85
+ and trailing whitespace, and the function attempts to parse the string
86
+ as an ISO 8601 formatted date. If the ISO 8601 parsing fails, the input
87
+ is passed to a fallback parsing function for stricter date parsing.
88
+
89
+ Raises:
90
+ ValueError: If the input string cannot be parsed into a valid date
91
+ via both ISO 8601 and the fallback parsing mechanism.
92
+
93
+ Parameters:
94
+ date_input (str | datetime.date | None): The input to be converted
95
+ to a `datetime.date` object. This can be a string representing a
96
+ date, a `datetime.date` object itself, or `None`.
97
+
98
+ Returns:
99
+ datetime.date: A valid `datetime.date` object corresponding to the
100
+ provided input, or the current date if the input is `None`.
101
+ """
102
+ if not date_input:
103
+ return datetime.date.today()
104
+ if isinstance(date_input, datetime.date):
105
+ return date_input
106
+
107
+ clean_input = str(date_input).strip()
108
+ try:
109
+ return datetime.date.fromisoformat(clean_input)
110
+ except ValueError:
111
+ return parse_strict_date(clean_input)
@@ -0,0 +1,6 @@
1
+ from .refinement import parse_numeric_value, parse_currency_value
2
+
3
+ __all__ = [
4
+ "parse_numeric_value",
5
+ "parse_currency_value",
6
+ ]
@@ -0,0 +1,55 @@
1
+ def parse_numeric_value(val: str | float | None, sep: str = ",", decimal: str = ".", relaxed: bool = False) -> float | str:
2
+ if val is None:
3
+ return 0.0
4
+
5
+ s = str(val).strip()
6
+ is_negative = False
7
+
8
+ if s.startswith("-"):
9
+ is_negative = True
10
+ s = s[1:].strip()
11
+ elif s.startswith("(") and s.endswith(")"):
12
+ is_negative = True
13
+ s = s[1:-1].strip()
14
+
15
+ # Split by decimal to validate groups
16
+ parts = s.split(decimal)
17
+ if len(parts) > 2:
18
+ if relaxed:
19
+ return val
20
+ raise ValueError(f"Invalid numeric value: '{val}'")
21
+
22
+ # Validate integer part groups if 'sep' is used
23
+ int_part = parts[0]
24
+ if sep in int_part:
25
+ groups = int_part.split(sep)
26
+ # All groups except the first must be exactly 3 digits
27
+ # This is a common rule for thousand separators
28
+ if any(len(g) != 3 for g in groups[1:]):
29
+ if relaxed:
30
+ return val
31
+ raise ValueError(f"Invalid numeric value: '{val}'")
32
+
33
+ s_cleaned = s.replace(sep, "")
34
+ if decimal != ".":
35
+ s_cleaned = s_cleaned.replace(decimal, ".")
36
+
37
+ try:
38
+ # Check if separator appears after decimal
39
+ if sep in s and decimal in s and s.rfind(sep) > s.find(decimal):
40
+ raise ValueError
41
+
42
+ num = float(s_cleaned)
43
+ # Check if it was purely digits + decimal
44
+ if not s_cleaned.replace(".", "", 1).isdigit():
45
+ raise ValueError
46
+
47
+ return -num if is_negative else num
48
+ except ValueError:
49
+ if relaxed:
50
+ return val
51
+ raise ValueError(f"Invalid numeric value: '{val}'")
52
+
53
+
54
+ def parse_currency_value(val: str | float | None, sep: str = ",", decimal: str = ".", relaxed: bool = False) -> float | str:
55
+ return parse_numeric_value(val, sep=sep, decimal=decimal, relaxed=relaxed)
@@ -0,0 +1,10 @@
1
+ from .metrics import StringDistance, analyze_pair, Result, Relation
2
+ from .normalization import normalize_text
3
+
4
+ __all__ = [
5
+ "StringDistance",
6
+ "analyze_pair",
7
+ "Result",
8
+ "Relation",
9
+ "normalize_text",
10
+ ]
@@ -0,0 +1,203 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from enum import Enum, auto
5
+
6
+
7
+ class Relation(Enum):
8
+ EXACT_MATCH = auto()
9
+ CASE_INSENSITIVE_MATCH = auto()
10
+ WHITESPACE_TRIMMED_MATCH = auto()
11
+ NORMALIZED_SPACE_MATCH = auto()
12
+ NO_STRUCTURAL_MATCH = auto()
13
+
14
+
15
+ @dataclass
16
+ class Result:
17
+ """
18
+ Represents the result of a comparison, encapsulating classification and similarity metrics.
19
+
20
+ This class encapsulates the result of a comparison operation between two entities,
21
+ including their relationship classification and various similarity metrics. It provides
22
+ a method to retrieve a string representation of the relationship classification.
23
+ """
24
+ classification: Relation
25
+ damerau_levenshtein_distance: int
26
+ jaro_winkler_score: float
27
+ similarity_percentage: float
28
+
29
+ def get_relation_string(self) -> str:
30
+ if self.classification is Relation.EXACT_MATCH:
31
+ return "Identical"
32
+ if self.classification is Relation.CASE_INSENSITIVE_MATCH:
33
+ return "Case-Insensitive Match"
34
+ if self.classification is Relation.WHITESPACE_TRIMMED_MATCH:
35
+ return "Similar (Trimmed)"
36
+ if self.classification is Relation.NORMALIZED_SPACE_MATCH:
37
+ return "Synonymous (No Spaces)"
38
+ return "Different"
39
+
40
+
41
+ class StringDistance:
42
+ """
43
+ Provides functionality for calculating string distances and relationships between
44
+ two strings based on various algorithms.
45
+
46
+ This class includes methods for analyzing string similarities and relationships,
47
+ including exact matches, case-insensitive comparisons, and whitespace normalization.
48
+ It also implements Damerau-Levenshtein and Jaro-Winkler distance calculations.
49
+
50
+ :return classification: Possible relationship classification between two strings.
51
+ :return damerau_levenshtein_distance: Integer distance calculated using the Damerau-Levenshtein algorithm.
52
+ :return jaro_winkler_score: A float score indicating similarity using the Jaro-Winkler algorithm.
53
+ :return similarity_percentage: A percentage similarity score between two strings.
54
+ """
55
+ @staticmethod
56
+ def analyze(a: str, b: str, ignore_case: bool = False) -> Result:
57
+ sa = str(a)
58
+ sb = str(b)
59
+
60
+ relation = StringDistance.classify(sa, sb)
61
+
62
+ if ignore_case:
63
+ sa = sa.lower()
64
+ sb = sb.lower()
65
+
66
+ d_dist = StringDistance.calculate_damerau_levenshtein(sa, sb)
67
+ jw_score = StringDistance.calculate_jaro_winkler(sa, sb)
68
+
69
+ max_len = max(len(sa), len(sb))
70
+ similarity = 100.0 if max_len == 0 else (1.0 - d_dist / max_len) * 100.0
71
+
72
+ return Result(
73
+ classification=relation,
74
+ damerau_levenshtein_distance=d_dist,
75
+ jaro_winkler_score=jw_score,
76
+ similarity_percentage=similarity,
77
+ )
78
+
79
+ @staticmethod
80
+ def trim(s: str) -> str:
81
+ return s.strip()
82
+
83
+ @staticmethod
84
+ def strip_all(s: str) -> str:
85
+ """
86
+ Remove all whitespace characters from a string.
87
+
88
+ Uses split/join logic so any whitespace character acts as a separator,
89
+ including spaces, tabs, and newlines.
90
+ """
91
+ return "".join(s.split())
92
+
93
+ @staticmethod
94
+ def equals_ignore_case(a: str, b: str) -> bool:
95
+ return a.lower() == b.lower()
96
+
97
+ @staticmethod
98
+ def classify(a: str, b: str) -> Relation:
99
+ if a == b:
100
+ return Relation.EXACT_MATCH
101
+ if StringDistance.equals_ignore_case(a, b):
102
+ return Relation.CASE_INSENSITIVE_MATCH
103
+ if StringDistance.trim(a) == StringDistance.trim(b):
104
+ return Relation.WHITESPACE_TRIMMED_MATCH
105
+ if StringDistance.strip_all(a) == StringDistance.strip_all(b):
106
+ return Relation.NORMALIZED_SPACE_MATCH
107
+ return Relation.NO_STRUCTURAL_MATCH
108
+
109
+ @staticmethod
110
+ def calculate_damerau_levenshtein(s1: str, s2: str) -> int:
111
+ """
112
+ Restricted Damerau-Levenshtein distance:
113
+ insertion, deletion, substitution, adjacent transposition.
114
+ """
115
+ m = len(s1)
116
+ n = len(s2)
117
+
118
+ d = [[0] * (n + 1) for _ in range(m + 1)]
119
+
120
+ for i in range(m + 1):
121
+ d[i][0] = i
122
+ for j in range(n + 1):
123
+ d[0][j] = j
124
+
125
+ for i in range(1, m + 1):
126
+ for j in range(1, n + 1):
127
+ cost = 0 if s1[i - 1] == s2[j - 1] else 1
128
+
129
+ d[i][j] = min(
130
+ d[i - 1][j] + 1, # deletion
131
+ d[i][j - 1] + 1, # insertion
132
+ d[i - 1][j - 1] + cost, # substitution
133
+ )
134
+
135
+ if (
136
+ i > 1
137
+ and j > 1
138
+ and s1[i - 1] == s2[j - 2]
139
+ and s1[i - 2] == s2[j - 1]
140
+ ):
141
+ d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost)
142
+
143
+ return d[m][n]
144
+
145
+ @staticmethod
146
+ def calculate_jaro_winkler(s1: str, s2: str) -> float:
147
+ len1 = len(s1)
148
+ len2 = len(s2)
149
+
150
+ if len1 == 0 and len2 == 0:
151
+ return 1.0
152
+ if len1 == 0 or len2 == 0:
153
+ return 0.0
154
+
155
+ match_distance = max(len1, len2) // 2 - 1
156
+ if match_distance < 0:
157
+ match_distance = 0
158
+
159
+ s1_matches = [False] * len1
160
+ s2_matches = [False] * len2
161
+
162
+ matches = 0
163
+ for i in range(len1):
164
+ start = max(0, i - match_distance)
165
+ end = min(i + match_distance + 1, len2)
166
+
167
+ for j in range(start, end):
168
+ if not s2_matches[j] and s1[i] == s2[j]:
169
+ s1_matches[i] = True
170
+ s2_matches[j] = True
171
+ matches += 1
172
+ break
173
+
174
+ if matches == 0:
175
+ return 0.0
176
+
177
+ transpositions = 0
178
+ k = 0
179
+ for i in range(len1):
180
+ if s1_matches[i]:
181
+ while not s2_matches[k]:
182
+ k += 1
183
+ if s1[i] != s2[k]:
184
+ transpositions += 1
185
+ k += 1
186
+
187
+ jaro = (
188
+ matches / len1
189
+ + matches / len2
190
+ + (matches - transpositions / 2.0) / matches
191
+ ) / 3.0
192
+
193
+ p = 0.1
194
+ max_l = 4
195
+ l = 0
196
+ while l < min(len1, len2, max_l) and s1[l] == s2[l]:
197
+ l += 1
198
+
199
+ return jaro + (l * p * (1.0 - jaro))
200
+
201
+
202
+ def analyze_pair(a: str, b: str, ignore_case: bool = False) -> Result:
203
+ return StringDistance.analyze(a, b, ignore_case)
@@ -0,0 +1,43 @@
1
+ import unicodedata
2
+
3
+
4
+ def normalize_text(text, ignore_case=True, remove_all_whitespace=False, nfkc=True):
5
+ """
6
+ Normalizes the input text by applying transformations such as Unicode normalization,
7
+ case folding, and whitespace handling.
8
+
9
+ Parameters:
10
+ text (str or None): The input text to normalize_text. If None, an empty string is returned.
11
+ ignore_case (bool, optional): Whether to convert the text to lowercase. Defaults to True.
12
+ remove_all_whitespace (bool, optional): Whether to remove all internal whitespace and trim outer
13
+ whitespace. Defaults to False.
14
+ nfkc (bool, optional): Whether to apply Unicode normalization using NFKC form. Defaults to
15
+ True.
16
+
17
+ Returns:
18
+ str: The normalized text.
19
+ """
20
+ if text is None:
21
+ return ""
22
+
23
+ # Cast to string to handle numeric cells safely
24
+ text = str(text)
25
+
26
+ # 1. Unicode Compatibility (Handles full-width/ligatures)
27
+ if nfkc:
28
+ text = unicodedata.normalize('NFKC', text)
29
+
30
+ # 3. Whitespace handling
31
+ # Always trim outer whitespace, and optionally remove all internal whitespace.
32
+ text = text.strip()
33
+ if remove_all_whitespace:
34
+ text = "".join(text.split())
35
+ else:
36
+ # Note: join(split()) reduces multiple spaces to one.
37
+ text = " ".join(text.split())
38
+
39
+ # 2. Case Folding
40
+ if ignore_case:
41
+ text = text.lower()
42
+
43
+ return text
@@ -0,0 +1,106 @@
1
+ Metadata-Version: 2.4
2
+ Name: smith-utils
3
+ Version: 0.1.0
4
+ Summary: A utility library for data cleaning and parsing.
5
+ Author-email: Eiichi YAMAMOTO <info@yeiichi.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Eiichi YAMAMOTO
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in
18
+ all copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26
+ IN THE SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/yeiichi/smith-utils
29
+ Project-URL: Repository, https://github.com/yeiichi/smith-utils
30
+ Keywords: csv,deduplication,data-filtering,file-organization,filtering
31
+ Classifier: Programming Language :: Python :: 3
32
+ Classifier: Programming Language :: Python :: 3 :: Only
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Intended Audience :: Developers
35
+ Classifier: Topic :: Software Development :: Libraries
36
+ Classifier: Topic :: Utilities
37
+ Requires-Python: >=3.10
38
+ Description-Content-Type: text/markdown
39
+ License-File: LICENSE
40
+ Provides-Extra: docs
41
+ Requires-Dist: sphinx<9,>=8; extra == "docs"
42
+ Requires-Dist: furo>=2024.8.6; extra == "docs"
43
+ Dynamic: license-file
44
+
45
+ # smith-utils
46
+ [![PyPI version](https://img.shields.io/pypi/v/smith-utils.svg)](https://pypi.org/project/smith-utils/)
47
+ ![Python versions](https://img.shields.io/pypi/pyversions/smith-utils.svg)
48
+ ![Status](https://img.shields.io/badge/status-Alpha-orange.svg)
49
+ ![License](https://img.shields.io/badge/license-MIT-blue.svg)
50
+ ![Tests](https://img.shields.io/badge/tests-passing-brightgreen.svg)
51
+ [![Documentation Status](https://readthedocs.org/projects/smith-utils/badge/?version=latest)](https://smith-utils.readthedocs.io/en/latest/?badge=latest)
52
+
53
+
54
+ **Smith Utils** is a central hub for data cleaning and parsing scripts.
55
+ This package consolidates distributed utility functions to improve code reuse and maintenance efficiency across all yeiichi projects.
56
+
57
+ ## Key Features
58
+
59
+ ### 📅 Datetime Utilities (`smith_utils.datetime`)
60
+ Robust date parsing and formatting.
61
+ - `ensure_date`: Flexible conversion of strings, `datetime.date` objects, or `None` (returns today) into a `date` object.
62
+ - `parse_strict_date`: Strict parsing for `YYYYMMDD` or `YYYY-MM-DD` formats, rejecting ambiguous inputs.
63
+ - `format_ordinal`: Converts integers to ordinal strings (e.g., `1` → `"1st"`, `22` → `"22nd"`).
64
+
65
+ ### 🔢 Numeric Refinement (`smith_utils.numeric`)
66
+ Clean and parse messy numeric data.
67
+ - `parse_numeric_value`: Handles custom separators, decimals, and negative formats like `(1,234.56)`.
68
+ - `parse_currency_value`: Alias for numeric parsing, specifically for currency strings.
69
+
70
+ ### 📝 Text Normalization & Metrics (`smith_utils.text`)
71
+ Standardize text and compare string similarity.
72
+ - `normalize_text`: Unicode NFKC normalization, case folding, and whitespace handling.
73
+ - `StringDistance`: Implementation of Damerau-Levenshtein and Jaro-Winkler algorithms for fuzzy matching.
74
+
75
+ ## Installation
76
+
77
+ Install via pip:
78
+
79
+ ```bash
80
+ pip install smith-utils
81
+ ```
82
+
83
+ ## Quick Start
84
+
85
+ ```python
86
+ from smith_utils.datetime.date_utils import ensure_date
87
+ from smith_utils.numeric.refinement import parse_numeric_value
88
+ from smith_utils.text.normalization import normalize_text
89
+
90
+ # Datetime
91
+ date = ensure_date("20231225") # datetime.date(2023, 12, 25)
92
+
93
+ # Numeric
94
+ value = parse_numeric_value("(1,250.50)") # -1250.5
95
+
96
+ # Text
97
+ clean_text = normalize_text(" Smith Utils ") # "smith utils"
98
+ ```
99
+
100
+ ## Directory Structure
101
+ - `src/smith_utils/`: Main package source.
102
+ - `legacy/`: Legacy scripts and templates (not included in distribution).
103
+ - `tests/`: Comprehensive test suite.
104
+
105
+ ## License
106
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,20 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/smith_utils/__init__.py
5
+ src/smith_utils.egg-info/PKG-INFO
6
+ src/smith_utils.egg-info/SOURCES.txt
7
+ src/smith_utils.egg-info/dependency_links.txt
8
+ src/smith_utils.egg-info/requires.txt
9
+ src/smith_utils.egg-info/top_level.txt
10
+ src/smith_utils/datetime/__init__.py
11
+ src/smith_utils/datetime/date_utils.py
12
+ src/smith_utils/numeric/__init__.py
13
+ src/smith_utils/numeric/refinement.py
14
+ src/smith_utils/text/__init__.py
15
+ src/smith_utils/text/metrics.py
16
+ src/smith_utils/text/normalization.py
17
+ tests/test_date_utils.py
18
+ tests/test_metrics.py
19
+ tests/test_normalization.py
20
+ tests/test_refinement.py
@@ -0,0 +1,4 @@
1
+
2
+ [docs]
3
+ sphinx<9,>=8
4
+ furo>=2024.8.6
@@ -0,0 +1 @@
1
+ smith_utils
@@ -0,0 +1,68 @@
1
+ import datetime
2
+
3
+ import pytest
4
+
5
+ from src.smith_utils.datetime.date_utils import format_ordinal, parse_strict_date
6
+
7
+
8
+ def test_format_ordinal_single_digits():
9
+ assert format_ordinal(1) == "1st"
10
+ assert format_ordinal(2) == "2nd"
11
+ assert format_ordinal(3) == "3rd"
12
+ assert format_ordinal(4) == "4th"
13
+ assert format_ordinal(9) == "9th"
14
+
15
+
16
+ def test_format_ordinal_double_digits():
17
+ assert format_ordinal(11) == "11th"
18
+ assert format_ordinal(12) == "12th"
19
+ assert format_ordinal(13) == "13th"
20
+ assert format_ordinal(21) == "21st"
21
+ assert format_ordinal(22) == "22nd"
22
+
23
+
24
+ def test_format_ordinal_large_numbers():
25
+ assert format_ordinal(100) == "100th"
26
+ assert format_ordinal(111) == "111th"
27
+ assert format_ordinal(112) == "112th"
28
+ assert format_ordinal(113) == "113th"
29
+ assert format_ordinal(121) == "121st"
30
+
31
+
32
+ def test_format_ordinal_negative_numbers():
33
+ assert format_ordinal(-1) == "-1st"
34
+ assert format_ordinal(-2) == "-2nd"
35
+ assert format_ordinal(-3) == "-3rd"
36
+ assert format_ordinal(-11) == "-11th"
37
+ assert format_ordinal(-22) == "-22nd"
38
+
39
+
40
+ def test_format_ordinal_edge_cases():
41
+ assert format_ordinal(0) == "0th"
42
+ assert format_ordinal(1000) == "1000th"
43
+
44
+
45
+ def test_parse_strict_date_basic_format():
46
+ assert parse_strict_date("20260413") == datetime.date(2026, 4, 13)
47
+ assert parse_strict_date("19991231") == datetime.date(1999, 12, 31)
48
+
49
+
50
+ def test_parse_strict_date_extended_format():
51
+ assert parse_strict_date("2026-04-13") == datetime.date(2026, 4, 13)
52
+ assert parse_strict_date("1999-12-31") == datetime.date(1999, 12, 31)
53
+
54
+
55
+ def test_parse_strict_date_invalid_formats():
56
+ with pytest.raises(ValueError, match="Could not parse '13-04-2026'. Expected YYYY-MM-DD or YYYYMMDD."):
57
+ parse_strict_date("13-04-2026")
58
+
59
+ with pytest.raises(ValueError, match="Could not parse '2026/04/13'. Expected YYYY-MM-DD or YYYYMMDD."):
60
+ parse_strict_date("2026/04/13")
61
+
62
+
63
+ def test_parse_strict_date_ambiguous_inputs():
64
+ with pytest.raises(ValueError, match=r"Ambiguous date '2026413' rejected\. .*"):
65
+ parse_strict_date("2026413")
66
+
67
+ with pytest.raises(ValueError, match="Ambiguous date '06-13' rejected. .*"):
68
+ parse_strict_date("06-13")
@@ -0,0 +1,74 @@
1
+ # tests/test_metrics.py
2
+
3
+ from src.smith_utils.text.metrics import StringDistance, Relation, Result
4
+
5
+
6
+ def test_equals_ignore_case():
7
+ assert StringDistance.equals_ignore_case("Test", "test")
8
+ assert StringDistance.equals_ignore_case("TEST", "test")
9
+ assert not StringDistance.equals_ignore_case("Test", "Another")
10
+
11
+
12
+ def test_trim():
13
+ assert StringDistance.trim(" hello ") == "hello"
14
+ assert StringDistance.trim("\t\ttest\t") == "test"
15
+ assert StringDistance.trim("no_spaces") == "no_spaces"
16
+
17
+
18
+ def test_strip_all():
19
+ assert StringDistance.strip_all(" he llo ") == "hello"
20
+ assert StringDistance.strip_all("\t\tt e\ts\tt\t") == "test"
21
+ assert StringDistance.strip_all("no spaces\t") == "nospaces"
22
+
23
+
24
+ def test_classify_exact_match():
25
+ result = StringDistance.classify("hello", "hello")
26
+ assert result == Relation.EXACT_MATCH
27
+
28
+
29
+ def test_classify_case_insensitive_match():
30
+ result = StringDistance.classify("HELLO", "hello")
31
+ assert result == Relation.CASE_INSENSITIVE_MATCH
32
+
33
+
34
+ def test_classify_whitespace_trimmed_match():
35
+ result = StringDistance.classify(" hello ", "hello")
36
+ assert result == Relation.WHITESPACE_TRIMMED_MATCH
37
+
38
+
39
+ def test_classify_normalized_space_match():
40
+ result = StringDistance.classify("he llo", "hel lo")
41
+ assert result == Relation.NORMALIZED_SPACE_MATCH
42
+
43
+
44
+ def test_classify_no_structural_match():
45
+ result = StringDistance.classify("hello", "world")
46
+ assert result == Relation.NO_STRUCTURAL_MATCH
47
+
48
+
49
+ def test_analyze_similarity_metrics_case_insensitive():
50
+ result = StringDistance.analyze("HELLO", "hello", ignore_case=True)
51
+ assert isinstance(result, Result)
52
+ assert result.classification == Relation.CASE_INSENSITIVE_MATCH
53
+ assert result.damerau_levenshtein_distance == 0
54
+ assert result.jaro_winkler_score > 0.9
55
+ assert result.similarity_percentage == 100.0
56
+
57
+
58
+ def test_calculate_damerau_levenshtein():
59
+ distance = StringDistance.calculate_damerau_levenshtein("kitten", "sitting")
60
+ assert distance == 3
61
+
62
+
63
+ def test_calculate_jaro_winkler():
64
+ score = StringDistance.calculate_jaro_winkler("MARTHA", "MARHTA")
65
+ assert score > 0.9
66
+
67
+
68
+ def test_analyze_empty_strings():
69
+ result = StringDistance.analyze("", "", ignore_case=False)
70
+ assert isinstance(result, Result)
71
+ assert result.classification == Relation.EXACT_MATCH
72
+ assert result.damerau_levenshtein_distance == 0
73
+ assert result.jaro_winkler_score == 1.0
74
+ assert result.similarity_percentage == 100.0
@@ -0,0 +1,43 @@
1
+ from src.smith_utils.text.normalization import normalize_text
2
+
3
+
4
+ def test_normalize_none_input():
5
+ assert normalize_text(None) == ""
6
+
7
+
8
+ def test_normalize_empty_string():
9
+ assert normalize_text("") == ""
10
+
11
+
12
+ def test_normalize_whitespaces_only():
13
+ assert normalize_text(" ") == ""
14
+
15
+
16
+ def test_normalize_lowercase_conversion():
17
+ result = normalize_text("Hello World")
18
+ assert result == "hello world"
19
+
20
+
21
+ def test_normalize_unicodedata_nfkc():
22
+ result = normalize_text(u"① Ⅱ Ⅲ")
23
+ assert result == "1 ii iii"
24
+
25
+
26
+ def test_normalize_remove_all_whitespace():
27
+ result = normalize_text(" Hello World ", ignore_case=False, remove_all_whitespace=True)
28
+ assert result == "HelloWorld"
29
+
30
+
31
+ def test_normalize_trim_whitespace():
32
+ result = normalize_text(" Hello World ")
33
+ assert result == "hello world"
34
+
35
+
36
+ def test_normalize_nfkc_and_case_fold():
37
+ result = normalize_text(u"①ABC def")
38
+ assert result == "1abc def"
39
+
40
+
41
+ def test_normalize_case_sensitive_option():
42
+ result = normalize_text("Hello World", ignore_case=False)
43
+ assert result == "Hello World"
@@ -0,0 +1,37 @@
1
+ # tests/test_refinement.py
2
+
3
+ import pytest
4
+ from src.smith_utils.numeric.refinement import parse_numeric_value
5
+
6
+
7
+ def test_clean_numeric_valid_input():
8
+ assert parse_numeric_value("1,234.56") == 1234.56
9
+ assert parse_numeric_value("1234.56") == 1234.56
10
+ assert parse_numeric_value("1 234.56", sep=" ") == 1234.56
11
+
12
+
13
+ def test_clean_numeric_invalid_input_raises_error():
14
+ with pytest.raises(ValueError):
15
+ parse_numeric_value("invalid")
16
+ with pytest.raises(ValueError):
17
+ parse_numeric_value("1234,56.78")
18
+
19
+
20
+ def test_clean_numeric_relaxed_mode_invalid_input():
21
+ assert parse_numeric_value("invalid", relaxed=True) == "invalid"
22
+ assert parse_numeric_value("1234,56.78", relaxed=True) == "1234,56.78"
23
+
24
+
25
+ def test_clean_numeric_none_value():
26
+ assert parse_numeric_value(None) == 0.0
27
+
28
+
29
+ def test_clean_numeric_with_custom_separators():
30
+ assert parse_numeric_value("1.234,56", sep=".", decimal=",") == 1234.56
31
+ assert parse_numeric_value("1 234,56", sep=" ", decimal=",") == 1234.56
32
+
33
+
34
+ def test_clean_numeric_negative_number():
35
+ assert parse_numeric_value("-1234.56") == -1234.56
36
+ assert parse_numeric_value("(1234.56)") == -1234.56
37
+ assert parse_numeric_value("(1,234.56)") == -1234.56