datacompose 0.2.9__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datacompose-0.2.9 → datacompose-0.4.0}/CHANGELOG.md +37 -0
- {datacompose-0.2.9/datacompose.egg-info → datacompose-0.4.0}/PKG-INFO +6 -6
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/operators/primitives.py +30 -27
- datacompose-0.4.0/datacompose/transformers/analytics/__init__.py +1 -0
- datacompose-0.4.0/datacompose/transformers/analytics/fuzzy_matching/__init__.py +1 -0
- datacompose-0.4.0/datacompose/transformers/analytics/fuzzy_matching/pyspark/__init__.py +2 -0
- datacompose-0.4.0/datacompose/transformers/analytics/fuzzy_matching/pyspark/pyspark_primitives.py +453 -0
- datacompose-0.4.0/datacompose/transformers/text/text/__init__.py +5 -0
- datacompose-0.4.0/datacompose/transformers/text/text/pyspark/__init__.py +5 -0
- datacompose-0.4.0/datacompose/transformers/text/text/pyspark/pyspark_primitives.py +1449 -0
- {datacompose-0.2.9 → datacompose-0.4.0/datacompose.egg-info}/PKG-INFO +6 -6
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose.egg-info/SOURCES.txt +12 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/pyproject.toml +6 -6
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_primitives_complete.py +75 -4
- datacompose-0.4.0/tests/unit/transformers/analytics/test_fuzzy_matching.py +417 -0
- datacompose-0.4.0/tests/unit/transformers/text/test_text/__init__.py +1 -0
- datacompose-0.4.0/tests/unit/transformers/text/test_text/test_string_cleaning.py +533 -0
- datacompose-0.4.0/tests/unit/transformers/text/test_text/test_string_transformation.py +546 -0
- datacompose-0.4.0/tests/unit/transformers/text/test_text/test_string_validation.py +439 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/LICENSE +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/MANIFEST.in +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/README.md +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/colors.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/commands/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/commands/add.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/commands/init.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/commands/list.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/config.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/main.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/validation.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/generators/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/generators/base.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/generators/pyspark/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/generators/pyspark/generator.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/operators/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/discovery.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/addresses/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/datetimes/pyspark/pyspark_primitives.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/emails/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/emails/pyspark/pyspark_primitives.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/phone_numbers/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose.egg-info/dependency_links.txt +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose.egg-info/entry_points.txt +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose.egg-info/requires.txt +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/datacompose.egg-info/top_level.txt +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/setup.cfg +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/conftest.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/integration/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/integration/test_end_to_end.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/integration/test_full_workflow.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/integration/test_generated_imports.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/.venv/bin/activate_this.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/.venv/lib/python3.12/site-packages/_virtualenv.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/postgres/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/postgres/clean_emails/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/postgres/clean_emails/email_cleaner_udf_spec.yaml +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/postgres/clean_emails/test_email_cleaner_udf.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/spark/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/spark/clean_emails/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf_spec.yaml +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/spark/clean_emails/test_email_cleaner_udf.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_add_command.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_add_command_complete.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_add_default_target.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_add_validation.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_config.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_init_command.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_init_command_complete.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_list_command.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_main.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_main_complete.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_validation_complete.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/generators/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/generators/test_base_generator.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/generators/test_spark_generator.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_compose_conditions.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_conditional_auto_detection.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_conditional_core.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_conditional_real_world.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_operators.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/__init__.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/test_discovery.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/common/test_common.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_building_unit_extraction.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_city_state_extraction.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_clean_addresses.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_country_extraction.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_data_addresses.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_po_box_extraction.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_street_extraction.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_zip_code_extraction.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_data_quality.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_extraction.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_integration.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_performance.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_regression.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_timezones.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_emails/test_debug_long_emails.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_emails/test_email_extraction.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_emails/test_email_optimized.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_phone_numbers/test_phone_extraction.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_phone_numbers/test_phone_formatting.py +0 -0
- {datacompose-0.2.9 → datacompose-0.4.0}/tests/yaml_specs/__init__.py +0 -0
|
@@ -7,6 +7,43 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.4.0] - 2026-02-02
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- **Fuzzy Matching Primitives**: New analytics module for string similarity and comparison
|
|
14
|
+
- **Distance functions**: `levenshtein`, `levenshtein_normalized`, `levenshtein_threshold`
|
|
15
|
+
- **Phonetic functions**: `soundex`, `soundex_match`
|
|
16
|
+
- **Token-based functions**: `jaccard_similarity`, `token_overlap`
|
|
17
|
+
- **N-gram functions**: `ngram_similarity`, `ngram_distance`
|
|
18
|
+
- **Utility functions**: `exact_match`, `contains_match`, `prefix_match`
|
|
19
|
+
- **Advanced**: `cosine_similarity` for term-frequency based comparison
|
|
20
|
+
- Multi-column support for row-wise comparisons
|
|
21
|
+
- All functions use native PySpark SQL functions (no UDFs) for optimal performance
|
|
22
|
+
|
|
23
|
+
### Changed
|
|
24
|
+
- **Primitives Module**: Updated to handle multi-column operations
|
|
25
|
+
|
|
26
|
+
## [0.3.0] - 2026-01-01
|
|
27
|
+
|
|
28
|
+
### Added
|
|
29
|
+
- **Text Transformation Primitives**: New comprehensive text manipulation module (`text`) with 57 functions
|
|
30
|
+
- **Validation functions (14)**: `is_valid_hex`, `is_valid_base64`, `is_valid_url_encoded`, `has_control_characters`, `has_zero_width_characters`, `has_non_ascii`, `has_escape_sequences`, `has_url_encoding`, `has_html_entities`, `has_ansi_codes`, `has_non_printable`, `has_accents`, `has_unicode_issues`, `has_whitespace_issues`
|
|
31
|
+
- **Transformation functions (23)**: `hex_to_text`, `text_to_hex`, `clean_hex`, `extract_hex`, `decode_base64`, `encode_base64`, `clean_base64`, `extract_base64`, `decode_url`, `encode_url`, `decode_html_entities`, `encode_html_entities`, `unescape_string`, `escape_string`, `normalize_line_endings`, `to_ascii`, `to_codepoints`, `from_codepoints`, `reverse_string`, `truncate`, `pad_left`, `pad_right`
|
|
32
|
+
- **Cleaning functions (20)**: `remove_control_characters`, `remove_zero_width_characters`, `remove_non_printable`, `remove_ansi_codes`, `strip_invisible`, `remove_bom`, `normalize_unicode`, `remove_accents`, `normalize_whitespace`, `remove_html_tags`, `remove_urls`, `remove_emojis`, `remove_punctuation`, `remove_digits`, `remove_letters`, `remove_escape_sequences`, `strip_to_alphanumeric`, `clean_for_comparison`, `slugify`, `collapse_repeats`, `clean_string`
|
|
33
|
+
- All functions use native PySpark SQL functions (no UDFs) for optimal performance
|
|
34
|
+
- Comprehensive null and empty string handling
|
|
35
|
+
- 508 unit tests with full coverage
|
|
36
|
+
|
|
37
|
+
### Fixed
|
|
38
|
+
- **Text Primitives**: Various fixes to text transformation functions
|
|
39
|
+
- `decode_url`: Fixed %2B decoding to properly preserve literal plus signs vs form-encoded spaces
|
|
40
|
+
- `extract_hex`: Improved pattern to require `0x`/`#` prefix or MAC address format, avoiding false matches
|
|
41
|
+
- `extract_base64`: Improved pattern to require `=` padding or `base64,` prefix for reliable extraction
|
|
42
|
+
- `unescape_string`: Fixed backslash escape handling with placeholder approach
|
|
43
|
+
- `collapse_repeats`: Added working implementation for `max_repeat=2`
|
|
44
|
+
- `has_unicode_issues`: Added combining character detection (U+0300-U+036F range)
|
|
45
|
+
- `clean_string`: Fixed ANSI code removal order (must run before control char removal)
|
|
46
|
+
|
|
10
47
|
## [0.2.7.0] - 2025-09-11
|
|
11
48
|
|
|
12
49
|
### Fixed
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datacompose
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
|
|
5
5
|
Author: Datacompose Contributors
|
|
6
6
|
Maintainer: Datacompose Contributors
|
|
7
7
|
License: MIT
|
|
8
|
-
Project-URL: Homepage, https://github.com/
|
|
9
|
-
Project-URL: Documentation, https://
|
|
10
|
-
Project-URL: Repository, https://github.com/
|
|
11
|
-
Project-URL: Issues, https://github.com/
|
|
12
|
-
Project-URL: Changelog, https://github.com/
|
|
8
|
+
Project-URL: Homepage, https://github.com/datacompose/datacompose
|
|
9
|
+
Project-URL: Documentation, https://datacompose.io
|
|
10
|
+
Project-URL: Repository, https://github.com/datacompose/datacompose.git
|
|
11
|
+
Project-URL: Issues, https://github.com/datacompose/datacompose/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/datacompose/datacompose/blob/main/CHANGELOG.md
|
|
13
13
|
Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
|
|
14
14
|
Classifier: Development Status :: 4 - Beta
|
|
15
15
|
Classifier: Intended Audience :: Developers
|
|
@@ -28,28 +28,31 @@ logger = logging.getLogger(__name__)
|
|
|
28
28
|
class SmartPrimitive:
|
|
29
29
|
"""Wraps a PySpark column transformation function to enable partial application.
|
|
30
30
|
|
|
31
|
-
SmartPrimitive
|
|
32
|
-
1.
|
|
33
|
-
2.
|
|
31
|
+
SmartPrimitive handles both single-column and multi-column transformations:
|
|
32
|
+
1. Single column: `primitive(col)` or `primitive(param=value)(col)`
|
|
33
|
+
2. Multi column: `primitive(col1, col2)` or `primitive(param=value)(col1, col2)`
|
|
34
34
|
|
|
35
|
-
|
|
36
|
-
into data pipelines.
|
|
35
|
+
The behavior is auto-detected based on the number of arguments passed.
|
|
37
36
|
|
|
38
|
-
Example:
|
|
37
|
+
Example (single column):
|
|
38
|
+
>>> @registry.register()
|
|
39
39
|
>>> def trim_spaces(col, chars=' '):
|
|
40
|
-
... return
|
|
41
|
-
>>>
|
|
42
|
-
>>> trim = SmartPrimitive(trim_spaces)
|
|
40
|
+
... return F.trim(col, chars)
|
|
43
41
|
>>>
|
|
44
|
-
>>>
|
|
45
|
-
>>>
|
|
42
|
+
>>> df.select(registry.trim_spaces(F.col("text")))
|
|
43
|
+
>>> configured = registry.trim_spaces(chars='\\t')
|
|
44
|
+
>>> df.select(configured(F.col("text")))
|
|
45
|
+
|
|
46
|
+
Example (multi column):
|
|
47
|
+
>>> @registry.register()
|
|
48
|
+
>>> def levenshtein(col1, col2, normalize=False):
|
|
49
|
+
... return F.levenshtein(col1, col2)
|
|
46
50
|
>>>
|
|
47
|
-
>>>
|
|
48
|
-
>>>
|
|
49
|
-
>>> df.
|
|
51
|
+
>>> df.withColumn("score", registry.levenshtein(F.col("a"), F.col("b")))
|
|
52
|
+
>>> configured = registry.levenshtein(normalize=True)
|
|
53
|
+
>>> df.withColumn("score", configured(F.col("a"), F.col("b")))
|
|
50
54
|
|
|
51
|
-
|
|
52
|
-
Please note that you will not use this directly. It will be used in the PrimitiveRegistry class
|
|
55
|
+
Please note that you will not use this directly. It will be used in the PrimitiveRegistry class.
|
|
53
56
|
"""
|
|
54
57
|
|
|
55
58
|
def __init__(self, func: Callable, name: Optional[str] = None):
|
|
@@ -63,25 +66,25 @@ class SmartPrimitive:
|
|
|
63
66
|
self.name = name or func.__name__
|
|
64
67
|
self.__doc__ = func.__doc__
|
|
65
68
|
|
|
66
|
-
def __call__(self,
|
|
69
|
+
def __call__(self, *cols, **kwargs): # type: ignore
|
|
67
70
|
"""Apply the transformation or return a configured version.
|
|
68
71
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
72
|
+
Auto-detects single vs multi-column based on argument count:
|
|
73
|
+
- 0 args: returns configured function (partial application)
|
|
74
|
+
- 1 arg: single-column call
|
|
75
|
+
- 2+ args: multi-column call
|
|
73
76
|
|
|
74
77
|
Returns:
|
|
75
|
-
If
|
|
76
|
-
If
|
|
78
|
+
If columns provided: The transformed Column
|
|
79
|
+
If no columns: A configured function that takes Column(s)
|
|
77
80
|
"""
|
|
78
|
-
if
|
|
79
|
-
return self.func(
|
|
81
|
+
if cols:
|
|
82
|
+
return self.func(*cols, **kwargs)
|
|
80
83
|
else:
|
|
81
84
|
|
|
82
85
|
@wraps(self.func)
|
|
83
|
-
def configured(
|
|
84
|
-
return self.func(
|
|
86
|
+
def configured(*c): # type: ignore
|
|
87
|
+
return self.func(*c, **kwargs)
|
|
85
88
|
|
|
86
89
|
configured.__name__ = (
|
|
87
90
|
f"{self.name}({', '.join(f'{k}={v}' for k, v in kwargs.items())})"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Analytics transformers for row-wise operations."""
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Fuzzy matching transformer."""
|
datacompose-0.4.0/datacompose/transformers/analytics/fuzzy_matching/pyspark/pyspark_primitives.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fuzzy matching primitives for PySpark.
|
|
3
|
+
|
|
4
|
+
Provides string similarity and comparison functions for row-wise operations
|
|
5
|
+
that compare two or more columns.
|
|
6
|
+
|
|
7
|
+
Preview Output:
|
|
8
|
+
+----------+----------+------------+----------------+---------------+
|
|
9
|
+
|name_a |name_b |levenshtein |levenshtein_norm|soundex_match |
|
|
10
|
+
+----------+----------+------------+----------------+---------------+
|
|
11
|
+
|john |jon |1 |0.75 |true |
|
|
12
|
+
|smith |smyth |1 |0.80 |true |
|
|
13
|
+
|acme corp |acme inc |4 |0.56 |false |
|
|
14
|
+
|robert |bob |5 |0.17 |false |
|
|
15
|
+
+----------+----------+------------+----------------+---------------+
|
|
16
|
+
|
|
17
|
+
Usage Example:
|
|
18
|
+
from pyspark.sql import SparkSession
|
|
19
|
+
from pyspark.sql import functions as F
|
|
20
|
+
from transformers.pyspark.fuzzy_matching import fuzzy
|
|
21
|
+
|
|
22
|
+
# Initialize Spark
|
|
23
|
+
spark = SparkSession.builder.appName("FuzzyMatching").getOrCreate()
|
|
24
|
+
|
|
25
|
+
# Create sample data
|
|
26
|
+
data = [
|
|
27
|
+
("john", "jon"),
|
|
28
|
+
("smith", "smyth"),
|
|
29
|
+
("acme corp", "acme inc"),
|
|
30
|
+
]
|
|
31
|
+
df = spark.createDataFrame(data, ["name_a", "name_b"])
|
|
32
|
+
|
|
33
|
+
# Compare strings
|
|
34
|
+
result_df = df.select(
|
|
35
|
+
F.col("name_a"),
|
|
36
|
+
F.col("name_b"),
|
|
37
|
+
fuzzy.levenshtein(F.col("name_a"), F.col("name_b")).alias("distance"),
|
|
38
|
+
fuzzy.levenshtein_normalized(F.col("name_a"), F.col("name_b")).alias("similarity"),
|
|
39
|
+
fuzzy.soundex_match(F.col("name_a"), F.col("name_b")).alias("soundex_match")
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Filter to similar matches
|
|
43
|
+
similar = result_df.filter(F.col("similarity") >= 0.8)
|
|
44
|
+
|
|
45
|
+
Installation:
|
|
46
|
+
datacompose add fuzzy_matching
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
from typing import TYPE_CHECKING
|
|
50
|
+
|
|
51
|
+
if TYPE_CHECKING:
|
|
52
|
+
from pyspark.sql import Column
|
|
53
|
+
from pyspark.sql import functions as F
|
|
54
|
+
else:
|
|
55
|
+
try:
|
|
56
|
+
from pyspark.sql import Column
|
|
57
|
+
from pyspark.sql import functions as F
|
|
58
|
+
except ImportError:
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
from utils.primitives import PrimitiveRegistry # type: ignore
|
|
63
|
+
except ImportError:
|
|
64
|
+
from datacompose.operators.primitives import PrimitiveRegistry
|
|
65
|
+
|
|
66
|
+
fuzzy = PrimitiveRegistry("fuzzy")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# =============================================================================
|
|
70
|
+
# Distance Functions
|
|
71
|
+
# =============================================================================
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@fuzzy.register()
|
|
75
|
+
def levenshtein(col1: "Column", col2: "Column") -> "Column":
|
|
76
|
+
"""Calculate Levenshtein edit distance between two strings.
|
|
77
|
+
|
|
78
|
+
The Levenshtein distance is the minimum number of single-character edits
|
|
79
|
+
(insertions, deletions, substitutions) required to transform one string
|
|
80
|
+
into another.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
col1: First string column
|
|
84
|
+
col2: Second string column
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Column with integer edit distance (0 = identical)
|
|
88
|
+
|
|
89
|
+
Example:
|
|
90
|
+
>>> df.withColumn("dist", fuzzy.levenshtein(F.col("a"), F.col("b")))
|
|
91
|
+
"""
|
|
92
|
+
return F.levenshtein(col1, col2)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@fuzzy.register()
|
|
96
|
+
def levenshtein_normalized(col1: "Column", col2: "Column") -> "Column":
|
|
97
|
+
"""Calculate normalized Levenshtein similarity (0.0 to 1.0).
|
|
98
|
+
|
|
99
|
+
Returns a similarity score where 1.0 means identical strings and
|
|
100
|
+
0.0 means completely different. Calculated as:
|
|
101
|
+
1 - (levenshtein_distance / max(len(str1), len(str2)))
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
col1: First string column
|
|
105
|
+
col2: Second string column
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Column with float similarity score between 0.0 and 1.0
|
|
109
|
+
|
|
110
|
+
Example:
|
|
111
|
+
>>> df.withColumn("sim", fuzzy.levenshtein_normalized(F.col("a"), F.col("b")))
|
|
112
|
+
"""
|
|
113
|
+
distance = F.levenshtein(col1, col2)
|
|
114
|
+
max_len = F.greatest(F.length(col1), F.length(col2))
|
|
115
|
+
return F.when(max_len == 0, F.lit(1.0)).otherwise(F.lit(1.0) - (distance / max_len))
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@fuzzy.register()
|
|
119
|
+
def levenshtein_threshold(
|
|
120
|
+
col1: "Column", col2: "Column", threshold: float = 0.8
|
|
121
|
+
) -> "Column":
|
|
122
|
+
"""Check if normalized Levenshtein similarity meets threshold.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
col1: First string column
|
|
126
|
+
col2: Second string column
|
|
127
|
+
threshold: Minimum similarity score (default 0.8)
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Column with boolean indicating if similarity >= threshold
|
|
131
|
+
|
|
132
|
+
Example:
|
|
133
|
+
>>> df.withColumn("is_match", fuzzy.levenshtein_threshold(F.col("a"), F.col("b"), threshold=0.9))
|
|
134
|
+
"""
|
|
135
|
+
distance = F.levenshtein(col1, col2)
|
|
136
|
+
max_len = F.greatest(F.length(col1), F.length(col2))
|
|
137
|
+
similarity = F.when(max_len == 0, F.lit(1.0)).otherwise(
|
|
138
|
+
F.lit(1.0) - (distance / max_len)
|
|
139
|
+
)
|
|
140
|
+
return similarity >= F.lit(threshold)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# =============================================================================
|
|
144
|
+
# Phonetic Functions
|
|
145
|
+
# =============================================================================
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@fuzzy.register()
|
|
149
|
+
def soundex(col: "Column") -> "Column":
|
|
150
|
+
"""Calculate Soundex phonetic encoding of a string.
|
|
151
|
+
|
|
152
|
+
Soundex encodes a string into a letter followed by three digits,
|
|
153
|
+
representing how the word sounds in English.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
col: String column to encode
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Column with Soundex code (e.g., "Robert" -> "R163")
|
|
160
|
+
|
|
161
|
+
Example:
|
|
162
|
+
>>> df.withColumn("code", fuzzy.soundex(F.col("name")))
|
|
163
|
+
"""
|
|
164
|
+
return F.soundex(col)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@fuzzy.register()
|
|
168
|
+
def soundex_match(col1: "Column", col2: "Column") -> "Column":
|
|
169
|
+
"""Check if two strings have the same Soundex encoding.
|
|
170
|
+
|
|
171
|
+
Useful for matching names that sound alike but are spelled differently
|
|
172
|
+
(e.g., "Smith" and "Smyth").
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
col1: First string column
|
|
176
|
+
col2: Second string column
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Column with boolean indicating if Soundex codes match
|
|
180
|
+
|
|
181
|
+
Example:
|
|
182
|
+
>>> df.withColumn("sounds_alike", fuzzy.soundex_match(F.col("a"), F.col("b")))
|
|
183
|
+
"""
|
|
184
|
+
return F.soundex(col1) == F.soundex(col2)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
# =============================================================================
|
|
188
|
+
# Token-based Functions
|
|
189
|
+
# =============================================================================
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
@fuzzy.register()
|
|
193
|
+
def jaccard_similarity(
|
|
194
|
+
col1: "Column", col2: "Column", delimiter: str = " "
|
|
195
|
+
) -> "Column":
|
|
196
|
+
"""Calculate Jaccard similarity between tokenized strings.
|
|
197
|
+
|
|
198
|
+
Splits both strings into tokens and calculates:
|
|
199
|
+
|intersection| / |union|
|
|
200
|
+
|
|
201
|
+
Useful for comparing multi-word strings where word order doesn't matter.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
col1: First string column
|
|
205
|
+
col2: Second string column
|
|
206
|
+
delimiter: Token delimiter (default: space)
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Column with float similarity score between 0.0 and 1.0
|
|
210
|
+
|
|
211
|
+
Example:
|
|
212
|
+
>>> df.withColumn("sim", fuzzy.jaccard_similarity(F.col("a"), F.col("b")))
|
|
213
|
+
"""
|
|
214
|
+
tokens1 = F.split(F.lower(col1), delimiter)
|
|
215
|
+
tokens2 = F.split(F.lower(col2), delimiter)
|
|
216
|
+
intersection = F.size(F.array_intersect(tokens1, tokens2))
|
|
217
|
+
union = F.size(F.array_union(tokens1, tokens2))
|
|
218
|
+
return F.when(union == 0, F.lit(1.0)).otherwise(intersection / union)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
@fuzzy.register()
|
|
222
|
+
def token_overlap(col1: "Column", col2: "Column", delimiter: str = " ") -> "Column":
|
|
223
|
+
"""Count number of overlapping tokens between two strings.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
col1: First string column
|
|
227
|
+
col2: Second string column
|
|
228
|
+
delimiter: Token delimiter (default: space)
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Column with integer count of shared tokens
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
>>> df.withColumn("overlap", fuzzy.token_overlap(F.col("a"), F.col("b")))
|
|
235
|
+
"""
|
|
236
|
+
tokens1 = F.split(F.lower(col1), delimiter)
|
|
237
|
+
tokens2 = F.split(F.lower(col2), delimiter)
|
|
238
|
+
return F.size(F.array_intersect(tokens1, tokens2))
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
# =============================================================================
|
|
242
|
+
# Utility Functions
|
|
243
|
+
# =============================================================================
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
@fuzzy.register()
|
|
247
|
+
def exact_match(col1: "Column", col2: "Column", ignore_case: bool = True) -> "Column":
|
|
248
|
+
"""Check if two strings match exactly.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
col1: First string column
|
|
252
|
+
col2: Second string column
|
|
253
|
+
ignore_case: If True, comparison is case-insensitive (default: True)
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Column with boolean indicating exact match
|
|
257
|
+
|
|
258
|
+
Example:
|
|
259
|
+
>>> df.withColumn("match", fuzzy.exact_match(F.col("a"), F.col("b")))
|
|
260
|
+
"""
|
|
261
|
+
if ignore_case:
|
|
262
|
+
return F.lower(col1) == F.lower(col2)
|
|
263
|
+
return col1 == col2
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
@fuzzy.register()
|
|
267
|
+
def contains_match(
|
|
268
|
+
col1: "Column", col2: "Column", ignore_case: bool = True
|
|
269
|
+
) -> "Column":
|
|
270
|
+
"""Check if one string contains the other.
|
|
271
|
+
|
|
272
|
+
Returns True if col1 contains col2 OR col2 contains col1.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
col1: First string column
|
|
276
|
+
col2: Second string column
|
|
277
|
+
ignore_case: If True, comparison is case-insensitive (default: True)
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
Column with boolean indicating containment
|
|
281
|
+
|
|
282
|
+
Example:
|
|
283
|
+
>>> df.withColumn("contains", fuzzy.contains_match(F.col("a"), F.col("b")))
|
|
284
|
+
"""
|
|
285
|
+
if ignore_case:
|
|
286
|
+
c1, c2 = F.lower(col1), F.lower(col2)
|
|
287
|
+
else:
|
|
288
|
+
c1, c2 = col1, col2
|
|
289
|
+
return F.contains(c1, c2) | F.contains(c2, c1)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
@fuzzy.register()
|
|
293
|
+
def prefix_match(col1: "Column", col2: "Column", length: int = 3) -> "Column":
|
|
294
|
+
"""Check if two strings share the same prefix.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
col1: First string column
|
|
298
|
+
col2: Second string column
|
|
299
|
+
length: Number of characters to compare (default: 3)
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Column with boolean indicating prefix match
|
|
303
|
+
|
|
304
|
+
Example:
|
|
305
|
+
>>> df.withColumn("same_prefix", fuzzy.prefix_match(F.col("a"), F.col("b"), length=4))
|
|
306
|
+
"""
|
|
307
|
+
return F.left(F.lower(col1), F.lit(length)) == F.left(F.lower(col2), F.lit(length))
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# =============================================================================
|
|
311
|
+
# N-gram Functions
|
|
312
|
+
# =============================================================================
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
@fuzzy.register()
|
|
316
|
+
def ngram_similarity(col1: "Column", col2: "Column", n: int = 2) -> "Column":
|
|
317
|
+
"""Calculate n-gram (character-level) similarity between two strings.
|
|
318
|
+
|
|
319
|
+
Breaks strings into overlapping character sequences of length n,
|
|
320
|
+
then calculates Jaccard similarity on the n-gram sets.
|
|
321
|
+
|
|
322
|
+
Good for catching typos and character-level variations.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
col1: First string column
|
|
326
|
+
col2: Second string column
|
|
327
|
+
n: Size of n-grams (default: 2 for bigrams)
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Column with float similarity score between 0.0 and 1.0
|
|
331
|
+
|
|
332
|
+
Example:
|
|
333
|
+
>>> df.withColumn("sim", fuzzy.ngram_similarity(F.col("a"), F.col("b"), n=2))
|
|
334
|
+
"""
|
|
335
|
+
|
|
336
|
+
# Generate n-grams using transform to create array of substrings
|
|
337
|
+
# For a string of length L, we get L-n+1 n-grams
|
|
338
|
+
def make_ngrams(col: "Column", n: int) -> "Column":
|
|
339
|
+
# Pad the string to handle short strings
|
|
340
|
+
padded = F.lower(col)
|
|
341
|
+
length = F.length(padded)
|
|
342
|
+
# Generate indices from 0 to length-n
|
|
343
|
+
indices = F.sequence(F.lit(0), F.greatest(length - F.lit(n), F.lit(0)))
|
|
344
|
+
# Extract substring at each index
|
|
345
|
+
return F.transform(indices, lambda i: F.substring(padded, i + 1, n))
|
|
346
|
+
|
|
347
|
+
ngrams1 = make_ngrams(col1, n)
|
|
348
|
+
ngrams2 = make_ngrams(col2, n)
|
|
349
|
+
|
|
350
|
+
intersection = F.size(F.array_intersect(ngrams1, ngrams2))
|
|
351
|
+
union = F.size(F.array_union(ngrams1, ngrams2))
|
|
352
|
+
|
|
353
|
+
return F.when(union == 0, F.lit(1.0)).otherwise(intersection / union)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
@fuzzy.register()
|
|
357
|
+
def ngram_distance(col1: "Column", col2: "Column", n: int = 2) -> "Column":
|
|
358
|
+
"""Calculate n-gram distance (1 - similarity) between two strings.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
col1: First string column
|
|
362
|
+
col2: Second string column
|
|
363
|
+
n: Size of n-grams (default: 2 for bigrams)
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
Column with float distance between 0.0 and 1.0
|
|
367
|
+
|
|
368
|
+
Example:
|
|
369
|
+
>>> df.withColumn("dist", fuzzy.ngram_distance(F.col("a"), F.col("b")))
|
|
370
|
+
"""
|
|
371
|
+
|
|
372
|
+
def make_ngrams(col: "Column", n: int) -> "Column":
|
|
373
|
+
padded = F.lower(col)
|
|
374
|
+
length = F.length(padded)
|
|
375
|
+
indices = F.sequence(F.lit(0), F.greatest(length - F.lit(n), F.lit(0)))
|
|
376
|
+
return F.transform(indices, lambda i: F.substring(padded, i + 1, n))
|
|
377
|
+
|
|
378
|
+
ngrams1 = make_ngrams(col1, n)
|
|
379
|
+
ngrams2 = make_ngrams(col2, n)
|
|
380
|
+
|
|
381
|
+
intersection = F.size(F.array_intersect(ngrams1, ngrams2))
|
|
382
|
+
union = F.size(F.array_union(ngrams1, ngrams2))
|
|
383
|
+
|
|
384
|
+
similarity = F.when(union == 0, F.lit(1.0)).otherwise(intersection / union)
|
|
385
|
+
return F.lit(1.0) - similarity
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
# =============================================================================
|
|
389
|
+
# Cosine Similarity
|
|
390
|
+
# =============================================================================
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
@fuzzy.register()
|
|
394
|
+
def cosine_similarity(col1: "Column", col2: "Column", delimiter: str = " ") -> "Column":
|
|
395
|
+
"""Calculate cosine similarity between tokenized strings.
|
|
396
|
+
|
|
397
|
+
Treats each string as a bag of words and computes cosine similarity
|
|
398
|
+
based on term frequency. Good for comparing longer text.
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
col1: First string column
|
|
402
|
+
col2: Second string column
|
|
403
|
+
delimiter: Token delimiter (default: space)
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Column with float similarity score between 0.0 and 1.0
|
|
407
|
+
|
|
408
|
+
Example:
|
|
409
|
+
>>> df.withColumn("sim", fuzzy.cosine_similarity(F.col("a"), F.col("b")))
|
|
410
|
+
"""
|
|
411
|
+
# Tokenize
|
|
412
|
+
tokens1 = F.split(F.lower(col1), delimiter)
|
|
413
|
+
tokens2 = F.split(F.lower(col2), delimiter)
|
|
414
|
+
|
|
415
|
+
# Get all unique tokens
|
|
416
|
+
all_tokens = F.array_union(tokens1, tokens2)
|
|
417
|
+
|
|
418
|
+
# Calculate dot product and magnitudes
|
|
419
|
+
# dot_product = sum(tf1[t] * tf2[t] for t in all_tokens)
|
|
420
|
+
# magnitude1 = sqrt(sum(tf1[t]^2 for t in all_tokens))
|
|
421
|
+
# magnitude2 = sqrt(sum(tf2[t]^2 for t in all_tokens))
|
|
422
|
+
|
|
423
|
+
dot_product = F.aggregate(
|
|
424
|
+
all_tokens,
|
|
425
|
+
F.lit(0.0),
|
|
426
|
+
lambda acc, token: acc
|
|
427
|
+
+ (
|
|
428
|
+
F.size(F.filter(tokens1, lambda t: t == token)).cast("double")
|
|
429
|
+
* F.size(F.filter(tokens2, lambda t: t == token)).cast("double")
|
|
430
|
+
),
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
magnitude1 = F.sqrt(
|
|
434
|
+
F.aggregate(
|
|
435
|
+
all_tokens,
|
|
436
|
+
F.lit(0.0),
|
|
437
|
+
lambda acc, token: acc
|
|
438
|
+
+ F.pow(F.size(F.filter(tokens1, lambda t: t == token)).cast("double"), 2),
|
|
439
|
+
)
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
magnitude2 = F.sqrt(
|
|
443
|
+
F.aggregate(
|
|
444
|
+
all_tokens,
|
|
445
|
+
F.lit(0.0),
|
|
446
|
+
lambda acc, token: acc
|
|
447
|
+
+ F.pow(F.size(F.filter(tokens2, lambda t: t == token)).cast("double"), 2),
|
|
448
|
+
)
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
denominator = magnitude1 * magnitude2
|
|
452
|
+
|
|
453
|
+
return F.when(denominator == 0, F.lit(0.0)).otherwise(dot_product / denominator)
|