datacompose 0.2.9__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {datacompose-0.2.9 → datacompose-0.4.0}/CHANGELOG.md +37 -0
  2. {datacompose-0.2.9/datacompose.egg-info → datacompose-0.4.0}/PKG-INFO +6 -6
  3. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/operators/primitives.py +30 -27
  4. datacompose-0.4.0/datacompose/transformers/analytics/__init__.py +1 -0
  5. datacompose-0.4.0/datacompose/transformers/analytics/fuzzy_matching/__init__.py +1 -0
  6. datacompose-0.4.0/datacompose/transformers/analytics/fuzzy_matching/pyspark/__init__.py +2 -0
  7. datacompose-0.4.0/datacompose/transformers/analytics/fuzzy_matching/pyspark/pyspark_primitives.py +453 -0
  8. datacompose-0.4.0/datacompose/transformers/text/text/__init__.py +5 -0
  9. datacompose-0.4.0/datacompose/transformers/text/text/pyspark/__init__.py +5 -0
  10. datacompose-0.4.0/datacompose/transformers/text/text/pyspark/pyspark_primitives.py +1449 -0
  11. {datacompose-0.2.9 → datacompose-0.4.0/datacompose.egg-info}/PKG-INFO +6 -6
  12. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose.egg-info/SOURCES.txt +12 -0
  13. {datacompose-0.2.9 → datacompose-0.4.0}/pyproject.toml +6 -6
  14. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_primitives_complete.py +75 -4
  15. datacompose-0.4.0/tests/unit/transformers/analytics/test_fuzzy_matching.py +417 -0
  16. datacompose-0.4.0/tests/unit/transformers/text/test_text/__init__.py +1 -0
  17. datacompose-0.4.0/tests/unit/transformers/text/test_text/test_string_cleaning.py +533 -0
  18. datacompose-0.4.0/tests/unit/transformers/text/test_text/test_string_transformation.py +546 -0
  19. datacompose-0.4.0/tests/unit/transformers/text/test_text/test_string_validation.py +439 -0
  20. {datacompose-0.2.9 → datacompose-0.4.0}/LICENSE +0 -0
  21. {datacompose-0.2.9 → datacompose-0.4.0}/MANIFEST.in +0 -0
  22. {datacompose-0.2.9 → datacompose-0.4.0}/README.md +0 -0
  23. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/__init__.py +0 -0
  24. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/__init__.py +0 -0
  25. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/colors.py +0 -0
  26. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/commands/__init__.py +0 -0
  27. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/commands/add.py +0 -0
  28. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/commands/init.py +0 -0
  29. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/commands/list.py +0 -0
  30. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/config.py +0 -0
  31. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/main.py +0 -0
  32. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/cli/validation.py +0 -0
  33. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/generators/__init__.py +0 -0
  34. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/generators/base.py +0 -0
  35. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/generators/pyspark/__init__.py +0 -0
  36. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/generators/pyspark/generator.py +0 -0
  37. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/operators/__init__.py +0 -0
  38. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/__init__.py +0 -0
  39. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/discovery.py +0 -0
  40. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/__init__.py +0 -0
  41. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/addresses/__init__.py +0 -0
  42. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/addresses/pyspark/pyspark_primitives.py +0 -0
  43. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/datetimes/pyspark/pyspark_primitives.py +0 -0
  44. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/emails/__init__.py +0 -0
  45. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/emails/pyspark/pyspark_primitives.py +0 -0
  46. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/phone_numbers/__init__.py +0 -0
  47. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose/transformers/text/phone_numbers/pyspark/pyspark_primitives.py +0 -0
  48. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose.egg-info/dependency_links.txt +0 -0
  49. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose.egg-info/entry_points.txt +0 -0
  50. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose.egg-info/requires.txt +0 -0
  51. {datacompose-0.2.9 → datacompose-0.4.0}/datacompose.egg-info/top_level.txt +0 -0
  52. {datacompose-0.2.9 → datacompose-0.4.0}/setup.cfg +0 -0
  53. {datacompose-0.2.9 → datacompose-0.4.0}/tests/__init__.py +0 -0
  54. {datacompose-0.2.9 → datacompose-0.4.0}/tests/conftest.py +0 -0
  55. {datacompose-0.2.9 → datacompose-0.4.0}/tests/integration/__init__.py +0 -0
  56. {datacompose-0.2.9 → datacompose-0.4.0}/tests/integration/test_end_to_end.py +0 -0
  57. {datacompose-0.2.9 → datacompose-0.4.0}/tests/integration/test_full_workflow.py +0 -0
  58. {datacompose-0.2.9 → datacompose-0.4.0}/tests/integration/test_generated_imports.py +0 -0
  59. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/.venv/bin/activate_this.py +0 -0
  60. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/.venv/lib/python3.12/site-packages/_virtualenv.py +0 -0
  61. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/__init__.py +0 -0
  62. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/__init__.py +0 -0
  63. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/postgres/__init__.py +0 -0
  64. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/postgres/clean_emails/__init__.py +0 -0
  65. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/postgres/clean_emails/email_cleaner_udf_spec.yaml +0 -0
  66. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/postgres/clean_emails/test_email_cleaner_udf.py +0 -0
  67. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/spark/__init__.py +0 -0
  68. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/spark/clean_emails/__init__.py +0 -0
  69. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf.py +0 -0
  70. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/spark/clean_emails/email_cleaner_udf_spec.yaml +0 -0
  71. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/build/spark/clean_emails/test_email_cleaner_udf.py +0 -0
  72. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_add_command.py +0 -0
  73. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_add_command_complete.py +0 -0
  74. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_add_default_target.py +0 -0
  75. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_add_validation.py +0 -0
  76. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_config.py +0 -0
  77. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_init_command.py +0 -0
  78. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_init_command_complete.py +0 -0
  79. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_list_command.py +0 -0
  80. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_main.py +0 -0
  81. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_main_complete.py +0 -0
  82. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/cli/test_validation_complete.py +0 -0
  83. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/generators/__init__.py +0 -0
  84. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/generators/test_base_generator.py +0 -0
  85. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/generators/test_spark_generator.py +0 -0
  86. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_compose_conditions.py +0 -0
  87. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_conditional_auto_detection.py +0 -0
  88. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_conditional_core.py +0 -0
  89. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_conditional_real_world.py +0 -0
  90. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/operators/test_operators.py +0 -0
  91. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/__init__.py +0 -0
  92. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/test_discovery.py +0 -0
  93. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/common/test_common.py +0 -0
  94. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_building_unit_extraction.py +0 -0
  95. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_city_state_extraction.py +0 -0
  96. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_clean_addresses.py +0 -0
  97. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_country_extraction.py +0 -0
  98. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_data_addresses.py +0 -0
  99. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_po_box_extraction.py +0 -0
  100. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_street_extraction.py +0 -0
  101. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_addresses/test_zip_code_extraction.py +0 -0
  102. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_data_quality.py +0 -0
  103. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_extraction.py +0 -0
  104. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_integration.py +0 -0
  105. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_performance.py +0 -0
  106. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_regression.py +0 -0
  107. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_datetimes/test_datetime_timezones.py +0 -0
  108. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_emails/test_debug_long_emails.py +0 -0
  109. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_emails/test_email_extraction.py +0 -0
  110. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_emails/test_email_optimized.py +0 -0
  111. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_phone_numbers/test_phone_extraction.py +0 -0
  112. {datacompose-0.2.9 → datacompose-0.4.0}/tests/unit/transformers/text/test_phone_numbers/test_phone_formatting.py +0 -0
  113. {datacompose-0.2.9 → datacompose-0.4.0}/tests/yaml_specs/__init__.py +0 -0
@@ -7,6 +7,43 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.4.0] - 2026-02-02
11
+
12
+ ### Added
13
+ - **Fuzzy Matching Primitives**: New analytics module for string similarity and comparison
14
+ - **Distance functions**: `levenshtein`, `levenshtein_normalized`, `levenshtein_threshold`
15
+ - **Phonetic functions**: `soundex`, `soundex_match`
16
+ - **Token-based functions**: `jaccard_similarity`, `token_overlap`
17
+ - **N-gram functions**: `ngram_similarity`, `ngram_distance`
18
+ - **Utility functions**: `exact_match`, `contains_match`, `prefix_match`
19
+ - **Advanced**: `cosine_similarity` for term-frequency based comparison
20
+ - Multi-column support for row-wise comparisons
21
+ - All functions use native PySpark SQL functions (no UDFs) for optimal performance
22
+
23
+ ### Changed
24
+ - **Primitives Module**: Updated to handle multi-column operations
25
+
26
+ ## [0.3.0] - 2026-01-01
27
+
28
+ ### Added
29
+ - **Text Transformation Primitives**: New comprehensive text manipulation module (`text`) with 57 functions
30
+ - **Validation functions (14)**: `is_valid_hex`, `is_valid_base64`, `is_valid_url_encoded`, `has_control_characters`, `has_zero_width_characters`, `has_non_ascii`, `has_escape_sequences`, `has_url_encoding`, `has_html_entities`, `has_ansi_codes`, `has_non_printable`, `has_accents`, `has_unicode_issues`, `has_whitespace_issues`
31
+ - **Transformation functions (23)**: `hex_to_text`, `text_to_hex`, `clean_hex`, `extract_hex`, `decode_base64`, `encode_base64`, `clean_base64`, `extract_base64`, `decode_url`, `encode_url`, `decode_html_entities`, `encode_html_entities`, `unescape_string`, `escape_string`, `normalize_line_endings`, `to_ascii`, `to_codepoints`, `from_codepoints`, `reverse_string`, `truncate`, `pad_left`, `pad_right`
32
+ - **Cleaning functions (20)**: `remove_control_characters`, `remove_zero_width_characters`, `remove_non_printable`, `remove_ansi_codes`, `strip_invisible`, `remove_bom`, `normalize_unicode`, `remove_accents`, `normalize_whitespace`, `remove_html_tags`, `remove_urls`, `remove_emojis`, `remove_punctuation`, `remove_digits`, `remove_letters`, `remove_escape_sequences`, `strip_to_alphanumeric`, `clean_for_comparison`, `slugify`, `collapse_repeats`, `clean_string`
33
+ - All functions use native PySpark SQL functions (no UDFs) for optimal performance
34
+ - Comprehensive null and empty string handling
35
+ - 508 unit tests with full coverage
36
+
37
+ ### Fixed
38
+ - **Text Primitives**: Various fixes to text transformation functions
39
+ - `decode_url`: Fixed %2B decoding to properly preserve literal plus signs vs form-encoded spaces
40
+ - `extract_hex`: Improved pattern to require `0x`/`#` prefix or MAC address format, avoiding false matches
41
+ - `extract_base64`: Improved pattern to require `=` padding or `base64,` prefix for reliable extraction
42
+ - `unescape_string`: Fixed backslash escape handling with placeholder approach
43
+ - `collapse_repeats`: Added working implementation for `max_repeat=2`
44
+ - `has_unicode_issues`: Added combining character detection (U+0300-U+036F range)
45
+ - `clean_string`: Fixed ANSI code removal order (must run before control char removal)
46
+
10
47
  ## [0.2.7.0] - 2025-09-11
11
48
 
12
49
  ### Fixed
@@ -1,15 +1,15 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datacompose
3
- Version: 0.2.9
3
+ Version: 0.4.0
4
4
  Summary: Copy-pasteable data transformation primitives for PySpark. Inspired by shadcn-svelte.
5
5
  Author: Datacompose Contributors
6
6
  Maintainer: Datacompose Contributors
7
7
  License: MIT
8
- Project-URL: Homepage, https://github.com/tc-cole/datacompose
9
- Project-URL: Documentation, https://github.com/tc-cole/datacompose/tree/main/docs
10
- Project-URL: Repository, https://github.com/tc-cole/datacompose.git
11
- Project-URL: Issues, https://github.com/tc-cole/datacompose/issues
12
- Project-URL: Changelog, https://github.com/tc-cole/datacompose/blob/main/CHANGELOG.md
8
+ Project-URL: Homepage, https://github.com/datacompose/datacompose
9
+ Project-URL: Documentation, https://datacompose.io
10
+ Project-URL: Repository, https://github.com/datacompose/datacompose.git
11
+ Project-URL: Issues, https://github.com/datacompose/datacompose/issues
12
+ Project-URL: Changelog, https://github.com/datacompose/datacompose/blob/main/CHANGELOG.md
13
13
  Keywords: data-cleaning,data-quality,udf,spark,postgres,code-generation,data-pipeline,etl
14
14
  Classifier: Development Status :: 4 - Beta
15
15
  Classifier: Intended Audience :: Developers
@@ -28,28 +28,31 @@ logger = logging.getLogger(__name__)
28
28
  class SmartPrimitive:
29
29
  """Wraps a PySpark column transformation function to enable partial application.
30
30
 
31
- SmartPrimitive allows column transformation functions to be:
32
- 1. Called directly with a column: `primitive(col)`
33
- 2. Pre-configured with parameters: `primitive(param=value)` returns a configured function
31
+ SmartPrimitive handles both single-column and multi-column transformations:
32
+ 1. Single column: `primitive(col)` or `primitive(param=value)(col)`
33
+ 2. Multi column: `primitive(col1, col2)` or `primitive(param=value)(col1, col2)`
34
34
 
35
- This enables building reusable, parameterized transformations that can be composed
36
- into data pipelines.
35
+ The behavior is auto-detected based on the number of arguments passed.
37
36
 
38
- Example:
37
+ Example (single column):
38
+ >>> @registry.register()
39
39
  >>> def trim_spaces(col, chars=' '):
40
- ... return f.trim(col, chars)
41
- >>>
42
- >>> trim = SmartPrimitive(trim_spaces)
40
+ ... return F.trim(col, chars)
43
41
  >>>
44
- >>> # Direct usage
45
- >>> df.select(trim(f.col("text")))
42
+ >>> df.select(registry.trim_spaces(F.col("text")))
43
+ >>> configured = registry.trim_spaces(chars='\\t')
44
+ >>> df.select(configured(F.col("text")))
45
+
46
+ Example (multi column):
47
+ >>> @registry.register()
48
+ >>> def levenshtein(col1, col2, normalize=False):
49
+ ... return F.levenshtein(col1, col2)
46
50
  >>>
47
- >>> # Pre-configured usage
48
- >>> trim_tabs = trim(chars='\t')
49
- >>> df.select(trim_tabs(f.col("text")))
51
+ >>> df.withColumn("score", registry.levenshtein(F.col("a"), F.col("b")))
52
+ >>> configured = registry.levenshtein(normalize=True)
53
+ >>> df.withColumn("score", configured(F.col("a"), F.col("b")))
50
54
 
51
-
52
- Please note that you will not use this directly. It will be used in the PrimitiveRegistry class
55
+ Please note that you will not use this directly. It will be used in the PrimitiveRegistry class.
53
56
  """
54
57
 
55
58
  def __init__(self, func: Callable, name: Optional[str] = None):
@@ -63,25 +66,25 @@ class SmartPrimitive:
63
66
  self.name = name or func.__name__
64
67
  self.__doc__ = func.__doc__
65
68
 
66
- def __call__(self, col: Optional[Column] = None, **kwargs): # type: ignore
69
+ def __call__(self, *cols, **kwargs): # type: ignore
67
70
  """Apply the transformation or return a configured version.
68
71
 
69
- Args:
70
- col: Optional PySpark Column to transform. If provided, applies the
71
- transformation immediately. If None, returns a configured function.
72
- **kwargs: Parameters to pass to the transformation function
72
+ Auto-detects single vs multi-column based on argument count:
73
+ - 0 args: returns configured function (partial application)
74
+ - 1 arg: single-column call
75
+ - 2+ args: multi-column call
73
76
 
74
77
  Returns:
75
- If col is provided: The transformed Column
76
- If col is None: A configured function that takes a Column
78
+ If columns provided: The transformed Column
79
+ If no columns: A configured function that takes Column(s)
77
80
  """
78
- if col is not None: # type: ignore
79
- return self.func(col, **kwargs) # type: ignore
81
+ if cols:
82
+ return self.func(*cols, **kwargs)
80
83
  else:
81
84
 
82
85
  @wraps(self.func)
83
- def configured(col: Column): # type: ignore
84
- return self.func(col, **kwargs) # type: ignore
86
+ def configured(*c): # type: ignore
87
+ return self.func(*c, **kwargs)
85
88
 
86
89
  configured.__name__ = (
87
90
  f"{self.name}({', '.join(f'{k}={v}' for k, v in kwargs.items())})"
@@ -0,0 +1 @@
1
+ """Analytics transformers for row-wise operations."""
@@ -0,0 +1 @@
1
+ """Fuzzy matching transformer."""
@@ -0,0 +1,2 @@
1
+ """PySpark fuzzy matching primitives."""
2
+ from .pyspark_primitives import fuzzy
@@ -0,0 +1,453 @@
1
+ """
2
+ Fuzzy matching primitives for PySpark.
3
+
4
+ Provides string similarity and comparison functions for row-wise operations
5
+ that compare two or more columns.
6
+
7
+ Preview Output:
8
+ +----------+----------+------------+----------------+---------------+
9
+ |name_a |name_b |levenshtein |levenshtein_norm|soundex_match |
10
+ +----------+----------+------------+----------------+---------------+
11
+ |john |jon |1 |0.75 |true |
12
+ |smith |smyth |1 |0.80 |true |
13
+ |acme corp |acme inc |4 |0.56 |false |
14
+ |robert |bob |5 |0.17 |false |
15
+ +----------+----------+------------+----------------+---------------+
16
+
17
+ Usage Example:
18
+ from pyspark.sql import SparkSession
19
+ from pyspark.sql import functions as F
20
+ from transformers.pyspark.fuzzy_matching import fuzzy
21
+
22
+ # Initialize Spark
23
+ spark = SparkSession.builder.appName("FuzzyMatching").getOrCreate()
24
+
25
+ # Create sample data
26
+ data = [
27
+ ("john", "jon"),
28
+ ("smith", "smyth"),
29
+ ("acme corp", "acme inc"),
30
+ ]
31
+ df = spark.createDataFrame(data, ["name_a", "name_b"])
32
+
33
+ # Compare strings
34
+ result_df = df.select(
35
+ F.col("name_a"),
36
+ F.col("name_b"),
37
+ fuzzy.levenshtein(F.col("name_a"), F.col("name_b")).alias("distance"),
38
+ fuzzy.levenshtein_normalized(F.col("name_a"), F.col("name_b")).alias("similarity"),
39
+ fuzzy.soundex_match(F.col("name_a"), F.col("name_b")).alias("soundex_match")
40
+ )
41
+
42
+ # Filter to similar matches
43
+ similar = result_df.filter(F.col("similarity") >= 0.8)
44
+
45
+ Installation:
46
+ datacompose add fuzzy_matching
47
+ """
48
+
49
+ from typing import TYPE_CHECKING
50
+
51
+ if TYPE_CHECKING:
52
+ from pyspark.sql import Column
53
+ from pyspark.sql import functions as F
54
+ else:
55
+ try:
56
+ from pyspark.sql import Column
57
+ from pyspark.sql import functions as F
58
+ except ImportError:
59
+ pass
60
+
61
+ try:
62
+ from utils.primitives import PrimitiveRegistry # type: ignore
63
+ except ImportError:
64
+ from datacompose.operators.primitives import PrimitiveRegistry
65
+
66
+ fuzzy = PrimitiveRegistry("fuzzy")
67
+
68
+
69
+ # =============================================================================
70
+ # Distance Functions
71
+ # =============================================================================
72
+
73
+
74
+ @fuzzy.register()
75
+ def levenshtein(col1: "Column", col2: "Column") -> "Column":
76
+ """Calculate Levenshtein edit distance between two strings.
77
+
78
+ The Levenshtein distance is the minimum number of single-character edits
79
+ (insertions, deletions, substitutions) required to transform one string
80
+ into another.
81
+
82
+ Args:
83
+ col1: First string column
84
+ col2: Second string column
85
+
86
+ Returns:
87
+ Column with integer edit distance (0 = identical)
88
+
89
+ Example:
90
+ >>> df.withColumn("dist", fuzzy.levenshtein(F.col("a"), F.col("b")))
91
+ """
92
+ return F.levenshtein(col1, col2)
93
+
94
+
95
+ @fuzzy.register()
96
+ def levenshtein_normalized(col1: "Column", col2: "Column") -> "Column":
97
+ """Calculate normalized Levenshtein similarity (0.0 to 1.0).
98
+
99
+ Returns a similarity score where 1.0 means identical strings and
100
+ 0.0 means completely different. Calculated as:
101
+ 1 - (levenshtein_distance / max(len(str1), len(str2)))
102
+
103
+ Args:
104
+ col1: First string column
105
+ col2: Second string column
106
+
107
+ Returns:
108
+ Column with float similarity score between 0.0 and 1.0
109
+
110
+ Example:
111
+ >>> df.withColumn("sim", fuzzy.levenshtein_normalized(F.col("a"), F.col("b")))
112
+ """
113
+ distance = F.levenshtein(col1, col2)
114
+ max_len = F.greatest(F.length(col1), F.length(col2))
115
+ return F.when(max_len == 0, F.lit(1.0)).otherwise(F.lit(1.0) - (distance / max_len))
116
+
117
+
118
+ @fuzzy.register()
119
+ def levenshtein_threshold(
120
+ col1: "Column", col2: "Column", threshold: float = 0.8
121
+ ) -> "Column":
122
+ """Check if normalized Levenshtein similarity meets threshold.
123
+
124
+ Args:
125
+ col1: First string column
126
+ col2: Second string column
127
+ threshold: Minimum similarity score (default 0.8)
128
+
129
+ Returns:
130
+ Column with boolean indicating if similarity >= threshold
131
+
132
+ Example:
133
+ >>> df.withColumn("is_match", fuzzy.levenshtein_threshold(F.col("a"), F.col("b"), threshold=0.9))
134
+ """
135
+ distance = F.levenshtein(col1, col2)
136
+ max_len = F.greatest(F.length(col1), F.length(col2))
137
+ similarity = F.when(max_len == 0, F.lit(1.0)).otherwise(
138
+ F.lit(1.0) - (distance / max_len)
139
+ )
140
+ return similarity >= F.lit(threshold)
141
+
142
+
143
+ # =============================================================================
144
+ # Phonetic Functions
145
+ # =============================================================================
146
+
147
+
148
+ @fuzzy.register()
149
+ def soundex(col: "Column") -> "Column":
150
+ """Calculate Soundex phonetic encoding of a string.
151
+
152
+ Soundex encodes a string into a letter followed by three digits,
153
+ representing how the word sounds in English.
154
+
155
+ Args:
156
+ col: String column to encode
157
+
158
+ Returns:
159
+ Column with Soundex code (e.g., "Robert" -> "R163")
160
+
161
+ Example:
162
+ >>> df.withColumn("code", fuzzy.soundex(F.col("name")))
163
+ """
164
+ return F.soundex(col)
165
+
166
+
167
+ @fuzzy.register()
168
+ def soundex_match(col1: "Column", col2: "Column") -> "Column":
169
+ """Check if two strings have the same Soundex encoding.
170
+
171
+ Useful for matching names that sound alike but are spelled differently
172
+ (e.g., "Smith" and "Smyth").
173
+
174
+ Args:
175
+ col1: First string column
176
+ col2: Second string column
177
+
178
+ Returns:
179
+ Column with boolean indicating if Soundex codes match
180
+
181
+ Example:
182
+ >>> df.withColumn("sounds_alike", fuzzy.soundex_match(F.col("a"), F.col("b")))
183
+ """
184
+ return F.soundex(col1) == F.soundex(col2)
185
+
186
+
187
+ # =============================================================================
188
+ # Token-based Functions
189
+ # =============================================================================
190
+
191
+
192
+ @fuzzy.register()
193
+ def jaccard_similarity(
194
+ col1: "Column", col2: "Column", delimiter: str = " "
195
+ ) -> "Column":
196
+ """Calculate Jaccard similarity between tokenized strings.
197
+
198
+ Splits both strings into tokens and calculates:
199
+ |intersection| / |union|
200
+
201
+ Useful for comparing multi-word strings where word order doesn't matter.
202
+
203
+ Args:
204
+ col1: First string column
205
+ col2: Second string column
206
+ delimiter: Token delimiter (default: space)
207
+
208
+ Returns:
209
+ Column with float similarity score between 0.0 and 1.0
210
+
211
+ Example:
212
+ >>> df.withColumn("sim", fuzzy.jaccard_similarity(F.col("a"), F.col("b")))
213
+ """
214
+ tokens1 = F.split(F.lower(col1), delimiter)
215
+ tokens2 = F.split(F.lower(col2), delimiter)
216
+ intersection = F.size(F.array_intersect(tokens1, tokens2))
217
+ union = F.size(F.array_union(tokens1, tokens2))
218
+ return F.when(union == 0, F.lit(1.0)).otherwise(intersection / union)
219
+
220
+
221
+ @fuzzy.register()
222
+ def token_overlap(col1: "Column", col2: "Column", delimiter: str = " ") -> "Column":
223
+ """Count number of overlapping tokens between two strings.
224
+
225
+ Args:
226
+ col1: First string column
227
+ col2: Second string column
228
+ delimiter: Token delimiter (default: space)
229
+
230
+ Returns:
231
+ Column with integer count of shared tokens
232
+
233
+ Example:
234
+ >>> df.withColumn("overlap", fuzzy.token_overlap(F.col("a"), F.col("b")))
235
+ """
236
+ tokens1 = F.split(F.lower(col1), delimiter)
237
+ tokens2 = F.split(F.lower(col2), delimiter)
238
+ return F.size(F.array_intersect(tokens1, tokens2))
239
+
240
+
241
+ # =============================================================================
242
+ # Utility Functions
243
+ # =============================================================================
244
+
245
+
246
+ @fuzzy.register()
247
+ def exact_match(col1: "Column", col2: "Column", ignore_case: bool = True) -> "Column":
248
+ """Check if two strings match exactly.
249
+
250
+ Args:
251
+ col1: First string column
252
+ col2: Second string column
253
+ ignore_case: If True, comparison is case-insensitive (default: True)
254
+
255
+ Returns:
256
+ Column with boolean indicating exact match
257
+
258
+ Example:
259
+ >>> df.withColumn("match", fuzzy.exact_match(F.col("a"), F.col("b")))
260
+ """
261
+ if ignore_case:
262
+ return F.lower(col1) == F.lower(col2)
263
+ return col1 == col2
264
+
265
+
266
+ @fuzzy.register()
267
+ def contains_match(
268
+ col1: "Column", col2: "Column", ignore_case: bool = True
269
+ ) -> "Column":
270
+ """Check if one string contains the other.
271
+
272
+ Returns True if col1 contains col2 OR col2 contains col1.
273
+
274
+ Args:
275
+ col1: First string column
276
+ col2: Second string column
277
+ ignore_case: If True, comparison is case-insensitive (default: True)
278
+
279
+ Returns:
280
+ Column with boolean indicating containment
281
+
282
+ Example:
283
+ >>> df.withColumn("contains", fuzzy.contains_match(F.col("a"), F.col("b")))
284
+ """
285
+ if ignore_case:
286
+ c1, c2 = F.lower(col1), F.lower(col2)
287
+ else:
288
+ c1, c2 = col1, col2
289
+ return F.contains(c1, c2) | F.contains(c2, c1)
290
+
291
+
292
+ @fuzzy.register()
293
+ def prefix_match(col1: "Column", col2: "Column", length: int = 3) -> "Column":
294
+ """Check if two strings share the same prefix.
295
+
296
+ Args:
297
+ col1: First string column
298
+ col2: Second string column
299
+ length: Number of characters to compare (default: 3)
300
+
301
+ Returns:
302
+ Column with boolean indicating prefix match
303
+
304
+ Example:
305
+ >>> df.withColumn("same_prefix", fuzzy.prefix_match(F.col("a"), F.col("b"), length=4))
306
+ """
307
+ return F.left(F.lower(col1), F.lit(length)) == F.left(F.lower(col2), F.lit(length))
308
+
309
+
310
+ # =============================================================================
311
+ # N-gram Functions
312
+ # =============================================================================
313
+
314
+
315
+ @fuzzy.register()
316
+ def ngram_similarity(col1: "Column", col2: "Column", n: int = 2) -> "Column":
317
+ """Calculate n-gram (character-level) similarity between two strings.
318
+
319
+ Breaks strings into overlapping character sequences of length n,
320
+ then calculates Jaccard similarity on the n-gram sets.
321
+
322
+ Good for catching typos and character-level variations.
323
+
324
+ Args:
325
+ col1: First string column
326
+ col2: Second string column
327
+ n: Size of n-grams (default: 2 for bigrams)
328
+
329
+ Returns:
330
+ Column with float similarity score between 0.0 and 1.0
331
+
332
+ Example:
333
+ >>> df.withColumn("sim", fuzzy.ngram_similarity(F.col("a"), F.col("b"), n=2))
334
+ """
335
+
336
+ # Generate n-grams using transform to create array of substrings
337
+ # For a string of length L, we get L-n+1 n-grams
338
+ def make_ngrams(col: "Column", n: int) -> "Column":
339
+ # Pad the string to handle short strings
340
+ padded = F.lower(col)
341
+ length = F.length(padded)
342
+ # Generate indices from 0 to length-n
343
+ indices = F.sequence(F.lit(0), F.greatest(length - F.lit(n), F.lit(0)))
344
+ # Extract substring at each index
345
+ return F.transform(indices, lambda i: F.substring(padded, i + 1, n))
346
+
347
+ ngrams1 = make_ngrams(col1, n)
348
+ ngrams2 = make_ngrams(col2, n)
349
+
350
+ intersection = F.size(F.array_intersect(ngrams1, ngrams2))
351
+ union = F.size(F.array_union(ngrams1, ngrams2))
352
+
353
+ return F.when(union == 0, F.lit(1.0)).otherwise(intersection / union)
354
+
355
+
356
+ @fuzzy.register()
357
+ def ngram_distance(col1: "Column", col2: "Column", n: int = 2) -> "Column":
358
+ """Calculate n-gram distance (1 - similarity) between two strings.
359
+
360
+ Args:
361
+ col1: First string column
362
+ col2: Second string column
363
+ n: Size of n-grams (default: 2 for bigrams)
364
+
365
+ Returns:
366
+ Column with float distance between 0.0 and 1.0
367
+
368
+ Example:
369
+ >>> df.withColumn("dist", fuzzy.ngram_distance(F.col("a"), F.col("b")))
370
+ """
371
+
372
+ def make_ngrams(col: "Column", n: int) -> "Column":
373
+ padded = F.lower(col)
374
+ length = F.length(padded)
375
+ indices = F.sequence(F.lit(0), F.greatest(length - F.lit(n), F.lit(0)))
376
+ return F.transform(indices, lambda i: F.substring(padded, i + 1, n))
377
+
378
+ ngrams1 = make_ngrams(col1, n)
379
+ ngrams2 = make_ngrams(col2, n)
380
+
381
+ intersection = F.size(F.array_intersect(ngrams1, ngrams2))
382
+ union = F.size(F.array_union(ngrams1, ngrams2))
383
+
384
+ similarity = F.when(union == 0, F.lit(1.0)).otherwise(intersection / union)
385
+ return F.lit(1.0) - similarity
386
+
387
+
388
+ # =============================================================================
389
+ # Cosine Similarity
390
+ # =============================================================================
391
+
392
+
393
+ @fuzzy.register()
394
+ def cosine_similarity(col1: "Column", col2: "Column", delimiter: str = " ") -> "Column":
395
+ """Calculate cosine similarity between tokenized strings.
396
+
397
+ Treats each string as a bag of words and computes cosine similarity
398
+ based on term frequency. Good for comparing longer text.
399
+
400
+ Args:
401
+ col1: First string column
402
+ col2: Second string column
403
+ delimiter: Token delimiter (default: space)
404
+
405
+ Returns:
406
+ Column with float similarity score between 0.0 and 1.0
407
+
408
+ Example:
409
+ >>> df.withColumn("sim", fuzzy.cosine_similarity(F.col("a"), F.col("b")))
410
+ """
411
+ # Tokenize
412
+ tokens1 = F.split(F.lower(col1), delimiter)
413
+ tokens2 = F.split(F.lower(col2), delimiter)
414
+
415
+ # Get all unique tokens
416
+ all_tokens = F.array_union(tokens1, tokens2)
417
+
418
+ # Calculate dot product and magnitudes
419
+ # dot_product = sum(tf1[t] * tf2[t] for t in all_tokens)
420
+ # magnitude1 = sqrt(sum(tf1[t]^2 for t in all_tokens))
421
+ # magnitude2 = sqrt(sum(tf2[t]^2 for t in all_tokens))
422
+
423
+ dot_product = F.aggregate(
424
+ all_tokens,
425
+ F.lit(0.0),
426
+ lambda acc, token: acc
427
+ + (
428
+ F.size(F.filter(tokens1, lambda t: t == token)).cast("double")
429
+ * F.size(F.filter(tokens2, lambda t: t == token)).cast("double")
430
+ ),
431
+ )
432
+
433
+ magnitude1 = F.sqrt(
434
+ F.aggregate(
435
+ all_tokens,
436
+ F.lit(0.0),
437
+ lambda acc, token: acc
438
+ + F.pow(F.size(F.filter(tokens1, lambda t: t == token)).cast("double"), 2),
439
+ )
440
+ )
441
+
442
+ magnitude2 = F.sqrt(
443
+ F.aggregate(
444
+ all_tokens,
445
+ F.lit(0.0),
446
+ lambda acc, token: acc
447
+ + F.pow(F.size(F.filter(tokens2, lambda t: t == token)).cast("double"), 2),
448
+ )
449
+ )
450
+
451
+ denominator = magnitude1 * magnitude2
452
+
453
+ return F.when(denominator == 0, F.lit(0.0)).otherwise(dot_product / denominator)
@@ -0,0 +1,5 @@
1
+ """Text cleaning, validation, and transformation primitives."""
2
+
3
+ from datacompose.transformers.text.text.pyspark.pyspark_primitives import text
4
+
5
+ __all__ = ["text"]
@@ -0,0 +1,5 @@
1
+ """PySpark text primitives."""
2
+
3
+ from datacompose.transformers.text.text.pyspark.pyspark_primitives import text
4
+
5
+ __all__ = ["text"]