datacompose 0.2.4.1__py3-none-any.whl → 0.2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datacompose might be problematic. Click here for more details.

@@ -1,3 +1,54 @@
1
+ """
2
+ Address transformation primitives for PySpark.
3
+
4
+ Preview Output:
5
+ +----------------------------------------------+-------------+-----------+-----------+-----+-------+
6
+ |address |street_number|street_name|city |state|zip |
7
+ +----------------------------------------------+-------------+-----------+-----------+-----+-------+
8
+ | 123 Main St, New York, NY 10001 |123 |Main |New York |NY |10001 |
9
+ |456 oak ave apt 5b, los angeles, ca 90001 |456 |Oak |Los Angeles|CA |90001 |
10
+ |789 ELM STREET CHICAGO IL 60601 |789 |Elm |Chicago |IL |60601 |
11
+ |321 pine rd. suite 100,, boston massachusetts|321 |Pine |Boston |MA |null |
12
+ |PO Box 789, Atlanta, GA 30301 |null |null |Atlanta |GA |30301 |
13
+ +----------------------------------------------+-------------+-----------+-----------+-----+-------+
14
+
15
+ Usage Example:
16
+ from pyspark.sql import SparkSession
17
+ from pyspark.sql import functions as F
18
+ from transformers.pyspark.addresses import addresses
19
+
20
+ # Initialize Spark
21
+ spark = SparkSession.builder.appName("DataCleaning").getOrCreate()
22
+
23
+ # Create sample data
24
+ data = [
25
+ ("123 Main St, New York, NY 10001",),
26
+ ("456 Oak Ave Apt 5B, Los Angeles, CA 90001",),
27
+ ("789 Elm Street, Chicago, IL 60601",),
28
+ ("321 Pine Road Suite 100, Boston, MA 02101",),
29
+ ]
30
+ df = spark.createDataFrame(data, ["address"])
31
+
32
+ # Extract and standardize address components
33
+ result_df = df.select(
34
+ F.col("address"),
35
+ addresses.extract_street_number(F.col("address")).alias("street_number"),
36
+ addresses.extract_street_name(F.col("address")).alias("street_name"),
37
+ addresses.extract_city(F.col("address")).alias("city"),
38
+ addresses.extract_state(F.col("address")).alias("state"),
39
+ addresses.extract_zip_code(F.col("address")).alias("zip")
40
+ )
41
+
42
+ # Show results
43
+ result_df.show(truncate=False)
44
+
45
+ # Filter to valid addresses
46
+ valid_addresses = result_df.filter(addresses.validate_zip_code(F.col("zip")))
47
+
48
+ Installation:
49
+ datacompose add addresses
50
+ """
51
+
1
52
  import re
2
53
  from typing import TYPE_CHECKING, Dict, List, Optional
3
54
 
@@ -16,7 +67,7 @@ else:
16
67
 
17
68
  try:
18
69
  # Try local utils import first (for generated code)
19
- from utils.primitives import PrimitiveRegistry
70
+ from utils.primitives import PrimitiveRegistry # type: ignore
20
71
  except ImportError:
21
72
  # Fall back to installed datacompose package
22
73
  from datacompose.operators.primitives import PrimitiveRegistry
@@ -345,8 +396,10 @@ def extract_street_name(col: Column) -> Column:
345
396
  trimmed_col = F.trim(col)
346
397
  without_number = F.when(
347
398
  # If it's just a numbered street (e.g., "5th Avenue", "1st Street")
348
- trimmed_col.rlike(r"^(?i)\d+(?:st|nd|rd|th)\s+(?:" + "|".join(suffixes) + r")$"),
349
- trimmed_col # Keep as is - it's a numbered street name
399
+ trimmed_col.rlike(
400
+ r"^(?i)\d+(?:st|nd|rd|th)\s+(?:" + "|".join(suffixes) + r")$"
401
+ ),
402
+ trimmed_col, # Keep as is - it's a numbered street name
350
403
  ).otherwise(
351
404
  # Otherwise remove the house number
352
405
  F.regexp_replace(trimmed_col, r"^\d+[\w\-/]*\s+", "")
@@ -354,9 +407,7 @@ def extract_street_name(col: Column) -> Column:
354
407
 
355
408
  # Remove directional prefix - case insensitive
356
409
  # Include full directional words and abbreviations
357
- prefix_pattern = (
358
- r"^(?i)(?:North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\s+"
359
- )
410
+ prefix_pattern = r"^(?i)(?:North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\s+"
360
411
  without_prefix = F.regexp_replace(without_number, prefix_pattern, "")
361
412
 
362
413
  # Extract everything before the street suffix - case insensitive
@@ -434,8 +485,10 @@ def extract_street_suffix(col: Column) -> Column:
434
485
 
435
486
  # Build pattern to match the LAST suffix in the string
436
487
  # This handles cases like "St. James Place" where we want "Place" not "St"
437
- suffix_pattern = r"\b(" + "|".join(suffixes) + r")\b(?!.*\b(?:" + "|".join(suffixes) + r")\b)"
438
-
488
+ suffix_pattern = (
489
+ r"\b(" + "|".join(suffixes) + r")\b(?!.*\b(?:" + "|".join(suffixes) + r")\b)"
490
+ )
491
+
439
492
  # Extract the last matching suffix - case insensitive
440
493
  suffix_pattern_ci = r"(?i)" + suffix_pattern
441
494
  result = F.regexp_extract(col, suffix_pattern_ci, 1)
@@ -653,25 +706,27 @@ def standardize_street_suffix(
653
706
  if col is None:
654
707
  return F.lit("")
655
708
  col = F.when(col.isNull(), F.lit("")).otherwise(col)
656
-
709
+
657
710
  # Convert to uppercase for matching
658
711
  upper_col = F.upper(F.trim(col))
659
712
 
660
713
  # Start with the original column
661
714
  result = col
662
-
715
+
663
716
  # Apply custom mappings first if provided (they take precedence)
664
717
  if custom_mappings:
665
718
  for original, standard in custom_mappings.items():
666
719
  result = F.when(
667
720
  upper_col == F.upper(F.lit(original)), F.lit(standard)
668
721
  ).otherwise(result)
669
-
722
+
670
723
  # Then apply standard mappings for anything not already mapped
671
724
  # Need to check if result has changed to avoid overwriting custom mappings
672
725
  for original, standard in suffix_map.items():
673
726
  # Only apply if not already mapped by custom mappings
674
- if custom_mappings and original.upper() in [k.upper() for k in custom_mappings.keys()]:
727
+ if custom_mappings and original.upper() in [
728
+ k.upper() for k in custom_mappings.keys()
729
+ ]:
675
730
  continue
676
731
  result = F.when(upper_col == original, F.lit(standard)).otherwise(result)
677
732
 
@@ -1005,7 +1060,7 @@ def format_secondary_address(unit_type: Column, unit_number: Column) -> Column:
1005
1060
  Column with formatted secondary address
1006
1061
 
1007
1062
  Example:
1008
- from datacompose.transformers.text.clean_addresses.pyspark.pyspark_udf import format_secondary_address
1063
+ from datacompose.transformers.text.addresses.pyspark.pyspark_udf import format_secondary_address
1009
1064
  df.select(format_secondary_address(F.lit("Apartment"), F.lit("5B")))
1010
1065
  # -> "Apt 5B"
1011
1066
  """
@@ -1,3 +1,55 @@
1
+ """
2
+ Email transformation primitives for PySpark.
3
+
4
+ Preview Output:
5
+ +---------------------------+----------------------+-------------+----------------+--------+
6
+ |email |standardized |username |domain |is_valid|
7
+ +---------------------------+----------------------+-------------+----------------+--------+
8
+ | John.Doe@Gmail.COM |john.doe@gmail.com |john.doe |gmail.com |true |
9
+ |JANE.SMITH@OUTLOOK.COM |jane.smith@outlook.com|jane.smith |outlook.com |true |
10
+ | info@company-name.org |info@company-name.org |info |company-name.org|true |
11
+ |invalid.email@ |null |null |null |false |
12
+ |user+tag@domain.co.uk |user+tag@domain.co.uk |user+tag |domain.co.uk |true |
13
+ |bad email@test.com |null |null |null |false |
14
+ +---------------------------+----------------------+-------------+----------------+--------+
15
+
16
+ Usage Example:
17
+ from pyspark.sql import SparkSession
18
+ from pyspark.sql import functions as F
19
+ from transformers.pyspark.emails import emails
20
+
21
+ # Initialize Spark
22
+ spark = SparkSession.builder.appName("EmailCleaning").getOrCreate()
23
+
24
+ # Create sample data
25
+ data = [
26
+ ("john.doe@gmail.com",),
27
+ ("JANE.SMITH@OUTLOOK.COM",),
28
+ ("info@company-name.org",),
29
+ ("invalid.email@",),
30
+ ("user+tag@domain.co.uk",),
31
+ ]
32
+ df = spark.createDataFrame(data, ["email"])
33
+
34
+ # Extract and validate email components
35
+ result_df = df.select(
36
+ F.col("email"),
37
+ emails.standardize_email(F.col("email")).alias("standardized"),
38
+ emails.extract_username(F.col("email")).alias("username"),
39
+ emails.extract_domain(F.col("email")).alias("domain"),
40
+ emails.is_valid_email(F.col("email")).alias("is_valid")
41
+ )
42
+
43
+ # Show results
44
+ result_df.show(truncate=False)
45
+
46
+ # Filter to valid emails only
47
+ valid_emails = result_df.filter(F.col("is_valid") == True)
48
+
49
+ Installation:
50
+ datacompose add emails
51
+ """
52
+
1
53
  import re
2
54
  from typing import TYPE_CHECKING, Dict, List, Optional
3
55
 
@@ -16,7 +68,7 @@ else:
16
68
 
17
69
  try:
18
70
  # Try local utils import first (for generated code)
19
- from utils.primitives import PrimitiveRegistry
71
+ from utils.primitives import PrimitiveRegistry # type: ignore
20
72
  except ImportError:
21
73
  # Fall back to installed datacompose package
22
74
  from datacompose.operators.primitives import PrimitiveRegistry