datacompose 0.2.4.1__py3-none-any.whl → 0.2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datacompose might be problematic. Click here for more details.
- datacompose/cli/__init__.py +1 -1
- datacompose/cli/commands/add.py +49 -21
- datacompose/cli/commands/init.py +35 -9
- datacompose/cli/commands/list.py +2 -2
- datacompose/cli/config.py +80 -0
- datacompose/cli/main.py +3 -3
- datacompose/generators/base.py +15 -14
- datacompose/generators/pyspark/generator.py +5 -10
- datacompose/operators/__init__.py +1 -1
- datacompose/operators/primitives.py +57 -19
- datacompose/transformers/text/{clean_addresses → addresses}/pyspark/pyspark_primitives.py +68 -13
- datacompose/transformers/text/{clean_emails → emails}/pyspark/pyspark_primitives.py +53 -1
- datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/pyspark/pyspark_primitives.py +416 -366
- datacompose-0.2.6.0.dist-info/METADATA +94 -0
- datacompose-0.2.6.0.dist-info/RECORD +31 -0
- datacompose-0.2.4.1.dist-info/METADATA +0 -449
- datacompose-0.2.4.1.dist-info/RECORD +0 -30
- /datacompose/transformers/text/{clean_addresses → addresses}/__init__.py +0 -0
- /datacompose/transformers/text/{clean_emails → emails}/__init__.py +0 -0
- /datacompose/transformers/text/{clean_phone_numbers → phone_numbers}/__init__.py +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/WHEEL +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/entry_points.txt +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/licenses/LICENSE +0 -0
- {datacompose-0.2.4.1.dist-info → datacompose-0.2.6.0.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Address transformation primitives for PySpark.
|
|
3
|
+
|
|
4
|
+
Preview Output:
|
|
5
|
+
+----------------------------------------------+-------------+-----------+-----------+-----+-------+
|
|
6
|
+
|address |street_number|street_name|city |state|zip |
|
|
7
|
+
+----------------------------------------------+-------------+-----------+-----------+-----+-------+
|
|
8
|
+
| 123 Main St, New York, NY 10001 |123 |Main |New York |NY |10001 |
|
|
9
|
+
|456 oak ave apt 5b, los angeles, ca 90001 |456 |Oak |Los Angeles|CA |90001 |
|
|
10
|
+
|789 ELM STREET CHICAGO IL 60601 |789 |Elm |Chicago |IL |60601 |
|
|
11
|
+
|321 pine rd. suite 100,, boston massachusetts|321 |Pine |Boston |MA |null |
|
|
12
|
+
|PO Box 789, Atlanta, GA 30301 |null |null |Atlanta |GA |30301 |
|
|
13
|
+
+----------------------------------------------+-------------+-----------+-----------+-----+-------+
|
|
14
|
+
|
|
15
|
+
Usage Example:
|
|
16
|
+
from pyspark.sql import SparkSession
|
|
17
|
+
from pyspark.sql import functions as F
|
|
18
|
+
from transformers.pyspark.addresses import addresses
|
|
19
|
+
|
|
20
|
+
# Initialize Spark
|
|
21
|
+
spark = SparkSession.builder.appName("DataCleaning").getOrCreate()
|
|
22
|
+
|
|
23
|
+
# Create sample data
|
|
24
|
+
data = [
|
|
25
|
+
("123 Main St, New York, NY 10001",),
|
|
26
|
+
("456 Oak Ave Apt 5B, Los Angeles, CA 90001",),
|
|
27
|
+
("789 Elm Street, Chicago, IL 60601",),
|
|
28
|
+
("321 Pine Road Suite 100, Boston, MA 02101",),
|
|
29
|
+
]
|
|
30
|
+
df = spark.createDataFrame(data, ["address"])
|
|
31
|
+
|
|
32
|
+
# Extract and standardize address components
|
|
33
|
+
result_df = df.select(
|
|
34
|
+
F.col("address"),
|
|
35
|
+
addresses.extract_street_number(F.col("address")).alias("street_number"),
|
|
36
|
+
addresses.extract_street_name(F.col("address")).alias("street_name"),
|
|
37
|
+
addresses.extract_city(F.col("address")).alias("city"),
|
|
38
|
+
addresses.extract_state(F.col("address")).alias("state"),
|
|
39
|
+
addresses.extract_zip_code(F.col("address")).alias("zip")
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Show results
|
|
43
|
+
result_df.show(truncate=False)
|
|
44
|
+
|
|
45
|
+
# Filter to valid addresses
|
|
46
|
+
valid_addresses = result_df.filter(addresses.validate_zip_code(F.col("zip")))
|
|
47
|
+
|
|
48
|
+
Installation:
|
|
49
|
+
datacompose add addresses
|
|
50
|
+
"""
|
|
51
|
+
|
|
1
52
|
import re
|
|
2
53
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
3
54
|
|
|
@@ -16,7 +67,7 @@ else:
|
|
|
16
67
|
|
|
17
68
|
try:
|
|
18
69
|
# Try local utils import first (for generated code)
|
|
19
|
-
from utils.primitives import PrimitiveRegistry
|
|
70
|
+
from utils.primitives import PrimitiveRegistry # type: ignore
|
|
20
71
|
except ImportError:
|
|
21
72
|
# Fall back to installed datacompose package
|
|
22
73
|
from datacompose.operators.primitives import PrimitiveRegistry
|
|
@@ -345,8 +396,10 @@ def extract_street_name(col: Column) -> Column:
|
|
|
345
396
|
trimmed_col = F.trim(col)
|
|
346
397
|
without_number = F.when(
|
|
347
398
|
# If it's just a numbered street (e.g., "5th Avenue", "1st Street")
|
|
348
|
-
trimmed_col.rlike(
|
|
349
|
-
|
|
399
|
+
trimmed_col.rlike(
|
|
400
|
+
r"^(?i)\d+(?:st|nd|rd|th)\s+(?:" + "|".join(suffixes) + r")$"
|
|
401
|
+
),
|
|
402
|
+
trimmed_col, # Keep as is - it's a numbered street name
|
|
350
403
|
).otherwise(
|
|
351
404
|
# Otherwise remove the house number
|
|
352
405
|
F.regexp_replace(trimmed_col, r"^\d+[\w\-/]*\s+", "")
|
|
@@ -354,9 +407,7 @@ def extract_street_name(col: Column) -> Column:
|
|
|
354
407
|
|
|
355
408
|
# Remove directional prefix - case insensitive
|
|
356
409
|
# Include full directional words and abbreviations
|
|
357
|
-
prefix_pattern = (
|
|
358
|
-
r"^(?i)(?:North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\s+"
|
|
359
|
-
)
|
|
410
|
+
prefix_pattern = r"^(?i)(?:North|South|East|West|Northeast|Northwest|Southeast|Southwest|N\.?|S\.?|E\.?|W\.?|NE\.?|NW\.?|SE\.?|SW\.?)\s+"
|
|
360
411
|
without_prefix = F.regexp_replace(without_number, prefix_pattern, "")
|
|
361
412
|
|
|
362
413
|
# Extract everything before the street suffix - case insensitive
|
|
@@ -434,8 +485,10 @@ def extract_street_suffix(col: Column) -> Column:
|
|
|
434
485
|
|
|
435
486
|
# Build pattern to match the LAST suffix in the string
|
|
436
487
|
# This handles cases like "St. James Place" where we want "Place" not "St"
|
|
437
|
-
suffix_pattern =
|
|
438
|
-
|
|
488
|
+
suffix_pattern = (
|
|
489
|
+
r"\b(" + "|".join(suffixes) + r")\b(?!.*\b(?:" + "|".join(suffixes) + r")\b)"
|
|
490
|
+
)
|
|
491
|
+
|
|
439
492
|
# Extract the last matching suffix - case insensitive
|
|
440
493
|
suffix_pattern_ci = r"(?i)" + suffix_pattern
|
|
441
494
|
result = F.regexp_extract(col, suffix_pattern_ci, 1)
|
|
@@ -653,25 +706,27 @@ def standardize_street_suffix(
|
|
|
653
706
|
if col is None:
|
|
654
707
|
return F.lit("")
|
|
655
708
|
col = F.when(col.isNull(), F.lit("")).otherwise(col)
|
|
656
|
-
|
|
709
|
+
|
|
657
710
|
# Convert to uppercase for matching
|
|
658
711
|
upper_col = F.upper(F.trim(col))
|
|
659
712
|
|
|
660
713
|
# Start with the original column
|
|
661
714
|
result = col
|
|
662
|
-
|
|
715
|
+
|
|
663
716
|
# Apply custom mappings first if provided (they take precedence)
|
|
664
717
|
if custom_mappings:
|
|
665
718
|
for original, standard in custom_mappings.items():
|
|
666
719
|
result = F.when(
|
|
667
720
|
upper_col == F.upper(F.lit(original)), F.lit(standard)
|
|
668
721
|
).otherwise(result)
|
|
669
|
-
|
|
722
|
+
|
|
670
723
|
# Then apply standard mappings for anything not already mapped
|
|
671
724
|
# Need to check if result has changed to avoid overwriting custom mappings
|
|
672
725
|
for original, standard in suffix_map.items():
|
|
673
726
|
# Only apply if not already mapped by custom mappings
|
|
674
|
-
if custom_mappings and original.upper() in [
|
|
727
|
+
if custom_mappings and original.upper() in [
|
|
728
|
+
k.upper() for k in custom_mappings.keys()
|
|
729
|
+
]:
|
|
675
730
|
continue
|
|
676
731
|
result = F.when(upper_col == original, F.lit(standard)).otherwise(result)
|
|
677
732
|
|
|
@@ -1005,7 +1060,7 @@ def format_secondary_address(unit_type: Column, unit_number: Column) -> Column:
|
|
|
1005
1060
|
Column with formatted secondary address
|
|
1006
1061
|
|
|
1007
1062
|
Example:
|
|
1008
|
-
from datacompose.transformers.text.
|
|
1063
|
+
from datacompose.transformers.text.addresses.pyspark.pyspark_udf import format_secondary_address
|
|
1009
1064
|
df.select(format_secondary_address(F.lit("Apartment"), F.lit("5B")))
|
|
1010
1065
|
# -> "Apt 5B"
|
|
1011
1066
|
"""
|
|
@@ -1,3 +1,55 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Email transformation primitives for PySpark.
|
|
3
|
+
|
|
4
|
+
Preview Output:
|
|
5
|
+
+---------------------------+----------------------+-------------+----------------+--------+
|
|
6
|
+
|email |standardized |username |domain |is_valid|
|
|
7
|
+
+---------------------------+----------------------+-------------+----------------+--------+
|
|
8
|
+
| John.Doe@Gmail.COM |john.doe@gmail.com |john.doe |gmail.com |true |
|
|
9
|
+
|JANE.SMITH@OUTLOOK.COM |jane.smith@outlook.com|jane.smith |outlook.com |true |
|
|
10
|
+
| info@company-name.org |info@company-name.org |info |company-name.org|true |
|
|
11
|
+
|invalid.email@ |null |null |null |false |
|
|
12
|
+
|user+tag@domain.co.uk |user+tag@domain.co.uk |user+tag |domain.co.uk |true |
|
|
13
|
+
|bad email@test.com |null |null |null |false |
|
|
14
|
+
+---------------------------+----------------------+-------------+----------------+--------+
|
|
15
|
+
|
|
16
|
+
Usage Example:
|
|
17
|
+
from pyspark.sql import SparkSession
|
|
18
|
+
from pyspark.sql import functions as F
|
|
19
|
+
from transformers.pyspark.emails import emails
|
|
20
|
+
|
|
21
|
+
# Initialize Spark
|
|
22
|
+
spark = SparkSession.builder.appName("EmailCleaning").getOrCreate()
|
|
23
|
+
|
|
24
|
+
# Create sample data
|
|
25
|
+
data = [
|
|
26
|
+
("john.doe@gmail.com",),
|
|
27
|
+
("JANE.SMITH@OUTLOOK.COM",),
|
|
28
|
+
("info@company-name.org",),
|
|
29
|
+
("invalid.email@",),
|
|
30
|
+
("user+tag@domain.co.uk",),
|
|
31
|
+
]
|
|
32
|
+
df = spark.createDataFrame(data, ["email"])
|
|
33
|
+
|
|
34
|
+
# Extract and validate email components
|
|
35
|
+
result_df = df.select(
|
|
36
|
+
F.col("email"),
|
|
37
|
+
emails.standardize_email(F.col("email")).alias("standardized"),
|
|
38
|
+
emails.extract_username(F.col("email")).alias("username"),
|
|
39
|
+
emails.extract_domain(F.col("email")).alias("domain"),
|
|
40
|
+
emails.is_valid_email(F.col("email")).alias("is_valid")
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Show results
|
|
44
|
+
result_df.show(truncate=False)
|
|
45
|
+
|
|
46
|
+
# Filter to valid emails only
|
|
47
|
+
valid_emails = result_df.filter(F.col("is_valid") == True)
|
|
48
|
+
|
|
49
|
+
Installation:
|
|
50
|
+
datacompose add emails
|
|
51
|
+
"""
|
|
52
|
+
|
|
1
53
|
import re
|
|
2
54
|
from typing import TYPE_CHECKING, Dict, List, Optional
|
|
3
55
|
|
|
@@ -16,7 +68,7 @@ else:
|
|
|
16
68
|
|
|
17
69
|
try:
|
|
18
70
|
# Try local utils import first (for generated code)
|
|
19
|
-
from utils.primitives import PrimitiveRegistry
|
|
71
|
+
from utils.primitives import PrimitiveRegistry # type: ignore
|
|
20
72
|
except ImportError:
|
|
21
73
|
# Fall back to installed datacompose package
|
|
22
74
|
from datacompose.operators.primitives import PrimitiveRegistry
|