dhisana 0.0.1.dev243__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dhisana/__init__.py +1 -0
- dhisana/cli/__init__.py +1 -0
- dhisana/cli/cli.py +20 -0
- dhisana/cli/datasets.py +27 -0
- dhisana/cli/models.py +26 -0
- dhisana/cli/predictions.py +20 -0
- dhisana/schemas/__init__.py +1 -0
- dhisana/schemas/common.py +399 -0
- dhisana/schemas/sales.py +965 -0
- dhisana/ui/__init__.py +1 -0
- dhisana/ui/components.py +472 -0
- dhisana/utils/__init__.py +1 -0
- dhisana/utils/add_mapping.py +352 -0
- dhisana/utils/agent_tools.py +51 -0
- dhisana/utils/apollo_tools.py +1597 -0
- dhisana/utils/assistant_tool_tag.py +4 -0
- dhisana/utils/built_with_api_tools.py +282 -0
- dhisana/utils/cache_output_tools.py +98 -0
- dhisana/utils/cache_output_tools_local.py +78 -0
- dhisana/utils/check_email_validity_tools.py +717 -0
- dhisana/utils/check_for_intent_signal.py +107 -0
- dhisana/utils/check_linkedin_url_validity.py +209 -0
- dhisana/utils/clay_tools.py +43 -0
- dhisana/utils/clean_properties.py +135 -0
- dhisana/utils/company_utils.py +60 -0
- dhisana/utils/compose_salesnav_query.py +259 -0
- dhisana/utils/compose_search_query.py +759 -0
- dhisana/utils/compose_three_step_workflow.py +234 -0
- dhisana/utils/composite_tools.py +137 -0
- dhisana/utils/dataframe_tools.py +237 -0
- dhisana/utils/domain_parser.py +45 -0
- dhisana/utils/email_body_utils.py +72 -0
- dhisana/utils/email_parse_helpers.py +132 -0
- dhisana/utils/email_provider.py +375 -0
- dhisana/utils/enrich_lead_information.py +933 -0
- dhisana/utils/extract_email_content_for_llm.py +101 -0
- dhisana/utils/fetch_openai_config.py +129 -0
- dhisana/utils/field_validators.py +426 -0
- dhisana/utils/g2_tools.py +104 -0
- dhisana/utils/generate_content.py +41 -0
- dhisana/utils/generate_custom_message.py +271 -0
- dhisana/utils/generate_email.py +278 -0
- dhisana/utils/generate_email_response.py +465 -0
- dhisana/utils/generate_flow.py +102 -0
- dhisana/utils/generate_leads_salesnav.py +303 -0
- dhisana/utils/generate_linkedin_connect_message.py +224 -0
- dhisana/utils/generate_linkedin_response_message.py +317 -0
- dhisana/utils/generate_structured_output_internal.py +462 -0
- dhisana/utils/google_custom_search.py +267 -0
- dhisana/utils/google_oauth_tools.py +727 -0
- dhisana/utils/google_workspace_tools.py +1294 -0
- dhisana/utils/hubspot_clearbit.py +96 -0
- dhisana/utils/hubspot_crm_tools.py +2440 -0
- dhisana/utils/instantly_tools.py +149 -0
- dhisana/utils/linkedin_crawler.py +168 -0
- dhisana/utils/lusha_tools.py +333 -0
- dhisana/utils/mailgun_tools.py +156 -0
- dhisana/utils/mailreach_tools.py +123 -0
- dhisana/utils/microsoft365_tools.py +455 -0
- dhisana/utils/openai_assistant_and_file_utils.py +267 -0
- dhisana/utils/openai_helpers.py +977 -0
- dhisana/utils/openapi_spec_to_tools.py +45 -0
- dhisana/utils/openapi_tool/__init__.py +1 -0
- dhisana/utils/openapi_tool/api_models.py +633 -0
- dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
- dhisana/utils/openapi_tool/openapi_tool.py +319 -0
- dhisana/utils/parse_linkedin_messages_txt.py +100 -0
- dhisana/utils/profile.py +37 -0
- dhisana/utils/proxy_curl_tools.py +1226 -0
- dhisana/utils/proxycurl_search_leads.py +426 -0
- dhisana/utils/python_function_to_tools.py +83 -0
- dhisana/utils/research_lead.py +176 -0
- dhisana/utils/sales_navigator_crawler.py +1103 -0
- dhisana/utils/salesforce_crm_tools.py +477 -0
- dhisana/utils/search_router.py +131 -0
- dhisana/utils/search_router_jobs.py +51 -0
- dhisana/utils/sendgrid_tools.py +162 -0
- dhisana/utils/serarch_router_local_business.py +75 -0
- dhisana/utils/serpapi_additional_tools.py +290 -0
- dhisana/utils/serpapi_google_jobs.py +117 -0
- dhisana/utils/serpapi_google_search.py +188 -0
- dhisana/utils/serpapi_local_business_search.py +129 -0
- dhisana/utils/serpapi_search_tools.py +852 -0
- dhisana/utils/serperdev_google_jobs.py +125 -0
- dhisana/utils/serperdev_local_business.py +154 -0
- dhisana/utils/serperdev_search.py +233 -0
- dhisana/utils/smtp_email_tools.py +582 -0
- dhisana/utils/test_connect.py +2087 -0
- dhisana/utils/trasform_json.py +173 -0
- dhisana/utils/web_download_parse_tools.py +189 -0
- dhisana/utils/workflow_code_model.py +5 -0
- dhisana/utils/zoominfo_tools.py +357 -0
- dhisana/workflow/__init__.py +1 -0
- dhisana/workflow/agent.py +18 -0
- dhisana/workflow/flow.py +44 -0
- dhisana/workflow/task.py +43 -0
- dhisana/workflow/test.py +90 -0
- dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
- dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
- dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
- dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
- dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
import base64
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
def decode_base64(data: str) -> str:
|
|
6
|
+
"""
|
|
7
|
+
Decode a base64- or web-safe-base64-encoded string.
|
|
8
|
+
"""
|
|
9
|
+
if not data:
|
|
10
|
+
return ""
|
|
11
|
+
missing_padding = len(data) % 4
|
|
12
|
+
if missing_padding:
|
|
13
|
+
data += '=' * (4 - missing_padding)
|
|
14
|
+
return base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
|
|
15
|
+
|
|
16
|
+
def html_to_text(html: str) -> str:
|
|
17
|
+
"""
|
|
18
|
+
(Optional) Convert HTML to plain text using a simple regex.
|
|
19
|
+
This is not bulletproof for all HTML, but often fine for short email bodies.
|
|
20
|
+
"""
|
|
21
|
+
text = re.sub(r'<br\s*/?>', '\n', html, flags=re.IGNORECASE) # <br> to newline
|
|
22
|
+
text = re.sub(r'<.*?>', '', text) # remove remaining tags
|
|
23
|
+
return text.strip()
|
|
24
|
+
|
|
25
|
+
def get_text_content(payload: Dict[str, Any]) -> str:
|
|
26
|
+
"""
|
|
27
|
+
Recursively extract 'text/plain' content from the Gmail message payload.
|
|
28
|
+
If no text/plain is found, fallback to 'text/html' (converted to plain text).
|
|
29
|
+
"""
|
|
30
|
+
# If there's a direct 'parts' list, we may need to walk each part.
|
|
31
|
+
if 'parts' in payload:
|
|
32
|
+
extracted = []
|
|
33
|
+
for part in payload['parts']:
|
|
34
|
+
extracted.append(get_text_content(part))
|
|
35
|
+
return "\n".join(filter(None, extracted))
|
|
36
|
+
|
|
37
|
+
# If this part has a mimeType and a body, try to decode.
|
|
38
|
+
mime_type = payload.get('mimeType', '')
|
|
39
|
+
body_data = payload.get('body', {}).get('data', '')
|
|
40
|
+
|
|
41
|
+
# If it's text/plain, decode base64 and return
|
|
42
|
+
if mime_type == 'text/plain' and body_data:
|
|
43
|
+
return decode_base64(body_data)
|
|
44
|
+
|
|
45
|
+
# If it's text/html (and we haven't returned text/plain yet), fallback
|
|
46
|
+
if mime_type == 'text/html' and body_data:
|
|
47
|
+
html_content = decode_base64(body_data)
|
|
48
|
+
return html_to_text(html_content)
|
|
49
|
+
|
|
50
|
+
return ""
|
|
51
|
+
|
|
52
|
+
def trim_repeated_quoted_lines(text: str) -> str:
|
|
53
|
+
"""
|
|
54
|
+
(Optional) Try to remove repeated or quoted content from replies.
|
|
55
|
+
This is a naive approach—real-world heuristics can get quite complicated.
|
|
56
|
+
"""
|
|
57
|
+
# Common patterns: lines starting with ">"
|
|
58
|
+
# or lines starting with "On <date>, <someone> wrote:"
|
|
59
|
+
lines = text.splitlines()
|
|
60
|
+
filtered_lines = []
|
|
61
|
+
for line in lines:
|
|
62
|
+
if line.startswith(">"):
|
|
63
|
+
continue
|
|
64
|
+
# You can add more heuristics for removing signature blocks or repeated disclaimers
|
|
65
|
+
filtered_lines.append(line)
|
|
66
|
+
return "\n".join(filtered_lines).strip()
|
|
67
|
+
|
|
68
|
+
def extract_email_content_for_llm(email_details: Dict[str, Any]) -> str:
|
|
69
|
+
"""
|
|
70
|
+
Cleans up, extracts, and formats the relevant text content from a single Gmail message.
|
|
71
|
+
If you want the entire thread, call the Gmail API for all messages in the thread and
|
|
72
|
+
combine them. This function handles one message in detail, recursively extracting
|
|
73
|
+
text from multiple MIME parts.
|
|
74
|
+
"""
|
|
75
|
+
if not email_details or 'payload' not in email_details:
|
|
76
|
+
return "No valid email details found."
|
|
77
|
+
|
|
78
|
+
# Extract basic headers
|
|
79
|
+
headers_map = {h['name']: h['value'] for h in email_details['payload'].get('headers', [])}
|
|
80
|
+
|
|
81
|
+
sender = headers_map.get('From', 'Unknown Sender')
|
|
82
|
+
receiver = headers_map.get('To', 'Unknown Receiver')
|
|
83
|
+
date = headers_map.get('Date', 'Unknown Date')
|
|
84
|
+
subject = headers_map.get('Subject', 'No Subject')
|
|
85
|
+
|
|
86
|
+
# Recursively get text from payload
|
|
87
|
+
body = get_text_content(email_details['payload'])
|
|
88
|
+
|
|
89
|
+
# Optionally remove some repeated lines
|
|
90
|
+
body = trim_repeated_quoted_lines(body)
|
|
91
|
+
|
|
92
|
+
# Format final string
|
|
93
|
+
formatted_content = (
|
|
94
|
+
f"From: {sender}\n"
|
|
95
|
+
f"To: {receiver}\n"
|
|
96
|
+
f"Date: {date}\n"
|
|
97
|
+
f"Subject: {subject}\n\n"
|
|
98
|
+
f"{body}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
return formatted_content
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified OpenAI / Azure OpenAI helper (no env-fallback for secrets)
|
|
3
|
+
=================================================================
|
|
4
|
+
|
|
5
|
+
Resolution order
|
|
6
|
+
----------------
|
|
7
|
+
1. If `tool_config` has a **"openai"** block → public OpenAI
|
|
8
|
+
2. Else if it has an **"azure_openai"** block → Azure OpenAI
|
|
9
|
+
3. Otherwise → raise ValueError
|
|
10
|
+
|
|
11
|
+
`api_key` **and** `endpoint` (for Azure) must therefore be supplied in
|
|
12
|
+
`tool_config`. They will never be read from the host environment.
|
|
13
|
+
|
|
14
|
+
Optional:
|
|
15
|
+
• `AZURE_OPENAI_API_VERSION` – defaults to 2025-03-01-preview
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
22
|
+
|
|
23
|
+
from openai import AsyncOpenAI, OpenAI, AzureOpenAI, AsyncAzureOpenAI
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
27
|
+
# 1. Helpers: config parsing
|
|
28
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
29
|
+
|
|
30
|
+
def _extract_config(
|
|
31
|
+
tool_config: Optional[List[Dict]], provider_name: str
|
|
32
|
+
) -> Dict[str, str]:
|
|
33
|
+
"""Return the config map for the requested provider name, else {}."""
|
|
34
|
+
if not tool_config:
|
|
35
|
+
return {}
|
|
36
|
+
block = next((b for b in tool_config if b.get("name") == provider_name), {})
|
|
37
|
+
return {entry["name"]: entry["value"] for entry in block.get("configuration", []) if entry}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _discover_credentials(
|
|
41
|
+
tool_config: Optional[List[Dict]] = None,
|
|
42
|
+
) -> Tuple[str, str, Optional[str]]:
|
|
43
|
+
"""
|
|
44
|
+
Return (provider, api_key, endpoint_or_None).
|
|
45
|
+
|
|
46
|
+
provider ∈ {"public", "azure"}
|
|
47
|
+
"""
|
|
48
|
+
# 1️⃣ Public OpenAI
|
|
49
|
+
openai_cfg = _extract_config(tool_config, "openai")
|
|
50
|
+
if openai_cfg:
|
|
51
|
+
key = openai_cfg.get("apiKey")
|
|
52
|
+
if not key:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"OpenAI integration is not configured. Please configure the connection to OpenAI in Integrations."
|
|
55
|
+
)
|
|
56
|
+
return "public", key, None
|
|
57
|
+
|
|
58
|
+
# 2️⃣ Azure OpenAI
|
|
59
|
+
azure_cfg = _extract_config(tool_config, "azure_openai")
|
|
60
|
+
if azure_cfg:
|
|
61
|
+
key = azure_cfg.get("apiKey")
|
|
62
|
+
endpoint = azure_cfg.get("endpoint")
|
|
63
|
+
if not key or not endpoint:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
"Azure OpenAI integration is not configured. Please configure the connection to Azure OpenAI in Integrations."
|
|
66
|
+
)
|
|
67
|
+
return "azure", key, endpoint
|
|
68
|
+
|
|
69
|
+
# 3️⃣ Neither block present → error
|
|
70
|
+
raise ValueError(
|
|
71
|
+
"OpenAI integration is not configured. Please configure the connection to OpenAI in Integrations."
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
76
|
+
# 2. Client factories
|
|
77
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
def _api_version() -> str:
|
|
80
|
+
"""Return the Azure API version (env-controlled, no secret)."""
|
|
81
|
+
return os.getenv("AZURE_OPENAI_API_VERSION", "2025-03-01-preview")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def create_openai_client(
|
|
85
|
+
tool_config: Optional[List[Dict]] = None,
|
|
86
|
+
) -> Union[OpenAI, AzureOpenAI]:
|
|
87
|
+
"""
|
|
88
|
+
Return a *synchronous* client:
|
|
89
|
+
• openai.OpenAI – public service
|
|
90
|
+
• openai.AzureOpenAI – Azure
|
|
91
|
+
"""
|
|
92
|
+
provider, key, endpoint = _discover_credentials(tool_config)
|
|
93
|
+
|
|
94
|
+
if provider == "public":
|
|
95
|
+
return OpenAI(api_key=key)
|
|
96
|
+
|
|
97
|
+
# Azure
|
|
98
|
+
return AzureOpenAI(api_key=key, azure_endpoint=endpoint, api_version=_api_version())
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def create_async_openai_client(
|
|
102
|
+
tool_config: Optional[List[Dict]] = None,
|
|
103
|
+
) -> AsyncOpenAI:
|
|
104
|
+
"""
|
|
105
|
+
Return an *async* client (AsyncOpenAI).
|
|
106
|
+
|
|
107
|
+
For Azure we pass both `azure_endpoint` and `api_version`.
|
|
108
|
+
"""
|
|
109
|
+
provider, key, endpoint = _discover_credentials(tool_config)
|
|
110
|
+
|
|
111
|
+
if provider == "public":
|
|
112
|
+
return AsyncOpenAI(api_key=key)
|
|
113
|
+
|
|
114
|
+
return AsyncAzureOpenAI(
|
|
115
|
+
api_key=key,
|
|
116
|
+
azure_endpoint=endpoint,
|
|
117
|
+
api_version=_api_version(),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
123
|
+
# 3. Convenience helper (legacy)
|
|
124
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
def get_openai_access_token(tool_config: Optional[List[Dict]] = None) -> str:
|
|
127
|
+
"""Return just the API key (legacy helper)."""
|
|
128
|
+
_, key, _ = _discover_credentials(tool_config)
|
|
129
|
+
return key
|
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from urllib.parse import urlparse
|
|
3
|
+
import urllib.parse
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from email_validator import validate_email, EmailNotValidError
|
|
7
|
+
from fqdn import FQDN
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
# -------------------------------------------------------------------------------------
|
|
12
|
+
# Utility sets and patterns
|
|
13
|
+
# -------------------------------------------------------------------------------------
|
|
14
|
+
PLACEHOLDER_EMAILS = {
|
|
15
|
+
"test@example.com",
|
|
16
|
+
"test@test.com",
|
|
17
|
+
"test@domain.com",
|
|
18
|
+
"user@domain.com",
|
|
19
|
+
"user@example.com",
|
|
20
|
+
"user@yourdomain.com",
|
|
21
|
+
"no-reply@example.com",
|
|
22
|
+
"no-reply@domain.com",
|
|
23
|
+
"no-reply@yourdomain.com",
|
|
24
|
+
"admin@domain.com",
|
|
25
|
+
"contact@domain.com",
|
|
26
|
+
"info@domain.com",
|
|
27
|
+
"none@none.com",
|
|
28
|
+
"none@domain.com",
|
|
29
|
+
"noemail@noemail.com",
|
|
30
|
+
"test@fake.com",
|
|
31
|
+
"test@demo.com",
|
|
32
|
+
"test@testing.com",
|
|
33
|
+
"test@local.com",
|
|
34
|
+
"fake@fake.com",
|
|
35
|
+
"email@email.com",
|
|
36
|
+
"asdf@asdf.com",
|
|
37
|
+
"qwerty@qwerty.com",
|
|
38
|
+
"xxx@xxx.com",
|
|
39
|
+
"aaaa@aaaa.com",
|
|
40
|
+
"nomail@nomail.com",
|
|
41
|
+
"dontreply@dontreply.com",
|
|
42
|
+
"asdasd@asdasd.com",
|
|
43
|
+
"abcdefg@abcdefg.com",
|
|
44
|
+
"123@123.com",
|
|
45
|
+
"test123@test.com",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
DISPOSABLE_DOMAINS = {
|
|
49
|
+
"mailinator.com",
|
|
50
|
+
"10minutemail.com",
|
|
51
|
+
"yopmail.com",
|
|
52
|
+
"guerrillamail.com",
|
|
53
|
+
"tempmail.com",
|
|
54
|
+
"fakemailgenerator.com",
|
|
55
|
+
"mytrashmail.com",
|
|
56
|
+
"getnada.com",
|
|
57
|
+
"throwawaymail.com",
|
|
58
|
+
"sharklasers.com",
|
|
59
|
+
"maildrop.cc",
|
|
60
|
+
"discard.email",
|
|
61
|
+
"temporaryemail.com",
|
|
62
|
+
"trashmail.com",
|
|
63
|
+
"mohmal.com",
|
|
64
|
+
"mail.tm",
|
|
65
|
+
"mailsac.com",
|
|
66
|
+
"mailcatch.com",
|
|
67
|
+
"temp-mail.org",
|
|
68
|
+
"emailondeck.com",
|
|
69
|
+
"mailinabox.email",
|
|
70
|
+
"spambog.com",
|
|
71
|
+
"mintemail.com",
|
|
72
|
+
"spam4.me",
|
|
73
|
+
"spambox.us",
|
|
74
|
+
"edumail.rocks",
|
|
75
|
+
"getairmail.com",
|
|
76
|
+
"mailnesia.com",
|
|
77
|
+
"spoofmail.de",
|
|
78
|
+
"dropmail.me",
|
|
79
|
+
"tempmailaddress.com",
|
|
80
|
+
"33mail.com",
|
|
81
|
+
"incognitomail.com",
|
|
82
|
+
"tempemail.co",
|
|
83
|
+
"trbvm.com",
|
|
84
|
+
"online.ms",
|
|
85
|
+
"20mail.in",
|
|
86
|
+
"wavee.net",
|
|
87
|
+
"ephemeral.email",
|
|
88
|
+
"bccto.me",
|
|
89
|
+
"cuvox.de",
|
|
90
|
+
"dispostable.com",
|
|
91
|
+
"easytrashmail.com",
|
|
92
|
+
"email-fake.org",
|
|
93
|
+
"emailtemporario.com.br",
|
|
94
|
+
"fleckens.hu",
|
|
95
|
+
"lroid.com",
|
|
96
|
+
"mail-temporaire.fr",
|
|
97
|
+
"mailate.com",
|
|
98
|
+
"mailfever.com",
|
|
99
|
+
"mailforspam.com",
|
|
100
|
+
"mailfreeonline.com",
|
|
101
|
+
"mailhazard.com",
|
|
102
|
+
"mailimate.com",
|
|
103
|
+
"mailin8r.com",
|
|
104
|
+
"mailincubator.com",
|
|
105
|
+
"mailmoat.com",
|
|
106
|
+
"mailzilla.org",
|
|
107
|
+
"notsharingmy.info",
|
|
108
|
+
"objectmail.com",
|
|
109
|
+
"proxymail.eu",
|
|
110
|
+
"spamdecoy.net",
|
|
111
|
+
"spamfree24.org",
|
|
112
|
+
"spamgourmet.com",
|
|
113
|
+
"spamify.com",
|
|
114
|
+
"spamomatic.com",
|
|
115
|
+
"spamspot.com",
|
|
116
|
+
"superrito.com",
|
|
117
|
+
"teleworm.us",
|
|
118
|
+
"trash-amil.com",
|
|
119
|
+
"trashmail.me",
|
|
120
|
+
"trashmail.net",
|
|
121
|
+
"wegwerfemail.de",
|
|
122
|
+
"wh4f.org",
|
|
123
|
+
"zmail.ru",
|
|
124
|
+
# Add more as needed
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
SPAMMY_PATTERN = re.compile(r"(.)\1{3,}") # e.g., 4+ repeated characters
|
|
128
|
+
|
|
129
|
+
BLOCKED_DOMAINS = {
|
|
130
|
+
# URL shorteners and link forwarders
|
|
131
|
+
"bit.ly",
|
|
132
|
+
"tinyurl.com",
|
|
133
|
+
"t.co",
|
|
134
|
+
"ow.ly",
|
|
135
|
+
"is.gd",
|
|
136
|
+
"cutt.ly",
|
|
137
|
+
"bit.do",
|
|
138
|
+
"buff.ly",
|
|
139
|
+
"rebrand.ly",
|
|
140
|
+
"rebrandly.com",
|
|
141
|
+
"snip.ly",
|
|
142
|
+
"shorte.st",
|
|
143
|
+
"soo.gd",
|
|
144
|
+
"shorturl.at",
|
|
145
|
+
"adf.ly",
|
|
146
|
+
# Bio / profile link aggregators
|
|
147
|
+
"linktr.ee",
|
|
148
|
+
"linktree.com",
|
|
149
|
+
"linkin.bio",
|
|
150
|
+
"campsite.bio",
|
|
151
|
+
"bio.link",
|
|
152
|
+
"bio.site",
|
|
153
|
+
"bio.fm",
|
|
154
|
+
"milkshake.app",
|
|
155
|
+
"lnk.bio",
|
|
156
|
+
"withkoji.com",
|
|
157
|
+
"about.me",
|
|
158
|
+
"carrd.co",
|
|
159
|
+
# Large social platforms (block if used as “organization domain”)
|
|
160
|
+
"facebook.com",
|
|
161
|
+
"instagram.com",
|
|
162
|
+
"linkedin.com",
|
|
163
|
+
"youtube.com",
|
|
164
|
+
"yelp.com",
|
|
165
|
+
"twitter.com",
|
|
166
|
+
"tiktok.com",
|
|
167
|
+
"pinterest.com",
|
|
168
|
+
"reddit.com",
|
|
169
|
+
"snapchat.com",
|
|
170
|
+
"tumblr.com",
|
|
171
|
+
"vimeo.com",
|
|
172
|
+
"flickr.com",
|
|
173
|
+
"wechat.com",
|
|
174
|
+
"qq.com",
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# -------------------------------------------------------------------------------------
|
|
179
|
+
# Helper: Check FQDN validity
|
|
180
|
+
# -------------------------------------------------------------------------------------
|
|
181
|
+
def is_valid_fqdn(domain: str) -> bool:
|
|
182
|
+
"""
|
|
183
|
+
Returns True if `domain` is a syntactically valid Fully Qualified Domain Name.
|
|
184
|
+
"""
|
|
185
|
+
try:
|
|
186
|
+
if not domain or not isinstance(domain, str):
|
|
187
|
+
return False
|
|
188
|
+
fqdn_obj = FQDN(domain)
|
|
189
|
+
return fqdn_obj.is_valid
|
|
190
|
+
except Exception:
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# -------------------------------------------------------------------------------------
|
|
195
|
+
# Domain validation for organizations
|
|
196
|
+
# -------------------------------------------------------------------------------------
|
|
197
|
+
def validation_organization_domain(domain: str) -> str:
|
|
198
|
+
"""
|
|
199
|
+
1. Lowercases/strips the input domain string.
|
|
200
|
+
2. Checks if the domain is in (or a subdomain of) a blocked set.
|
|
201
|
+
3. Checks if the domain is a valid FQDN.
|
|
202
|
+
4. Returns '' if blocked or invalid, otherwise returns the normalized domain.
|
|
203
|
+
"""
|
|
204
|
+
if not domain or not isinstance(domain, str):
|
|
205
|
+
return ""
|
|
206
|
+
|
|
207
|
+
domain = domain.strip().lower()
|
|
208
|
+
# If domain exactly matches OR is a subdomain of any blocked domain
|
|
209
|
+
if any(domain == blocked or domain.endswith(f".{blocked}") for blocked in BLOCKED_DOMAINS):
|
|
210
|
+
return ""
|
|
211
|
+
|
|
212
|
+
# Otherwise, confirm valid FQDN
|
|
213
|
+
return domain if is_valid_fqdn(domain) else ""
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# -------------------------------------------------------------------------------------
|
|
217
|
+
# Email validation & cleaning
|
|
218
|
+
# -------------------------------------------------------------------------------------
|
|
219
|
+
def validate_and_clean_email(email: str) -> str:
|
|
220
|
+
"""
|
|
221
|
+
Return a validated, normalized email string or '' if invalid/unwanted.
|
|
222
|
+
Checks:
|
|
223
|
+
1. Syntax / deliverability via email_validator
|
|
224
|
+
2. Against placeholder/fake emails
|
|
225
|
+
3. Against disposable email domains
|
|
226
|
+
4. Spammy repeated character patterns
|
|
227
|
+
"""
|
|
228
|
+
if not email or not isinstance(email, str):
|
|
229
|
+
return ""
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
v = validate_email(email, check_deliverability=True)
|
|
233
|
+
normalized_email = v["email"] # canonical form
|
|
234
|
+
local_part, domain_part = normalized_email.rsplit("@", 1)
|
|
235
|
+
|
|
236
|
+
# 1. Check entire address in placeholder set
|
|
237
|
+
if normalized_email.lower() in PLACEHOLDER_EMAILS:
|
|
238
|
+
return ""
|
|
239
|
+
|
|
240
|
+
# 2. Check domain in disposable set
|
|
241
|
+
if domain_part.lower() in DISPOSABLE_DOMAINS:
|
|
242
|
+
return ""
|
|
243
|
+
|
|
244
|
+
# 3. Check repeated/spammy pattern
|
|
245
|
+
if SPAMMY_PATTERN.search(normalized_email):
|
|
246
|
+
return ""
|
|
247
|
+
|
|
248
|
+
return normalized_email
|
|
249
|
+
except EmailNotValidError:
|
|
250
|
+
return ""
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# -------------------------------------------------------------------------------------
|
|
254
|
+
# Website URL validation
|
|
255
|
+
# -------------------------------------------------------------------------------------
|
|
256
|
+
def validate_website_url(raw_url: str) -> str:
|
|
257
|
+
"""
|
|
258
|
+
Validate a website URL (must be http/https or a raw domain).
|
|
259
|
+
If no scheme is provided but the input is a valid FQDN-like string
|
|
260
|
+
(e.g. www.google.com), automatically prefix https://.
|
|
261
|
+
Return the normalized URL without query/fragment, or '' if invalid.
|
|
262
|
+
"""
|
|
263
|
+
if not raw_url or not isinstance(raw_url, str):
|
|
264
|
+
return ""
|
|
265
|
+
|
|
266
|
+
# Clean input
|
|
267
|
+
raw_url = raw_url.strip().lower()
|
|
268
|
+
|
|
269
|
+
try:
|
|
270
|
+
parsed = urllib.parse.urlparse(raw_url)
|
|
271
|
+
|
|
272
|
+
# If there's no scheme, try prefixing https://
|
|
273
|
+
if not parsed.scheme:
|
|
274
|
+
# Example: "www.google.com" => "https://www.google.com"
|
|
275
|
+
potential_url = f"https://{raw_url}"
|
|
276
|
+
test_parsed = urllib.parse.urlparse(potential_url)
|
|
277
|
+
|
|
278
|
+
# If that yields a valid scheme and netloc, use it
|
|
279
|
+
if test_parsed.scheme in ["http", "https"] and test_parsed.netloc:
|
|
280
|
+
parsed = test_parsed
|
|
281
|
+
else:
|
|
282
|
+
return ""
|
|
283
|
+
|
|
284
|
+
# Check we now have a valid scheme and netloc
|
|
285
|
+
if parsed.scheme not in ["http", "https"] or not parsed.netloc:
|
|
286
|
+
return ""
|
|
287
|
+
|
|
288
|
+
# Normalize by removing query and fragment parts
|
|
289
|
+
normalized = urllib.parse.urlunparse(
|
|
290
|
+
(parsed.scheme, parsed.netloc, parsed.path, "", "", "")
|
|
291
|
+
)
|
|
292
|
+
return normalized
|
|
293
|
+
|
|
294
|
+
except Exception:
|
|
295
|
+
return ""
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
# -------------------------------------------------------------------------------------
|
|
299
|
+
# LinkedIn URL Normalizers
|
|
300
|
+
# -------------------------------------------------------------------------------------
|
|
301
|
+
def normalize_linkedin_url(raw_url: str) -> str:
|
|
302
|
+
"""
|
|
303
|
+
Normalize a personal LinkedIn URL to the form: https://www.linkedin.com/in/<something>
|
|
304
|
+
Must contain '/in/'. Otherwise, return ''.
|
|
305
|
+
"""
|
|
306
|
+
if not raw_url or not isinstance(raw_url, str):
|
|
307
|
+
return ""
|
|
308
|
+
|
|
309
|
+
try:
|
|
310
|
+
raw_url = raw_url.strip()
|
|
311
|
+
parsed = urlparse(raw_url)
|
|
312
|
+
|
|
313
|
+
if not parsed.scheme or not parsed.netloc:
|
|
314
|
+
return ""
|
|
315
|
+
|
|
316
|
+
if "linkedin.com" not in parsed.netloc.lower():
|
|
317
|
+
return ""
|
|
318
|
+
|
|
319
|
+
url = raw_url.rstrip("/")
|
|
320
|
+
parsed = urlparse(url)
|
|
321
|
+
|
|
322
|
+
if "/in/" not in parsed.path.lower():
|
|
323
|
+
return ""
|
|
324
|
+
|
|
325
|
+
path = parsed.path.lstrip("/")
|
|
326
|
+
return f"https://www.linkedin.com/{path}".rstrip("/")
|
|
327
|
+
except Exception:
|
|
328
|
+
return ""
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def normalize_linkedin_company_url(raw_url: str) -> str:
|
|
332
|
+
"""
|
|
333
|
+
Normalize a company LinkedIn URL to the form: https://www.linkedin.com/company/<something>
|
|
334
|
+
Must contain '/company/'. Otherwise, return ''.
|
|
335
|
+
"""
|
|
336
|
+
if not raw_url or not isinstance(raw_url, str):
|
|
337
|
+
return ""
|
|
338
|
+
|
|
339
|
+
try:
|
|
340
|
+
raw_url = raw_url.strip()
|
|
341
|
+
parsed = urlparse(raw_url)
|
|
342
|
+
|
|
343
|
+
if not parsed.scheme or not parsed.netloc:
|
|
344
|
+
return ""
|
|
345
|
+
|
|
346
|
+
if "linkedin.com" not in parsed.netloc.lower():
|
|
347
|
+
return ""
|
|
348
|
+
|
|
349
|
+
url = raw_url.rstrip("/")
|
|
350
|
+
parsed = urlparse(url)
|
|
351
|
+
|
|
352
|
+
if "/company/" not in parsed.path.lower():
|
|
353
|
+
return ""
|
|
354
|
+
|
|
355
|
+
path = parsed.path.lstrip("/")
|
|
356
|
+
return f"https://www.linkedin.com/{path}".rstrip("/")
|
|
357
|
+
except Exception:
|
|
358
|
+
return ""
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def normalize_linkedin_company_salesnav_url(raw_url: str) -> str:
|
|
362
|
+
"""
|
|
363
|
+
Normalize a company Sales Navigator URL to: https://www.linkedin.com/sales/company/<something>
|
|
364
|
+
Must contain '/sales/company/'. Otherwise, return ''.
|
|
365
|
+
"""
|
|
366
|
+
if not raw_url or not isinstance(raw_url, str):
|
|
367
|
+
return ""
|
|
368
|
+
|
|
369
|
+
try:
|
|
370
|
+
raw_url = raw_url.strip()
|
|
371
|
+
parsed = urlparse(raw_url)
|
|
372
|
+
|
|
373
|
+
if not parsed.scheme or not parsed.netloc:
|
|
374
|
+
return ""
|
|
375
|
+
|
|
376
|
+
if "linkedin.com" not in parsed.netloc.lower():
|
|
377
|
+
return ""
|
|
378
|
+
|
|
379
|
+
url = raw_url.rstrip("/")
|
|
380
|
+
parsed = urlparse(url)
|
|
381
|
+
|
|
382
|
+
if "/sales/company/" not in parsed.path.lower():
|
|
383
|
+
return ""
|
|
384
|
+
|
|
385
|
+
path = parsed.path.lstrip("/")
|
|
386
|
+
return f"https://www.linkedin.com/{path}".rstrip("/")
|
|
387
|
+
except Exception:
|
|
388
|
+
return ""
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def normalize_salesnav_url(raw_url: str) -> str:
|
|
392
|
+
"""
|
|
393
|
+
Normalize a Sales Navigator URL to: https://www.linkedin.com/sales/lead/<something>
|
|
394
|
+
Must contain '/sales/lead/'. Otherwise, return ''.
|
|
395
|
+
Strips anything after a comma in the URL.
|
|
396
|
+
"""
|
|
397
|
+
if not raw_url or not isinstance(raw_url, str):
|
|
398
|
+
return ""
|
|
399
|
+
|
|
400
|
+
try:
|
|
401
|
+
raw_url = raw_url.strip()
|
|
402
|
+
parsed_initial = urlparse(raw_url)
|
|
403
|
+
|
|
404
|
+
if not parsed_initial.scheme or not parsed_initial.netloc:
|
|
405
|
+
return ""
|
|
406
|
+
|
|
407
|
+
if "linkedin.com" not in parsed_initial.netloc.lower():
|
|
408
|
+
return ""
|
|
409
|
+
|
|
410
|
+
# Remove trailing slash
|
|
411
|
+
url = raw_url.rstrip("/")
|
|
412
|
+
|
|
413
|
+
# Strip anything after the first comma
|
|
414
|
+
comma_idx = url.find(",")
|
|
415
|
+
if comma_idx != -1:
|
|
416
|
+
url = url[:comma_idx]
|
|
417
|
+
|
|
418
|
+
parsed = urlparse(url)
|
|
419
|
+
|
|
420
|
+
if "/sales/lead/" not in parsed.path.lower():
|
|
421
|
+
return ""
|
|
422
|
+
|
|
423
|
+
path = parsed.path.lstrip("/")
|
|
424
|
+
return f"https://www.linkedin.com/{path}".rstrip("/")
|
|
425
|
+
except Exception:
|
|
426
|
+
return ""
|