dhisana 0.0.1.dev116__py3-none-any.whl → 0.0.1.dev236__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dhisana/schemas/common.py +10 -1
- dhisana/schemas/sales.py +203 -22
- dhisana/utils/add_mapping.py +0 -2
- dhisana/utils/apollo_tools.py +739 -119
- dhisana/utils/built_with_api_tools.py +4 -2
- dhisana/utils/check_email_validity_tools.py +35 -18
- dhisana/utils/check_for_intent_signal.py +1 -2
- dhisana/utils/check_linkedin_url_validity.py +34 -8
- dhisana/utils/clay_tools.py +3 -2
- dhisana/utils/clean_properties.py +1 -4
- dhisana/utils/compose_salesnav_query.py +0 -1
- dhisana/utils/compose_search_query.py +7 -3
- dhisana/utils/composite_tools.py +0 -1
- dhisana/utils/dataframe_tools.py +2 -2
- dhisana/utils/email_body_utils.py +72 -0
- dhisana/utils/email_provider.py +174 -35
- dhisana/utils/enrich_lead_information.py +183 -53
- dhisana/utils/fetch_openai_config.py +129 -0
- dhisana/utils/field_validators.py +1 -1
- dhisana/utils/g2_tools.py +0 -1
- dhisana/utils/generate_content.py +0 -1
- dhisana/utils/generate_email.py +68 -23
- dhisana/utils/generate_email_response.py +294 -46
- dhisana/utils/generate_flow.py +0 -1
- dhisana/utils/generate_linkedin_connect_message.py +9 -2
- dhisana/utils/generate_linkedin_response_message.py +137 -66
- dhisana/utils/generate_structured_output_internal.py +317 -164
- dhisana/utils/google_custom_search.py +150 -44
- dhisana/utils/google_oauth_tools.py +721 -0
- dhisana/utils/google_workspace_tools.py +278 -54
- dhisana/utils/hubspot_clearbit.py +3 -1
- dhisana/utils/hubspot_crm_tools.py +718 -272
- dhisana/utils/instantly_tools.py +3 -1
- dhisana/utils/lusha_tools.py +10 -7
- dhisana/utils/mailgun_tools.py +150 -0
- dhisana/utils/microsoft365_tools.py +447 -0
- dhisana/utils/openai_assistant_and_file_utils.py +121 -177
- dhisana/utils/openai_helpers.py +8 -6
- dhisana/utils/parse_linkedin_messages_txt.py +1 -3
- dhisana/utils/profile.py +37 -0
- dhisana/utils/proxy_curl_tools.py +377 -76
- dhisana/utils/proxycurl_search_leads.py +426 -0
- dhisana/utils/research_lead.py +3 -3
- dhisana/utils/sales_navigator_crawler.py +1 -6
- dhisana/utils/salesforce_crm_tools.py +323 -50
- dhisana/utils/search_router.py +131 -0
- dhisana/utils/search_router_jobs.py +51 -0
- dhisana/utils/sendgrid_tools.py +126 -91
- dhisana/utils/serarch_router_local_business.py +75 -0
- dhisana/utils/serpapi_additional_tools.py +290 -0
- dhisana/utils/serpapi_google_jobs.py +117 -0
- dhisana/utils/serpapi_google_search.py +188 -0
- dhisana/utils/serpapi_local_business_search.py +129 -0
- dhisana/utils/serpapi_search_tools.py +360 -432
- dhisana/utils/serperdev_google_jobs.py +125 -0
- dhisana/utils/serperdev_local_business.py +154 -0
- dhisana/utils/serperdev_search.py +233 -0
- dhisana/utils/smtp_email_tools.py +178 -18
- dhisana/utils/test_connect.py +1603 -130
- dhisana/utils/trasform_json.py +3 -3
- dhisana/utils/web_download_parse_tools.py +0 -1
- dhisana/utils/zoominfo_tools.py +2 -3
- dhisana/workflow/test.py +1 -1
- {dhisana-0.0.1.dev116.dist-info → dhisana-0.0.1.dev236.dist-info}/METADATA +1 -1
- dhisana-0.0.1.dev236.dist-info/RECORD +100 -0
- {dhisana-0.0.1.dev116.dist-info → dhisana-0.0.1.dev236.dist-info}/WHEEL +1 -1
- dhisana-0.0.1.dev116.dist-info/RECORD +0 -83
- {dhisana-0.0.1.dev116.dist-info → dhisana-0.0.1.dev236.dist-info}/entry_points.txt +0 -0
- {dhisana-0.0.1.dev116.dist-info → dhisana-0.0.1.dev236.dist-info}/top_level.txt +0 -0
|
@@ -23,7 +23,7 @@ def get_builtwith_api_key(tool_config: Optional[List[Dict]] = None) -> str:
|
|
|
23
23
|
str: The BUILTWITH_API_KEY access token.
|
|
24
24
|
|
|
25
25
|
Raises:
|
|
26
|
-
ValueError: If the
|
|
26
|
+
ValueError: If the BuiltWith integration has not been configured.
|
|
27
27
|
"""
|
|
28
28
|
if tool_config:
|
|
29
29
|
builtwith_config = next(
|
|
@@ -43,7 +43,9 @@ def get_builtwith_api_key(tool_config: Optional[List[Dict]] = None) -> str:
|
|
|
43
43
|
|
|
44
44
|
BUILTWITH_API_KEY = BUILTWITH_API_KEY or os.getenv("BUILTWITH_API_KEY")
|
|
45
45
|
if not BUILTWITH_API_KEY:
|
|
46
|
-
raise ValueError(
|
|
46
|
+
raise ValueError(
|
|
47
|
+
"BuiltWith integration is not configured. Please configure the connection to BuiltWith in Integrations."
|
|
48
|
+
)
|
|
47
49
|
return BUILTWITH_API_KEY
|
|
48
50
|
|
|
49
51
|
# Use BuiltWith API to find tech stack and financials of a company
|
|
@@ -31,7 +31,6 @@ import aiohttp
|
|
|
31
31
|
# ────────────────────────────────────────────────────────────────────────────
|
|
32
32
|
from dhisana.schemas.sales import HubSpotLeadInformation
|
|
33
33
|
from dhisana.utils.field_validators import validate_and_clean_email
|
|
34
|
-
from dhisana.utils.hubspot_crm_tools import lookup_contact_by_name_and_domain
|
|
35
34
|
from dhisana.utils.apollo_tools import enrich_user_info_with_apollo
|
|
36
35
|
from dhisana.utils.assistant_tool_tag import assistant_tool
|
|
37
36
|
from dhisana.utils.cache_output_tools import cache_output, retrieve_output
|
|
@@ -71,7 +70,9 @@ def get_findymail_access_token(tool_config: Optional[List[Dict]] = None) -> str:
|
|
|
71
70
|
|
|
72
71
|
api_key = api_key or os.getenv("FINDYMAIL_API_KEY")
|
|
73
72
|
if not api_key:
|
|
74
|
-
logger.warning(
|
|
73
|
+
logger.warning(
|
|
74
|
+
"Findymail integration is not configured. Please configure the connection to Findymail in Integrations."
|
|
75
|
+
)
|
|
75
76
|
return ""
|
|
76
77
|
return api_key
|
|
77
78
|
|
|
@@ -99,7 +100,9 @@ def get_zero_bounce_access_token(tool_config: Optional[List[Dict]] = None) -> st
|
|
|
99
100
|
|
|
100
101
|
api_key = api_key or os.getenv("ZERO_BOUNCE_API_KEY")
|
|
101
102
|
if not api_key:
|
|
102
|
-
logger.warning(
|
|
103
|
+
logger.warning(
|
|
104
|
+
"ZeroBounce integration is not configured. Please configure the connection to ZeroBounce in Integrations."
|
|
105
|
+
)
|
|
103
106
|
return ""
|
|
104
107
|
return api_key
|
|
105
108
|
|
|
@@ -122,7 +125,9 @@ def get_hunter_access_token(tool_config: Optional[List[Dict]] = None) -> str:
|
|
|
122
125
|
|
|
123
126
|
api_key = api_key or os.getenv("HUNTER_API_KEY")
|
|
124
127
|
if not api_key:
|
|
125
|
-
logger.warning(
|
|
128
|
+
logger.warning(
|
|
129
|
+
"Hunter integration is not configured. Please configure the connection to Hunter in Integrations."
|
|
130
|
+
)
|
|
126
131
|
return ""
|
|
127
132
|
return api_key
|
|
128
133
|
|
|
@@ -304,46 +309,58 @@ async def guess_email_with_findymail(
|
|
|
304
309
|
first_name: str,
|
|
305
310
|
last_name: str,
|
|
306
311
|
domain: str,
|
|
307
|
-
user_linkedin_url: Optional[str] = None,
|
|
312
|
+
user_linkedin_url: Optional[str] = None,
|
|
308
313
|
middle_name: Optional[str] = None,
|
|
309
314
|
tool_config: Optional[List[Dict]] = None,
|
|
310
315
|
) -> Dict[str, Any]:
|
|
311
|
-
"""
|
|
312
|
-
|
|
316
|
+
"""Use Findymail to guess an email.
|
|
317
|
+
|
|
318
|
+
If ``user_linkedin_url`` is provided, the function queries ``/search/linkedin``.
|
|
319
|
+
Otherwise it falls back to ``/search/name`` with ``first_name``/``last_name``
|
|
320
|
+
and ``domain``. Only verified emails are returned and therefore considered
|
|
321
|
+
high confidence.
|
|
313
322
|
"""
|
|
314
323
|
logger.info("Entering guess_email_with_findymail")
|
|
315
|
-
|
|
316
|
-
|
|
324
|
+
|
|
325
|
+
if user_linkedin_url:
|
|
326
|
+
cache_key = f"findymail:{user_linkedin_url}"
|
|
327
|
+
else:
|
|
328
|
+
if not first_name or not last_name or not domain:
|
|
329
|
+
return {"email": "", "email_confidence": "low"}
|
|
330
|
+
cache_key = f"findymail:{first_name}_{last_name}_{domain}"
|
|
317
331
|
|
|
318
332
|
api_key = get_findymail_access_token(tool_config)
|
|
319
333
|
if not api_key:
|
|
320
334
|
return {"email": "", "email_confidence": "low"}
|
|
321
335
|
|
|
322
|
-
cache_key = f"findymail:{first_name}_{last_name}_{domain}"
|
|
323
336
|
cached = retrieve_output("findymail_guess", cache_key)
|
|
324
337
|
if cached:
|
|
325
338
|
return json.loads(cached[0])
|
|
326
339
|
|
|
327
|
-
|
|
340
|
+
if user_linkedin_url:
|
|
341
|
+
url = f"{FINDYMAIL_BASE_URL}/search/linkedin"
|
|
342
|
+
payload = {"linkedin_url": user_linkedin_url, "webhook_url": None}
|
|
343
|
+
else:
|
|
344
|
+
url = f"{FINDYMAIL_BASE_URL}/search/name"
|
|
345
|
+
full_name = " ".join(filter(None, [first_name, middle_name, last_name]))
|
|
346
|
+
payload = {"name": full_name, "domain": domain}
|
|
347
|
+
|
|
328
348
|
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
|
|
329
|
-
full_name = " ".join(filter(None, [first_name, middle_name, last_name]))
|
|
330
349
|
|
|
331
350
|
try:
|
|
332
351
|
async with aiohttp.ClientSession() as session:
|
|
333
|
-
async with session.post(
|
|
334
|
-
url, headers=headers, json={"name": full_name, "domain": domain}
|
|
335
|
-
) as r:
|
|
352
|
+
async with session.post(url, headers=headers, json=payload) as r:
|
|
336
353
|
if r.status != 200:
|
|
337
|
-
logger.warning("[Findymail] search
|
|
354
|
+
logger.warning("[Findymail] search non‑200: %s", r.status)
|
|
338
355
|
result = {"email": "", "email_confidence": "low"}
|
|
339
356
|
else:
|
|
340
357
|
data = await r.json()
|
|
341
|
-
contact = data.get(
|
|
358
|
+
contact = data.get("contact")
|
|
342
359
|
found = contact.get("email", "") if contact else ""
|
|
343
360
|
if found:
|
|
344
361
|
result = {
|
|
345
362
|
"email": found,
|
|
346
|
-
"email_confidence": "high"
|
|
363
|
+
"email_confidence": "high",
|
|
347
364
|
"contact_info": json.dumps(contact) if contact else "",
|
|
348
365
|
}
|
|
349
366
|
else:
|
|
@@ -5,7 +5,6 @@ from typing import Any, Dict, List, Optional, cast
|
|
|
5
5
|
from pydantic import BaseModel
|
|
6
6
|
from dhisana.utils.generate_structured_output_internal import get_structured_output_internal
|
|
7
7
|
from dhisana.utils.compose_search_query import (
|
|
8
|
-
generate_google_search_queries,
|
|
9
8
|
get_search_results_for_insights
|
|
10
9
|
)
|
|
11
10
|
|
|
@@ -49,7 +48,7 @@ async def check_for_intent_signal(
|
|
|
49
48
|
logger.info("Search query: %s", query_str)
|
|
50
49
|
logger.info("Search results snippet: %s", results_str[:100]) # Show partial snippet
|
|
51
50
|
search_results_text += f"Query: {query_str}\nResults: {results_str}\n\n"
|
|
52
|
-
|
|
51
|
+
datetime.datetime.now().isoformat()
|
|
53
52
|
user_prompt = f"""
|
|
54
53
|
Hi AI Assistant,
|
|
55
54
|
You are an expert in scoring leads based on intent signals.
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
import
|
|
1
|
+
import re
|
|
2
2
|
from typing import Dict, List, Optional, Any
|
|
3
|
-
import aiohttp
|
|
4
3
|
from pydantic import BaseModel
|
|
5
4
|
from dhisana.utils.apollo_tools import enrich_person_info_from_apollo
|
|
6
5
|
from dhisana.utils.assistant_tool_tag import assistant_tool
|
|
@@ -28,6 +27,7 @@ def compare_field(
|
|
|
28
27
|
person_key: str
|
|
29
28
|
) -> bool:
|
|
30
29
|
if not lead_properties.get(lead_key):
|
|
30
|
+
# If the lead doesn't have the field at all, let's consider it "matched" by default
|
|
31
31
|
return True
|
|
32
32
|
|
|
33
33
|
lead_value = lead_properties.get(lead_key, "")
|
|
@@ -72,8 +72,7 @@ async def validate_linkedin_url_with_apollo(
|
|
|
72
72
|
linkedin_url=linkedin_url,
|
|
73
73
|
tool_config=tool_config
|
|
74
74
|
)
|
|
75
|
-
# If no data is returned from Apollo, return defaults
|
|
76
|
-
# the logic in compare_field where no input -> True).
|
|
75
|
+
# If no data is returned from Apollo, return defaults
|
|
77
76
|
if not linkedin_data:
|
|
78
77
|
return match_result.model_dump()
|
|
79
78
|
|
|
@@ -120,8 +119,7 @@ async def validate_linkedin_url_with_proxy_curl(
|
|
|
120
119
|
linkedin_url=linkedin_url,
|
|
121
120
|
tool_config=tool_config
|
|
122
121
|
)
|
|
123
|
-
# If no data is returned from
|
|
124
|
-
# the logic in compare_field where no input -> True).
|
|
122
|
+
# If no data is returned from Proxycurl, return defaults
|
|
125
123
|
if not linkedin_data:
|
|
126
124
|
return match_result.model_dump()
|
|
127
125
|
|
|
@@ -148,6 +146,18 @@ LINKEDIN_VALIDATE_TOOL_NAME_TO_FUNCTION_MAP = {
|
|
|
148
146
|
"proxycurl": validate_linkedin_url_with_proxy_curl
|
|
149
147
|
}
|
|
150
148
|
|
|
149
|
+
def is_proxy_linkedin_url(url: str) -> bool:
|
|
150
|
+
"""
|
|
151
|
+
Determines if a LinkedIn URL is "proxy-like":
|
|
152
|
+
specifically, if /in/<profile_id> starts with 'acw' and is > 10 chars total.
|
|
153
|
+
"""
|
|
154
|
+
match = re.search(r"linkedin\.com/in/([^/]+)", url, re.IGNORECASE)
|
|
155
|
+
if match:
|
|
156
|
+
profile_id = match.group(1).strip()
|
|
157
|
+
if profile_id.startswith("acw") and len(profile_id) > 10:
|
|
158
|
+
return True
|
|
159
|
+
return False
|
|
160
|
+
|
|
151
161
|
@assistant_tool
|
|
152
162
|
async def check_linkedin_url_validity(
|
|
153
163
|
lead_properties: Dict[str, Any],
|
|
@@ -155,10 +165,12 @@ async def check_linkedin_url_validity(
|
|
|
155
165
|
) -> Dict[str, bool]:
|
|
156
166
|
"""
|
|
157
167
|
Validates LinkedIn URL (and related fields) by choosing the appropriate tool
|
|
158
|
-
from the tool_config.
|
|
168
|
+
from the tool_config. If the LinkedIn URL is detected as a "proxy" URL,
|
|
169
|
+
we skip calling any external tool and directly return 'linkedin_url_valid' = True.
|
|
159
170
|
|
|
160
171
|
Args:
|
|
161
|
-
lead_properties (dict): Lead info (e.g. first_name, last_name, job_title,
|
|
172
|
+
lead_properties (dict): Lead info (e.g. first_name, last_name, job_title,
|
|
173
|
+
lead_location, user_linkedin_url).
|
|
162
174
|
tool_config (Optional[List[Dict]]): Configuration to identify which tool is available.
|
|
163
175
|
|
|
164
176
|
Returns:
|
|
@@ -170,6 +182,20 @@ async def check_linkedin_url_validity(
|
|
|
170
182
|
if not tool_config:
|
|
171
183
|
raise ValueError("No tool configuration found.")
|
|
172
184
|
|
|
185
|
+
# ---------------------------------------------------------
|
|
186
|
+
# 1) If it’s a "proxy" LinkedIn URL, just return valid = True
|
|
187
|
+
# ---------------------------------------------------------
|
|
188
|
+
linkedin_url = lead_properties.get("user_linkedin_url", "")
|
|
189
|
+
if is_proxy_linkedin_url(linkedin_url):
|
|
190
|
+
match_result = LeadLinkedInMatch()
|
|
191
|
+
match_result.linkedin_url_valid = True
|
|
192
|
+
# The other fields remain their default (False) unless
|
|
193
|
+
# you want to set them otherwise. For now, we just do:
|
|
194
|
+
return match_result.model_dump()
|
|
195
|
+
|
|
196
|
+
# ---------------------------------------------------------
|
|
197
|
+
# 2) Otherwise, pick the correct tool and validate normally
|
|
198
|
+
# ---------------------------------------------------------
|
|
173
199
|
chosen_tool_func = None
|
|
174
200
|
for item in tool_config:
|
|
175
201
|
tool_name = item.get("name")
|
dhisana/utils/clay_tools.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
import aiohttp
|
|
3
2
|
import logging
|
|
4
3
|
from typing import Optional
|
|
@@ -22,7 +21,9 @@ async def push_to_clay_table(
|
|
|
22
21
|
- **dict**: Response message or error.
|
|
23
22
|
"""
|
|
24
23
|
if not api_key:
|
|
25
|
-
return {
|
|
24
|
+
return {
|
|
25
|
+
'error': "Clay integration is not configured. Please configure the connection to Clay in Integrations."
|
|
26
|
+
}
|
|
26
27
|
|
|
27
28
|
if not webhook:
|
|
28
29
|
return {'error': "Webhook URL not provided"}
|
|
@@ -1,11 +1,8 @@
|
|
|
1
|
-
from typing import Any, Dict, List
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
2
|
import copy
|
|
3
3
|
from typing import Any, Dict, List, Optional
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel, Field
|
|
6
5
|
|
|
7
|
-
from dhisana.schemas.sales import HubSpotLeadInformation
|
|
8
|
-
from dhisana.utils.generate_structured_output_internal import get_structured_output_internal
|
|
9
6
|
|
|
10
7
|
def remove_empty(data: Any) -> Any:
|
|
11
8
|
"""
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import datetime
|
|
2
1
|
import logging
|
|
3
2
|
import os
|
|
4
3
|
import json
|
|
@@ -352,8 +351,11 @@ async def get_search_results_for_insights(
|
|
|
352
351
|
|
|
353
352
|
def get_serp_api_access_token(tool_config: Optional[List[Dict]] = None) -> str:
|
|
354
353
|
"""
|
|
355
|
-
Retrieves the SERPAPI_KEY access token from the provided tool configuration
|
|
354
|
+
Retrieves the SERPAPI_KEY access token from the provided tool configuration
|
|
356
355
|
or from the environment variable SERPAPI_KEY.
|
|
356
|
+
|
|
357
|
+
Raises:
|
|
358
|
+
ValueError: If the SerpAPI integration has not been configured.
|
|
357
359
|
"""
|
|
358
360
|
serpapi_key = None
|
|
359
361
|
if tool_config:
|
|
@@ -373,7 +375,7 @@ def get_serp_api_access_token(tool_config: Optional[List[Dict]] = None) -> str:
|
|
|
373
375
|
serpapi_key = serpapi_key or os.getenv("SERPAPI_KEY")
|
|
374
376
|
if not serpapi_key:
|
|
375
377
|
raise ValueError(
|
|
376
|
-
"
|
|
378
|
+
"SerpAPI integration is not configured. Please configure the connection to SerpAPI in Integrations."
|
|
377
379
|
)
|
|
378
380
|
return serpapi_key
|
|
379
381
|
|
|
@@ -470,6 +472,7 @@ Output must be valid JSON, e.g.:
|
|
|
470
472
|
prompt=prompt,
|
|
471
473
|
response_format=TechnologyUsedCheck,
|
|
472
474
|
effort="high",
|
|
475
|
+
model="gpt-5.1-chat",
|
|
473
476
|
tool_config=tool_config
|
|
474
477
|
)
|
|
475
478
|
|
|
@@ -531,6 +534,7 @@ Output must be valid JSON, e.g.:
|
|
|
531
534
|
prompt=prompt,
|
|
532
535
|
response_format=TechnologyAndRoleCheck,
|
|
533
536
|
effort="high",
|
|
537
|
+
model="gpt-5.1-chat",
|
|
534
538
|
tool_config=tool_config
|
|
535
539
|
)
|
|
536
540
|
|
dhisana/utils/composite_tools.py
CHANGED
|
@@ -7,7 +7,6 @@ from dhisana.utils.built_with_api_tools import (
|
|
|
7
7
|
)
|
|
8
8
|
from dhisana.utils.dataframe_tools import get_structured_output
|
|
9
9
|
from dhisana.utils.google_custom_search import search_google_custom
|
|
10
|
-
from dhisana.utils.serpapi_search_tools import search_google
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
class QualifyCompanyBasedOnTechUsage(BaseModel):
|
dhisana/utils/dataframe_tools.py
CHANGED
|
@@ -33,13 +33,13 @@ class PandasQuery(BaseModel):
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
@assistant_tool
|
|
36
|
-
async def get_structured_output(message: str, response_type, model: str = "
|
|
36
|
+
async def get_structured_output(message: str, response_type, model: str = "gpt-5.1-chat"):
|
|
37
37
|
"""
|
|
38
38
|
Asynchronously retrieves structured output from the OpenAI API based on the input message.
|
|
39
39
|
|
|
40
40
|
:param message: The input message to be processed by the OpenAI API.
|
|
41
41
|
:param response_type: The expected format of the response (e.g., JSON).
|
|
42
|
-
:param model: The model to be used for processing the input message. Defaults to "
|
|
42
|
+
:param model: The model to be used for processing the input message. Defaults to "gpt-5.1-chat".
|
|
43
43
|
:return: A tuple containing the parsed response and a status string ('SUCCESS' or 'FAIL').
|
|
44
44
|
"""
|
|
45
45
|
try:
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Small helpers for handling e-mail bodies across providers."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Tuple
|
|
4
|
+
import html as html_lib
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def looks_like_html(text: str) -> bool:
|
|
9
|
+
"""Heuristically determine whether the body contains HTML markup."""
|
|
10
|
+
return bool(text and re.search(r"<[a-zA-Z][^>]*>", text))
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _normalize_format_hint(format_hint: Optional[str]) -> str:
|
|
14
|
+
"""
|
|
15
|
+
Normalize a user-supplied format hint into html/text/auto.
|
|
16
|
+
|
|
17
|
+
Accepts variations like "plain" or "plaintext" as text.
|
|
18
|
+
"""
|
|
19
|
+
if not format_hint:
|
|
20
|
+
return "auto"
|
|
21
|
+
fmt_raw = getattr(format_hint, "value", format_hint)
|
|
22
|
+
fmt = str(fmt_raw).strip().lower()
|
|
23
|
+
if fmt in ("html",):
|
|
24
|
+
return "html"
|
|
25
|
+
if fmt in ("text", "plain", "plain_text", "plaintext"):
|
|
26
|
+
return "text"
|
|
27
|
+
return "auto"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def html_to_plain_text(html: str) -> str:
|
|
31
|
+
"""
|
|
32
|
+
Produce a very lightweight plain-text version of an HTML fragment.
|
|
33
|
+
This keeps newlines on block boundaries and strips tags.
|
|
34
|
+
"""
|
|
35
|
+
if not html:
|
|
36
|
+
return ""
|
|
37
|
+
text = re.sub(r"(?is)<(script|style).*?>.*?</\1>", " ", html)
|
|
38
|
+
text = re.sub(r"(?i)<br\s*/?>", "\n", text)
|
|
39
|
+
text = re.sub(r"(?i)</(p|div|li|h[1-6])\s*>", "\n", text)
|
|
40
|
+
text = re.sub(r"(?is)<.*?>", "", text)
|
|
41
|
+
text = html_lib.unescape(text)
|
|
42
|
+
text = re.sub(r"\s+\n", "\n", text)
|
|
43
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
44
|
+
return text.strip()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def plain_text_to_html(text: str) -> str:
|
|
48
|
+
"""Wrap plain text in a minimal HTML container that preserves newlines."""
|
|
49
|
+
if text is None:
|
|
50
|
+
return ""
|
|
51
|
+
escaped = html_lib.escape(text)
|
|
52
|
+
return f'<div style="white-space: pre-wrap">{escaped}</div>'
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def body_variants(body: Optional[str], format_hint: Optional[str]) -> Tuple[str, str, str]:
|
|
56
|
+
"""
|
|
57
|
+
Return (plain, html, resolved_format) honoring an optional format hint.
|
|
58
|
+
|
|
59
|
+
resolved_format is "html" or "text" after applying auto-detection.
|
|
60
|
+
"""
|
|
61
|
+
content = body or ""
|
|
62
|
+
fmt = _normalize_format_hint(format_hint)
|
|
63
|
+
|
|
64
|
+
if fmt == "html":
|
|
65
|
+
return html_to_plain_text(content), content, "html"
|
|
66
|
+
if fmt == "text":
|
|
67
|
+
return content, plain_text_to_html(content), "text"
|
|
68
|
+
|
|
69
|
+
if looks_like_html(content):
|
|
70
|
+
return html_to_plain_text(content), content, "html"
|
|
71
|
+
|
|
72
|
+
return content, plain_text_to_html(content), "text"
|