dhisana 0.0.1.dev243__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dhisana/__init__.py +1 -0
- dhisana/cli/__init__.py +1 -0
- dhisana/cli/cli.py +20 -0
- dhisana/cli/datasets.py +27 -0
- dhisana/cli/models.py +26 -0
- dhisana/cli/predictions.py +20 -0
- dhisana/schemas/__init__.py +1 -0
- dhisana/schemas/common.py +399 -0
- dhisana/schemas/sales.py +965 -0
- dhisana/ui/__init__.py +1 -0
- dhisana/ui/components.py +472 -0
- dhisana/utils/__init__.py +1 -0
- dhisana/utils/add_mapping.py +352 -0
- dhisana/utils/agent_tools.py +51 -0
- dhisana/utils/apollo_tools.py +1597 -0
- dhisana/utils/assistant_tool_tag.py +4 -0
- dhisana/utils/built_with_api_tools.py +282 -0
- dhisana/utils/cache_output_tools.py +98 -0
- dhisana/utils/cache_output_tools_local.py +78 -0
- dhisana/utils/check_email_validity_tools.py +717 -0
- dhisana/utils/check_for_intent_signal.py +107 -0
- dhisana/utils/check_linkedin_url_validity.py +209 -0
- dhisana/utils/clay_tools.py +43 -0
- dhisana/utils/clean_properties.py +135 -0
- dhisana/utils/company_utils.py +60 -0
- dhisana/utils/compose_salesnav_query.py +259 -0
- dhisana/utils/compose_search_query.py +759 -0
- dhisana/utils/compose_three_step_workflow.py +234 -0
- dhisana/utils/composite_tools.py +137 -0
- dhisana/utils/dataframe_tools.py +237 -0
- dhisana/utils/domain_parser.py +45 -0
- dhisana/utils/email_body_utils.py +72 -0
- dhisana/utils/email_parse_helpers.py +132 -0
- dhisana/utils/email_provider.py +375 -0
- dhisana/utils/enrich_lead_information.py +933 -0
- dhisana/utils/extract_email_content_for_llm.py +101 -0
- dhisana/utils/fetch_openai_config.py +129 -0
- dhisana/utils/field_validators.py +426 -0
- dhisana/utils/g2_tools.py +104 -0
- dhisana/utils/generate_content.py +41 -0
- dhisana/utils/generate_custom_message.py +271 -0
- dhisana/utils/generate_email.py +278 -0
- dhisana/utils/generate_email_response.py +465 -0
- dhisana/utils/generate_flow.py +102 -0
- dhisana/utils/generate_leads_salesnav.py +303 -0
- dhisana/utils/generate_linkedin_connect_message.py +224 -0
- dhisana/utils/generate_linkedin_response_message.py +317 -0
- dhisana/utils/generate_structured_output_internal.py +462 -0
- dhisana/utils/google_custom_search.py +267 -0
- dhisana/utils/google_oauth_tools.py +727 -0
- dhisana/utils/google_workspace_tools.py +1294 -0
- dhisana/utils/hubspot_clearbit.py +96 -0
- dhisana/utils/hubspot_crm_tools.py +2440 -0
- dhisana/utils/instantly_tools.py +149 -0
- dhisana/utils/linkedin_crawler.py +168 -0
- dhisana/utils/lusha_tools.py +333 -0
- dhisana/utils/mailgun_tools.py +156 -0
- dhisana/utils/mailreach_tools.py +123 -0
- dhisana/utils/microsoft365_tools.py +455 -0
- dhisana/utils/openai_assistant_and_file_utils.py +267 -0
- dhisana/utils/openai_helpers.py +977 -0
- dhisana/utils/openapi_spec_to_tools.py +45 -0
- dhisana/utils/openapi_tool/__init__.py +1 -0
- dhisana/utils/openapi_tool/api_models.py +633 -0
- dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
- dhisana/utils/openapi_tool/openapi_tool.py +319 -0
- dhisana/utils/parse_linkedin_messages_txt.py +100 -0
- dhisana/utils/profile.py +37 -0
- dhisana/utils/proxy_curl_tools.py +1226 -0
- dhisana/utils/proxycurl_search_leads.py +426 -0
- dhisana/utils/python_function_to_tools.py +83 -0
- dhisana/utils/research_lead.py +176 -0
- dhisana/utils/sales_navigator_crawler.py +1103 -0
- dhisana/utils/salesforce_crm_tools.py +477 -0
- dhisana/utils/search_router.py +131 -0
- dhisana/utils/search_router_jobs.py +51 -0
- dhisana/utils/sendgrid_tools.py +162 -0
- dhisana/utils/serarch_router_local_business.py +75 -0
- dhisana/utils/serpapi_additional_tools.py +290 -0
- dhisana/utils/serpapi_google_jobs.py +117 -0
- dhisana/utils/serpapi_google_search.py +188 -0
- dhisana/utils/serpapi_local_business_search.py +129 -0
- dhisana/utils/serpapi_search_tools.py +852 -0
- dhisana/utils/serperdev_google_jobs.py +125 -0
- dhisana/utils/serperdev_local_business.py +154 -0
- dhisana/utils/serperdev_search.py +233 -0
- dhisana/utils/smtp_email_tools.py +582 -0
- dhisana/utils/test_connect.py +2087 -0
- dhisana/utils/trasform_json.py +173 -0
- dhisana/utils/web_download_parse_tools.py +189 -0
- dhisana/utils/workflow_code_model.py +5 -0
- dhisana/utils/zoominfo_tools.py +357 -0
- dhisana/workflow/__init__.py +1 -0
- dhisana/workflow/agent.py +18 -0
- dhisana/workflow/flow.py +44 -0
- dhisana/workflow/task.py +43 -0
- dhisana/workflow/test.py +90 -0
- dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
- dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
- dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
- dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
- dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,759 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
|
+
|
|
7
|
+
import aiohttp
|
|
8
|
+
import asyncio
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
# If these are your local imports, leave them as is. Otherwise adjust paths as needed.
|
|
13
|
+
from dhisana.utils.company_utils import normalize_company_name
|
|
14
|
+
from dhisana.utils.generate_structured_output_internal import get_structured_output_internal
|
|
15
|
+
from dhisana.utils.cache_output_tools import cache_output, retrieve_output
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
logging.basicConfig(level=logging.INFO)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class GoogleSearchQuery(BaseModel):
|
|
22
|
+
"""
|
|
23
|
+
Pydantic model representing the three Google search queries generated.
|
|
24
|
+
google_search_queries has a list of 3 search query strings.
|
|
25
|
+
"""
|
|
26
|
+
google_search_queries: List[str]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def generate_google_search_queries(
|
|
30
|
+
lead: Dict[str, Any],
|
|
31
|
+
english_description: str,
|
|
32
|
+
intent_signal_type: str,
|
|
33
|
+
example_query: str = "",
|
|
34
|
+
tool_config: Optional[List[Dict[str, Any]]] = None
|
|
35
|
+
) -> Dict[str, Any]:
|
|
36
|
+
"""
|
|
37
|
+
Generate three Google search queries based on a plain-English description,
|
|
38
|
+
incorporating the following logic:
|
|
39
|
+
1. First consider searching LinkedIn and the organization's own website for relevant info.
|
|
40
|
+
2. Then consider searching Instagram, Twitter, Github, Yelp, Crunchbase, Bloomberg,
|
|
41
|
+
or reputable news/financial sites for relevant qualification info.
|
|
42
|
+
3. If lead["primary_domain_of_organization"] is not empty, ALWAYS include one query
|
|
43
|
+
that searches the domain with something like:
|
|
44
|
+
site:<primary_domain_of_organization> "about this company"
|
|
45
|
+
4. Make sure lead["organization_name"] is part of every query.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
lead: Dictionary containing information about the lead, including 'organization_name'.
|
|
49
|
+
english_description: The user's plain-English description.
|
|
50
|
+
intent_signal_type: A string indicating the intent signal type.
|
|
51
|
+
example_query: Optional user-provided example.
|
|
52
|
+
tool_config: Optional list of dictionaries containing tool configuration.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
A dictionary with a single key: "google_search_queries", mapping to a list of
|
|
56
|
+
exactly three search query strings.
|
|
57
|
+
"""
|
|
58
|
+
# Pull out relevant values
|
|
59
|
+
org_name = lead.get("organization_name", "").strip()
|
|
60
|
+
org_name = normalize_company_name(org_name)
|
|
61
|
+
primary_domain = lead.get("primary_domain_of_organization", "").strip()
|
|
62
|
+
|
|
63
|
+
system_message = (
|
|
64
|
+
"You are a helpful AI Assistant that converts an English description of search requirements "
|
|
65
|
+
"into valid Google search queries.\n\n"
|
|
66
|
+
"Important instructions:\n"
|
|
67
|
+
"1. Always include the organization name in every query.\n"
|
|
68
|
+
"2. First consider ways to use LinkedIn or the company's own website to gather info.\n"
|
|
69
|
+
"3. Then consider how Google can leverage Instagram, Twitter, Github, Yelp, Crunchbase, Bloomberg, "
|
|
70
|
+
" or reputable news/financial sites to figure out relevant info for qualification.\n"
|
|
71
|
+
"4. You MUST generate exactly three Google search queries. No extra commentary.\n"
|
|
72
|
+
"5. If you're unsure about a filter, make your best guess or omit it.\n"
|
|
73
|
+
f"6. Primary domain of organization is: {primary_domain}\n\n"
|
|
74
|
+
f"7. Organization name is: {org_name}\n"
|
|
75
|
+
"8. In any site:linkedin.com search, make sure intitle:<organization_name> is present.\n\n"
|
|
76
|
+
"Output must be valid JSON with the structure:\n"
|
|
77
|
+
"{\n"
|
|
78
|
+
' "google_search_queries": ["search query1", "search query2", "search query3"]\n'
|
|
79
|
+
"}"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
few_shot_example_queries_lines = [
|
|
83
|
+
'Examples (like Neo4j used in company):',
|
|
84
|
+
f'- site:linkedin.com/in "{org_name}" "Neo4j" intitle:"{org_name}" -intitle:Neo4j -intitle:"profiles" ',
|
|
85
|
+
'Other examples to ssearch by title, news etc',
|
|
86
|
+
f'- site:linkedin.com/in "{org_name}" "Data Engineer" intitle:"{org_name}" -intitle:"profiles" ',
|
|
87
|
+
f'- site:linkedin.com/jobs/view/ "{org_name}" "hiring" "angular developer" intitle:"{org_name}"',
|
|
88
|
+
f'- site:news.google.com "{org_name}" "funding" OR "acquisition" OR "partnership"',
|
|
89
|
+
f'- site:crunchbase.com "{org_name}" "funding"',
|
|
90
|
+
f'- site:bloomberg.com "{org_name}" "financial news"'
|
|
91
|
+
]
|
|
92
|
+
if primary_domain:
|
|
93
|
+
few_shot_example_queries_lines.append(f'- site:{primary_domain} Job Openings')
|
|
94
|
+
few_shot_example_queries_lines.append(f'- site:{primary_domain} Case Studies')
|
|
95
|
+
few_shot_example_queries_lines.append(f'- "{org_name}" "competitors" OR "versus" OR "vs" "market share" "compare"')
|
|
96
|
+
|
|
97
|
+
few_shot_example_queries = "\n".join(few_shot_example_queries_lines)
|
|
98
|
+
|
|
99
|
+
user_prompt = f"""
|
|
100
|
+
{system_message}
|
|
101
|
+
|
|
102
|
+
The user wants to build Google search queries for:
|
|
103
|
+
"{english_description}"
|
|
104
|
+
|
|
105
|
+
Some example queries:
|
|
106
|
+
{few_shot_example_queries}
|
|
107
|
+
|
|
108
|
+
Lead info:
|
|
109
|
+
{json.dumps(lead, indent=2)}
|
|
110
|
+
|
|
111
|
+
Example query (if provided):
|
|
112
|
+
{example_query}
|
|
113
|
+
|
|
114
|
+
Intent signal type:
|
|
115
|
+
{intent_signal_type}
|
|
116
|
+
|
|
117
|
+
Please generate exactly three queries in JSON format as:
|
|
118
|
+
{{
|
|
119
|
+
"google_search_queries": ["query1", "query2", "query3"]
|
|
120
|
+
}}
|
|
121
|
+
Remember to include "{org_name}" in each query.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
logger.info("Generating Google search queries from description: %s", english_description)
|
|
125
|
+
|
|
126
|
+
# Call your structured-output helper
|
|
127
|
+
response, status = await get_structured_output_internal(
|
|
128
|
+
user_prompt,
|
|
129
|
+
GoogleSearchQuery,
|
|
130
|
+
tool_config=tool_config
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
if status != "SUCCESS" or not response:
|
|
134
|
+
raise Exception("Error generating the Google search queries.")
|
|
135
|
+
|
|
136
|
+
queries_dict = response.model_dump()
|
|
137
|
+
|
|
138
|
+
# Ensure that each query includes org_name
|
|
139
|
+
fixed_queries = []
|
|
140
|
+
for q in queries_dict["google_search_queries"]:
|
|
141
|
+
if org_name and org_name.lower() not in q.lower() and not q.lower().startswith(f'site:{primary_domain}'):
|
|
142
|
+
q = f'{q} "{org_name}"'
|
|
143
|
+
fixed_queries.append(q.strip())
|
|
144
|
+
|
|
145
|
+
queries_dict["google_search_queries"] = fixed_queries
|
|
146
|
+
|
|
147
|
+
# Ensure the domain-based query is included if primary_domain is present.
|
|
148
|
+
if primary_domain:
|
|
149
|
+
domain_query = f'site:{primary_domain}'
|
|
150
|
+
if all(domain_query.lower() not in x.lower() for x in queries_dict["google_search_queries"]):
|
|
151
|
+
queries_dict["google_search_queries"].append(domain_query)
|
|
152
|
+
|
|
153
|
+
logger.info("Search queries to be returned: %s", queries_dict["google_search_queries"])
|
|
154
|
+
return queries_dict
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
async def get_search_results_for_insights(
|
|
158
|
+
lead: Dict[str, Any],
|
|
159
|
+
english_description: str,
|
|
160
|
+
intent_signal_type: str,
|
|
161
|
+
example_query: str = "",
|
|
162
|
+
tool_config: Optional[List[Dict[str, Any]]] = None
|
|
163
|
+
) -> List[Dict[str, Any]]:
|
|
164
|
+
"""
|
|
165
|
+
Uses generate_google_search_queries() to get up to four Google queries,
|
|
166
|
+
then calls search_google() for each query in parallel to fetch results.
|
|
167
|
+
|
|
168
|
+
For special intent signals, specialized queries are composed directly
|
|
169
|
+
(e.g., searching LinkedIn for job postings with specific roles/technologies).
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
lead: Dictionary containing information about the lead.
|
|
173
|
+
english_description: The user's plain-English description.
|
|
174
|
+
intent_signal_type: A string indicating the intent signal type.
|
|
175
|
+
example_query: Optional user-provided example.
|
|
176
|
+
tool_config: Optional list of dictionaries containing tool configuration.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
A list of dictionaries, where each dictionary contains:
|
|
180
|
+
{
|
|
181
|
+
"query": <the google query used>,
|
|
182
|
+
"results": <a JSON string of search results array>
|
|
183
|
+
}
|
|
184
|
+
"""
|
|
185
|
+
results_of_queries: List[Dict[str, Any]] = []
|
|
186
|
+
|
|
187
|
+
# ---------------------------------------------------------
|
|
188
|
+
# Specialized approach for recognized intent signal types
|
|
189
|
+
# ---------------------------------------------------------
|
|
190
|
+
if intent_signal_type == "intent_find_tech_usage_in_leads_current_company":
|
|
191
|
+
company_name = lead.get("organization_name", "")
|
|
192
|
+
company_name = normalize_company_name(company_name)
|
|
193
|
+
organization_linkedin_url = lead.get("organization_linkedin_url", "")
|
|
194
|
+
if company_name:
|
|
195
|
+
google_queries = await get_google_queries_for_technology_used(
|
|
196
|
+
english_description,
|
|
197
|
+
company_name=company_name,
|
|
198
|
+
tool_config=tool_config
|
|
199
|
+
)
|
|
200
|
+
if google_queries:
|
|
201
|
+
job_posting_links = await find_tech_reference_by_google_search(
|
|
202
|
+
company_name,
|
|
203
|
+
google_queries,
|
|
204
|
+
organization_linkedin_url,
|
|
205
|
+
tool_config
|
|
206
|
+
)
|
|
207
|
+
results_of_queries.append({
|
|
208
|
+
"query": f"Find tech usage references by {company_name} using {google_queries} in Google search",
|
|
209
|
+
"results": json.dumps(job_posting_links)
|
|
210
|
+
})
|
|
211
|
+
|
|
212
|
+
elif intent_signal_type == "intent_find_tech_usage_in_leads_previous_company":
|
|
213
|
+
previous_company_name = lead.get("previous_organization_name", "")
|
|
214
|
+
previous_company_name = normalize_company_name(previous_company_name)
|
|
215
|
+
previous_organization_linkedin_url = lead.get("previous_organization_linkedin_url", "")
|
|
216
|
+
if previous_company_name:
|
|
217
|
+
google_queries = await get_google_queries_for_technology_used(
|
|
218
|
+
english_description,
|
|
219
|
+
company_name=previous_company_name,
|
|
220
|
+
tool_config=tool_config
|
|
221
|
+
)
|
|
222
|
+
if google_queries:
|
|
223
|
+
job_posting_links = await find_tech_reference_by_google_search(
|
|
224
|
+
previous_company_name,
|
|
225
|
+
google_queries,
|
|
226
|
+
previous_organization_linkedin_url,
|
|
227
|
+
tool_config
|
|
228
|
+
)
|
|
229
|
+
results_of_queries.append({
|
|
230
|
+
"query": f"Find tech usage references by previous {previous_company_name} using {google_queries} in Google search",
|
|
231
|
+
"results": json.dumps(job_posting_links)
|
|
232
|
+
})
|
|
233
|
+
|
|
234
|
+
elif intent_signal_type == "intent_find_champion_changed_job":
|
|
235
|
+
# For current
|
|
236
|
+
company_name = normalize_company_name(lead.get("organization_name", ""))
|
|
237
|
+
organization_linkedin_url = lead.get("organization_linkedin_url", "")
|
|
238
|
+
if company_name:
|
|
239
|
+
google_queries = await get_google_queries_for_technology_used(
|
|
240
|
+
english_description,
|
|
241
|
+
company_name=company_name,
|
|
242
|
+
tool_config=tool_config
|
|
243
|
+
)
|
|
244
|
+
if google_queries:
|
|
245
|
+
current_company_job_posting_links = await find_tech_reference_by_google_search(
|
|
246
|
+
company_name,
|
|
247
|
+
google_queries,
|
|
248
|
+
organization_linkedin_url,
|
|
249
|
+
tool_config
|
|
250
|
+
)
|
|
251
|
+
results_of_queries.append({
|
|
252
|
+
"query": f"Find tech usage references by current company {company_name} using {google_queries} in Google search",
|
|
253
|
+
"results": json.dumps(current_company_job_posting_links)
|
|
254
|
+
})
|
|
255
|
+
|
|
256
|
+
# For previous
|
|
257
|
+
previous_company_name = normalize_company_name(lead.get("previous_organization_name", ""))
|
|
258
|
+
previous_organization_linkedin_url = lead.get("previous_organization_linkedin_url", "")
|
|
259
|
+
if previous_company_name:
|
|
260
|
+
google_queries = await get_google_queries_for_technology_used(
|
|
261
|
+
english_description,
|
|
262
|
+
company_name=previous_company_name,
|
|
263
|
+
tool_config=tool_config
|
|
264
|
+
)
|
|
265
|
+
if google_queries:
|
|
266
|
+
prev_company_job_posting_links = await find_tech_reference_by_google_search(
|
|
267
|
+
previous_company_name,
|
|
268
|
+
google_queries,
|
|
269
|
+
previous_organization_linkedin_url,
|
|
270
|
+
tool_config
|
|
271
|
+
)
|
|
272
|
+
results_of_queries.append({
|
|
273
|
+
"query": f"Find tech usage references by previous company {previous_company_name} using {google_queries} in Google search",
|
|
274
|
+
"results": json.dumps(prev_company_job_posting_links)
|
|
275
|
+
})
|
|
276
|
+
|
|
277
|
+
elif intent_signal_type == "intent_find_job_opening_with_role_in_company":
|
|
278
|
+
company_name = normalize_company_name(lead.get("organization_name", ""))
|
|
279
|
+
organization_linkedin_url = lead.get("organization_linkedin_url", "")
|
|
280
|
+
if company_name and organization_linkedin_url:
|
|
281
|
+
google_query = await get_google_query_for_specific_role(
|
|
282
|
+
english_description,
|
|
283
|
+
company_name=company_name,
|
|
284
|
+
tool_config=tool_config
|
|
285
|
+
)
|
|
286
|
+
if google_query.strip():
|
|
287
|
+
job_posting_links = await find_job_postings_google_search(
|
|
288
|
+
company_name,
|
|
289
|
+
google_query,
|
|
290
|
+
organization_linkedin_url,
|
|
291
|
+
tool_config
|
|
292
|
+
)
|
|
293
|
+
results_of_queries.append({
|
|
294
|
+
"query": f"Find job by role in {company_name} using {google_query} in Google search",
|
|
295
|
+
"results": json.dumps(job_posting_links)
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
elif intent_signal_type == "intent_find_person_with_title_in_company":
|
|
299
|
+
company_name = normalize_company_name(lead.get("organization_name", ""))
|
|
300
|
+
organization_linkedin_url = lead.get("organization_linkedin_url", "")
|
|
301
|
+
if company_name and organization_linkedin_url:
|
|
302
|
+
google_query = await get_google_query_for_specific_title(
|
|
303
|
+
english_description,
|
|
304
|
+
company_name=company_name,
|
|
305
|
+
tool_config=tool_config
|
|
306
|
+
)
|
|
307
|
+
if google_query.strip():
|
|
308
|
+
job_posting_links = await find_job_postings_google_search(
|
|
309
|
+
company_name,
|
|
310
|
+
google_query,
|
|
311
|
+
organization_linkedin_url,
|
|
312
|
+
tool_config
|
|
313
|
+
)
|
|
314
|
+
results_of_queries.append({
|
|
315
|
+
"query": f"Find people with specific title in {company_name} using {google_query} in Google search",
|
|
316
|
+
"results": json.dumps(job_posting_links)
|
|
317
|
+
})
|
|
318
|
+
|
|
319
|
+
else:
|
|
320
|
+
# ---------------------------------------------------------
|
|
321
|
+
# Generic approach for unknown or general intent signals
|
|
322
|
+
# ---------------------------------------------------------
|
|
323
|
+
response_dict = await generate_google_search_queries(
|
|
324
|
+
lead=lead,
|
|
325
|
+
english_description=english_description,
|
|
326
|
+
intent_signal_type=intent_signal_type,
|
|
327
|
+
example_query=example_query,
|
|
328
|
+
tool_config=tool_config
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
# Extract and limit the queries to a maximum of four
|
|
332
|
+
queries = response_dict.get("google_search_queries", [])
|
|
333
|
+
queries = queries[:4]
|
|
334
|
+
|
|
335
|
+
# Execute searches in parallel
|
|
336
|
+
coroutines = [
|
|
337
|
+
search_google(query, number_of_results=3, tool_config=tool_config)
|
|
338
|
+
for query in queries
|
|
339
|
+
]
|
|
340
|
+
results = await asyncio.gather(*coroutines)
|
|
341
|
+
|
|
342
|
+
for query, query_results in zip(queries, results):
|
|
343
|
+
results_of_queries.append({
|
|
344
|
+
"query": query,
|
|
345
|
+
"results": json.dumps(query_results)
|
|
346
|
+
})
|
|
347
|
+
|
|
348
|
+
# Return the compiled list of search results
|
|
349
|
+
return results_of_queries
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def get_serp_api_access_token(tool_config: Optional[List[Dict]] = None) -> str:
|
|
353
|
+
"""
|
|
354
|
+
Retrieves the SERPAPI_KEY access token from the provided tool configuration
|
|
355
|
+
or from the environment variable SERPAPI_KEY.
|
|
356
|
+
|
|
357
|
+
Raises:
|
|
358
|
+
ValueError: If the SerpAPI integration has not been configured.
|
|
359
|
+
"""
|
|
360
|
+
serpapi_key = None
|
|
361
|
+
if tool_config:
|
|
362
|
+
serpapi_config = next(
|
|
363
|
+
(item for item in tool_config if item.get("name") == "serpapi"),
|
|
364
|
+
None
|
|
365
|
+
)
|
|
366
|
+
if serpapi_config:
|
|
367
|
+
config_map = {
|
|
368
|
+
item["name"]: item["value"]
|
|
369
|
+
for item in serpapi_config.get("configuration", [])
|
|
370
|
+
if item
|
|
371
|
+
}
|
|
372
|
+
serpapi_key = config_map.get("apiKey")
|
|
373
|
+
|
|
374
|
+
# Fallback to environment variable if not found in tool_config
|
|
375
|
+
serpapi_key = serpapi_key or os.getenv("SERPAPI_KEY")
|
|
376
|
+
if not serpapi_key:
|
|
377
|
+
raise ValueError(
|
|
378
|
+
"SerpAPI integration is not configured. Please configure the connection to SerpAPI in Integrations."
|
|
379
|
+
)
|
|
380
|
+
return serpapi_key
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
async def search_google(
|
|
384
|
+
query: str,
|
|
385
|
+
number_of_results: int = 3,
|
|
386
|
+
tool_config: Optional[List[Dict]] = None
|
|
387
|
+
) -> List[str]:
|
|
388
|
+
"""
|
|
389
|
+
Search Google using SERP API and return the results as a list of JSON strings.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
query: The search query.
|
|
393
|
+
number_of_results: Number of organic results to return.
|
|
394
|
+
tool_config: Optional list of dictionaries containing tool configuration.
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
A list of JSON strings, each representing one search result.
|
|
398
|
+
If any error occurs, returns a list with a single JSON-encoded error dict.
|
|
399
|
+
"""
|
|
400
|
+
serpapi_key = get_serp_api_access_token(tool_config)
|
|
401
|
+
|
|
402
|
+
# Check cache first
|
|
403
|
+
cached_response = retrieve_output("search_google_serp", query)
|
|
404
|
+
if cached_response is not None:
|
|
405
|
+
return cached_response
|
|
406
|
+
|
|
407
|
+
params = {
|
|
408
|
+
"q": query,
|
|
409
|
+
"num": number_of_results,
|
|
410
|
+
"api_key": serpapi_key
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
url = "https://serpapi.com/search"
|
|
414
|
+
try:
|
|
415
|
+
async with aiohttp.ClientSession() as session:
|
|
416
|
+
async with session.get(url, params=params) as response:
|
|
417
|
+
if response.status != 200:
|
|
418
|
+
error_data = await response.text()
|
|
419
|
+
return [json.dumps({"error": error_data})]
|
|
420
|
+
|
|
421
|
+
result = await response.json()
|
|
422
|
+
# Serialize each result to a JSON string
|
|
423
|
+
serialized_results = [
|
|
424
|
+
json.dumps(item) for item in result.get('organic_results', [])
|
|
425
|
+
]
|
|
426
|
+
# Cache results
|
|
427
|
+
cache_output("search_google_serp", query, serialized_results)
|
|
428
|
+
return serialized_results
|
|
429
|
+
except Exception as exc:
|
|
430
|
+
return [json.dumps({"error": str(exc)})]
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
class TechnologyUsedCheck(BaseModel):
|
|
434
|
+
"""
|
|
435
|
+
Pydantic model representing the technology keywords to look for.
|
|
436
|
+
technologies_used: list of strings describing the sought-after technologies.
|
|
437
|
+
"""
|
|
438
|
+
technologies_used: List[str]
|
|
439
|
+
location_to_filter_by: str
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
async def get_google_queries_for_technology_used(
|
|
443
|
+
english_description: str,
|
|
444
|
+
company_name: str,
|
|
445
|
+
tool_config: Optional[List[Dict[str, Any]]] = None
|
|
446
|
+
) -> List[str]:
|
|
447
|
+
"""
|
|
448
|
+
Extract technology keywords from the English description to build a LinkedIn job-focused query.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
english_description: The user's plain-English description.
|
|
452
|
+
company_name: Name of the company to search around.
|
|
453
|
+
tool_config: Optional tool configuration for structured-output or SERP.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
A list of Google queries that includes the company name and discovered technology keywords.
|
|
457
|
+
"""
|
|
458
|
+
prompt = f"""
|
|
459
|
+
Given the English description, list any technologies that the user is trying to verify for {company_name}.
|
|
460
|
+
Find if there is a location to filter search by and fill location_to_filter_by. If none specified, default is United States.
|
|
461
|
+
|
|
462
|
+
User input:
|
|
463
|
+
{english_description}
|
|
464
|
+
|
|
465
|
+
Output must be valid JSON, e.g.:
|
|
466
|
+
{{
|
|
467
|
+
"technologies_used": ["someTech", "anotherTech"],
|
|
468
|
+
"location_to_filter_by": "United States"
|
|
469
|
+
}}
|
|
470
|
+
"""
|
|
471
|
+
response, status = await get_structured_output_internal(
|
|
472
|
+
prompt=prompt,
|
|
473
|
+
response_format=TechnologyUsedCheck,
|
|
474
|
+
effort="high",
|
|
475
|
+
model="gpt-5.1-chat",
|
|
476
|
+
tool_config=tool_config
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
# Build up to two queries if we have technologies
|
|
480
|
+
if status == "SUCCESS" and response and response.technologies_used:
|
|
481
|
+
queries = []
|
|
482
|
+
tech_used_quoted = " OR ".join([f'"{tech}"' for tech in response.technologies_used])
|
|
483
|
+
queries.append(
|
|
484
|
+
f'site:linkedin.com (({tech_used_quoted}) AND ("{company_name}") AND ("{response.location_to_filter_by}"))'
|
|
485
|
+
)
|
|
486
|
+
queries.append(
|
|
487
|
+
f'site:x.com (({tech_used_quoted}) AND ("{company_name}"))'
|
|
488
|
+
)
|
|
489
|
+
return queries
|
|
490
|
+
else:
|
|
491
|
+
return []
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
class TechnologyAndRoleCheck(BaseModel):
|
|
495
|
+
"""
|
|
496
|
+
Pydantic model representing the technology keywords and role(s) to look for.
|
|
497
|
+
"""
|
|
498
|
+
technologies_used: List[str]
|
|
499
|
+
roles_looking_for: List[str]
|
|
500
|
+
location_to_filter_by: str
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
async def get_google_query_for_specific_role(
|
|
504
|
+
english_description: str,
|
|
505
|
+
company_name: str,
|
|
506
|
+
tool_config: Optional[List[Dict[str, Any]]] = None
|
|
507
|
+
) -> str:
|
|
508
|
+
"""
|
|
509
|
+
Extract role and technology keywords from the English description to build a LinkedIn jobs query.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
english_description: The user's plain-English description.
|
|
513
|
+
company_name: Name of the company to search around.
|
|
514
|
+
tool_config: Optional tool configuration.
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
A single Google query string with role(s) and technology keywords.
|
|
518
|
+
"""
|
|
519
|
+
prompt = f"""
|
|
520
|
+
Given the English description, identify any specific roles and technologies for {company_name}.
|
|
521
|
+
Find if there is a location to filter search by and fill location_to_filter_by. If none specified, default is United States.
|
|
522
|
+
|
|
523
|
+
User input:
|
|
524
|
+
{english_description}
|
|
525
|
+
|
|
526
|
+
Output must be valid JSON, e.g.:
|
|
527
|
+
{{
|
|
528
|
+
"technologies_used": ["Angular", "Python"],
|
|
529
|
+
"roles_looking_for": ["Developer", "Team Lead"],
|
|
530
|
+
"location_to_filter_by": "United States"
|
|
531
|
+
}}
|
|
532
|
+
"""
|
|
533
|
+
response, status = await get_structured_output_internal(
|
|
534
|
+
prompt=prompt,
|
|
535
|
+
response_format=TechnologyAndRoleCheck,
|
|
536
|
+
effort="high",
|
|
537
|
+
model="gpt-5.1-chat",
|
|
538
|
+
tool_config=tool_config
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
if status == "SUCCESS" and response:
|
|
542
|
+
tech_used_part = " OR ".join([f'"{tech}"' for tech in response.technologies_used]) if response.technologies_used else ""
|
|
543
|
+
roles_part = " OR ".join([f'"{role}"' for role in response.roles_looking_for]) if response.roles_looking_for else ""
|
|
544
|
+
return (
|
|
545
|
+
f'site:linkedin.com/in ({tech_used_part}) AND ({roles_part}) '
|
|
546
|
+
f'AND ("{company_name}") AND ("{response.location_to_filter_by}") -intitle:"profiles" '
|
|
547
|
+
).strip()
|
|
548
|
+
else:
|
|
549
|
+
return ""
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
class CheckForPeopleWithTitle(BaseModel):
|
|
553
|
+
"""
|
|
554
|
+
Pydantic model for extracting job titles from an English description.
|
|
555
|
+
"""
|
|
556
|
+
job_titles: List[str]
|
|
557
|
+
location_to_filter_by: str = "United States"
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
async def get_google_query_for_specific_title(
|
|
561
|
+
english_description: str,
|
|
562
|
+
company_name: str,
|
|
563
|
+
tool_config: Optional[List[Dict[str, Any]]] = None
|
|
564
|
+
) -> str:
|
|
565
|
+
"""
|
|
566
|
+
Extract job titles (and location) from the English description to build a LinkedIn /in query.
|
|
567
|
+
|
|
568
|
+
Args:
|
|
569
|
+
english_description: The user's plain-English description.
|
|
570
|
+
company_name: Name of the company to search around.
|
|
571
|
+
tool_config: Optional tool configuration.
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
A single Google query string with job titles and the company name.
|
|
575
|
+
"""
|
|
576
|
+
prompt = f"""
|
|
577
|
+
Given the English description, identify any specific job titles that the user wants to find at {company_name}.
|
|
578
|
+
Find if there is location to filter search by and fill location_to_filter_by. If none specified default is United States.
|
|
579
|
+
|
|
580
|
+
User input:
|
|
581
|
+
{english_description}
|
|
582
|
+
|
|
583
|
+
Output must be valid JSON, e.g.:
|
|
584
|
+
{{
|
|
585
|
+
"job_titles": ["CTO", "Head of Engineering"],
|
|
586
|
+
"location_to_filter_by": "United States"
|
|
587
|
+
}}
|
|
588
|
+
"""
|
|
589
|
+
response, status = await get_structured_output_internal(
|
|
590
|
+
prompt=prompt,
|
|
591
|
+
response_format=CheckForPeopleWithTitle,
|
|
592
|
+
effort="high",
|
|
593
|
+
tool_config=tool_config
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
if status == "SUCCESS" and response:
|
|
597
|
+
titles_part = " OR ".join([f'"{title}"' for title in response.job_titles]) if response.job_titles else ""
|
|
598
|
+
return (
|
|
599
|
+
f'site:linkedin.com/in ({titles_part}) AND ("{company_name}") '
|
|
600
|
+
f'AND ("{response.location_to_filter_by}") -intitle:"profiles" '
|
|
601
|
+
).strip()
|
|
602
|
+
else:
|
|
603
|
+
return ""
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
# TODO: fix with playwright implementation.
|
|
607
|
+
headers = {
|
|
608
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0",
|
|
609
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
610
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
611
|
+
"Referer": "https://www.google.com/"
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
async def _get_html_content_from_url(url: str) -> str:
|
|
616
|
+
async with aiohttp.ClientSession(headers=headers) as session:
|
|
617
|
+
async with session.get(url) as response:
|
|
618
|
+
return await response.text()
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
async def _clean_html_content(html_content: str) -> BeautifulSoup:
|
|
622
|
+
if not html_content:
|
|
623
|
+
return BeautifulSoup("", 'html.parser')
|
|
624
|
+
soup = BeautifulSoup(html_content, 'html.parser')
|
|
625
|
+
for element in soup(['script', 'style', 'meta', 'code', 'svg']):
|
|
626
|
+
element.decompose()
|
|
627
|
+
return soup
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
async def find_job_postings_google_search(
|
|
631
|
+
company_name: str,
|
|
632
|
+
google_query: str,
|
|
633
|
+
organization_linkedin_url: Optional[str] = None,
|
|
634
|
+
tool_config: Optional[List[Dict[str, Any]]] = None
|
|
635
|
+
) -> List[str]:
|
|
636
|
+
"""
|
|
637
|
+
Find job postings on LinkedIn for a given company using a Google Search query.
|
|
638
|
+
|
|
639
|
+
Args:
|
|
640
|
+
company_name (str): The name of the company.
|
|
641
|
+
google_query (str): The Google query to run.
|
|
642
|
+
organization_linkedin_url (Optional[str]): The LinkedIn URL of the company.
|
|
643
|
+
tool_config: Optional list of dictionaries containing tool configuration.
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
A list of discovered job posting links.
|
|
647
|
+
"""
|
|
648
|
+
logger.info("Entering find_job_postings_google_search with query: %s", google_query)
|
|
649
|
+
if not google_query.strip():
|
|
650
|
+
return []
|
|
651
|
+
|
|
652
|
+
job_posting_links = []
|
|
653
|
+
|
|
654
|
+
try:
|
|
655
|
+
results = await search_google(google_query.strip(), number_of_results=10, tool_config=tool_config)
|
|
656
|
+
except Exception:
|
|
657
|
+
logger.exception("Error searching for job postings via Google.")
|
|
658
|
+
return []
|
|
659
|
+
|
|
660
|
+
if not isinstance(results, list) or len(results) == 0:
|
|
661
|
+
logger.debug("No results returned for this query.")
|
|
662
|
+
return []
|
|
663
|
+
|
|
664
|
+
for result_item in results:
|
|
665
|
+
try:
|
|
666
|
+
result_json = json.loads(result_item)
|
|
667
|
+
except json.JSONDecodeError:
|
|
668
|
+
logger.debug("Failed to parse JSON from the search result.")
|
|
669
|
+
continue
|
|
670
|
+
|
|
671
|
+
link = result_json.get('link', '')
|
|
672
|
+
if not link:
|
|
673
|
+
logger.debug("No link found in result JSON.")
|
|
674
|
+
continue
|
|
675
|
+
|
|
676
|
+
try:
|
|
677
|
+
page_content = await _get_html_content_from_url(link)
|
|
678
|
+
soup = await _clean_html_content(page_content)
|
|
679
|
+
except Exception:
|
|
680
|
+
logger.exception("Error fetching or parsing the job posting page.")
|
|
681
|
+
continue
|
|
682
|
+
|
|
683
|
+
page_links = [a.get('href') for a in soup.find_all('a', href=True)]
|
|
684
|
+
|
|
685
|
+
company_match = False
|
|
686
|
+
if organization_linkedin_url:
|
|
687
|
+
partial_url = re.sub(r'^https?:\/\/(www\.)?', '', organization_linkedin_url).rstrip('/')
|
|
688
|
+
for page_link in page_links:
|
|
689
|
+
if (
|
|
690
|
+
page_link
|
|
691
|
+
and partial_url in page_link
|
|
692
|
+
and 'public_jobs_topcard-org-name' in page_link
|
|
693
|
+
):
|
|
694
|
+
company_match = True
|
|
695
|
+
break
|
|
696
|
+
|
|
697
|
+
if company_match:
|
|
698
|
+
job_posting_links.append(link)
|
|
699
|
+
|
|
700
|
+
logger.info("Found %d job posting links for query '%s'.", len(job_posting_links), google_query)
|
|
701
|
+
return job_posting_links
|
|
702
|
+
|
|
703
|
+
|
|
704
|
+
async def find_tech_reference_by_google_search(
|
|
705
|
+
company_name: str,
|
|
706
|
+
google_queries: List[str],
|
|
707
|
+
organization_linkedin_url: Optional[str] = None,
|
|
708
|
+
tool_config: Optional[List[Dict[str, Any]]] = None
|
|
709
|
+
) -> List[str]:
|
|
710
|
+
"""
|
|
711
|
+
Find pages referencing certain technologies or job postings on LinkedIn for a given company
|
|
712
|
+
using a list of Google queries.
|
|
713
|
+
|
|
714
|
+
Args:
|
|
715
|
+
company_name (str): The name of the company.
|
|
716
|
+
google_queries (List[str]): The Google queries to run.
|
|
717
|
+
organization_linkedin_url (Optional[str]): The LinkedIn URL of the company.
|
|
718
|
+
tool_config (Optional[List[Dict[str, Any]]]): Optional list of dictionaries containing tool configuration.
|
|
719
|
+
|
|
720
|
+
Returns:
|
|
721
|
+
List[str]: A list of discovered links referencing the technologies/job postings.
|
|
722
|
+
"""
|
|
723
|
+
linkedin_reference_links = []
|
|
724
|
+
for google_query in google_queries:
|
|
725
|
+
logger.info("Entering find_tech_reference_by_google_search with query: %s", google_query)
|
|
726
|
+
if not google_query.strip():
|
|
727
|
+
continue
|
|
728
|
+
|
|
729
|
+
try:
|
|
730
|
+
results = await search_google(google_query.strip(), number_of_results=10, tool_config=tool_config)
|
|
731
|
+
except Exception:
|
|
732
|
+
logger.exception("Error searching for job postings via Google.")
|
|
733
|
+
continue
|
|
734
|
+
|
|
735
|
+
if not isinstance(results, list) or len(results) == 0:
|
|
736
|
+
logger.debug("No results returned for this query.")
|
|
737
|
+
continue
|
|
738
|
+
|
|
739
|
+
for result_item in results:
|
|
740
|
+
try:
|
|
741
|
+
result_json = json.loads(result_item)
|
|
742
|
+
except json.JSONDecodeError:
|
|
743
|
+
logger.debug("Failed to parse JSON from the search result.")
|
|
744
|
+
continue
|
|
745
|
+
|
|
746
|
+
link = result_json.get('link', '')
|
|
747
|
+
if not link:
|
|
748
|
+
logger.debug("No link found in result JSON.")
|
|
749
|
+
continue
|
|
750
|
+
|
|
751
|
+
linkedin_reference_links.append(link)
|
|
752
|
+
|
|
753
|
+
logger.info(
|
|
754
|
+
"Accumulated %d links so far for query '%s'.",
|
|
755
|
+
len(linkedin_reference_links),
|
|
756
|
+
google_query
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
return linkedin_reference_links
|