fusesell 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fusesell might be problematic. Click here for more details.
- fusesell-1.2.0.dist-info/METADATA +872 -0
- fusesell-1.2.0.dist-info/RECORD +31 -0
- fusesell-1.2.0.dist-info/WHEEL +5 -0
- fusesell-1.2.0.dist-info/entry_points.txt +2 -0
- fusesell-1.2.0.dist-info/licenses/LICENSE +21 -0
- fusesell-1.2.0.dist-info/top_level.txt +2 -0
- fusesell.py +15 -0
- fusesell_local/__init__.py +37 -0
- fusesell_local/api.py +341 -0
- fusesell_local/cli.py +1450 -0
- fusesell_local/config/__init__.py +11 -0
- fusesell_local/config/prompts.py +245 -0
- fusesell_local/config/settings.py +277 -0
- fusesell_local/pipeline.py +932 -0
- fusesell_local/stages/__init__.py +19 -0
- fusesell_local/stages/base_stage.py +602 -0
- fusesell_local/stages/data_acquisition.py +1820 -0
- fusesell_local/stages/data_preparation.py +1231 -0
- fusesell_local/stages/follow_up.py +1590 -0
- fusesell_local/stages/initial_outreach.py +2337 -0
- fusesell_local/stages/lead_scoring.py +1452 -0
- fusesell_local/tests/test_api.py +65 -0
- fusesell_local/tests/test_cli.py +37 -0
- fusesell_local/utils/__init__.py +15 -0
- fusesell_local/utils/birthday_email_manager.py +467 -0
- fusesell_local/utils/data_manager.py +4050 -0
- fusesell_local/utils/event_scheduler.py +618 -0
- fusesell_local/utils/llm_client.py +283 -0
- fusesell_local/utils/logger.py +203 -0
- fusesell_local/utils/timezone_detector.py +914 -0
- fusesell_local/utils/validators.py +416 -0
|
@@ -0,0 +1,1820 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Acquisition Stage - Extract customer information from multiple sources
|
|
3
|
+
Converted from fusesell_data_acquisition.yml
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
import json
|
|
8
|
+
from typing import Dict, Any, Optional, List
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
import time
|
|
11
|
+
from .base_stage import BaseStage
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DataAcquisitionStage(BaseStage):
|
|
15
|
+
"""
|
|
16
|
+
Data Acquisition stage for extracting customer information from multiple sources.
|
|
17
|
+
Converts YAML workflow logic to Python implementation.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def execute(self, context: Dict[str, Any]) -> Dict[str, Any]:
|
|
21
|
+
"""
|
|
22
|
+
Execute data acquisition stage.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
context: Execution context
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Stage execution result
|
|
29
|
+
"""
|
|
30
|
+
try:
|
|
31
|
+
input_data = context.get('input_data', {})
|
|
32
|
+
|
|
33
|
+
# Collect data from all available sources
|
|
34
|
+
collected_data = []
|
|
35
|
+
data_sources = []
|
|
36
|
+
|
|
37
|
+
# 1. Website scraping (matching original YAML: input_website)
|
|
38
|
+
if input_data.get('input_website'):
|
|
39
|
+
website_data = self._scrape_website(
|
|
40
|
+
input_data['input_website'])
|
|
41
|
+
if website_data:
|
|
42
|
+
collected_data.append(website_data)
|
|
43
|
+
data_sources.append('website')
|
|
44
|
+
self.logger.info("Successfully scraped website data")
|
|
45
|
+
|
|
46
|
+
# 2. Customer description (matching original YAML: input_description)
|
|
47
|
+
if input_data.get('input_description'):
|
|
48
|
+
collected_data.append(input_data['input_description'])
|
|
49
|
+
data_sources.append('description')
|
|
50
|
+
self.logger.info("Added customer description")
|
|
51
|
+
|
|
52
|
+
# 3. Business card processing (matching original YAML: input_business_card)
|
|
53
|
+
if input_data.get('input_business_card'):
|
|
54
|
+
business_card_data = self._process_business_card(
|
|
55
|
+
input_data['input_business_card'])
|
|
56
|
+
if business_card_data:
|
|
57
|
+
collected_data.append(business_card_data)
|
|
58
|
+
data_sources.append('business_card')
|
|
59
|
+
self.logger.info("Successfully processed business card")
|
|
60
|
+
|
|
61
|
+
# 4. Social media scraping (matching original YAML: input_facebook_url, input_linkedin_url)
|
|
62
|
+
if input_data.get('input_facebook_url'):
|
|
63
|
+
facebook_data = self._scrape_social_media(
|
|
64
|
+
input_data['input_facebook_url'])
|
|
65
|
+
if facebook_data:
|
|
66
|
+
collected_data.append(facebook_data)
|
|
67
|
+
data_sources.append('facebook')
|
|
68
|
+
self.logger.info("Successfully scraped Facebook data")
|
|
69
|
+
|
|
70
|
+
if input_data.get('input_linkedin_url'):
|
|
71
|
+
linkedin_data = self._scrape_social_media(
|
|
72
|
+
input_data['input_linkedin_url'])
|
|
73
|
+
if linkedin_data:
|
|
74
|
+
collected_data.append(linkedin_data)
|
|
75
|
+
data_sources.append('linkedin')
|
|
76
|
+
self.logger.info("Successfully scraped LinkedIn data")
|
|
77
|
+
|
|
78
|
+
# 5. Free text input (matching executor schema: input_freetext)
|
|
79
|
+
if input_data.get('input_freetext'):
|
|
80
|
+
collected_data.append(input_data['input_freetext'])
|
|
81
|
+
data_sources.append('freetext')
|
|
82
|
+
self.logger.info("Added free text input")
|
|
83
|
+
|
|
84
|
+
# Combine all collected data
|
|
85
|
+
combined_data = ' '.join(str(data)
|
|
86
|
+
for data in collected_data if data)
|
|
87
|
+
|
|
88
|
+
if not combined_data:
|
|
89
|
+
raise ValueError("No data could be collected from any source")
|
|
90
|
+
|
|
91
|
+
# 5. Extract structured customer information using LLM
|
|
92
|
+
customer_info = self._extract_customer_info(combined_data)
|
|
93
|
+
|
|
94
|
+
# 6. Perform additional company research
|
|
95
|
+
research_data = self._perform_company_research(customer_info)
|
|
96
|
+
|
|
97
|
+
# 7. Scrape company website if not already done
|
|
98
|
+
website_research_data = self._scrape_company_website(
|
|
99
|
+
customer_info, data_sources)
|
|
100
|
+
|
|
101
|
+
# Combine all research data
|
|
102
|
+
mini_research = ' '.join(
|
|
103
|
+
filter(None, [research_data, website_research_data]))
|
|
104
|
+
|
|
105
|
+
# Final result
|
|
106
|
+
result_data = {
|
|
107
|
+
**customer_info,
|
|
108
|
+
'company_mini_search': mini_research,
|
|
109
|
+
'research_mini': True,
|
|
110
|
+
'data_sources': data_sources,
|
|
111
|
+
'extraction_status': 'success',
|
|
112
|
+
'customer_id': context.get('execution_id')
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Save to database
|
|
116
|
+
self.save_stage_result(context, result_data)
|
|
117
|
+
|
|
118
|
+
result = self.create_success_result(result_data, context)
|
|
119
|
+
return result
|
|
120
|
+
|
|
121
|
+
except Exception as e:
|
|
122
|
+
self.log_stage_error(context, e)
|
|
123
|
+
return self.handle_stage_error(e, context)
|
|
124
|
+
|
|
125
|
+
def _scrape_website(self, url: str) -> Optional[str]:
|
|
126
|
+
"""
|
|
127
|
+
Scrape website content with enhanced fallback mechanisms.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
url: Website URL to scrape
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Scraped text content or None if failed
|
|
134
|
+
"""
|
|
135
|
+
try:
|
|
136
|
+
if self.is_dry_run():
|
|
137
|
+
return f"[DRY RUN] Would scrape website: {url}"
|
|
138
|
+
|
|
139
|
+
# Step 1: Try direct HTTP request first
|
|
140
|
+
scraped_content = self._direct_website_scrape(url)
|
|
141
|
+
if scraped_content:
|
|
142
|
+
self.logger.info(f"Successfully scraped website directly: {url}")
|
|
143
|
+
return scraped_content
|
|
144
|
+
|
|
145
|
+
# Step 2: Try Serper API scraping
|
|
146
|
+
serper_key = self.config.get('serper_api_key')
|
|
147
|
+
if serper_key:
|
|
148
|
+
scraped_content = self._scrape_with_serper(url, serper_key)
|
|
149
|
+
if scraped_content:
|
|
150
|
+
self.logger.info(f"Successfully scraped website via Serper API: {url}")
|
|
151
|
+
return scraped_content
|
|
152
|
+
|
|
153
|
+
# Step 3: Enhanced fallback - Search-based data recovery
|
|
154
|
+
self.logger.info(f"Direct scraping failed for {url}, attempting search-based fallback")
|
|
155
|
+
company_name = self._extract_company_name_from_url(url)
|
|
156
|
+
if company_name:
|
|
157
|
+
fallback_content = self._search_based_fallback(company_name, serper_key)
|
|
158
|
+
if fallback_content:
|
|
159
|
+
self.logger.info(f"Successfully recovered company data via search fallback for: {company_name}")
|
|
160
|
+
return fallback_content
|
|
161
|
+
|
|
162
|
+
self.logger.warning(f"All scraping methods failed for {url}")
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
except Exception as e:
|
|
166
|
+
self.logger.error(f"Website scraping failed for {url}: {str(e)}")
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
def _extract_company_name_from_url(self, url: str) -> Optional[str]:
|
|
170
|
+
"""
|
|
171
|
+
Extract company name from URL for search fallback.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
url: Website URL
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Extracted company name or None if failed
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
from urllib.parse import urlparse
|
|
181
|
+
|
|
182
|
+
parsed = urlparse(url)
|
|
183
|
+
domain = parsed.netloc.lower()
|
|
184
|
+
|
|
185
|
+
# Remove common prefixes
|
|
186
|
+
if domain.startswith('www.'):
|
|
187
|
+
domain = domain[4:]
|
|
188
|
+
|
|
189
|
+
# Extract company name from domain
|
|
190
|
+
# Remove common TLDs
|
|
191
|
+
domain_parts = domain.split('.')
|
|
192
|
+
if len(domain_parts) >= 2:
|
|
193
|
+
company_name = domain_parts[0]
|
|
194
|
+
|
|
195
|
+
# Clean up common patterns
|
|
196
|
+
company_name = company_name.replace('-', ' ')
|
|
197
|
+
company_name = company_name.replace('_', ' ')
|
|
198
|
+
|
|
199
|
+
# Capitalize words
|
|
200
|
+
company_name = ' '.join(word.capitalize() for word in company_name.split())
|
|
201
|
+
|
|
202
|
+
return company_name
|
|
203
|
+
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
except Exception as e:
|
|
207
|
+
self.logger.debug(f"Failed to extract company name from URL {url}: {str(e)}")
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
def _search_based_fallback(self, company_name: str, api_key: str) -> Optional[str]:
|
|
211
|
+
"""
|
|
212
|
+
Enhanced fallback mechanism using search to recover company data.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
company_name: Company name to search for
|
|
216
|
+
api_key: Serper API key
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Company information from search results or None if failed
|
|
220
|
+
"""
|
|
221
|
+
try:
|
|
222
|
+
self.logger.info(f"Attempting search-based fallback for company: {company_name}")
|
|
223
|
+
|
|
224
|
+
# Try multiple search strategies
|
|
225
|
+
search_queries = [
|
|
226
|
+
f'"{company_name}" company about',
|
|
227
|
+
f'"{company_name}" business services',
|
|
228
|
+
f'{company_name} company profile',
|
|
229
|
+
f'{company_name} official website',
|
|
230
|
+
f'{company_name} contact information'
|
|
231
|
+
]
|
|
232
|
+
|
|
233
|
+
all_results = []
|
|
234
|
+
|
|
235
|
+
for query in search_queries:
|
|
236
|
+
try:
|
|
237
|
+
search_result = self._search_with_serper(query, api_key)
|
|
238
|
+
if search_result and len(search_result.strip()) > 50:
|
|
239
|
+
all_results.append(search_result)
|
|
240
|
+
self.logger.debug(f"Search query '{query}' returned {len(search_result)} characters")
|
|
241
|
+
|
|
242
|
+
# Add small delay between searches to avoid rate limiting
|
|
243
|
+
time.sleep(1)
|
|
244
|
+
|
|
245
|
+
except Exception as e:
|
|
246
|
+
self.logger.debug(f"Search query '{query}' failed: {str(e)}")
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
if not all_results:
|
|
250
|
+
self.logger.warning(f"No search results found for company: {company_name}")
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
# Combine and deduplicate results
|
|
254
|
+
combined_results = ' '.join(all_results)
|
|
255
|
+
|
|
256
|
+
# Limit length to avoid token limits
|
|
257
|
+
if len(combined_results) > 3000:
|
|
258
|
+
combined_results = combined_results[:3000] + "..."
|
|
259
|
+
|
|
260
|
+
# Try to find alternative URLs in search results
|
|
261
|
+
alternative_urls = self._extract_urls_from_search_results(combined_results)
|
|
262
|
+
|
|
263
|
+
# If we found alternative URLs, try scraping them
|
|
264
|
+
for alt_url in alternative_urls[:3]: # Try up to 3 alternative URLs
|
|
265
|
+
try:
|
|
266
|
+
self.logger.info(f"Trying alternative URL from search: {alt_url}")
|
|
267
|
+
scraped_content = self._direct_website_scrape(alt_url)
|
|
268
|
+
if scraped_content and len(scraped_content.strip()) > 100:
|
|
269
|
+
self.logger.info(f"Successfully scraped alternative URL: {alt_url}")
|
|
270
|
+
return f"Search Results: {combined_results}\n\nAlternative Website Content: {scraped_content}"
|
|
271
|
+
except Exception as e:
|
|
272
|
+
self.logger.debug(f"Failed to scrape alternative URL {alt_url}: {str(e)}")
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
# Return search results even if no alternative URLs worked
|
|
276
|
+
self.logger.info(f"Returning search results for company: {company_name}")
|
|
277
|
+
return f"Search Results: {combined_results}"
|
|
278
|
+
|
|
279
|
+
except Exception as e:
|
|
280
|
+
self.logger.error(f"Search-based fallback failed for {company_name}: {str(e)}")
|
|
281
|
+
return None
|
|
282
|
+
|
|
283
|
+
def _extract_urls_from_search_results(self, search_text: str) -> List[str]:
|
|
284
|
+
"""
|
|
285
|
+
Extract potential website URLs from search results.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
search_text: Search results text
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
List of extracted URLs
|
|
292
|
+
"""
|
|
293
|
+
try:
|
|
294
|
+
import re
|
|
295
|
+
|
|
296
|
+
# Pattern to match URLs in search results
|
|
297
|
+
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
|
|
298
|
+
urls = re.findall(url_pattern, search_text, re.IGNORECASE)
|
|
299
|
+
|
|
300
|
+
# Filter out common non-company URLs
|
|
301
|
+
filtered_urls = []
|
|
302
|
+
exclude_domains = [
|
|
303
|
+
'google.com', 'facebook.com', 'linkedin.com', 'twitter.com',
|
|
304
|
+
'instagram.com', 'youtube.com', 'wikipedia.org', 'yelp.com',
|
|
305
|
+
'glassdoor.com', 'indeed.com', 'crunchbase.com'
|
|
306
|
+
]
|
|
307
|
+
|
|
308
|
+
for url in urls:
|
|
309
|
+
# Skip if it's a social media or directory site
|
|
310
|
+
if not any(domain in url.lower() for domain in exclude_domains):
|
|
311
|
+
# Skip if it's too long (likely not a main company website)
|
|
312
|
+
if len(url) < 100:
|
|
313
|
+
filtered_urls.append(url)
|
|
314
|
+
|
|
315
|
+
# Remove duplicates while preserving order
|
|
316
|
+
seen = set()
|
|
317
|
+
unique_urls = []
|
|
318
|
+
for url in filtered_urls:
|
|
319
|
+
if url not in seen:
|
|
320
|
+
seen.add(url)
|
|
321
|
+
unique_urls.append(url)
|
|
322
|
+
|
|
323
|
+
return unique_urls[:5] # Return top 5 URLs
|
|
324
|
+
|
|
325
|
+
except Exception as e:
|
|
326
|
+
self.logger.debug(f"Failed to extract URLs from search results: {str(e)}")
|
|
327
|
+
return []
|
|
328
|
+
|
|
329
|
+
def _direct_website_scrape(self, url: str) -> Optional[str]:
|
|
330
|
+
"""
|
|
331
|
+
Direct website scraping using requests and basic HTML parsing.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
url: Website URL to scrape
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
Scraped text content or None if failed
|
|
338
|
+
"""
|
|
339
|
+
try:
|
|
340
|
+
headers = {
|
|
341
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
response = requests.get(url, headers=headers, timeout=30)
|
|
345
|
+
response.raise_for_status()
|
|
346
|
+
|
|
347
|
+
# Basic HTML parsing to extract text
|
|
348
|
+
try:
|
|
349
|
+
from bs4 import BeautifulSoup
|
|
350
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
351
|
+
|
|
352
|
+
# Remove script and style elements
|
|
353
|
+
for script in soup(["script", "style"]):
|
|
354
|
+
script.decompose()
|
|
355
|
+
|
|
356
|
+
# Get text content
|
|
357
|
+
text = soup.get_text()
|
|
358
|
+
|
|
359
|
+
# Clean up text
|
|
360
|
+
lines = (line.strip() for line in text.splitlines())
|
|
361
|
+
chunks = (phrase.strip()
|
|
362
|
+
for line in lines for phrase in line.split(" "))
|
|
363
|
+
text = ' '.join(chunk for chunk in chunks if chunk)
|
|
364
|
+
|
|
365
|
+
# Limit text length to avoid token limits
|
|
366
|
+
if len(text) > 5000:
|
|
367
|
+
text = text[:5000] + "..."
|
|
368
|
+
|
|
369
|
+
return text
|
|
370
|
+
|
|
371
|
+
except ImportError:
|
|
372
|
+
self.logger.warning(
|
|
373
|
+
"BeautifulSoup not available. Install with: pip install beautifulsoup4")
|
|
374
|
+
# Fallback: return raw HTML (limited)
|
|
375
|
+
content = response.text
|
|
376
|
+
if len(content) > 2000:
|
|
377
|
+
content = content[:2000] + "..."
|
|
378
|
+
return content
|
|
379
|
+
|
|
380
|
+
except Exception as e:
|
|
381
|
+
self.logger.error(f"Direct website scraping failed: {str(e)}")
|
|
382
|
+
return None
|
|
383
|
+
|
|
384
|
+
def _scrape_with_serper(self, url: str, api_key: str) -> Optional[str]:
|
|
385
|
+
"""
|
|
386
|
+
Scrape website using Serper API (original method).
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
url: Website URL to scrape
|
|
390
|
+
api_key: Serper API key
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Scraped text content or None if failed
|
|
394
|
+
"""
|
|
395
|
+
try:
|
|
396
|
+
headers = {
|
|
397
|
+
'X-API-KEY': api_key,
|
|
398
|
+
'Content-Type': 'application/json'
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
body = {'url': url}
|
|
402
|
+
|
|
403
|
+
response = requests.post(
|
|
404
|
+
'https://scrape.serper.dev',
|
|
405
|
+
json=body,
|
|
406
|
+
headers=headers,
|
|
407
|
+
timeout=300 # 5 minutes
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
if response.status_code == 200:
|
|
411
|
+
result = response.json()
|
|
412
|
+
return result.get('text', '')
|
|
413
|
+
else:
|
|
414
|
+
self.logger.warning(
|
|
415
|
+
f"Serper API returned status {response.status_code}")
|
|
416
|
+
return None
|
|
417
|
+
|
|
418
|
+
except Exception as e:
|
|
419
|
+
self.logger.error(f"Serper API scraping failed: {str(e)}")
|
|
420
|
+
return None
|
|
421
|
+
|
|
422
|
+
def _process_business_card(self, business_card_url: str) -> Optional[str]:
|
|
423
|
+
"""
|
|
424
|
+
Process business card image using OCR.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
business_card_url: URL to business card image
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Extracted text from business card or None if failed
|
|
431
|
+
"""
|
|
432
|
+
try:
|
|
433
|
+
if self.is_dry_run():
|
|
434
|
+
return f"[DRY RUN] Would process business card: {business_card_url}"
|
|
435
|
+
|
|
436
|
+
# Download the image
|
|
437
|
+
image_data = self._download_image(business_card_url)
|
|
438
|
+
if not image_data:
|
|
439
|
+
return None
|
|
440
|
+
|
|
441
|
+
# Try OCR processing with different methods
|
|
442
|
+
extracted_text = self._extract_text_from_image(image_data)
|
|
443
|
+
|
|
444
|
+
if extracted_text:
|
|
445
|
+
self.logger.info(
|
|
446
|
+
"Successfully extracted text from business card")
|
|
447
|
+
return extracted_text
|
|
448
|
+
else:
|
|
449
|
+
self.logger.warning(
|
|
450
|
+
"No text could be extracted from business card")
|
|
451
|
+
return None
|
|
452
|
+
|
|
453
|
+
except Exception as e:
|
|
454
|
+
self.logger.error(f"Business card processing failed: {str(e)}")
|
|
455
|
+
return None
|
|
456
|
+
|
|
457
|
+
def _download_image(self, image_url: str) -> Optional[bytes]:
|
|
458
|
+
"""
|
|
459
|
+
Download image or PDF from URL.
|
|
460
|
+
|
|
461
|
+
Args:
|
|
462
|
+
image_url: URL to image or PDF
|
|
463
|
+
|
|
464
|
+
Returns:
|
|
465
|
+
Image/PDF data as bytes or None if failed
|
|
466
|
+
"""
|
|
467
|
+
try:
|
|
468
|
+
headers = {
|
|
469
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
response = requests.get(image_url, headers=headers, timeout=30)
|
|
473
|
+
response.raise_for_status()
|
|
474
|
+
|
|
475
|
+
# Verify it's an image or PDF
|
|
476
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
477
|
+
if not (content_type.startswith('image/') or content_type == 'application/pdf'):
|
|
478
|
+
self.logger.warning(
|
|
479
|
+
f"URL does not point to an image or PDF: {content_type}")
|
|
480
|
+
return None
|
|
481
|
+
|
|
482
|
+
return response.content
|
|
483
|
+
|
|
484
|
+
except Exception as e:
|
|
485
|
+
self.logger.error(
|
|
486
|
+
f"Failed to download file from {image_url}: {str(e)}")
|
|
487
|
+
return None
|
|
488
|
+
|
|
489
|
+
def _extract_text_from_image(self, image_data: bytes) -> Optional[str]:
|
|
490
|
+
"""
|
|
491
|
+
Extract text from image or PDF using OCR.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
image_data: Image or PDF data as bytes
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
Extracted text or None if failed
|
|
498
|
+
"""
|
|
499
|
+
try:
|
|
500
|
+
# Check if it's a PDF first
|
|
501
|
+
if image_data.startswith(b'%PDF'):
|
|
502
|
+
text = self._extract_text_from_pdf(image_data)
|
|
503
|
+
if text:
|
|
504
|
+
return text
|
|
505
|
+
|
|
506
|
+
# Try Tesseract OCR first (most common)
|
|
507
|
+
text = self._ocr_with_tesseract(image_data)
|
|
508
|
+
if text:
|
|
509
|
+
return text
|
|
510
|
+
|
|
511
|
+
# Try EasyOCR as fallback
|
|
512
|
+
text = self._ocr_with_easyocr(image_data)
|
|
513
|
+
if text:
|
|
514
|
+
return text
|
|
515
|
+
|
|
516
|
+
# Try cloud OCR services if available
|
|
517
|
+
text = self._ocr_with_cloud_service(image_data)
|
|
518
|
+
if text:
|
|
519
|
+
return text
|
|
520
|
+
|
|
521
|
+
return None
|
|
522
|
+
|
|
523
|
+
except Exception as e:
|
|
524
|
+
self.logger.error(f"Text extraction failed: {str(e)}")
|
|
525
|
+
return None
|
|
526
|
+
|
|
527
|
+
def _extract_text_from_pdf(self, pdf_data: bytes) -> Optional[str]:
|
|
528
|
+
"""
|
|
529
|
+
Extract text from PDF business card.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
pdf_data: PDF data as bytes
|
|
533
|
+
|
|
534
|
+
Returns:
|
|
535
|
+
Extracted text or None if failed
|
|
536
|
+
"""
|
|
537
|
+
try:
|
|
538
|
+
import io
|
|
539
|
+
|
|
540
|
+
# Try PyPDF2 first for text extraction
|
|
541
|
+
text = self._extract_pdf_text_pypdf2(pdf_data)
|
|
542
|
+
if text and len(text.strip()) > 10:
|
|
543
|
+
structured_text = self._extract_business_card_info(text)
|
|
544
|
+
return structured_text if structured_text else text
|
|
545
|
+
|
|
546
|
+
# If no text found, convert PDF to image and use OCR
|
|
547
|
+
text = self._extract_pdf_text_via_ocr(pdf_data)
|
|
548
|
+
if text:
|
|
549
|
+
return text
|
|
550
|
+
|
|
551
|
+
return None
|
|
552
|
+
|
|
553
|
+
except Exception as e:
|
|
554
|
+
self.logger.debug(f"PDF text extraction failed: {str(e)}")
|
|
555
|
+
return None
|
|
556
|
+
|
|
557
|
+
def _extract_pdf_text_pypdf2(self, pdf_data: bytes) -> Optional[str]:
|
|
558
|
+
"""
|
|
559
|
+
Extract text from PDF using PyPDF2.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
pdf_data: PDF data as bytes
|
|
563
|
+
|
|
564
|
+
Returns:
|
|
565
|
+
Extracted text or None if failed
|
|
566
|
+
"""
|
|
567
|
+
try:
|
|
568
|
+
import PyPDF2
|
|
569
|
+
import io
|
|
570
|
+
|
|
571
|
+
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
|
|
572
|
+
|
|
573
|
+
text_parts = []
|
|
574
|
+
for page in pdf_reader.pages:
|
|
575
|
+
text = page.extract_text()
|
|
576
|
+
if text:
|
|
577
|
+
text_parts.append(text)
|
|
578
|
+
|
|
579
|
+
combined_text = ' '.join(text_parts).strip()
|
|
580
|
+
return combined_text if len(combined_text) > 10 else None
|
|
581
|
+
|
|
582
|
+
except ImportError:
|
|
583
|
+
self.logger.debug("PyPDF2 not available. Install with: pip install PyPDF2")
|
|
584
|
+
return None
|
|
585
|
+
except Exception as e:
|
|
586
|
+
self.logger.debug(f"PyPDF2 extraction failed: {str(e)}")
|
|
587
|
+
return None
|
|
588
|
+
|
|
589
|
+
def _extract_pdf_text_via_ocr(self, pdf_data: bytes) -> Optional[str]:
|
|
590
|
+
"""
|
|
591
|
+
Convert PDF to image and extract text via OCR.
|
|
592
|
+
|
|
593
|
+
Args:
|
|
594
|
+
pdf_data: PDF data as bytes
|
|
595
|
+
|
|
596
|
+
Returns:
|
|
597
|
+
Extracted text or None if failed
|
|
598
|
+
"""
|
|
599
|
+
try:
|
|
600
|
+
import fitz # PyMuPDF
|
|
601
|
+
import io
|
|
602
|
+
from PIL import Image
|
|
603
|
+
|
|
604
|
+
# Open PDF
|
|
605
|
+
pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
|
|
606
|
+
|
|
607
|
+
all_text = []
|
|
608
|
+
|
|
609
|
+
# Process each page (usually business cards are single page)
|
|
610
|
+
for page_num in range(min(3, len(pdf_document))): # Max 3 pages
|
|
611
|
+
page = pdf_document[page_num]
|
|
612
|
+
|
|
613
|
+
# Convert page to image
|
|
614
|
+
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
|
|
615
|
+
pix = page.get_pixmap(matrix=mat)
|
|
616
|
+
img_data = pix.tobytes("png")
|
|
617
|
+
|
|
618
|
+
# Extract text using OCR
|
|
619
|
+
text = self._ocr_with_tesseract(img_data)
|
|
620
|
+
if not text:
|
|
621
|
+
text = self._ocr_with_easyocr(img_data)
|
|
622
|
+
|
|
623
|
+
if text:
|
|
624
|
+
all_text.append(text)
|
|
625
|
+
|
|
626
|
+
pdf_document.close()
|
|
627
|
+
|
|
628
|
+
combined_text = ' '.join(all_text).strip()
|
|
629
|
+
if combined_text:
|
|
630
|
+
structured_text = self._extract_business_card_info(combined_text)
|
|
631
|
+
return structured_text if structured_text else combined_text
|
|
632
|
+
|
|
633
|
+
return None
|
|
634
|
+
|
|
635
|
+
except ImportError:
|
|
636
|
+
self.logger.debug("PyMuPDF not available. Install with: pip install PyMuPDF")
|
|
637
|
+
return None
|
|
638
|
+
except Exception as e:
|
|
639
|
+
self.logger.debug(f"PDF OCR extraction failed: {str(e)}")
|
|
640
|
+
return None
|
|
641
|
+
|
|
642
|
+
def _ocr_with_tesseract(self, image_data: bytes) -> Optional[str]:
|
|
643
|
+
"""
|
|
644
|
+
Extract text using Tesseract OCR with image preprocessing.
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
image_data: Image data as bytes
|
|
648
|
+
|
|
649
|
+
Returns:
|
|
650
|
+
Extracted text or None if failed
|
|
651
|
+
"""
|
|
652
|
+
try:
|
|
653
|
+
import pytesseract
|
|
654
|
+
from PIL import Image, ImageEnhance, ImageFilter
|
|
655
|
+
import io
|
|
656
|
+
|
|
657
|
+
# Load image
|
|
658
|
+
image = Image.open(io.BytesIO(image_data))
|
|
659
|
+
|
|
660
|
+
# Convert to RGB if necessary
|
|
661
|
+
if image.mode != 'RGB':
|
|
662
|
+
image = image.convert('RGB')
|
|
663
|
+
|
|
664
|
+
# Apply image preprocessing for better OCR accuracy
|
|
665
|
+
processed_image = self._preprocess_image_for_ocr(image)
|
|
666
|
+
|
|
667
|
+
# Try different OCR configurations for business cards
|
|
668
|
+
ocr_configs = [
|
|
669
|
+
'--psm 6', # Uniform block of text
|
|
670
|
+
'--psm 4', # Single column of text
|
|
671
|
+
'--psm 3', # Fully automatic page segmentation
|
|
672
|
+
'--psm 8', # Single word
|
|
673
|
+
'--psm 13' # Raw line. Treat the image as a single text line
|
|
674
|
+
]
|
|
675
|
+
|
|
676
|
+
best_text = ""
|
|
677
|
+
best_confidence = 0
|
|
678
|
+
|
|
679
|
+
for config in ocr_configs:
|
|
680
|
+
try:
|
|
681
|
+
# Extract text with configuration
|
|
682
|
+
text = pytesseract.image_to_string(processed_image, lang='eng', config=config)
|
|
683
|
+
|
|
684
|
+
# Get confidence score
|
|
685
|
+
data = pytesseract.image_to_data(processed_image, lang='eng', config=config, output_type=pytesseract.Output.DICT)
|
|
686
|
+
confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
|
|
687
|
+
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
|
|
688
|
+
|
|
689
|
+
# Keep the best result
|
|
690
|
+
if avg_confidence > best_confidence and len(text.strip()) > 10:
|
|
691
|
+
best_text = text.strip()
|
|
692
|
+
best_confidence = avg_confidence
|
|
693
|
+
|
|
694
|
+
except Exception as e:
|
|
695
|
+
self.logger.debug(f"OCR config {config} failed: {str(e)}")
|
|
696
|
+
continue
|
|
697
|
+
|
|
698
|
+
# Also try the original image without preprocessing
|
|
699
|
+
try:
|
|
700
|
+
text = pytesseract.image_to_string(image, lang='eng')
|
|
701
|
+
if len(text.strip()) > len(best_text):
|
|
702
|
+
best_text = text.strip()
|
|
703
|
+
except:
|
|
704
|
+
pass
|
|
705
|
+
|
|
706
|
+
# Extract structured information from business card text
|
|
707
|
+
if best_text:
|
|
708
|
+
structured_text = self._extract_business_card_info(best_text)
|
|
709
|
+
return structured_text if structured_text else best_text
|
|
710
|
+
|
|
711
|
+
return None
|
|
712
|
+
|
|
713
|
+
except ImportError:
|
|
714
|
+
self.logger.debug(
|
|
715
|
+
"Tesseract OCR not available. Install with: pip install pytesseract pillow")
|
|
716
|
+
return None
|
|
717
|
+
except Exception as e:
|
|
718
|
+
self.logger.debug(f"Tesseract OCR failed: {str(e)}")
|
|
719
|
+
return None
|
|
720
|
+
|
|
721
|
+
def _preprocess_image_for_ocr(self, image):
|
|
722
|
+
"""
|
|
723
|
+
Preprocess image to improve OCR accuracy.
|
|
724
|
+
|
|
725
|
+
Args:
|
|
726
|
+
image: PIL Image object
|
|
727
|
+
|
|
728
|
+
Returns:
|
|
729
|
+
Preprocessed PIL Image object
|
|
730
|
+
"""
|
|
731
|
+
try:
|
|
732
|
+
from PIL import ImageEnhance, ImageFilter
|
|
733
|
+
import numpy as np
|
|
734
|
+
|
|
735
|
+
# Convert to grayscale for better OCR
|
|
736
|
+
if image.mode != 'L':
|
|
737
|
+
image = image.convert('L')
|
|
738
|
+
|
|
739
|
+
# Resize image if too small (OCR works better on larger images)
|
|
740
|
+
width, height = image.size
|
|
741
|
+
if width < 300 or height < 300:
|
|
742
|
+
scale_factor = max(300 / width, 300 / height)
|
|
743
|
+
new_width = int(width * scale_factor)
|
|
744
|
+
new_height = int(height * scale_factor)
|
|
745
|
+
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
|
|
746
|
+
|
|
747
|
+
# Enhance contrast
|
|
748
|
+
enhancer = ImageEnhance.Contrast(image)
|
|
749
|
+
image = enhancer.enhance(1.5)
|
|
750
|
+
|
|
751
|
+
# Enhance sharpness
|
|
752
|
+
enhancer = ImageEnhance.Sharpness(image)
|
|
753
|
+
image = enhancer.enhance(2.0)
|
|
754
|
+
|
|
755
|
+
# Apply noise reduction
|
|
756
|
+
image = image.filter(ImageFilter.MedianFilter(size=3))
|
|
757
|
+
|
|
758
|
+
# Apply slight blur to smooth out noise
|
|
759
|
+
image = image.filter(ImageFilter.GaussianBlur(radius=0.5))
|
|
760
|
+
|
|
761
|
+
return image
|
|
762
|
+
|
|
763
|
+
except Exception as e:
|
|
764
|
+
self.logger.debug(f"Image preprocessing failed: {str(e)}")
|
|
765
|
+
return image # Return original image if preprocessing fails
|
|
766
|
+
|
|
767
|
+
def _extract_business_card_info(self, raw_text: str) -> Optional[str]:
|
|
768
|
+
"""
|
|
769
|
+
Extract structured information from business card OCR text.
|
|
770
|
+
|
|
771
|
+
Args:
|
|
772
|
+
raw_text: Raw OCR text from business card
|
|
773
|
+
|
|
774
|
+
Returns:
|
|
775
|
+
Structured business card information
|
|
776
|
+
"""
|
|
777
|
+
try:
|
|
778
|
+
import re
|
|
779
|
+
|
|
780
|
+
# Clean up the text
|
|
781
|
+
lines = [line.strip() for line in raw_text.split('\n') if line.strip()]
|
|
782
|
+
|
|
783
|
+
# Initialize extracted info
|
|
784
|
+
extracted_info = {
|
|
785
|
+
'name': None,
|
|
786
|
+
'title': None,
|
|
787
|
+
'company': None,
|
|
788
|
+
'email': None,
|
|
789
|
+
'phone': None,
|
|
790
|
+
'website': None,
|
|
791
|
+
'address': None
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
# Email pattern
|
|
795
|
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
|
796
|
+
|
|
797
|
+
# Phone pattern (various formats)
|
|
798
|
+
phone_pattern = r'(\+?1?[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})'
|
|
799
|
+
|
|
800
|
+
# Website pattern
|
|
801
|
+
website_pattern = r'(?:https?://)?(?:www\.)?([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}'
|
|
802
|
+
|
|
803
|
+
# Extract email
|
|
804
|
+
email_matches = re.findall(email_pattern, raw_text, re.IGNORECASE)
|
|
805
|
+
if email_matches:
|
|
806
|
+
extracted_info['email'] = email_matches[0]
|
|
807
|
+
|
|
808
|
+
# Extract phone
|
|
809
|
+
phone_matches = re.findall(phone_pattern, raw_text)
|
|
810
|
+
if phone_matches:
|
|
811
|
+
phone = ''.join(phone_matches[0])
|
|
812
|
+
extracted_info['phone'] = phone
|
|
813
|
+
|
|
814
|
+
# Extract website
|
|
815
|
+
website_matches = re.findall(website_pattern, raw_text, re.IGNORECASE)
|
|
816
|
+
if website_matches:
|
|
817
|
+
website = website_matches[0]
|
|
818
|
+
if not website.startswith('http'):
|
|
819
|
+
website = 'https://' + website
|
|
820
|
+
extracted_info['website'] = website
|
|
821
|
+
|
|
822
|
+
# Heuristic extraction for name, title, company
|
|
823
|
+
# Usually name is on the first or second line
|
|
824
|
+
# Title is often after the name
|
|
825
|
+
# Company is often the largest/most prominent text
|
|
826
|
+
|
|
827
|
+
if len(lines) >= 1:
|
|
828
|
+
# First line is often the name
|
|
829
|
+
potential_name = lines[0]
|
|
830
|
+
if len(potential_name.split()) <= 4 and not any(char.isdigit() for char in potential_name):
|
|
831
|
+
extracted_info['name'] = potential_name
|
|
832
|
+
|
|
833
|
+
if len(lines) >= 2:
|
|
834
|
+
# Second line might be title
|
|
835
|
+
potential_title = lines[1]
|
|
836
|
+
if len(potential_title.split()) <= 6 and not any(char in potential_title for char in '@.'):
|
|
837
|
+
extracted_info['title'] = potential_title
|
|
838
|
+
|
|
839
|
+
# Look for company name (often contains "Inc", "LLC", "Corp", etc.)
|
|
840
|
+
company_indicators = ['inc', 'llc', 'corp', 'ltd', 'company', 'co.', 'corporation']
|
|
841
|
+
for line in lines:
|
|
842
|
+
if any(indicator in line.lower() for indicator in company_indicators):
|
|
843
|
+
extracted_info['company'] = line
|
|
844
|
+
break
|
|
845
|
+
|
|
846
|
+
# If no company found with indicators, use the longest line that's not name/title/contact info
|
|
847
|
+
if not extracted_info['company']:
|
|
848
|
+
for line in lines:
|
|
849
|
+
if (line != extracted_info['name'] and
|
|
850
|
+
line != extracted_info['title'] and
|
|
851
|
+
not re.search(email_pattern, line, re.IGNORECASE) and
|
|
852
|
+
not re.search(phone_pattern, line) and
|
|
853
|
+
len(line) > 10):
|
|
854
|
+
extracted_info['company'] = line
|
|
855
|
+
break
|
|
856
|
+
|
|
857
|
+
# Format the structured output
|
|
858
|
+
structured_parts = []
|
|
859
|
+
if extracted_info['name']:
|
|
860
|
+
structured_parts.append(f"Name: {extracted_info['name']}")
|
|
861
|
+
if extracted_info['title']:
|
|
862
|
+
structured_parts.append(f"Title: {extracted_info['title']}")
|
|
863
|
+
if extracted_info['company']:
|
|
864
|
+
structured_parts.append(f"Company: {extracted_info['company']}")
|
|
865
|
+
if extracted_info['email']:
|
|
866
|
+
structured_parts.append(f"Email: {extracted_info['email']}")
|
|
867
|
+
if extracted_info['phone']:
|
|
868
|
+
structured_parts.append(f"Phone: {extracted_info['phone']}")
|
|
869
|
+
if extracted_info['website']:
|
|
870
|
+
structured_parts.append(f"Website: {extracted_info['website']}")
|
|
871
|
+
|
|
872
|
+
if structured_parts:
|
|
873
|
+
return " | ".join(structured_parts)
|
|
874
|
+
else:
|
|
875
|
+
return raw_text # Return raw text if no structured info found
|
|
876
|
+
|
|
877
|
+
except Exception as e:
|
|
878
|
+
self.logger.debug(f"Business card info extraction failed: {str(e)}")
|
|
879
|
+
return raw_text # Return raw text if processing fails
|
|
880
|
+
|
|
881
|
+
def _ocr_with_easyocr(self, image_data: bytes) -> Optional[str]:
|
|
882
|
+
"""
|
|
883
|
+
Extract text using EasyOCR with preprocessing.
|
|
884
|
+
|
|
885
|
+
Args:
|
|
886
|
+
image_data: Image data as bytes
|
|
887
|
+
|
|
888
|
+
Returns:
|
|
889
|
+
Extracted text or None if failed
|
|
890
|
+
"""
|
|
891
|
+
try:
|
|
892
|
+
import easyocr
|
|
893
|
+
import numpy as np
|
|
894
|
+
from PIL import Image
|
|
895
|
+
import io
|
|
896
|
+
|
|
897
|
+
# Load image
|
|
898
|
+
image = Image.open(io.BytesIO(image_data))
|
|
899
|
+
|
|
900
|
+
# Apply preprocessing
|
|
901
|
+
processed_image = self._preprocess_image_for_ocr(image)
|
|
902
|
+
|
|
903
|
+
# Convert to numpy array for EasyOCR
|
|
904
|
+
image_array = np.array(processed_image)
|
|
905
|
+
|
|
906
|
+
# Initialize EasyOCR reader with multiple languages for better detection
|
|
907
|
+
reader = easyocr.Reader(['en'], gpu=False) # Disable GPU for compatibility
|
|
908
|
+
|
|
909
|
+
# Extract text with different confidence thresholds
|
|
910
|
+
results = reader.readtext(image_array, detail=1)
|
|
911
|
+
|
|
912
|
+
# Sort results by confidence and position
|
|
913
|
+
high_conf_results = [result for result in results if result[2] > 0.6]
|
|
914
|
+
medium_conf_results = [result for result in results if 0.3 < result[2] <= 0.6]
|
|
915
|
+
|
|
916
|
+
# Try high confidence results first
|
|
917
|
+
if high_conf_results:
|
|
918
|
+
text_parts = [result[1] for result in high_conf_results]
|
|
919
|
+
text = ' '.join(text_parts)
|
|
920
|
+
|
|
921
|
+
# Extract structured info if it looks like a business card
|
|
922
|
+
structured_text = self._extract_business_card_info(text)
|
|
923
|
+
if structured_text:
|
|
924
|
+
return structured_text
|
|
925
|
+
elif len(text.strip()) > 10:
|
|
926
|
+
return text.strip()
|
|
927
|
+
|
|
928
|
+
# Fall back to medium confidence results
|
|
929
|
+
if medium_conf_results:
|
|
930
|
+
text_parts = [result[1] for result in medium_conf_results]
|
|
931
|
+
text = ' '.join(text_parts)
|
|
932
|
+
|
|
933
|
+
if len(text.strip()) > 10:
|
|
934
|
+
return text.strip()
|
|
935
|
+
|
|
936
|
+
# Try with original image if preprocessing didn't help
|
|
937
|
+
try:
|
|
938
|
+
original_array = np.array(image)
|
|
939
|
+
results = reader.readtext(original_array, detail=1)
|
|
940
|
+
text_parts = [result[1] for result in results if result[2] > 0.4]
|
|
941
|
+
text = ' '.join(text_parts)
|
|
942
|
+
|
|
943
|
+
if len(text.strip()) > 10:
|
|
944
|
+
return text.strip()
|
|
945
|
+
except:
|
|
946
|
+
pass
|
|
947
|
+
|
|
948
|
+
return None
|
|
949
|
+
|
|
950
|
+
except ImportError:
|
|
951
|
+
self.logger.debug(
|
|
952
|
+
"EasyOCR not available. Install with: pip install easyocr")
|
|
953
|
+
return None
|
|
954
|
+
except Exception as e:
|
|
955
|
+
self.logger.debug(f"EasyOCR failed: {str(e)}")
|
|
956
|
+
return None
|
|
957
|
+
|
|
958
|
+
def _ocr_with_cloud_service(self, image_data: bytes) -> Optional[str]:
|
|
959
|
+
"""
|
|
960
|
+
Extract text using cloud OCR service (Google Vision API, Azure, etc.).
|
|
961
|
+
|
|
962
|
+
Args:
|
|
963
|
+
image_data: Image data as bytes
|
|
964
|
+
|
|
965
|
+
Returns:
|
|
966
|
+
Extracted text or None if failed
|
|
967
|
+
"""
|
|
968
|
+
try:
|
|
969
|
+
# Try Google Vision API if credentials are available
|
|
970
|
+
google_creds = self.config.get('google_vision_credentials')
|
|
971
|
+
if google_creds:
|
|
972
|
+
return self._ocr_with_google_vision(image_data, google_creds)
|
|
973
|
+
|
|
974
|
+
# Try Azure Computer Vision if key is available
|
|
975
|
+
azure_key = self.config.get('azure_vision_key')
|
|
976
|
+
azure_endpoint = self.config.get('azure_vision_endpoint')
|
|
977
|
+
if azure_key and azure_endpoint:
|
|
978
|
+
return self._ocr_with_azure_vision(image_data, azure_key, azure_endpoint)
|
|
979
|
+
|
|
980
|
+
self.logger.debug("No cloud OCR service credentials available")
|
|
981
|
+
return None
|
|
982
|
+
|
|
983
|
+
except Exception as e:
|
|
984
|
+
self.logger.debug(f"Cloud OCR failed: {str(e)}")
|
|
985
|
+
return None
|
|
986
|
+
|
|
987
|
+
def _ocr_with_google_vision(self, image_data: bytes, credentials_path: str) -> Optional[str]:
|
|
988
|
+
"""
|
|
989
|
+
Extract text using Google Vision API.
|
|
990
|
+
|
|
991
|
+
Args:
|
|
992
|
+
image_data: Image data as bytes
|
|
993
|
+
credentials_path: Path to Google credentials JSON
|
|
994
|
+
|
|
995
|
+
Returns:
|
|
996
|
+
Extracted text or None if failed
|
|
997
|
+
"""
|
|
998
|
+
try:
|
|
999
|
+
from google.cloud import vision
|
|
1000
|
+
import os
|
|
1001
|
+
|
|
1002
|
+
# Set credentials
|
|
1003
|
+
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
|
|
1004
|
+
|
|
1005
|
+
# Initialize client
|
|
1006
|
+
client = vision.ImageAnnotatorClient()
|
|
1007
|
+
|
|
1008
|
+
# Create image object
|
|
1009
|
+
image = vision.Image(content=image_data)
|
|
1010
|
+
|
|
1011
|
+
# Perform text detection
|
|
1012
|
+
response = client.text_detection(image=image)
|
|
1013
|
+
texts = response.text_annotations
|
|
1014
|
+
|
|
1015
|
+
if texts:
|
|
1016
|
+
return texts[0].description.strip()
|
|
1017
|
+
|
|
1018
|
+
return None
|
|
1019
|
+
|
|
1020
|
+
except ImportError:
|
|
1021
|
+
self.logger.debug(
|
|
1022
|
+
"Google Vision API not available. Install with: pip install google-cloud-vision")
|
|
1023
|
+
return None
|
|
1024
|
+
except Exception as e:
|
|
1025
|
+
self.logger.debug(f"Google Vision API failed: {str(e)}")
|
|
1026
|
+
return None
|
|
1027
|
+
|
|
1028
|
+
def _ocr_with_azure_vision(self, image_data: bytes, api_key: str, endpoint: str) -> Optional[str]:
|
|
1029
|
+
"""
|
|
1030
|
+
Extract text using Azure Computer Vision API.
|
|
1031
|
+
|
|
1032
|
+
Args:
|
|
1033
|
+
image_data: Image data as bytes
|
|
1034
|
+
api_key: Azure API key
|
|
1035
|
+
endpoint: Azure endpoint URL
|
|
1036
|
+
|
|
1037
|
+
Returns:
|
|
1038
|
+
Extracted text or None if failed
|
|
1039
|
+
"""
|
|
1040
|
+
try:
|
|
1041
|
+
import time
|
|
1042
|
+
|
|
1043
|
+
# Submit image for OCR
|
|
1044
|
+
headers = {
|
|
1045
|
+
'Ocp-Apim-Subscription-Key': api_key,
|
|
1046
|
+
'Content-Type': 'application/octet-stream'
|
|
1047
|
+
}
|
|
1048
|
+
|
|
1049
|
+
# Start OCR operation
|
|
1050
|
+
ocr_url = f"{endpoint}/vision/v3.2/read/analyze"
|
|
1051
|
+
response = requests.post(ocr_url, headers=headers, data=image_data)
|
|
1052
|
+
response.raise_for_status()
|
|
1053
|
+
|
|
1054
|
+
# Get operation location
|
|
1055
|
+
operation_url = response.headers['Operation-Location']
|
|
1056
|
+
|
|
1057
|
+
# Poll for results
|
|
1058
|
+
for _ in range(10): # Max 10 attempts
|
|
1059
|
+
time.sleep(1)
|
|
1060
|
+
result_response = requests.get(operation_url, headers={
|
|
1061
|
+
'Ocp-Apim-Subscription-Key': api_key})
|
|
1062
|
+
result_response.raise_for_status()
|
|
1063
|
+
result = result_response.json()
|
|
1064
|
+
|
|
1065
|
+
if result['status'] == 'succeeded':
|
|
1066
|
+
# Extract text from results
|
|
1067
|
+
text_parts = []
|
|
1068
|
+
for read_result in result['analyzeResult']['readResults']:
|
|
1069
|
+
for line in read_result['lines']:
|
|
1070
|
+
text_parts.append(line['text'])
|
|
1071
|
+
|
|
1072
|
+
return ' '.join(text_parts)
|
|
1073
|
+
elif result['status'] == 'failed':
|
|
1074
|
+
break
|
|
1075
|
+
|
|
1076
|
+
return None
|
|
1077
|
+
|
|
1078
|
+
except Exception as e:
|
|
1079
|
+
self.logger.debug(f"Azure Vision API failed: {str(e)}")
|
|
1080
|
+
return None
|
|
1081
|
+
|
|
1082
|
+
def _scrape_social_media(self, url: str) -> Optional[str]:
|
|
1083
|
+
"""
|
|
1084
|
+
Scrape social media profile data.
|
|
1085
|
+
|
|
1086
|
+
Args:
|
|
1087
|
+
url: Social media profile URL
|
|
1088
|
+
|
|
1089
|
+
Returns:
|
|
1090
|
+
Scraped profile data or None if failed
|
|
1091
|
+
"""
|
|
1092
|
+
try:
|
|
1093
|
+
if self.is_dry_run():
|
|
1094
|
+
return f"[DRY RUN] Would scrape social media: {url}"
|
|
1095
|
+
|
|
1096
|
+
# Determine platform and use appropriate scraping method
|
|
1097
|
+
if 'linkedin.com' in url.lower():
|
|
1098
|
+
return self._scrape_linkedin_profile(url)
|
|
1099
|
+
elif 'facebook.com' in url.lower():
|
|
1100
|
+
return self._scrape_facebook_profile(url)
|
|
1101
|
+
else:
|
|
1102
|
+
# Generic social media scraping
|
|
1103
|
+
return self._scrape_website(url)
|
|
1104
|
+
|
|
1105
|
+
except Exception as e:
|
|
1106
|
+
self.logger.error(
|
|
1107
|
+
f"Social media scraping failed for {url}: {str(e)}")
|
|
1108
|
+
return None
|
|
1109
|
+
|
|
1110
|
+
def _scrape_linkedin_profile(self, url: str) -> Optional[str]:
|
|
1111
|
+
"""
|
|
1112
|
+
Scrape LinkedIn profile with specific handling for LinkedIn's structure.
|
|
1113
|
+
|
|
1114
|
+
Args:
|
|
1115
|
+
url: LinkedIn profile URL
|
|
1116
|
+
|
|
1117
|
+
Returns:
|
|
1118
|
+
Scraped LinkedIn profile data or None if failed
|
|
1119
|
+
"""
|
|
1120
|
+
try:
|
|
1121
|
+
# LinkedIn has anti-scraping measures, so we'll try different approaches
|
|
1122
|
+
|
|
1123
|
+
# Method 1: Try with Serper API if available (most reliable)
|
|
1124
|
+
serper_key = self.config.get('serper_api_key')
|
|
1125
|
+
if serper_key:
|
|
1126
|
+
linkedin_data = self._scrape_with_serper(url, serper_key)
|
|
1127
|
+
if linkedin_data:
|
|
1128
|
+
return linkedin_data
|
|
1129
|
+
|
|
1130
|
+
# Method 2: Try direct scraping with LinkedIn-specific headers
|
|
1131
|
+
linkedin_data = self._scrape_linkedin_direct(url)
|
|
1132
|
+
if linkedin_data:
|
|
1133
|
+
return linkedin_data
|
|
1134
|
+
|
|
1135
|
+
# Method 3: Fallback to generic scraping
|
|
1136
|
+
return self._direct_website_scrape(url)
|
|
1137
|
+
|
|
1138
|
+
except Exception as e:
|
|
1139
|
+
self.logger.error(f"LinkedIn scraping failed: {str(e)}")
|
|
1140
|
+
return None
|
|
1141
|
+
|
|
1142
|
+
def _scrape_linkedin_direct(self, url: str) -> Optional[str]:
|
|
1143
|
+
"""
|
|
1144
|
+
Direct LinkedIn scraping with specific headers and handling.
|
|
1145
|
+
|
|
1146
|
+
Args:
|
|
1147
|
+
url: LinkedIn profile URL
|
|
1148
|
+
|
|
1149
|
+
Returns:
|
|
1150
|
+
Scraped content or None if failed
|
|
1151
|
+
"""
|
|
1152
|
+
try:
|
|
1153
|
+
# LinkedIn-specific headers to avoid blocking
|
|
1154
|
+
headers = {
|
|
1155
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
1156
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
1157
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
1158
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
1159
|
+
'Connection': 'keep-alive',
|
|
1160
|
+
'Upgrade-Insecure-Requests': '1',
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
# Add delay to avoid rate limiting
|
|
1164
|
+
time.sleep(2)
|
|
1165
|
+
|
|
1166
|
+
response = requests.get(url, headers=headers, timeout=30)
|
|
1167
|
+
response.raise_for_status()
|
|
1168
|
+
|
|
1169
|
+
# Parse LinkedIn-specific content
|
|
1170
|
+
try:
|
|
1171
|
+
from bs4 import BeautifulSoup
|
|
1172
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
1173
|
+
|
|
1174
|
+
# Extract LinkedIn-specific elements
|
|
1175
|
+
profile_data = []
|
|
1176
|
+
|
|
1177
|
+
# Try to extract name
|
|
1178
|
+
name_selectors = [
|
|
1179
|
+
'h1.text-heading-xlarge',
|
|
1180
|
+
'.pv-text-details__left-panel h1',
|
|
1181
|
+
'.ph5 h1'
|
|
1182
|
+
]
|
|
1183
|
+
for selector in name_selectors:
|
|
1184
|
+
name_elem = soup.select_one(selector)
|
|
1185
|
+
if name_elem:
|
|
1186
|
+
profile_data.append(
|
|
1187
|
+
f"Name: {name_elem.get_text().strip()}")
|
|
1188
|
+
break
|
|
1189
|
+
|
|
1190
|
+
# Try to extract headline/title
|
|
1191
|
+
headline_selectors = [
|
|
1192
|
+
'.text-body-medium.break-words',
|
|
1193
|
+
'.pv-text-details__left-panel .text-body-medium',
|
|
1194
|
+
'.ph5 .text-body-medium'
|
|
1195
|
+
]
|
|
1196
|
+
for selector in headline_selectors:
|
|
1197
|
+
headline_elem = soup.select_one(selector)
|
|
1198
|
+
if headline_elem:
|
|
1199
|
+
profile_data.append(
|
|
1200
|
+
f"Title: {headline_elem.get_text().strip()}")
|
|
1201
|
+
break
|
|
1202
|
+
|
|
1203
|
+
# Try to extract company
|
|
1204
|
+
company_selectors = [
|
|
1205
|
+
'.pv-text-details__right-panel',
|
|
1206
|
+
'.pv-entity__summary-info h3',
|
|
1207
|
+
'.experience-section .pv-entity__summary-info h3'
|
|
1208
|
+
]
|
|
1209
|
+
for selector in company_selectors:
|
|
1210
|
+
company_elem = soup.select_one(selector)
|
|
1211
|
+
if company_elem:
|
|
1212
|
+
profile_data.append(
|
|
1213
|
+
f"Company: {company_elem.get_text().strip()}")
|
|
1214
|
+
break
|
|
1215
|
+
|
|
1216
|
+
if profile_data:
|
|
1217
|
+
return ' | '.join(profile_data)
|
|
1218
|
+
|
|
1219
|
+
# Fallback: extract all text
|
|
1220
|
+
text = soup.get_text()
|
|
1221
|
+
lines = (line.strip() for line in text.splitlines())
|
|
1222
|
+
text = ' '.join(line for line in lines if line)
|
|
1223
|
+
|
|
1224
|
+
if len(text) > 3000:
|
|
1225
|
+
text = text[:3000] + "..."
|
|
1226
|
+
|
|
1227
|
+
return text if len(text) > 50 else None
|
|
1228
|
+
|
|
1229
|
+
except ImportError:
|
|
1230
|
+
# Fallback without BeautifulSoup
|
|
1231
|
+
content = response.text
|
|
1232
|
+
if len(content) > 2000:
|
|
1233
|
+
content = content[:2000] + "..."
|
|
1234
|
+
return content
|
|
1235
|
+
|
|
1236
|
+
except Exception as e:
|
|
1237
|
+
self.logger.debug(f"Direct LinkedIn scraping failed: {str(e)}")
|
|
1238
|
+
return None
|
|
1239
|
+
|
|
1240
|
+
def _scrape_facebook_profile(self, url: str) -> Optional[str]:
|
|
1241
|
+
"""
|
|
1242
|
+
Scrape Facebook profile with specific handling for Facebook's structure.
|
|
1243
|
+
|
|
1244
|
+
Args:
|
|
1245
|
+
url: Facebook profile URL
|
|
1246
|
+
|
|
1247
|
+
Returns:
|
|
1248
|
+
Scraped Facebook profile data or None if failed
|
|
1249
|
+
"""
|
|
1250
|
+
try:
|
|
1251
|
+
# Facebook has strong anti-scraping measures
|
|
1252
|
+
|
|
1253
|
+
# Method 1: Try with Serper API if available (most reliable)
|
|
1254
|
+
serper_key = self.config.get('serper_api_key')
|
|
1255
|
+
if serper_key:
|
|
1256
|
+
facebook_data = self._scrape_with_serper(url, serper_key)
|
|
1257
|
+
if facebook_data:
|
|
1258
|
+
return facebook_data
|
|
1259
|
+
|
|
1260
|
+
# Method 2: Try direct scraping with Facebook-specific headers
|
|
1261
|
+
facebook_data = self._scrape_facebook_direct(url)
|
|
1262
|
+
if facebook_data:
|
|
1263
|
+
return facebook_data
|
|
1264
|
+
|
|
1265
|
+
# Method 3: Fallback to generic scraping
|
|
1266
|
+
return self._direct_website_scrape(url)
|
|
1267
|
+
|
|
1268
|
+
except Exception as e:
|
|
1269
|
+
self.logger.error(f"Facebook scraping failed: {str(e)}")
|
|
1270
|
+
return None
|
|
1271
|
+
|
|
1272
|
+
def _scrape_facebook_direct(self, url: str) -> Optional[str]:
|
|
1273
|
+
"""
|
|
1274
|
+
Direct Facebook scraping with specific headers and handling.
|
|
1275
|
+
|
|
1276
|
+
Args:
|
|
1277
|
+
url: Facebook profile URL
|
|
1278
|
+
|
|
1279
|
+
Returns:
|
|
1280
|
+
Scraped content or None if failed
|
|
1281
|
+
"""
|
|
1282
|
+
try:
|
|
1283
|
+
# Facebook-specific headers
|
|
1284
|
+
headers = {
|
|
1285
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
1286
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
1287
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
1288
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
1289
|
+
'Connection': 'keep-alive',
|
|
1290
|
+
'Upgrade-Insecure-Requests': '1',
|
|
1291
|
+
'Sec-Fetch-Dest': 'document',
|
|
1292
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
1293
|
+
'Sec-Fetch-Site': 'none',
|
|
1294
|
+
}
|
|
1295
|
+
|
|
1296
|
+
# Add delay to avoid rate limiting
|
|
1297
|
+
time.sleep(3)
|
|
1298
|
+
|
|
1299
|
+
response = requests.get(url, headers=headers, timeout=30)
|
|
1300
|
+
response.raise_for_status()
|
|
1301
|
+
|
|
1302
|
+
# Parse Facebook-specific content
|
|
1303
|
+
try:
|
|
1304
|
+
from bs4 import BeautifulSoup
|
|
1305
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
1306
|
+
|
|
1307
|
+
# Extract Facebook-specific elements
|
|
1308
|
+
profile_data = []
|
|
1309
|
+
|
|
1310
|
+
# Try to extract page/profile name
|
|
1311
|
+
name_selectors = [
|
|
1312
|
+
'h1[data-testid="page_title"]',
|
|
1313
|
+
'.x1heor9g.x1qlqyl8.x1pd3egz.x1a2a7pz h1',
|
|
1314
|
+
'#seo_h1_tag',
|
|
1315
|
+
'title'
|
|
1316
|
+
]
|
|
1317
|
+
for selector in name_selectors:
|
|
1318
|
+
name_elem = soup.select_one(selector)
|
|
1319
|
+
if name_elem:
|
|
1320
|
+
name_text = name_elem.get_text().strip()
|
|
1321
|
+
if name_text and len(name_text) > 3:
|
|
1322
|
+
profile_data.append(f"Name: {name_text}")
|
|
1323
|
+
break
|
|
1324
|
+
|
|
1325
|
+
# Try to extract description/about
|
|
1326
|
+
desc_selectors = [
|
|
1327
|
+
'[data-testid="page_description"]',
|
|
1328
|
+
'.x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz.x1sur9pj.xkrqix3.x1fey0fg.x1s688f',
|
|
1329
|
+
'.x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz.x1heor9g.xt0b8zv.xo1l8bm'
|
|
1330
|
+
]
|
|
1331
|
+
for selector in desc_selectors:
|
|
1332
|
+
desc_elem = soup.select_one(selector)
|
|
1333
|
+
if desc_elem:
|
|
1334
|
+
desc_text = desc_elem.get_text().strip()
|
|
1335
|
+
if desc_text and len(desc_text) > 10:
|
|
1336
|
+
profile_data.append(f"Description: {desc_text}")
|
|
1337
|
+
break
|
|
1338
|
+
|
|
1339
|
+
if profile_data:
|
|
1340
|
+
return ' | '.join(profile_data)
|
|
1341
|
+
|
|
1342
|
+
# Fallback: extract meta description
|
|
1343
|
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
1344
|
+
if meta_desc and meta_desc.get('content'):
|
|
1345
|
+
return f"Description: {meta_desc['content']}"
|
|
1346
|
+
|
|
1347
|
+
# Last fallback: extract title
|
|
1348
|
+
title = soup.find('title')
|
|
1349
|
+
if title:
|
|
1350
|
+
return f"Title: {title.get_text().strip()}"
|
|
1351
|
+
|
|
1352
|
+
return None
|
|
1353
|
+
|
|
1354
|
+
except ImportError:
|
|
1355
|
+
# Fallback without BeautifulSoup
|
|
1356
|
+
content = response.text
|
|
1357
|
+
if 'Facebook' in content and len(content) > 100:
|
|
1358
|
+
# Extract title from HTML
|
|
1359
|
+
title_start = content.find('<title>')
|
|
1360
|
+
title_end = content.find('</title>')
|
|
1361
|
+
if title_start != -1 and title_end != -1:
|
|
1362
|
+
title = content[title_start + 7:title_end].strip()
|
|
1363
|
+
return f"Title: {title}"
|
|
1364
|
+
return None
|
|
1365
|
+
|
|
1366
|
+
except Exception as e:
|
|
1367
|
+
self.logger.debug(f"Direct Facebook scraping failed: {str(e)}")
|
|
1368
|
+
return None
|
|
1369
|
+
|
|
1370
|
+
def _extract_customer_info(self, raw_data: str) -> Dict[str, Any]:
|
|
1371
|
+
"""
|
|
1372
|
+
Extract structured customer information using LLM.
|
|
1373
|
+
|
|
1374
|
+
Args:
|
|
1375
|
+
raw_data: Combined raw data from all sources
|
|
1376
|
+
|
|
1377
|
+
Returns:
|
|
1378
|
+
Structured customer information dictionary
|
|
1379
|
+
"""
|
|
1380
|
+
try:
|
|
1381
|
+
if self.is_dry_run():
|
|
1382
|
+
return {
|
|
1383
|
+
'contact_name': 'John Doe',
|
|
1384
|
+
'company_name': 'Example Corp',
|
|
1385
|
+
'customer_phone': '+1-555-0123',
|
|
1386
|
+
'customer_email': 'contact@example.com',
|
|
1387
|
+
'customer_linkedin': 'https://linkedin.com/company/example',
|
|
1388
|
+
'customer_facebook': 'https://facebook.com/example',
|
|
1389
|
+
'company_website': 'https://example.com',
|
|
1390
|
+
'customer_address': '123 Main St, City, State',
|
|
1391
|
+
'company_business': 'Technology solutions',
|
|
1392
|
+
'company_industries': ['Technology', 'Software'],
|
|
1393
|
+
'founders': ['John Doe'],
|
|
1394
|
+
'branches': ['Main Office'],
|
|
1395
|
+
'customer_description': '[DRY RUN] Mock customer description'
|
|
1396
|
+
}
|
|
1397
|
+
|
|
1398
|
+
prompt = f"""This is the customer information: {raw_data}.
|
|
1399
|
+
|
|
1400
|
+
Based on the above data, generate a JSON structure with the following format:
|
|
1401
|
+
|
|
1402
|
+
{{
|
|
1403
|
+
"contact_name": "Name of the contact/representative",
|
|
1404
|
+
"company_name": "Name of the company",
|
|
1405
|
+
"customer_phone": "Company/Contact phone number in the correct format",
|
|
1406
|
+
"customer_email": "Company/Contact email",
|
|
1407
|
+
"customer_linkedin": "LinkedIn profile URL",
|
|
1408
|
+
"customer_facebook": "Facebook profile URL",
|
|
1409
|
+
"company_website": "Company website (valid structure)",
|
|
1410
|
+
"customer_address": "Company/Contact address",
|
|
1411
|
+
"company_business": "Main business activities of the company",
|
|
1412
|
+
"company_industries": ["List of industries or fields of operation"],
|
|
1413
|
+
"founders": ["List of founders"],
|
|
1414
|
+
"branches": ["List of branches"],
|
|
1415
|
+
"customer_description": "All information about the customer"
|
|
1416
|
+
}}
|
|
1417
|
+
|
|
1418
|
+
Rules:
|
|
1419
|
+
1. Ensure `company_website` is correctly structured as a valid URL.
|
|
1420
|
+
2. If `company_name` is an array with multiple values:
|
|
1421
|
+
- Use available data and context to generate a comprehensive, accurate company name.
|
|
1422
|
+
3. Return an empty result if the required information is not available.
|
|
1423
|
+
4. Do not include the word ```JSON in the result.
|
|
1424
|
+
5. Provide the output directly without any explanations or additional text. In JSON response, use double quotes instead of single quotes."""
|
|
1425
|
+
|
|
1426
|
+
response = self.call_llm(prompt, temperature=0.2)
|
|
1427
|
+
|
|
1428
|
+
# Parse the JSON response
|
|
1429
|
+
customer_info = self.parse_json_response(response)
|
|
1430
|
+
|
|
1431
|
+
self.logger.info("Successfully extracted customer information")
|
|
1432
|
+
return customer_info
|
|
1433
|
+
|
|
1434
|
+
except Exception as e:
|
|
1435
|
+
self.logger.error(f"Customer info extraction failed: {str(e)}")
|
|
1436
|
+
# Try basic regex extraction as fallback
|
|
1437
|
+
fallback_info = self._extract_basic_info_fallback(raw_data)
|
|
1438
|
+
self.logger.info("Using basic regex extraction as fallback")
|
|
1439
|
+
return fallback_info
|
|
1440
|
+
|
|
1441
|
+
def _extract_basic_info_fallback(self, raw_data: str) -> Dict[str, Any]:
|
|
1442
|
+
"""
|
|
1443
|
+
Extract basic information using regex patterns when LLM is not available.
|
|
1444
|
+
|
|
1445
|
+
Args:
|
|
1446
|
+
raw_data: Raw text data to extract from
|
|
1447
|
+
|
|
1448
|
+
Returns:
|
|
1449
|
+
Dictionary with extracted basic information
|
|
1450
|
+
"""
|
|
1451
|
+
import re
|
|
1452
|
+
|
|
1453
|
+
# Initialize result with empty values
|
|
1454
|
+
result = {
|
|
1455
|
+
'contact_name': '',
|
|
1456
|
+
'company_name': '',
|
|
1457
|
+
'customer_phone': '',
|
|
1458
|
+
'customer_email': '',
|
|
1459
|
+
'customer_linkedin': '',
|
|
1460
|
+
'customer_facebook': '',
|
|
1461
|
+
'company_website': '',
|
|
1462
|
+
'customer_address': '',
|
|
1463
|
+
'company_business': '',
|
|
1464
|
+
'company_industries': [],
|
|
1465
|
+
'founders': [],
|
|
1466
|
+
'branches': [],
|
|
1467
|
+
'customer_description': raw_data[:500] + "..." if len(raw_data) > 500 else raw_data
|
|
1468
|
+
}
|
|
1469
|
+
|
|
1470
|
+
# Extract email addresses
|
|
1471
|
+
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
|
1472
|
+
emails = re.findall(email_pattern, raw_data, re.IGNORECASE)
|
|
1473
|
+
if emails:
|
|
1474
|
+
result['customer_email'] = emails[0]
|
|
1475
|
+
|
|
1476
|
+
# Extract phone numbers (various formats)
|
|
1477
|
+
phone_pattern = r'(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})'
|
|
1478
|
+
phones = re.findall(phone_pattern, raw_data)
|
|
1479
|
+
if phones:
|
|
1480
|
+
result['customer_phone'] = ''.join(phones[0])
|
|
1481
|
+
|
|
1482
|
+
# Extract URLs
|
|
1483
|
+
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
|
|
1484
|
+
urls = re.findall(url_pattern, raw_data, re.IGNORECASE)
|
|
1485
|
+
for url in urls:
|
|
1486
|
+
if 'linkedin.com' in url.lower():
|
|
1487
|
+
result['customer_linkedin'] = url
|
|
1488
|
+
elif 'facebook.com' in url.lower():
|
|
1489
|
+
result['customer_facebook'] = url
|
|
1490
|
+
elif not result['company_website']: # First non-social URL becomes website
|
|
1491
|
+
result['company_website'] = url
|
|
1492
|
+
|
|
1493
|
+
# Extract names and companies using common patterns
|
|
1494
|
+
# Look for "Customer: Name at Company" pattern
|
|
1495
|
+
customer_pattern = r'Customer:\s*([^,\n]+?)(?:\s+at\s+([^,\n]+?))?(?:,|$|\n)'
|
|
1496
|
+
customer_match = re.search(customer_pattern, raw_data, re.IGNORECASE)
|
|
1497
|
+
if customer_match:
|
|
1498
|
+
result['contact_name'] = customer_match.group(1).strip()
|
|
1499
|
+
if customer_match.group(2):
|
|
1500
|
+
result['company_name'] = customer_match.group(2).strip()
|
|
1501
|
+
|
|
1502
|
+
# Look for "Name: value" patterns
|
|
1503
|
+
name_patterns = [
|
|
1504
|
+
(r'Name:\s*([^\n,]+)', 'contact_name'),
|
|
1505
|
+
(r'Company:\s*([^\n,]+)', 'company_name'),
|
|
1506
|
+
(r'Organization:\s*([^\n,]+)', 'company_name'),
|
|
1507
|
+
(r'Business:\s*([^\n,]+)', 'company_business'),
|
|
1508
|
+
(r'Address:\s*([^\n,]+)', 'customer_address')
|
|
1509
|
+
]
|
|
1510
|
+
|
|
1511
|
+
for pattern, field in name_patterns:
|
|
1512
|
+
match = re.search(pattern, raw_data, re.IGNORECASE)
|
|
1513
|
+
if match and not result[field]: # Only set if not already set
|
|
1514
|
+
result[field] = match.group(1).strip()
|
|
1515
|
+
|
|
1516
|
+
# If we found an email but no name, try to extract name from email
|
|
1517
|
+
if result['customer_email'] and not result['contact_name']:
|
|
1518
|
+
email_name = result['customer_email'].split('@')[0]
|
|
1519
|
+
# Convert common email formats to names
|
|
1520
|
+
if '.' in email_name:
|
|
1521
|
+
parts = email_name.split('.')
|
|
1522
|
+
result['contact_name'] = ' '.join(part.capitalize() for part in parts)
|
|
1523
|
+
else:
|
|
1524
|
+
result['contact_name'] = email_name.capitalize()
|
|
1525
|
+
|
|
1526
|
+
# If we found an email but no company, try to extract from email domain
|
|
1527
|
+
if result['customer_email'] and not result['company_name']:
|
|
1528
|
+
domain = result['customer_email'].split('@')[1]
|
|
1529
|
+
# Remove common TLDs and convert to company name
|
|
1530
|
+
company_part = domain.split('.')[0]
|
|
1531
|
+
result['company_name'] = company_part.upper()
|
|
1532
|
+
|
|
1533
|
+
return result
|
|
1534
|
+
|
|
1535
|
+
def _perform_company_research(self, customer_info: Dict[str, Any]) -> Optional[str]:
|
|
1536
|
+
"""
|
|
1537
|
+
Perform enhanced company research using multiple search strategies.
|
|
1538
|
+
|
|
1539
|
+
Args:
|
|
1540
|
+
customer_info: Extracted customer information
|
|
1541
|
+
|
|
1542
|
+
Returns:
|
|
1543
|
+
Research results or None if failed
|
|
1544
|
+
"""
|
|
1545
|
+
try:
|
|
1546
|
+
company_name = customer_info.get('company_name', '')
|
|
1547
|
+
company_website = customer_info.get('company_website', '')
|
|
1548
|
+
|
|
1549
|
+
if not company_name:
|
|
1550
|
+
return None
|
|
1551
|
+
|
|
1552
|
+
if self.is_dry_run():
|
|
1553
|
+
return f"[DRY RUN] Would research company: {company_name} {company_website}"
|
|
1554
|
+
|
|
1555
|
+
# Use Serper API for search if available
|
|
1556
|
+
serper_key = self.config.get('serper_api_key')
|
|
1557
|
+
if not serper_key:
|
|
1558
|
+
self.logger.warning("Company research skipped - no Serper API key available")
|
|
1559
|
+
return None
|
|
1560
|
+
|
|
1561
|
+
research_results = []
|
|
1562
|
+
|
|
1563
|
+
# Strategy 1: General company search
|
|
1564
|
+
general_query = f'"{company_name}" company profile business'
|
|
1565
|
+
general_results = self._search_with_serper(general_query, serper_key, 'search')
|
|
1566
|
+
if general_results:
|
|
1567
|
+
research_results.append(f"General Info: {general_results}")
|
|
1568
|
+
|
|
1569
|
+
# Strategy 2: News search for recent company information
|
|
1570
|
+
news_query = f'"{company_name}" company news'
|
|
1571
|
+
news_results = self._search_with_serper(news_query, serper_key, 'news')
|
|
1572
|
+
if news_results:
|
|
1573
|
+
research_results.append(f"Recent News: {news_results}")
|
|
1574
|
+
|
|
1575
|
+
# Strategy 3: Industry-specific search
|
|
1576
|
+
if company_website:
|
|
1577
|
+
industry_query = f'"{company_name}" industry services products site:{company_website}'
|
|
1578
|
+
industry_results = self._search_with_serper(industry_query, serper_key, 'search')
|
|
1579
|
+
if industry_results:
|
|
1580
|
+
research_results.append(f"Industry Info: {industry_results}")
|
|
1581
|
+
|
|
1582
|
+
# Strategy 4: Contact and location search
|
|
1583
|
+
contact_query = f'"{company_name}" contact address phone location'
|
|
1584
|
+
contact_results = self._search_with_serper(contact_query, serper_key, 'search')
|
|
1585
|
+
if contact_results:
|
|
1586
|
+
research_results.append(f"Contact Info: {contact_results}")
|
|
1587
|
+
|
|
1588
|
+
if research_results:
|
|
1589
|
+
combined_research = ' | '.join(research_results)
|
|
1590
|
+
# Limit length to avoid token limits
|
|
1591
|
+
if len(combined_research) > 4000:
|
|
1592
|
+
combined_research = combined_research[:4000] + "..."
|
|
1593
|
+
return combined_research
|
|
1594
|
+
|
|
1595
|
+
return None
|
|
1596
|
+
|
|
1597
|
+
except Exception as e:
|
|
1598
|
+
self.logger.error(f"Company research failed: {str(e)}")
|
|
1599
|
+
return None
|
|
1600
|
+
|
|
1601
|
+
def _search_with_serper(self, query: str, api_key: str, search_type: str = 'search') -> Optional[str]:
|
|
1602
|
+
"""
|
|
1603
|
+
Enhanced search using Serper API with multiple search types.
|
|
1604
|
+
|
|
1605
|
+
Args:
|
|
1606
|
+
query: Search query
|
|
1607
|
+
api_key: Serper API key
|
|
1608
|
+
search_type: Type of search ('search', 'news', 'images')
|
|
1609
|
+
|
|
1610
|
+
Returns:
|
|
1611
|
+
Search results or None if failed
|
|
1612
|
+
"""
|
|
1613
|
+
try:
|
|
1614
|
+
headers = {
|
|
1615
|
+
'X-API-KEY': api_key,
|
|
1616
|
+
'Content-Type': 'application/json'
|
|
1617
|
+
}
|
|
1618
|
+
|
|
1619
|
+
body = {
|
|
1620
|
+
'q': query,
|
|
1621
|
+
'num': 10 # Get more results for better fallback
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
# Choose appropriate endpoint
|
|
1625
|
+
endpoints = {
|
|
1626
|
+
'search': 'https://google.serper.dev/search',
|
|
1627
|
+
'news': 'https://google.serper.dev/news',
|
|
1628
|
+
'images': 'https://google.serper.dev/images'
|
|
1629
|
+
}
|
|
1630
|
+
|
|
1631
|
+
endpoint = endpoints.get(search_type, endpoints['search'])
|
|
1632
|
+
|
|
1633
|
+
response = requests.post(
|
|
1634
|
+
endpoint,
|
|
1635
|
+
json=body,
|
|
1636
|
+
headers=headers,
|
|
1637
|
+
timeout=60 # Reduced timeout for faster fallback
|
|
1638
|
+
)
|
|
1639
|
+
|
|
1640
|
+
if response.status_code == 200:
|
|
1641
|
+
result = response.json()
|
|
1642
|
+
|
|
1643
|
+
# Extract different types of results based on search type
|
|
1644
|
+
if search_type == 'search':
|
|
1645
|
+
return self._process_search_results(result)
|
|
1646
|
+
elif search_type == 'news':
|
|
1647
|
+
return self._process_news_results(result)
|
|
1648
|
+
else:
|
|
1649
|
+
return self._process_search_results(result)
|
|
1650
|
+
|
|
1651
|
+
elif response.status_code == 429:
|
|
1652
|
+
self.logger.warning("Serper API rate limit exceeded, waiting before retry")
|
|
1653
|
+
time.sleep(2)
|
|
1654
|
+
return None
|
|
1655
|
+
else:
|
|
1656
|
+
self.logger.warning(
|
|
1657
|
+
f"Serper search API returned status {response.status_code}: {response.text}")
|
|
1658
|
+
return None
|
|
1659
|
+
|
|
1660
|
+
except requests.exceptions.Timeout:
|
|
1661
|
+
self.logger.warning(f"Serper search timed out for query: {query}")
|
|
1662
|
+
return None
|
|
1663
|
+
except Exception as e:
|
|
1664
|
+
self.logger.error(f"Serper search failed: {str(e)}")
|
|
1665
|
+
return None
|
|
1666
|
+
|
|
1667
|
+
def _process_search_results(self, result: Dict[str, Any]) -> Optional[str]:
|
|
1668
|
+
"""
|
|
1669
|
+
Process search results from Serper API.
|
|
1670
|
+
|
|
1671
|
+
Args:
|
|
1672
|
+
result: JSON response from Serper API
|
|
1673
|
+
|
|
1674
|
+
Returns:
|
|
1675
|
+
Processed search results text
|
|
1676
|
+
"""
|
|
1677
|
+
try:
|
|
1678
|
+
processed_parts = []
|
|
1679
|
+
|
|
1680
|
+
# Extract knowledge graph info (company info box)
|
|
1681
|
+
knowledge_graph = result.get('knowledgeGraph', {})
|
|
1682
|
+
if knowledge_graph:
|
|
1683
|
+
kg_title = knowledge_graph.get('title', '')
|
|
1684
|
+
kg_description = knowledge_graph.get('description', '')
|
|
1685
|
+
kg_attributes = knowledge_graph.get('attributes', {})
|
|
1686
|
+
|
|
1687
|
+
if kg_title:
|
|
1688
|
+
processed_parts.append(f"Company: {kg_title}")
|
|
1689
|
+
if kg_description:
|
|
1690
|
+
processed_parts.append(f"Description: {kg_description}")
|
|
1691
|
+
|
|
1692
|
+
# Add relevant attributes
|
|
1693
|
+
for key, value in kg_attributes.items():
|
|
1694
|
+
if key.lower() in ['founded', 'headquarters', 'ceo', 'industry', 'revenue']:
|
|
1695
|
+
processed_parts.append(f"{key}: {value}")
|
|
1696
|
+
|
|
1697
|
+
# Extract organic results
|
|
1698
|
+
organic_results = result.get('organic', [])
|
|
1699
|
+
snippets = []
|
|
1700
|
+
|
|
1701
|
+
for item in organic_results:
|
|
1702
|
+
title = item.get('title', '')
|
|
1703
|
+
snippet = item.get('snippet', '')
|
|
1704
|
+
link = item.get('link', '')
|
|
1705
|
+
|
|
1706
|
+
if snippet:
|
|
1707
|
+
# Combine title and snippet for better context
|
|
1708
|
+
if title:
|
|
1709
|
+
snippets.append(f"{title}: {snippet}")
|
|
1710
|
+
else:
|
|
1711
|
+
snippets.append(snippet)
|
|
1712
|
+
|
|
1713
|
+
if snippets:
|
|
1714
|
+
processed_parts.extend(snippets[:5]) # Top 5 results
|
|
1715
|
+
|
|
1716
|
+
# Extract answer box if available
|
|
1717
|
+
answer_box = result.get('answerBox', {})
|
|
1718
|
+
if answer_box:
|
|
1719
|
+
answer = answer_box.get('answer', '')
|
|
1720
|
+
if answer:
|
|
1721
|
+
processed_parts.insert(0, f"Answer: {answer}")
|
|
1722
|
+
|
|
1723
|
+
return ' | '.join(processed_parts) if processed_parts else None
|
|
1724
|
+
|
|
1725
|
+
except Exception as e:
|
|
1726
|
+
self.logger.debug(f"Failed to process search results: {str(e)}")
|
|
1727
|
+
# Fallback to simple snippet extraction
|
|
1728
|
+
organic_results = result.get('organic', [])
|
|
1729
|
+
snippets = [item.get('snippet', '') for item in organic_results if 'snippet' in item]
|
|
1730
|
+
return ', '.join(snippets) if snippets else None
|
|
1731
|
+
|
|
1732
|
+
def _process_news_results(self, result: Dict[str, Any]) -> Optional[str]:
|
|
1733
|
+
"""
|
|
1734
|
+
Process news results from Serper API.
|
|
1735
|
+
|
|
1736
|
+
Args:
|
|
1737
|
+
result: JSON response from Serper API
|
|
1738
|
+
|
|
1739
|
+
Returns:
|
|
1740
|
+
Processed news results text
|
|
1741
|
+
"""
|
|
1742
|
+
try:
|
|
1743
|
+
news_results = result.get('news', [])
|
|
1744
|
+
news_snippets = []
|
|
1745
|
+
|
|
1746
|
+
for item in news_results[:3]: # Top 3 news items
|
|
1747
|
+
title = item.get('title', '')
|
|
1748
|
+
snippet = item.get('snippet', '')
|
|
1749
|
+
date = item.get('date', '')
|
|
1750
|
+
|
|
1751
|
+
if snippet:
|
|
1752
|
+
news_item = f"{title}: {snippet}"
|
|
1753
|
+
if date:
|
|
1754
|
+
news_item += f" ({date})"
|
|
1755
|
+
news_snippets.append(news_item)
|
|
1756
|
+
|
|
1757
|
+
return ' | '.join(news_snippets) if news_snippets else None
|
|
1758
|
+
|
|
1759
|
+
except Exception as e:
|
|
1760
|
+
self.logger.debug(f"Failed to process news results: {str(e)}")
|
|
1761
|
+
return None
|
|
1762
|
+
|
|
1763
|
+
def _scrape_company_website(self, customer_info: Dict[str, Any], data_sources: List[str]) -> Optional[str]:
|
|
1764
|
+
"""
|
|
1765
|
+
Scrape company website if not already scraped.
|
|
1766
|
+
|
|
1767
|
+
Args:
|
|
1768
|
+
customer_info: Extracted customer information
|
|
1769
|
+
data_sources: List of already processed data sources
|
|
1770
|
+
|
|
1771
|
+
Returns:
|
|
1772
|
+
Website content or None if failed/skipped
|
|
1773
|
+
"""
|
|
1774
|
+
try:
|
|
1775
|
+
# Only scrape if website wasn't already processed
|
|
1776
|
+
if 'website' in data_sources:
|
|
1777
|
+
return None
|
|
1778
|
+
|
|
1779
|
+
company_website = customer_info.get('company_website', '')
|
|
1780
|
+
if not company_website:
|
|
1781
|
+
return None
|
|
1782
|
+
|
|
1783
|
+
return self._scrape_website(company_website)
|
|
1784
|
+
|
|
1785
|
+
except Exception as e:
|
|
1786
|
+
self.logger.error(f"Company website research failed: {str(e)}")
|
|
1787
|
+
return None
|
|
1788
|
+
|
|
1789
|
+
def validate_input(self, context: Dict[str, Any]) -> bool:
|
|
1790
|
+
"""
|
|
1791
|
+
Validate input data for data acquisition stage.
|
|
1792
|
+
|
|
1793
|
+
Args:
|
|
1794
|
+
context: Execution context
|
|
1795
|
+
|
|
1796
|
+
Returns:
|
|
1797
|
+
True if input is valid
|
|
1798
|
+
"""
|
|
1799
|
+
input_data = context.get('input_data', {})
|
|
1800
|
+
|
|
1801
|
+
# Check if at least one data source is available (matching executor schema)
|
|
1802
|
+
sources = [
|
|
1803
|
+
input_data.get('input_website'),
|
|
1804
|
+
input_data.get('input_description'),
|
|
1805
|
+
input_data.get('input_business_card'),
|
|
1806
|
+
input_data.get('input_linkedin_url'),
|
|
1807
|
+
input_data.get('input_facebook_url'),
|
|
1808
|
+
input_data.get('input_freetext')
|
|
1809
|
+
]
|
|
1810
|
+
|
|
1811
|
+
return any(sources)
|
|
1812
|
+
|
|
1813
|
+
def get_required_fields(self) -> List[str]:
|
|
1814
|
+
"""
|
|
1815
|
+
Get list of required input fields for this stage.
|
|
1816
|
+
|
|
1817
|
+
Returns:
|
|
1818
|
+
List of required field names (at least one data source required)
|
|
1819
|
+
"""
|
|
1820
|
+
return [] # No strictly required fields, but at least one source needed
|