fusesell 1.3.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. fusesell-1.3.42.dist-info/METADATA +873 -0
  2. fusesell-1.3.42.dist-info/RECORD +35 -0
  3. fusesell-1.3.42.dist-info/WHEEL +5 -0
  4. fusesell-1.3.42.dist-info/entry_points.txt +2 -0
  5. fusesell-1.3.42.dist-info/licenses/LICENSE +21 -0
  6. fusesell-1.3.42.dist-info/top_level.txt +2 -0
  7. fusesell.py +20 -0
  8. fusesell_local/__init__.py +37 -0
  9. fusesell_local/api.py +343 -0
  10. fusesell_local/cli.py +1480 -0
  11. fusesell_local/config/__init__.py +11 -0
  12. fusesell_local/config/default_email_templates.json +34 -0
  13. fusesell_local/config/default_prompts.json +19 -0
  14. fusesell_local/config/default_scoring_criteria.json +154 -0
  15. fusesell_local/config/prompts.py +245 -0
  16. fusesell_local/config/settings.py +277 -0
  17. fusesell_local/pipeline.py +978 -0
  18. fusesell_local/stages/__init__.py +19 -0
  19. fusesell_local/stages/base_stage.py +603 -0
  20. fusesell_local/stages/data_acquisition.py +1820 -0
  21. fusesell_local/stages/data_preparation.py +1238 -0
  22. fusesell_local/stages/follow_up.py +1728 -0
  23. fusesell_local/stages/initial_outreach.py +2972 -0
  24. fusesell_local/stages/lead_scoring.py +1452 -0
  25. fusesell_local/utils/__init__.py +36 -0
  26. fusesell_local/utils/agent_context.py +552 -0
  27. fusesell_local/utils/auto_setup.py +361 -0
  28. fusesell_local/utils/birthday_email_manager.py +467 -0
  29. fusesell_local/utils/data_manager.py +4857 -0
  30. fusesell_local/utils/event_scheduler.py +959 -0
  31. fusesell_local/utils/llm_client.py +342 -0
  32. fusesell_local/utils/logger.py +203 -0
  33. fusesell_local/utils/output_helpers.py +2443 -0
  34. fusesell_local/utils/timezone_detector.py +914 -0
  35. fusesell_local/utils/validators.py +436 -0
@@ -0,0 +1,1820 @@
1
+ """
2
+ Data Acquisition Stage - Extract customer information from multiple sources
3
+ Converted from fusesell_data_acquisition.yml
4
+ """
5
+
6
+ import requests
7
+ import json
8
+ from typing import Dict, Any, Optional, List
9
+ from urllib.parse import urlparse
10
+ import time
11
+ from .base_stage import BaseStage
12
+
13
+
14
+ class DataAcquisitionStage(BaseStage):
15
+ """
16
+ Data Acquisition stage for extracting customer information from multiple sources.
17
+ Converts YAML workflow logic to Python implementation.
18
+ """
19
+
20
+ def execute(self, context: Dict[str, Any]) -> Dict[str, Any]:
21
+ """
22
+ Execute data acquisition stage.
23
+
24
+ Args:
25
+ context: Execution context
26
+
27
+ Returns:
28
+ Stage execution result
29
+ """
30
+ try:
31
+ input_data = context.get('input_data', {})
32
+
33
+ # Collect data from all available sources
34
+ collected_data = []
35
+ data_sources = []
36
+
37
+ # 1. Website scraping (matching original YAML: input_website)
38
+ if input_data.get('input_website'):
39
+ website_data = self._scrape_website(
40
+ input_data['input_website'])
41
+ if website_data:
42
+ collected_data.append(website_data)
43
+ data_sources.append('website')
44
+ self.logger.info("Successfully scraped website data")
45
+
46
+ # 2. Customer description (matching original YAML: input_description)
47
+ if input_data.get('input_description'):
48
+ collected_data.append(input_data['input_description'])
49
+ data_sources.append('description')
50
+ self.logger.info("Added customer description")
51
+
52
+ # 3. Business card processing (matching original YAML: input_business_card)
53
+ if input_data.get('input_business_card'):
54
+ business_card_data = self._process_business_card(
55
+ input_data['input_business_card'])
56
+ if business_card_data:
57
+ collected_data.append(business_card_data)
58
+ data_sources.append('business_card')
59
+ self.logger.info("Successfully processed business card")
60
+
61
+ # 4. Social media scraping (matching original YAML: input_facebook_url, input_linkedin_url)
62
+ if input_data.get('input_facebook_url'):
63
+ facebook_data = self._scrape_social_media(
64
+ input_data['input_facebook_url'])
65
+ if facebook_data:
66
+ collected_data.append(facebook_data)
67
+ data_sources.append('facebook')
68
+ self.logger.info("Successfully scraped Facebook data")
69
+
70
+ if input_data.get('input_linkedin_url'):
71
+ linkedin_data = self._scrape_social_media(
72
+ input_data['input_linkedin_url'])
73
+ if linkedin_data:
74
+ collected_data.append(linkedin_data)
75
+ data_sources.append('linkedin')
76
+ self.logger.info("Successfully scraped LinkedIn data")
77
+
78
+ # 5. Free text input (matching executor schema: input_freetext)
79
+ if input_data.get('input_freetext'):
80
+ collected_data.append(input_data['input_freetext'])
81
+ data_sources.append('freetext')
82
+ self.logger.info("Added free text input")
83
+
84
+ # Combine all collected data
85
+ combined_data = ' '.join(str(data)
86
+ for data in collected_data if data)
87
+
88
+ if not combined_data:
89
+ raise ValueError("No data could be collected from any source")
90
+
91
+ # 5. Extract structured customer information using LLM
92
+ customer_info = self._extract_customer_info(combined_data)
93
+
94
+ # 6. Perform additional company research
95
+ research_data = self._perform_company_research(customer_info)
96
+
97
+ # 7. Scrape company website if not already done
98
+ website_research_data = self._scrape_company_website(
99
+ customer_info, data_sources)
100
+
101
+ # Combine all research data
102
+ mini_research = ' '.join(
103
+ filter(None, [research_data, website_research_data]))
104
+
105
+ # Final result
106
+ result_data = {
107
+ **customer_info,
108
+ 'company_mini_search': mini_research,
109
+ 'research_mini': True,
110
+ 'data_sources': data_sources,
111
+ 'extraction_status': 'success',
112
+ 'customer_id': context.get('execution_id')
113
+ }
114
+
115
+ # Save to database
116
+ self.save_stage_result(context, result_data)
117
+
118
+ result = self.create_success_result(result_data, context)
119
+ return result
120
+
121
+ except Exception as e:
122
+ self.log_stage_error(context, e)
123
+ return self.handle_stage_error(e, context)
124
+
125
+ def _scrape_website(self, url: str) -> Optional[str]:
126
+ """
127
+ Scrape website content with enhanced fallback mechanisms.
128
+
129
+ Args:
130
+ url: Website URL to scrape
131
+
132
+ Returns:
133
+ Scraped text content or None if failed
134
+ """
135
+ try:
136
+ if self.is_dry_run():
137
+ return f"[DRY RUN] Would scrape website: {url}"
138
+
139
+ # Step 1: Try direct HTTP request first
140
+ scraped_content = self._direct_website_scrape(url)
141
+ if scraped_content:
142
+ self.logger.info(f"Successfully scraped website directly: {url}")
143
+ return scraped_content
144
+
145
+ # Step 2: Try Serper API scraping
146
+ serper_key = self.config.get('serper_api_key')
147
+ if serper_key:
148
+ scraped_content = self._scrape_with_serper(url, serper_key)
149
+ if scraped_content:
150
+ self.logger.info(f"Successfully scraped website via Serper API: {url}")
151
+ return scraped_content
152
+
153
+ # Step 3: Enhanced fallback - Search-based data recovery
154
+ self.logger.info(f"Direct scraping failed for {url}, attempting search-based fallback")
155
+ company_name = self._extract_company_name_from_url(url)
156
+ if company_name:
157
+ fallback_content = self._search_based_fallback(company_name, serper_key)
158
+ if fallback_content:
159
+ self.logger.info(f"Successfully recovered company data via search fallback for: {company_name}")
160
+ return fallback_content
161
+
162
+ self.logger.warning(f"All scraping methods failed for {url}")
163
+ return None
164
+
165
+ except Exception as e:
166
+ self.logger.error(f"Website scraping failed for {url}: {str(e)}")
167
+ return None
168
+
169
+ def _extract_company_name_from_url(self, url: str) -> Optional[str]:
170
+ """
171
+ Extract company name from URL for search fallback.
172
+
173
+ Args:
174
+ url: Website URL
175
+
176
+ Returns:
177
+ Extracted company name or None if failed
178
+ """
179
+ try:
180
+ from urllib.parse import urlparse
181
+
182
+ parsed = urlparse(url)
183
+ domain = parsed.netloc.lower()
184
+
185
+ # Remove common prefixes
186
+ if domain.startswith('www.'):
187
+ domain = domain[4:]
188
+
189
+ # Extract company name from domain
190
+ # Remove common TLDs
191
+ domain_parts = domain.split('.')
192
+ if len(domain_parts) >= 2:
193
+ company_name = domain_parts[0]
194
+
195
+ # Clean up common patterns
196
+ company_name = company_name.replace('-', ' ')
197
+ company_name = company_name.replace('_', ' ')
198
+
199
+ # Capitalize words
200
+ company_name = ' '.join(word.capitalize() for word in company_name.split())
201
+
202
+ return company_name
203
+
204
+ return None
205
+
206
+ except Exception as e:
207
+ self.logger.debug(f"Failed to extract company name from URL {url}: {str(e)}")
208
+ return None
209
+
210
+ def _search_based_fallback(self, company_name: str, api_key: str) -> Optional[str]:
211
+ """
212
+ Enhanced fallback mechanism using search to recover company data.
213
+
214
+ Args:
215
+ company_name: Company name to search for
216
+ api_key: Serper API key
217
+
218
+ Returns:
219
+ Company information from search results or None if failed
220
+ """
221
+ try:
222
+ self.logger.info(f"Attempting search-based fallback for company: {company_name}")
223
+
224
+ # Try multiple search strategies
225
+ search_queries = [
226
+ f'"{company_name}" company about',
227
+ f'"{company_name}" business services',
228
+ f'{company_name} company profile',
229
+ f'{company_name} official website',
230
+ f'{company_name} contact information'
231
+ ]
232
+
233
+ all_results = []
234
+
235
+ for query in search_queries:
236
+ try:
237
+ search_result = self._search_with_serper(query, api_key)
238
+ if search_result and len(search_result.strip()) > 50:
239
+ all_results.append(search_result)
240
+ self.logger.debug(f"Search query '{query}' returned {len(search_result)} characters")
241
+
242
+ # Add small delay between searches to avoid rate limiting
243
+ time.sleep(1)
244
+
245
+ except Exception as e:
246
+ self.logger.debug(f"Search query '{query}' failed: {str(e)}")
247
+ continue
248
+
249
+ if not all_results:
250
+ self.logger.warning(f"No search results found for company: {company_name}")
251
+ return None
252
+
253
+ # Combine and deduplicate results
254
+ combined_results = ' '.join(all_results)
255
+
256
+ # Limit length to avoid token limits
257
+ if len(combined_results) > 3000:
258
+ combined_results = combined_results[:3000] + "..."
259
+
260
+ # Try to find alternative URLs in search results
261
+ alternative_urls = self._extract_urls_from_search_results(combined_results)
262
+
263
+ # If we found alternative URLs, try scraping them
264
+ for alt_url in alternative_urls[:3]: # Try up to 3 alternative URLs
265
+ try:
266
+ self.logger.info(f"Trying alternative URL from search: {alt_url}")
267
+ scraped_content = self._direct_website_scrape(alt_url)
268
+ if scraped_content and len(scraped_content.strip()) > 100:
269
+ self.logger.info(f"Successfully scraped alternative URL: {alt_url}")
270
+ return f"Search Results: {combined_results}\n\nAlternative Website Content: {scraped_content}"
271
+ except Exception as e:
272
+ self.logger.debug(f"Failed to scrape alternative URL {alt_url}: {str(e)}")
273
+ continue
274
+
275
+ # Return search results even if no alternative URLs worked
276
+ self.logger.info(f"Returning search results for company: {company_name}")
277
+ return f"Search Results: {combined_results}"
278
+
279
+ except Exception as e:
280
+ self.logger.error(f"Search-based fallback failed for {company_name}: {str(e)}")
281
+ return None
282
+
283
+ def _extract_urls_from_search_results(self, search_text: str) -> List[str]:
284
+ """
285
+ Extract potential website URLs from search results.
286
+
287
+ Args:
288
+ search_text: Search results text
289
+
290
+ Returns:
291
+ List of extracted URLs
292
+ """
293
+ try:
294
+ import re
295
+
296
+ # Pattern to match URLs in search results
297
+ url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
298
+ urls = re.findall(url_pattern, search_text, re.IGNORECASE)
299
+
300
+ # Filter out common non-company URLs
301
+ filtered_urls = []
302
+ exclude_domains = [
303
+ 'google.com', 'facebook.com', 'linkedin.com', 'twitter.com',
304
+ 'instagram.com', 'youtube.com', 'wikipedia.org', 'yelp.com',
305
+ 'glassdoor.com', 'indeed.com', 'crunchbase.com'
306
+ ]
307
+
308
+ for url in urls:
309
+ # Skip if it's a social media or directory site
310
+ if not any(domain in url.lower() for domain in exclude_domains):
311
+ # Skip if it's too long (likely not a main company website)
312
+ if len(url) < 100:
313
+ filtered_urls.append(url)
314
+
315
+ # Remove duplicates while preserving order
316
+ seen = set()
317
+ unique_urls = []
318
+ for url in filtered_urls:
319
+ if url not in seen:
320
+ seen.add(url)
321
+ unique_urls.append(url)
322
+
323
+ return unique_urls[:5] # Return top 5 URLs
324
+
325
+ except Exception as e:
326
+ self.logger.debug(f"Failed to extract URLs from search results: {str(e)}")
327
+ return []
328
+
329
+ def _direct_website_scrape(self, url: str) -> Optional[str]:
330
+ """
331
+ Direct website scraping using requests and basic HTML parsing.
332
+
333
+ Args:
334
+ url: Website URL to scrape
335
+
336
+ Returns:
337
+ Scraped text content or None if failed
338
+ """
339
+ try:
340
+ headers = {
341
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
342
+ }
343
+
344
+ response = requests.get(url, headers=headers, timeout=30)
345
+ response.raise_for_status()
346
+
347
+ # Basic HTML parsing to extract text
348
+ try:
349
+ from bs4 import BeautifulSoup
350
+ soup = BeautifulSoup(response.content, 'html.parser')
351
+
352
+ # Remove script and style elements
353
+ for script in soup(["script", "style"]):
354
+ script.decompose()
355
+
356
+ # Get text content
357
+ text = soup.get_text()
358
+
359
+ # Clean up text
360
+ lines = (line.strip() for line in text.splitlines())
361
+ chunks = (phrase.strip()
362
+ for line in lines for phrase in line.split(" "))
363
+ text = ' '.join(chunk for chunk in chunks if chunk)
364
+
365
+ # Limit text length to avoid token limits
366
+ if len(text) > 5000:
367
+ text = text[:5000] + "..."
368
+
369
+ return text
370
+
371
+ except ImportError:
372
+ self.logger.warning(
373
+ "BeautifulSoup not available. Install with: pip install beautifulsoup4")
374
+ # Fallback: return raw HTML (limited)
375
+ content = response.text
376
+ if len(content) > 2000:
377
+ content = content[:2000] + "..."
378
+ return content
379
+
380
+ except Exception as e:
381
+ self.logger.error(f"Direct website scraping failed: {str(e)}")
382
+ return None
383
+
384
+ def _scrape_with_serper(self, url: str, api_key: str) -> Optional[str]:
385
+ """
386
+ Scrape website using Serper API (original method).
387
+
388
+ Args:
389
+ url: Website URL to scrape
390
+ api_key: Serper API key
391
+
392
+ Returns:
393
+ Scraped text content or None if failed
394
+ """
395
+ try:
396
+ headers = {
397
+ 'X-API-KEY': api_key,
398
+ 'Content-Type': 'application/json'
399
+ }
400
+
401
+ body = {'url': url}
402
+
403
+ response = requests.post(
404
+ 'https://scrape.serper.dev',
405
+ json=body,
406
+ headers=headers,
407
+ timeout=300 # 5 minutes
408
+ )
409
+
410
+ if response.status_code == 200:
411
+ result = response.json()
412
+ return result.get('text', '')
413
+ else:
414
+ self.logger.warning(
415
+ f"Serper API returned status {response.status_code}")
416
+ return None
417
+
418
+ except Exception as e:
419
+ self.logger.error(f"Serper API scraping failed: {str(e)}")
420
+ return None
421
+
422
+ def _process_business_card(self, business_card_url: str) -> Optional[str]:
423
+ """
424
+ Process business card image using OCR.
425
+
426
+ Args:
427
+ business_card_url: URL to business card image
428
+
429
+ Returns:
430
+ Extracted text from business card or None if failed
431
+ """
432
+ try:
433
+ if self.is_dry_run():
434
+ return f"[DRY RUN] Would process business card: {business_card_url}"
435
+
436
+ # Download the image
437
+ image_data = self._download_image(business_card_url)
438
+ if not image_data:
439
+ return None
440
+
441
+ # Try OCR processing with different methods
442
+ extracted_text = self._extract_text_from_image(image_data)
443
+
444
+ if extracted_text:
445
+ self.logger.info(
446
+ "Successfully extracted text from business card")
447
+ return extracted_text
448
+ else:
449
+ self.logger.warning(
450
+ "No text could be extracted from business card")
451
+ return None
452
+
453
+ except Exception as e:
454
+ self.logger.error(f"Business card processing failed: {str(e)}")
455
+ return None
456
+
457
+ def _download_image(self, image_url: str) -> Optional[bytes]:
458
+ """
459
+ Download image or PDF from URL.
460
+
461
+ Args:
462
+ image_url: URL to image or PDF
463
+
464
+ Returns:
465
+ Image/PDF data as bytes or None if failed
466
+ """
467
+ try:
468
+ headers = {
469
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
470
+ }
471
+
472
+ response = requests.get(image_url, headers=headers, timeout=30)
473
+ response.raise_for_status()
474
+
475
+ # Verify it's an image or PDF
476
+ content_type = response.headers.get('content-type', '').lower()
477
+ if not (content_type.startswith('image/') or content_type == 'application/pdf'):
478
+ self.logger.warning(
479
+ f"URL does not point to an image or PDF: {content_type}")
480
+ return None
481
+
482
+ return response.content
483
+
484
+ except Exception as e:
485
+ self.logger.error(
486
+ f"Failed to download file from {image_url}: {str(e)}")
487
+ return None
488
+
489
+ def _extract_text_from_image(self, image_data: bytes) -> Optional[str]:
490
+ """
491
+ Extract text from image or PDF using OCR.
492
+
493
+ Args:
494
+ image_data: Image or PDF data as bytes
495
+
496
+ Returns:
497
+ Extracted text or None if failed
498
+ """
499
+ try:
500
+ # Check if it's a PDF first
501
+ if image_data.startswith(b'%PDF'):
502
+ text = self._extract_text_from_pdf(image_data)
503
+ if text:
504
+ return text
505
+
506
+ # Try Tesseract OCR first (most common)
507
+ text = self._ocr_with_tesseract(image_data)
508
+ if text:
509
+ return text
510
+
511
+ # Try EasyOCR as fallback
512
+ text = self._ocr_with_easyocr(image_data)
513
+ if text:
514
+ return text
515
+
516
+ # Try cloud OCR services if available
517
+ text = self._ocr_with_cloud_service(image_data)
518
+ if text:
519
+ return text
520
+
521
+ return None
522
+
523
+ except Exception as e:
524
+ self.logger.error(f"Text extraction failed: {str(e)}")
525
+ return None
526
+
527
+ def _extract_text_from_pdf(self, pdf_data: bytes) -> Optional[str]:
528
+ """
529
+ Extract text from PDF business card.
530
+
531
+ Args:
532
+ pdf_data: PDF data as bytes
533
+
534
+ Returns:
535
+ Extracted text or None if failed
536
+ """
537
+ try:
538
+ import io
539
+
540
+ # Try PyPDF2 first for text extraction
541
+ text = self._extract_pdf_text_pypdf2(pdf_data)
542
+ if text and len(text.strip()) > 10:
543
+ structured_text = self._extract_business_card_info(text)
544
+ return structured_text if structured_text else text
545
+
546
+ # If no text found, convert PDF to image and use OCR
547
+ text = self._extract_pdf_text_via_ocr(pdf_data)
548
+ if text:
549
+ return text
550
+
551
+ return None
552
+
553
+ except Exception as e:
554
+ self.logger.debug(f"PDF text extraction failed: {str(e)}")
555
+ return None
556
+
557
+ def _extract_pdf_text_pypdf2(self, pdf_data: bytes) -> Optional[str]:
558
+ """
559
+ Extract text from PDF using PyPDF2.
560
+
561
+ Args:
562
+ pdf_data: PDF data as bytes
563
+
564
+ Returns:
565
+ Extracted text or None if failed
566
+ """
567
+ try:
568
+ import PyPDF2
569
+ import io
570
+
571
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
572
+
573
+ text_parts = []
574
+ for page in pdf_reader.pages:
575
+ text = page.extract_text()
576
+ if text:
577
+ text_parts.append(text)
578
+
579
+ combined_text = ' '.join(text_parts).strip()
580
+ return combined_text if len(combined_text) > 10 else None
581
+
582
+ except ImportError:
583
+ self.logger.debug("PyPDF2 not available. Install with: pip install PyPDF2")
584
+ return None
585
+ except Exception as e:
586
+ self.logger.debug(f"PyPDF2 extraction failed: {str(e)}")
587
+ return None
588
+
589
+ def _extract_pdf_text_via_ocr(self, pdf_data: bytes) -> Optional[str]:
590
+ """
591
+ Convert PDF to image and extract text via OCR.
592
+
593
+ Args:
594
+ pdf_data: PDF data as bytes
595
+
596
+ Returns:
597
+ Extracted text or None if failed
598
+ """
599
+ try:
600
+ import fitz # PyMuPDF
601
+ import io
602
+ from PIL import Image
603
+
604
+ # Open PDF
605
+ pdf_document = fitz.open(stream=pdf_data, filetype="pdf")
606
+
607
+ all_text = []
608
+
609
+ # Process each page (usually business cards are single page)
610
+ for page_num in range(min(3, len(pdf_document))): # Max 3 pages
611
+ page = pdf_document[page_num]
612
+
613
+ # Convert page to image
614
+ mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
615
+ pix = page.get_pixmap(matrix=mat)
616
+ img_data = pix.tobytes("png")
617
+
618
+ # Extract text using OCR
619
+ text = self._ocr_with_tesseract(img_data)
620
+ if not text:
621
+ text = self._ocr_with_easyocr(img_data)
622
+
623
+ if text:
624
+ all_text.append(text)
625
+
626
+ pdf_document.close()
627
+
628
+ combined_text = ' '.join(all_text).strip()
629
+ if combined_text:
630
+ structured_text = self._extract_business_card_info(combined_text)
631
+ return structured_text if structured_text else combined_text
632
+
633
+ return None
634
+
635
+ except ImportError:
636
+ self.logger.debug("PyMuPDF not available. Install with: pip install PyMuPDF")
637
+ return None
638
+ except Exception as e:
639
+ self.logger.debug(f"PDF OCR extraction failed: {str(e)}")
640
+ return None
641
+
642
+ def _ocr_with_tesseract(self, image_data: bytes) -> Optional[str]:
643
+ """
644
+ Extract text using Tesseract OCR with image preprocessing.
645
+
646
+ Args:
647
+ image_data: Image data as bytes
648
+
649
+ Returns:
650
+ Extracted text or None if failed
651
+ """
652
+ try:
653
+ import pytesseract
654
+ from PIL import Image, ImageEnhance, ImageFilter
655
+ import io
656
+
657
+ # Load image
658
+ image = Image.open(io.BytesIO(image_data))
659
+
660
+ # Convert to RGB if necessary
661
+ if image.mode != 'RGB':
662
+ image = image.convert('RGB')
663
+
664
+ # Apply image preprocessing for better OCR accuracy
665
+ processed_image = self._preprocess_image_for_ocr(image)
666
+
667
+ # Try different OCR configurations for business cards
668
+ ocr_configs = [
669
+ '--psm 6', # Uniform block of text
670
+ '--psm 4', # Single column of text
671
+ '--psm 3', # Fully automatic page segmentation
672
+ '--psm 8', # Single word
673
+ '--psm 13' # Raw line. Treat the image as a single text line
674
+ ]
675
+
676
+ best_text = ""
677
+ best_confidence = 0
678
+
679
+ for config in ocr_configs:
680
+ try:
681
+ # Extract text with configuration
682
+ text = pytesseract.image_to_string(processed_image, lang='eng', config=config)
683
+
684
+ # Get confidence score
685
+ data = pytesseract.image_to_data(processed_image, lang='eng', config=config, output_type=pytesseract.Output.DICT)
686
+ confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
687
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0
688
+
689
+ # Keep the best result
690
+ if avg_confidence > best_confidence and len(text.strip()) > 10:
691
+ best_text = text.strip()
692
+ best_confidence = avg_confidence
693
+
694
+ except Exception as e:
695
+ self.logger.debug(f"OCR config {config} failed: {str(e)}")
696
+ continue
697
+
698
+ # Also try the original image without preprocessing
699
+ try:
700
+ text = pytesseract.image_to_string(image, lang='eng')
701
+ if len(text.strip()) > len(best_text):
702
+ best_text = text.strip()
703
+ except:
704
+ pass
705
+
706
+ # Extract structured information from business card text
707
+ if best_text:
708
+ structured_text = self._extract_business_card_info(best_text)
709
+ return structured_text if structured_text else best_text
710
+
711
+ return None
712
+
713
+ except ImportError:
714
+ self.logger.debug(
715
+ "Tesseract OCR not available. Install with: pip install pytesseract pillow")
716
+ return None
717
+ except Exception as e:
718
+ self.logger.debug(f"Tesseract OCR failed: {str(e)}")
719
+ return None
720
+
721
+ def _preprocess_image_for_ocr(self, image):
722
+ """
723
+ Preprocess image to improve OCR accuracy.
724
+
725
+ Args:
726
+ image: PIL Image object
727
+
728
+ Returns:
729
+ Preprocessed PIL Image object
730
+ """
731
+ try:
732
+ from PIL import ImageEnhance, ImageFilter
733
+ import numpy as np
734
+
735
+ # Convert to grayscale for better OCR
736
+ if image.mode != 'L':
737
+ image = image.convert('L')
738
+
739
+ # Resize image if too small (OCR works better on larger images)
740
+ width, height = image.size
741
+ if width < 300 or height < 300:
742
+ scale_factor = max(300 / width, 300 / height)
743
+ new_width = int(width * scale_factor)
744
+ new_height = int(height * scale_factor)
745
+ image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
746
+
747
+ # Enhance contrast
748
+ enhancer = ImageEnhance.Contrast(image)
749
+ image = enhancer.enhance(1.5)
750
+
751
+ # Enhance sharpness
752
+ enhancer = ImageEnhance.Sharpness(image)
753
+ image = enhancer.enhance(2.0)
754
+
755
+ # Apply noise reduction
756
+ image = image.filter(ImageFilter.MedianFilter(size=3))
757
+
758
+ # Apply slight blur to smooth out noise
759
+ image = image.filter(ImageFilter.GaussianBlur(radius=0.5))
760
+
761
+ return image
762
+
763
+ except Exception as e:
764
+ self.logger.debug(f"Image preprocessing failed: {str(e)}")
765
+ return image # Return original image if preprocessing fails
766
+
767
+ def _extract_business_card_info(self, raw_text: str) -> Optional[str]:
768
+ """
769
+ Extract structured information from business card OCR text.
770
+
771
+ Args:
772
+ raw_text: Raw OCR text from business card
773
+
774
+ Returns:
775
+ Structured business card information
776
+ """
777
+ try:
778
+ import re
779
+
780
+ # Clean up the text
781
+ lines = [line.strip() for line in raw_text.split('\n') if line.strip()]
782
+
783
+ # Initialize extracted info
784
+ extracted_info = {
785
+ 'name': None,
786
+ 'title': None,
787
+ 'company': None,
788
+ 'email': None,
789
+ 'phone': None,
790
+ 'website': None,
791
+ 'address': None
792
+ }
793
+
794
+ # Email pattern
795
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
796
+
797
+ # Phone pattern (various formats)
798
+ phone_pattern = r'(\+?1?[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})'
799
+
800
+ # Website pattern
801
+ website_pattern = r'(?:https?://)?(?:www\.)?([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}'
802
+
803
+ # Extract email
804
+ email_matches = re.findall(email_pattern, raw_text, re.IGNORECASE)
805
+ if email_matches:
806
+ extracted_info['email'] = email_matches[0]
807
+
808
+ # Extract phone
809
+ phone_matches = re.findall(phone_pattern, raw_text)
810
+ if phone_matches:
811
+ phone = ''.join(phone_matches[0])
812
+ extracted_info['phone'] = phone
813
+
814
+ # Extract website
815
+ website_matches = re.findall(website_pattern, raw_text, re.IGNORECASE)
816
+ if website_matches:
817
+ website = website_matches[0]
818
+ if not website.startswith('http'):
819
+ website = 'https://' + website
820
+ extracted_info['website'] = website
821
+
822
+ # Heuristic extraction for name, title, company
823
+ # Usually name is on the first or second line
824
+ # Title is often after the name
825
+ # Company is often the largest/most prominent text
826
+
827
+ if len(lines) >= 1:
828
+ # First line is often the name
829
+ potential_name = lines[0]
830
+ if len(potential_name.split()) <= 4 and not any(char.isdigit() for char in potential_name):
831
+ extracted_info['name'] = potential_name
832
+
833
+ if len(lines) >= 2:
834
+ # Second line might be title
835
+ potential_title = lines[1]
836
+ if len(potential_title.split()) <= 6 and not any(char in potential_title for char in '@.'):
837
+ extracted_info['title'] = potential_title
838
+
839
+ # Look for company name (often contains "Inc", "LLC", "Corp", etc.)
840
+ company_indicators = ['inc', 'llc', 'corp', 'ltd', 'company', 'co.', 'corporation']
841
+ for line in lines:
842
+ if any(indicator in line.lower() for indicator in company_indicators):
843
+ extracted_info['company'] = line
844
+ break
845
+
846
+ # If no company found with indicators, use the longest line that's not name/title/contact info
847
+ if not extracted_info['company']:
848
+ for line in lines:
849
+ if (line != extracted_info['name'] and
850
+ line != extracted_info['title'] and
851
+ not re.search(email_pattern, line, re.IGNORECASE) and
852
+ not re.search(phone_pattern, line) and
853
+ len(line) > 10):
854
+ extracted_info['company'] = line
855
+ break
856
+
857
+ # Format the structured output
858
+ structured_parts = []
859
+ if extracted_info['name']:
860
+ structured_parts.append(f"Name: {extracted_info['name']}")
861
+ if extracted_info['title']:
862
+ structured_parts.append(f"Title: {extracted_info['title']}")
863
+ if extracted_info['company']:
864
+ structured_parts.append(f"Company: {extracted_info['company']}")
865
+ if extracted_info['email']:
866
+ structured_parts.append(f"Email: {extracted_info['email']}")
867
+ if extracted_info['phone']:
868
+ structured_parts.append(f"Phone: {extracted_info['phone']}")
869
+ if extracted_info['website']:
870
+ structured_parts.append(f"Website: {extracted_info['website']}")
871
+
872
+ if structured_parts:
873
+ return " | ".join(structured_parts)
874
+ else:
875
+ return raw_text # Return raw text if no structured info found
876
+
877
+ except Exception as e:
878
+ self.logger.debug(f"Business card info extraction failed: {str(e)}")
879
+ return raw_text # Return raw text if processing fails
880
+
881
+ def _ocr_with_easyocr(self, image_data: bytes) -> Optional[str]:
882
+ """
883
+ Extract text using EasyOCR with preprocessing.
884
+
885
+ Args:
886
+ image_data: Image data as bytes
887
+
888
+ Returns:
889
+ Extracted text or None if failed
890
+ """
891
+ try:
892
+ import easyocr
893
+ import numpy as np
894
+ from PIL import Image
895
+ import io
896
+
897
+ # Load image
898
+ image = Image.open(io.BytesIO(image_data))
899
+
900
+ # Apply preprocessing
901
+ processed_image = self._preprocess_image_for_ocr(image)
902
+
903
+ # Convert to numpy array for EasyOCR
904
+ image_array = np.array(processed_image)
905
+
906
+ # Initialize EasyOCR reader with multiple languages for better detection
907
+ reader = easyocr.Reader(['en'], gpu=False) # Disable GPU for compatibility
908
+
909
+ # Extract text with different confidence thresholds
910
+ results = reader.readtext(image_array, detail=1)
911
+
912
+ # Sort results by confidence and position
913
+ high_conf_results = [result for result in results if result[2] > 0.6]
914
+ medium_conf_results = [result for result in results if 0.3 < result[2] <= 0.6]
915
+
916
+ # Try high confidence results first
917
+ if high_conf_results:
918
+ text_parts = [result[1] for result in high_conf_results]
919
+ text = ' '.join(text_parts)
920
+
921
+ # Extract structured info if it looks like a business card
922
+ structured_text = self._extract_business_card_info(text)
923
+ if structured_text:
924
+ return structured_text
925
+ elif len(text.strip()) > 10:
926
+ return text.strip()
927
+
928
+ # Fall back to medium confidence results
929
+ if medium_conf_results:
930
+ text_parts = [result[1] for result in medium_conf_results]
931
+ text = ' '.join(text_parts)
932
+
933
+ if len(text.strip()) > 10:
934
+ return text.strip()
935
+
936
+ # Try with original image if preprocessing didn't help
937
+ try:
938
+ original_array = np.array(image)
939
+ results = reader.readtext(original_array, detail=1)
940
+ text_parts = [result[1] for result in results if result[2] > 0.4]
941
+ text = ' '.join(text_parts)
942
+
943
+ if len(text.strip()) > 10:
944
+ return text.strip()
945
+ except:
946
+ pass
947
+
948
+ return None
949
+
950
+ except ImportError:
951
+ self.logger.debug(
952
+ "EasyOCR not available. Install with: pip install easyocr")
953
+ return None
954
+ except Exception as e:
955
+ self.logger.debug(f"EasyOCR failed: {str(e)}")
956
+ return None
957
+
958
+ def _ocr_with_cloud_service(self, image_data: bytes) -> Optional[str]:
959
+ """
960
+ Extract text using cloud OCR service (Google Vision API, Azure, etc.).
961
+
962
+ Args:
963
+ image_data: Image data as bytes
964
+
965
+ Returns:
966
+ Extracted text or None if failed
967
+ """
968
+ try:
969
+ # Try Google Vision API if credentials are available
970
+ google_creds = self.config.get('google_vision_credentials')
971
+ if google_creds:
972
+ return self._ocr_with_google_vision(image_data, google_creds)
973
+
974
+ # Try Azure Computer Vision if key is available
975
+ azure_key = self.config.get('azure_vision_key')
976
+ azure_endpoint = self.config.get('azure_vision_endpoint')
977
+ if azure_key and azure_endpoint:
978
+ return self._ocr_with_azure_vision(image_data, azure_key, azure_endpoint)
979
+
980
+ self.logger.debug("No cloud OCR service credentials available")
981
+ return None
982
+
983
+ except Exception as e:
984
+ self.logger.debug(f"Cloud OCR failed: {str(e)}")
985
+ return None
986
+
987
+ def _ocr_with_google_vision(self, image_data: bytes, credentials_path: str) -> Optional[str]:
988
+ """
989
+ Extract text using Google Vision API.
990
+
991
+ Args:
992
+ image_data: Image data as bytes
993
+ credentials_path: Path to Google credentials JSON
994
+
995
+ Returns:
996
+ Extracted text or None if failed
997
+ """
998
+ try:
999
+ from google.cloud import vision
1000
+ import os
1001
+
1002
+ # Set credentials
1003
+ os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
1004
+
1005
+ # Initialize client
1006
+ client = vision.ImageAnnotatorClient()
1007
+
1008
+ # Create image object
1009
+ image = vision.Image(content=image_data)
1010
+
1011
+ # Perform text detection
1012
+ response = client.text_detection(image=image)
1013
+ texts = response.text_annotations
1014
+
1015
+ if texts:
1016
+ return texts[0].description.strip()
1017
+
1018
+ return None
1019
+
1020
+ except ImportError:
1021
+ self.logger.debug(
1022
+ "Google Vision API not available. Install with: pip install google-cloud-vision")
1023
+ return None
1024
+ except Exception as e:
1025
+ self.logger.debug(f"Google Vision API failed: {str(e)}")
1026
+ return None
1027
+
1028
+ def _ocr_with_azure_vision(self, image_data: bytes, api_key: str, endpoint: str) -> Optional[str]:
1029
+ """
1030
+ Extract text using Azure Computer Vision API.
1031
+
1032
+ Args:
1033
+ image_data: Image data as bytes
1034
+ api_key: Azure API key
1035
+ endpoint: Azure endpoint URL
1036
+
1037
+ Returns:
1038
+ Extracted text or None if failed
1039
+ """
1040
+ try:
1041
+ import time
1042
+
1043
+ # Submit image for OCR
1044
+ headers = {
1045
+ 'Ocp-Apim-Subscription-Key': api_key,
1046
+ 'Content-Type': 'application/octet-stream'
1047
+ }
1048
+
1049
+ # Start OCR operation
1050
+ ocr_url = f"{endpoint}/vision/v3.2/read/analyze"
1051
+ response = requests.post(ocr_url, headers=headers, data=image_data)
1052
+ response.raise_for_status()
1053
+
1054
+ # Get operation location
1055
+ operation_url = response.headers['Operation-Location']
1056
+
1057
+ # Poll for results
1058
+ for _ in range(10): # Max 10 attempts
1059
+ time.sleep(1)
1060
+ result_response = requests.get(operation_url, headers={
1061
+ 'Ocp-Apim-Subscription-Key': api_key})
1062
+ result_response.raise_for_status()
1063
+ result = result_response.json()
1064
+
1065
+ if result['status'] == 'succeeded':
1066
+ # Extract text from results
1067
+ text_parts = []
1068
+ for read_result in result['analyzeResult']['readResults']:
1069
+ for line in read_result['lines']:
1070
+ text_parts.append(line['text'])
1071
+
1072
+ return ' '.join(text_parts)
1073
+ elif result['status'] == 'failed':
1074
+ break
1075
+
1076
+ return None
1077
+
1078
+ except Exception as e:
1079
+ self.logger.debug(f"Azure Vision API failed: {str(e)}")
1080
+ return None
1081
+
1082
+ def _scrape_social_media(self, url: str) -> Optional[str]:
1083
+ """
1084
+ Scrape social media profile data.
1085
+
1086
+ Args:
1087
+ url: Social media profile URL
1088
+
1089
+ Returns:
1090
+ Scraped profile data or None if failed
1091
+ """
1092
+ try:
1093
+ if self.is_dry_run():
1094
+ return f"[DRY RUN] Would scrape social media: {url}"
1095
+
1096
+ # Determine platform and use appropriate scraping method
1097
+ if 'linkedin.com' in url.lower():
1098
+ return self._scrape_linkedin_profile(url)
1099
+ elif 'facebook.com' in url.lower():
1100
+ return self._scrape_facebook_profile(url)
1101
+ else:
1102
+ # Generic social media scraping
1103
+ return self._scrape_website(url)
1104
+
1105
+ except Exception as e:
1106
+ self.logger.error(
1107
+ f"Social media scraping failed for {url}: {str(e)}")
1108
+ return None
1109
+
1110
+ def _scrape_linkedin_profile(self, url: str) -> Optional[str]:
1111
+ """
1112
+ Scrape LinkedIn profile with specific handling for LinkedIn's structure.
1113
+
1114
+ Args:
1115
+ url: LinkedIn profile URL
1116
+
1117
+ Returns:
1118
+ Scraped LinkedIn profile data or None if failed
1119
+ """
1120
+ try:
1121
+ # LinkedIn has anti-scraping measures, so we'll try different approaches
1122
+
1123
+ # Method 1: Try with Serper API if available (most reliable)
1124
+ serper_key = self.config.get('serper_api_key')
1125
+ if serper_key:
1126
+ linkedin_data = self._scrape_with_serper(url, serper_key)
1127
+ if linkedin_data:
1128
+ return linkedin_data
1129
+
1130
+ # Method 2: Try direct scraping with LinkedIn-specific headers
1131
+ linkedin_data = self._scrape_linkedin_direct(url)
1132
+ if linkedin_data:
1133
+ return linkedin_data
1134
+
1135
+ # Method 3: Fallback to generic scraping
1136
+ return self._direct_website_scrape(url)
1137
+
1138
+ except Exception as e:
1139
+ self.logger.error(f"LinkedIn scraping failed: {str(e)}")
1140
+ return None
1141
+
1142
+ def _scrape_linkedin_direct(self, url: str) -> Optional[str]:
1143
+ """
1144
+ Direct LinkedIn scraping with specific headers and handling.
1145
+
1146
+ Args:
1147
+ url: LinkedIn profile URL
1148
+
1149
+ Returns:
1150
+ Scraped content or None if failed
1151
+ """
1152
+ try:
1153
+ # LinkedIn-specific headers to avoid blocking
1154
+ headers = {
1155
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
1156
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
1157
+ 'Accept-Language': 'en-US,en;q=0.5',
1158
+ 'Accept-Encoding': 'gzip, deflate',
1159
+ 'Connection': 'keep-alive',
1160
+ 'Upgrade-Insecure-Requests': '1',
1161
+ }
1162
+
1163
+ # Add delay to avoid rate limiting
1164
+ time.sleep(2)
1165
+
1166
+ response = requests.get(url, headers=headers, timeout=30)
1167
+ response.raise_for_status()
1168
+
1169
+ # Parse LinkedIn-specific content
1170
+ try:
1171
+ from bs4 import BeautifulSoup
1172
+ soup = BeautifulSoup(response.content, 'html.parser')
1173
+
1174
+ # Extract LinkedIn-specific elements
1175
+ profile_data = []
1176
+
1177
+ # Try to extract name
1178
+ name_selectors = [
1179
+ 'h1.text-heading-xlarge',
1180
+ '.pv-text-details__left-panel h1',
1181
+ '.ph5 h1'
1182
+ ]
1183
+ for selector in name_selectors:
1184
+ name_elem = soup.select_one(selector)
1185
+ if name_elem:
1186
+ profile_data.append(
1187
+ f"Name: {name_elem.get_text().strip()}")
1188
+ break
1189
+
1190
+ # Try to extract headline/title
1191
+ headline_selectors = [
1192
+ '.text-body-medium.break-words',
1193
+ '.pv-text-details__left-panel .text-body-medium',
1194
+ '.ph5 .text-body-medium'
1195
+ ]
1196
+ for selector in headline_selectors:
1197
+ headline_elem = soup.select_one(selector)
1198
+ if headline_elem:
1199
+ profile_data.append(
1200
+ f"Title: {headline_elem.get_text().strip()}")
1201
+ break
1202
+
1203
+ # Try to extract company
1204
+ company_selectors = [
1205
+ '.pv-text-details__right-panel',
1206
+ '.pv-entity__summary-info h3',
1207
+ '.experience-section .pv-entity__summary-info h3'
1208
+ ]
1209
+ for selector in company_selectors:
1210
+ company_elem = soup.select_one(selector)
1211
+ if company_elem:
1212
+ profile_data.append(
1213
+ f"Company: {company_elem.get_text().strip()}")
1214
+ break
1215
+
1216
+ if profile_data:
1217
+ return ' | '.join(profile_data)
1218
+
1219
+ # Fallback: extract all text
1220
+ text = soup.get_text()
1221
+ lines = (line.strip() for line in text.splitlines())
1222
+ text = ' '.join(line for line in lines if line)
1223
+
1224
+ if len(text) > 3000:
1225
+ text = text[:3000] + "..."
1226
+
1227
+ return text if len(text) > 50 else None
1228
+
1229
+ except ImportError:
1230
+ # Fallback without BeautifulSoup
1231
+ content = response.text
1232
+ if len(content) > 2000:
1233
+ content = content[:2000] + "..."
1234
+ return content
1235
+
1236
+ except Exception as e:
1237
+ self.logger.debug(f"Direct LinkedIn scraping failed: {str(e)}")
1238
+ return None
1239
+
1240
+ def _scrape_facebook_profile(self, url: str) -> Optional[str]:
1241
+ """
1242
+ Scrape Facebook profile with specific handling for Facebook's structure.
1243
+
1244
+ Args:
1245
+ url: Facebook profile URL
1246
+
1247
+ Returns:
1248
+ Scraped Facebook profile data or None if failed
1249
+ """
1250
+ try:
1251
+ # Facebook has strong anti-scraping measures
1252
+
1253
+ # Method 1: Try with Serper API if available (most reliable)
1254
+ serper_key = self.config.get('serper_api_key')
1255
+ if serper_key:
1256
+ facebook_data = self._scrape_with_serper(url, serper_key)
1257
+ if facebook_data:
1258
+ return facebook_data
1259
+
1260
+ # Method 2: Try direct scraping with Facebook-specific headers
1261
+ facebook_data = self._scrape_facebook_direct(url)
1262
+ if facebook_data:
1263
+ return facebook_data
1264
+
1265
+ # Method 3: Fallback to generic scraping
1266
+ return self._direct_website_scrape(url)
1267
+
1268
+ except Exception as e:
1269
+ self.logger.error(f"Facebook scraping failed: {str(e)}")
1270
+ return None
1271
+
1272
+ def _scrape_facebook_direct(self, url: str) -> Optional[str]:
1273
+ """
1274
+ Direct Facebook scraping with specific headers and handling.
1275
+
1276
+ Args:
1277
+ url: Facebook profile URL
1278
+
1279
+ Returns:
1280
+ Scraped content or None if failed
1281
+ """
1282
+ try:
1283
+ # Facebook-specific headers
1284
+ headers = {
1285
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
1286
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
1287
+ 'Accept-Language': 'en-US,en;q=0.5',
1288
+ 'Accept-Encoding': 'gzip, deflate',
1289
+ 'Connection': 'keep-alive',
1290
+ 'Upgrade-Insecure-Requests': '1',
1291
+ 'Sec-Fetch-Dest': 'document',
1292
+ 'Sec-Fetch-Mode': 'navigate',
1293
+ 'Sec-Fetch-Site': 'none',
1294
+ }
1295
+
1296
+ # Add delay to avoid rate limiting
1297
+ time.sleep(3)
1298
+
1299
+ response = requests.get(url, headers=headers, timeout=30)
1300
+ response.raise_for_status()
1301
+
1302
+ # Parse Facebook-specific content
1303
+ try:
1304
+ from bs4 import BeautifulSoup
1305
+ soup = BeautifulSoup(response.content, 'html.parser')
1306
+
1307
+ # Extract Facebook-specific elements
1308
+ profile_data = []
1309
+
1310
+ # Try to extract page/profile name
1311
+ name_selectors = [
1312
+ 'h1[data-testid="page_title"]',
1313
+ '.x1heor9g.x1qlqyl8.x1pd3egz.x1a2a7pz h1',
1314
+ '#seo_h1_tag',
1315
+ 'title'
1316
+ ]
1317
+ for selector in name_selectors:
1318
+ name_elem = soup.select_one(selector)
1319
+ if name_elem:
1320
+ name_text = name_elem.get_text().strip()
1321
+ if name_text and len(name_text) > 3:
1322
+ profile_data.append(f"Name: {name_text}")
1323
+ break
1324
+
1325
+ # Try to extract description/about
1326
+ desc_selectors = [
1327
+ '[data-testid="page_description"]',
1328
+ '.x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz.x1sur9pj.xkrqix3.x1fey0fg.x1s688f',
1329
+ '.x1i10hfl.xjbqb8w.x6umtig.x1b1mbwd.xaqea5y.xav7gou.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz.x1heor9g.xt0b8zv.xo1l8bm'
1330
+ ]
1331
+ for selector in desc_selectors:
1332
+ desc_elem = soup.select_one(selector)
1333
+ if desc_elem:
1334
+ desc_text = desc_elem.get_text().strip()
1335
+ if desc_text and len(desc_text) > 10:
1336
+ profile_data.append(f"Description: {desc_text}")
1337
+ break
1338
+
1339
+ if profile_data:
1340
+ return ' | '.join(profile_data)
1341
+
1342
+ # Fallback: extract meta description
1343
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
1344
+ if meta_desc and meta_desc.get('content'):
1345
+ return f"Description: {meta_desc['content']}"
1346
+
1347
+ # Last fallback: extract title
1348
+ title = soup.find('title')
1349
+ if title:
1350
+ return f"Title: {title.get_text().strip()}"
1351
+
1352
+ return None
1353
+
1354
+ except ImportError:
1355
+ # Fallback without BeautifulSoup
1356
+ content = response.text
1357
+ if 'Facebook' in content and len(content) > 100:
1358
+ # Extract title from HTML
1359
+ title_start = content.find('<title>')
1360
+ title_end = content.find('</title>')
1361
+ if title_start != -1 and title_end != -1:
1362
+ title = content[title_start + 7:title_end].strip()
1363
+ return f"Title: {title}"
1364
+ return None
1365
+
1366
+ except Exception as e:
1367
+ self.logger.debug(f"Direct Facebook scraping failed: {str(e)}")
1368
+ return None
1369
+
1370
+ def _extract_customer_info(self, raw_data: str) -> Dict[str, Any]:
1371
+ """
1372
+ Extract structured customer information using LLM.
1373
+
1374
+ Args:
1375
+ raw_data: Combined raw data from all sources
1376
+
1377
+ Returns:
1378
+ Structured customer information dictionary
1379
+ """
1380
+ try:
1381
+ if self.is_dry_run():
1382
+ return {
1383
+ 'contact_name': 'John Doe',
1384
+ 'company_name': 'Example Corp',
1385
+ 'customer_phone': '+1-555-0123',
1386
+ 'customer_email': 'contact@example.com',
1387
+ 'customer_linkedin': 'https://linkedin.com/company/example',
1388
+ 'customer_facebook': 'https://facebook.com/example',
1389
+ 'company_website': 'https://example.com',
1390
+ 'customer_address': '123 Main St, City, State',
1391
+ 'company_business': 'Technology solutions',
1392
+ 'company_industries': ['Technology', 'Software'],
1393
+ 'founders': ['John Doe'],
1394
+ 'branches': ['Main Office'],
1395
+ 'customer_description': '[DRY RUN] Mock customer description'
1396
+ }
1397
+
1398
+ prompt = f"""This is the customer information: {raw_data}.
1399
+
1400
+ Based on the above data, generate a JSON structure with the following format:
1401
+
1402
+ {{
1403
+ "contact_name": "Name of the contact/representative",
1404
+ "company_name": "Name of the company",
1405
+ "customer_phone": "Company/Contact phone number in the correct format",
1406
+ "customer_email": "Company/Contact email",
1407
+ "customer_linkedin": "LinkedIn profile URL",
1408
+ "customer_facebook": "Facebook profile URL",
1409
+ "company_website": "Company website (valid structure)",
1410
+ "customer_address": "Company/Contact address",
1411
+ "company_business": "Main business activities of the company",
1412
+ "company_industries": ["List of industries or fields of operation"],
1413
+ "founders": ["List of founders"],
1414
+ "branches": ["List of branches"],
1415
+ "customer_description": "All information about the customer"
1416
+ }}
1417
+
1418
+ Rules:
1419
+ 1. Ensure `company_website` is correctly structured as a valid URL.
1420
+ 2. If `company_name` is an array with multiple values:
1421
+ - Use available data and context to generate a comprehensive, accurate company name.
1422
+ 3. Return an empty result if the required information is not available.
1423
+ 4. Do not include the word ```JSON in the result.
1424
+ 5. Provide the output directly without any explanations or additional text. In JSON response, use double quotes instead of single quotes."""
1425
+
1426
+ response = self.call_llm(prompt, temperature=0.2)
1427
+
1428
+ # Parse the JSON response
1429
+ customer_info = self.parse_json_response(response)
1430
+
1431
+ self.logger.info("Successfully extracted customer information")
1432
+ return customer_info
1433
+
1434
+ except Exception as e:
1435
+ self.logger.error(f"Customer info extraction failed: {str(e)}")
1436
+ # Try basic regex extraction as fallback
1437
+ fallback_info = self._extract_basic_info_fallback(raw_data)
1438
+ self.logger.info("Using basic regex extraction as fallback")
1439
+ return fallback_info
1440
+
1441
+ def _extract_basic_info_fallback(self, raw_data: str) -> Dict[str, Any]:
1442
+ """
1443
+ Extract basic information using regex patterns when LLM is not available.
1444
+
1445
+ Args:
1446
+ raw_data: Raw text data to extract from
1447
+
1448
+ Returns:
1449
+ Dictionary with extracted basic information
1450
+ """
1451
+ import re
1452
+
1453
+ # Initialize result with empty values
1454
+ result = {
1455
+ 'contact_name': '',
1456
+ 'company_name': '',
1457
+ 'customer_phone': '',
1458
+ 'customer_email': '',
1459
+ 'customer_linkedin': '',
1460
+ 'customer_facebook': '',
1461
+ 'company_website': '',
1462
+ 'customer_address': '',
1463
+ 'company_business': '',
1464
+ 'company_industries': [],
1465
+ 'founders': [],
1466
+ 'branches': [],
1467
+ 'customer_description': raw_data[:500] + "..." if len(raw_data) > 500 else raw_data
1468
+ }
1469
+
1470
+ # Extract email addresses
1471
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
1472
+ emails = re.findall(email_pattern, raw_data, re.IGNORECASE)
1473
+ if emails:
1474
+ result['customer_email'] = emails[0]
1475
+
1476
+ # Extract phone numbers (various formats)
1477
+ phone_pattern = r'(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})'
1478
+ phones = re.findall(phone_pattern, raw_data)
1479
+ if phones:
1480
+ result['customer_phone'] = ''.join(phones[0])
1481
+
1482
+ # Extract URLs
1483
+ url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
1484
+ urls = re.findall(url_pattern, raw_data, re.IGNORECASE)
1485
+ for url in urls:
1486
+ if 'linkedin.com' in url.lower():
1487
+ result['customer_linkedin'] = url
1488
+ elif 'facebook.com' in url.lower():
1489
+ result['customer_facebook'] = url
1490
+ elif not result['company_website']: # First non-social URL becomes website
1491
+ result['company_website'] = url
1492
+
1493
+ # Extract names and companies using common patterns
1494
+ # Look for "Customer: Name at Company" pattern
1495
+ customer_pattern = r'Customer:\s*([^,\n]+?)(?:\s+at\s+([^,\n]+?))?(?:,|$|\n)'
1496
+ customer_match = re.search(customer_pattern, raw_data, re.IGNORECASE)
1497
+ if customer_match:
1498
+ result['contact_name'] = customer_match.group(1).strip()
1499
+ if customer_match.group(2):
1500
+ result['company_name'] = customer_match.group(2).strip()
1501
+
1502
+ # Look for "Name: value" patterns
1503
+ name_patterns = [
1504
+ (r'Name:\s*([^\n,]+)', 'contact_name'),
1505
+ (r'Company:\s*([^\n,]+)', 'company_name'),
1506
+ (r'Organization:\s*([^\n,]+)', 'company_name'),
1507
+ (r'Business:\s*([^\n,]+)', 'company_business'),
1508
+ (r'Address:\s*([^\n,]+)', 'customer_address')
1509
+ ]
1510
+
1511
+ for pattern, field in name_patterns:
1512
+ match = re.search(pattern, raw_data, re.IGNORECASE)
1513
+ if match and not result[field]: # Only set if not already set
1514
+ result[field] = match.group(1).strip()
1515
+
1516
+ # If we found an email but no name, try to extract name from email
1517
+ if result['customer_email'] and not result['contact_name']:
1518
+ email_name = result['customer_email'].split('@')[0]
1519
+ # Convert common email formats to names
1520
+ if '.' in email_name:
1521
+ parts = email_name.split('.')
1522
+ result['contact_name'] = ' '.join(part.capitalize() for part in parts)
1523
+ else:
1524
+ result['contact_name'] = email_name.capitalize()
1525
+
1526
+ # If we found an email but no company, try to extract from email domain
1527
+ if result['customer_email'] and not result['company_name']:
1528
+ domain = result['customer_email'].split('@')[1]
1529
+ # Remove common TLDs and convert to company name
1530
+ company_part = domain.split('.')[0]
1531
+ result['company_name'] = company_part.upper()
1532
+
1533
+ return result
1534
+
1535
+ def _perform_company_research(self, customer_info: Dict[str, Any]) -> Optional[str]:
1536
+ """
1537
+ Perform enhanced company research using multiple search strategies.
1538
+
1539
+ Args:
1540
+ customer_info: Extracted customer information
1541
+
1542
+ Returns:
1543
+ Research results or None if failed
1544
+ """
1545
+ try:
1546
+ company_name = customer_info.get('company_name', '')
1547
+ company_website = customer_info.get('company_website', '')
1548
+
1549
+ if not company_name:
1550
+ return None
1551
+
1552
+ if self.is_dry_run():
1553
+ return f"[DRY RUN] Would research company: {company_name} {company_website}"
1554
+
1555
+ # Use Serper API for search if available
1556
+ serper_key = self.config.get('serper_api_key')
1557
+ if not serper_key:
1558
+ self.logger.warning("Company research skipped - no Serper API key available")
1559
+ return None
1560
+
1561
+ research_results = []
1562
+
1563
+ # Strategy 1: General company search
1564
+ general_query = f'"{company_name}" company profile business'
1565
+ general_results = self._search_with_serper(general_query, serper_key, 'search')
1566
+ if general_results:
1567
+ research_results.append(f"General Info: {general_results}")
1568
+
1569
+ # Strategy 2: News search for recent company information
1570
+ news_query = f'"{company_name}" company news'
1571
+ news_results = self._search_with_serper(news_query, serper_key, 'news')
1572
+ if news_results:
1573
+ research_results.append(f"Recent News: {news_results}")
1574
+
1575
+ # Strategy 3: Industry-specific search
1576
+ if company_website:
1577
+ industry_query = f'"{company_name}" industry services products site:{company_website}'
1578
+ industry_results = self._search_with_serper(industry_query, serper_key, 'search')
1579
+ if industry_results:
1580
+ research_results.append(f"Industry Info: {industry_results}")
1581
+
1582
+ # Strategy 4: Contact and location search
1583
+ contact_query = f'"{company_name}" contact address phone location'
1584
+ contact_results = self._search_with_serper(contact_query, serper_key, 'search')
1585
+ if contact_results:
1586
+ research_results.append(f"Contact Info: {contact_results}")
1587
+
1588
+ if research_results:
1589
+ combined_research = ' | '.join(research_results)
1590
+ # Limit length to avoid token limits
1591
+ if len(combined_research) > 4000:
1592
+ combined_research = combined_research[:4000] + "..."
1593
+ return combined_research
1594
+
1595
+ return None
1596
+
1597
+ except Exception as e:
1598
+ self.logger.error(f"Company research failed: {str(e)}")
1599
+ return None
1600
+
1601
+ def _search_with_serper(self, query: str, api_key: str, search_type: str = 'search') -> Optional[str]:
1602
+ """
1603
+ Enhanced search using Serper API with multiple search types.
1604
+
1605
+ Args:
1606
+ query: Search query
1607
+ api_key: Serper API key
1608
+ search_type: Type of search ('search', 'news', 'images')
1609
+
1610
+ Returns:
1611
+ Search results or None if failed
1612
+ """
1613
+ try:
1614
+ headers = {
1615
+ 'X-API-KEY': api_key,
1616
+ 'Content-Type': 'application/json'
1617
+ }
1618
+
1619
+ body = {
1620
+ 'q': query,
1621
+ 'num': 10 # Get more results for better fallback
1622
+ }
1623
+
1624
+ # Choose appropriate endpoint
1625
+ endpoints = {
1626
+ 'search': 'https://google.serper.dev/search',
1627
+ 'news': 'https://google.serper.dev/news',
1628
+ 'images': 'https://google.serper.dev/images'
1629
+ }
1630
+
1631
+ endpoint = endpoints.get(search_type, endpoints['search'])
1632
+
1633
+ response = requests.post(
1634
+ endpoint,
1635
+ json=body,
1636
+ headers=headers,
1637
+ timeout=60 # Reduced timeout for faster fallback
1638
+ )
1639
+
1640
+ if response.status_code == 200:
1641
+ result = response.json()
1642
+
1643
+ # Extract different types of results based on search type
1644
+ if search_type == 'search':
1645
+ return self._process_search_results(result)
1646
+ elif search_type == 'news':
1647
+ return self._process_news_results(result)
1648
+ else:
1649
+ return self._process_search_results(result)
1650
+
1651
+ elif response.status_code == 429:
1652
+ self.logger.warning("Serper API rate limit exceeded, waiting before retry")
1653
+ time.sleep(2)
1654
+ return None
1655
+ else:
1656
+ self.logger.warning(
1657
+ f"Serper search API returned status {response.status_code}: {response.text}")
1658
+ return None
1659
+
1660
+ except requests.exceptions.Timeout:
1661
+ self.logger.warning(f"Serper search timed out for query: {query}")
1662
+ return None
1663
+ except Exception as e:
1664
+ self.logger.error(f"Serper search failed: {str(e)}")
1665
+ return None
1666
+
1667
+ def _process_search_results(self, result: Dict[str, Any]) -> Optional[str]:
1668
+ """
1669
+ Process search results from Serper API.
1670
+
1671
+ Args:
1672
+ result: JSON response from Serper API
1673
+
1674
+ Returns:
1675
+ Processed search results text
1676
+ """
1677
+ try:
1678
+ processed_parts = []
1679
+
1680
+ # Extract knowledge graph info (company info box)
1681
+ knowledge_graph = result.get('knowledgeGraph', {})
1682
+ if knowledge_graph:
1683
+ kg_title = knowledge_graph.get('title', '')
1684
+ kg_description = knowledge_graph.get('description', '')
1685
+ kg_attributes = knowledge_graph.get('attributes', {})
1686
+
1687
+ if kg_title:
1688
+ processed_parts.append(f"Company: {kg_title}")
1689
+ if kg_description:
1690
+ processed_parts.append(f"Description: {kg_description}")
1691
+
1692
+ # Add relevant attributes
1693
+ for key, value in kg_attributes.items():
1694
+ if key.lower() in ['founded', 'headquarters', 'ceo', 'industry', 'revenue']:
1695
+ processed_parts.append(f"{key}: {value}")
1696
+
1697
+ # Extract organic results
1698
+ organic_results = result.get('organic', [])
1699
+ snippets = []
1700
+
1701
+ for item in organic_results:
1702
+ title = item.get('title', '')
1703
+ snippet = item.get('snippet', '')
1704
+ link = item.get('link', '')
1705
+
1706
+ if snippet:
1707
+ # Combine title and snippet for better context
1708
+ if title:
1709
+ snippets.append(f"{title}: {snippet}")
1710
+ else:
1711
+ snippets.append(snippet)
1712
+
1713
+ if snippets:
1714
+ processed_parts.extend(snippets[:5]) # Top 5 results
1715
+
1716
+ # Extract answer box if available
1717
+ answer_box = result.get('answerBox', {})
1718
+ if answer_box:
1719
+ answer = answer_box.get('answer', '')
1720
+ if answer:
1721
+ processed_parts.insert(0, f"Answer: {answer}")
1722
+
1723
+ return ' | '.join(processed_parts) if processed_parts else None
1724
+
1725
+ except Exception as e:
1726
+ self.logger.debug(f"Failed to process search results: {str(e)}")
1727
+ # Fallback to simple snippet extraction
1728
+ organic_results = result.get('organic', [])
1729
+ snippets = [item.get('snippet', '') for item in organic_results if 'snippet' in item]
1730
+ return ', '.join(snippets) if snippets else None
1731
+
1732
+ def _process_news_results(self, result: Dict[str, Any]) -> Optional[str]:
1733
+ """
1734
+ Process news results from Serper API.
1735
+
1736
+ Args:
1737
+ result: JSON response from Serper API
1738
+
1739
+ Returns:
1740
+ Processed news results text
1741
+ """
1742
+ try:
1743
+ news_results = result.get('news', [])
1744
+ news_snippets = []
1745
+
1746
+ for item in news_results[:3]: # Top 3 news items
1747
+ title = item.get('title', '')
1748
+ snippet = item.get('snippet', '')
1749
+ date = item.get('date', '')
1750
+
1751
+ if snippet:
1752
+ news_item = f"{title}: {snippet}"
1753
+ if date:
1754
+ news_item += f" ({date})"
1755
+ news_snippets.append(news_item)
1756
+
1757
+ return ' | '.join(news_snippets) if news_snippets else None
1758
+
1759
+ except Exception as e:
1760
+ self.logger.debug(f"Failed to process news results: {str(e)}")
1761
+ return None
1762
+
1763
+ def _scrape_company_website(self, customer_info: Dict[str, Any], data_sources: List[str]) -> Optional[str]:
1764
+ """
1765
+ Scrape company website if not already scraped.
1766
+
1767
+ Args:
1768
+ customer_info: Extracted customer information
1769
+ data_sources: List of already processed data sources
1770
+
1771
+ Returns:
1772
+ Website content or None if failed/skipped
1773
+ """
1774
+ try:
1775
+ # Only scrape if website wasn't already processed
1776
+ if 'website' in data_sources:
1777
+ return None
1778
+
1779
+ company_website = customer_info.get('company_website', '')
1780
+ if not company_website:
1781
+ return None
1782
+
1783
+ return self._scrape_website(company_website)
1784
+
1785
+ except Exception as e:
1786
+ self.logger.error(f"Company website research failed: {str(e)}")
1787
+ return None
1788
+
1789
+ def validate_input(self, context: Dict[str, Any]) -> bool:
1790
+ """
1791
+ Validate input data for data acquisition stage.
1792
+
1793
+ Args:
1794
+ context: Execution context
1795
+
1796
+ Returns:
1797
+ True if input is valid
1798
+ """
1799
+ input_data = context.get('input_data', {})
1800
+
1801
+ # Check if at least one data source is available (matching executor schema)
1802
+ sources = [
1803
+ input_data.get('input_website'),
1804
+ input_data.get('input_description'),
1805
+ input_data.get('input_business_card'),
1806
+ input_data.get('input_linkedin_url'),
1807
+ input_data.get('input_facebook_url'),
1808
+ input_data.get('input_freetext')
1809
+ ]
1810
+
1811
+ return any(sources)
1812
+
1813
+ def get_required_fields(self) -> List[str]:
1814
+ """
1815
+ Get list of required input fields for this stage.
1816
+
1817
+ Returns:
1818
+ List of required field names (at least one data source required)
1819
+ """
1820
+ return [] # No strictly required fields, but at least one source needed