fusesell 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fusesell might be problematic. Click here for more details.

@@ -0,0 +1,1231 @@
1
+ """
2
+ Data Preparation Stage - Clean and structure customer data using AI
3
+ Converted from fusesell_data_preparation.yml
4
+ """
5
+
6
+ import json
7
+ from typing import Dict, Any, List, Optional
8
+ from datetime import datetime
9
+ from .base_stage import BaseStage
10
+
11
+
12
+ class DataPreparationStage(BaseStage):
13
+ """
14
+ Data Preparation stage for cleaning and structuring customer data using LLM.
15
+ Converts YAML workflow logic to Python implementation.
16
+ """
17
+
18
+ def execute(self, context: Dict[str, Any]) -> Dict[str, Any]:
19
+ """
20
+ Execute data preparation stage.
21
+
22
+ Args:
23
+ context: Execution context
24
+
25
+ Returns:
26
+ Stage execution result
27
+ """
28
+ try:
29
+ # Get data from previous stage (data acquisition)
30
+ acquisition_data = self._get_acquisition_data(context)
31
+
32
+ # Prepare customer information for LLM processing
33
+ customer_info_text = self._prepare_customer_info_text(acquisition_data)
34
+
35
+ # Extract structured customer information using LLM
36
+ structured_data = self._extract_structured_customer_info(customer_info_text)
37
+
38
+ # Enhance pain point identification
39
+ enhanced_data = self._enhance_pain_point_analysis(structured_data, customer_info_text)
40
+
41
+ # Add financial analysis
42
+ financial_enhanced_data = self._enhance_financial_analysis(enhanced_data, customer_info_text)
43
+
44
+ # Add company research and development analysis
45
+ research_enhanced_data = self._enhance_research_analysis(financial_enhanced_data, customer_info_text)
46
+
47
+ # Validate and clean the structured data
48
+ validated_data = self._validate_and_clean_data(research_enhanced_data)
49
+
50
+ # Save customer data to local database
51
+ self._save_customer_data(context, validated_data)
52
+
53
+ # Save to database
54
+ self.save_stage_result(context, validated_data)
55
+
56
+ result = self.create_success_result(validated_data, context)
57
+ return result
58
+
59
+ except Exception as e:
60
+ self.log_stage_error(context, e)
61
+ return self.handle_stage_error(e, context)
62
+
63
+ def _get_acquisition_data(self, context: Dict[str, Any]) -> Dict[str, Any]:
64
+ """
65
+ Get data from the data acquisition stage.
66
+
67
+ Args:
68
+ context: Execution context
69
+
70
+ Returns:
71
+ Data acquisition results
72
+ """
73
+ # Try to get from stage results first
74
+ stage_results = context.get('stage_results', {})
75
+ if 'data_acquisition' in stage_results:
76
+ acquisition_data = stage_results['data_acquisition'].get('data', {})
77
+ # Store for fallback use
78
+ self._current_acquisition_data = acquisition_data
79
+ return acquisition_data
80
+
81
+ # Fallback: try to get from input_data (for testing)
82
+ input_data = context.get('input_data', {})
83
+ fallback_data = {
84
+ 'company_name': input_data.get('customer_name', ''),
85
+ 'company_website': input_data.get('customer_website', ''),
86
+ 'customer_description': input_data.get('customer_description', ''),
87
+ 'company_mini_search': input_data.get('company_mini_search', ''),
88
+ 'contact_name': input_data.get('contact_name', ''),
89
+ 'customer_email': input_data.get('contact_email', ''),
90
+ 'customer_phone': input_data.get('contact_phone', ''),
91
+ 'customer_address': input_data.get('customer_address', ''),
92
+ 'customer_linkedin': input_data.get('linkedin_url', ''),
93
+ 'customer_facebook': input_data.get('facebook_url', ''),
94
+ 'company_business': '',
95
+ 'company_industries': [],
96
+ 'founders': [],
97
+ 'branches': []
98
+ }
99
+ # Store for fallback use
100
+ self._current_acquisition_data = fallback_data
101
+ return fallback_data
102
+
103
+ def _prepare_customer_info_text(self, acquisition_data: Dict[str, Any]) -> str:
104
+ """
105
+ Prepare customer information text for LLM processing.
106
+
107
+ Args:
108
+ acquisition_data: Data from acquisition stage
109
+
110
+ Returns:
111
+ Combined customer information text
112
+ """
113
+ info_parts = []
114
+
115
+ # Add company mini search results
116
+ mini_search = acquisition_data.get('company_mini_search', '')
117
+ if mini_search:
118
+ info_parts.append(f"Company Research: {mini_search}")
119
+
120
+ # Add customer description
121
+ description = acquisition_data.get('customer_description', '')
122
+ if description:
123
+ info_parts.append(f"Customer Description: {description}")
124
+
125
+ # Add basic company info
126
+ company_name = acquisition_data.get('company_name', '')
127
+ if company_name:
128
+ info_parts.append(f"Company Name: {company_name}")
129
+
130
+ website = acquisition_data.get('company_website', '')
131
+ if website:
132
+ info_parts.append(f"Website: {website}")
133
+
134
+ # Add contact information
135
+ contact_name = acquisition_data.get('contact_name', '')
136
+ if contact_name:
137
+ info_parts.append(f"Contact: {contact_name}")
138
+
139
+ # Add business information
140
+ business = acquisition_data.get('company_business', '')
141
+ if business:
142
+ info_parts.append(f"Business: {business}")
143
+
144
+ # Add industries
145
+ industries = acquisition_data.get('company_industries', [])
146
+ if industries:
147
+ info_parts.append(f"Industries: {', '.join(industries)}")
148
+
149
+ return '; '.join(info_parts)
150
+
151
+ def _extract_structured_customer_info(self, customer_info_text: str) -> Dict[str, Any]:
152
+ """
153
+ Extract structured customer information using LLM.
154
+
155
+ Args:
156
+ customer_info_text: Combined customer information text
157
+
158
+ Returns:
159
+ Structured customer information dictionary
160
+ """
161
+ try:
162
+ if self.is_dry_run():
163
+ return self._get_mock_structured_data()
164
+
165
+ # Get the LLM instruction from the original YAML
166
+ instruction = self._get_llm_instruction()
167
+
168
+ # Create the full prompt
169
+ prompt = f"{instruction}\n\nThe customer information: {customer_info_text}"
170
+
171
+ # Call LLM with specific parameters from original YAML
172
+ response = self.call_llm(prompt, temperature=0.3)
173
+
174
+ # Parse the JSON response
175
+ structured_data = self.parse_json_response(response)
176
+
177
+ self.logger.info("Successfully extracted structured customer information")
178
+ return structured_data
179
+
180
+ except Exception as e:
181
+ self.logger.error(f"Structured data extraction failed: {str(e)}")
182
+ # Return minimal structure to prevent complete failure
183
+ return self._get_fallback_structured_data(customer_info_text)
184
+
185
+ def _enhance_pain_point_analysis(self, structured_data: Dict[str, Any], customer_info_text: str) -> Dict[str, Any]:
186
+ """
187
+ Enhance pain point identification with additional analysis.
188
+
189
+ Args:
190
+ structured_data: Initial structured data from LLM
191
+ customer_info_text: Original customer information text
192
+
193
+ Returns:
194
+ Enhanced structured data with better pain point analysis
195
+ """
196
+ try:
197
+ current_pain_points = structured_data.get('painPoints', [])
198
+
199
+ # If pain points are insufficient, enhance them
200
+ if len(current_pain_points) < 2 or not self._are_pain_points_detailed(current_pain_points):
201
+ enhanced_pain_points = self._generate_enhanced_pain_points(structured_data, customer_info_text)
202
+ if enhanced_pain_points:
203
+ structured_data['painPoints'] = enhanced_pain_points
204
+
205
+ # Categorize and prioritize pain points
206
+ structured_data['painPoints'] = self._categorize_and_prioritize_pain_points(structured_data['painPoints'])
207
+
208
+ return structured_data
209
+
210
+ except Exception as e:
211
+ self.logger.error(f"Pain point enhancement failed: {str(e)}")
212
+ return structured_data
213
+
214
+ def _are_pain_points_detailed(self, pain_points: List[Dict[str, Any]]) -> bool:
215
+ """
216
+ Check if pain points are detailed enough.
217
+
218
+ Args:
219
+ pain_points: List of pain point dictionaries
220
+
221
+ Returns:
222
+ True if pain points are sufficiently detailed
223
+ """
224
+ if not pain_points:
225
+ return False
226
+
227
+ for pain_point in pain_points:
228
+ description = pain_point.get('description', '')
229
+ if len(description) < 20: # Too short to be meaningful
230
+ return False
231
+
232
+ return True
233
+
234
+ def _generate_enhanced_pain_points(self, structured_data: Dict[str, Any], customer_info_text: str) -> Optional[List[Dict[str, Any]]]:
235
+ """
236
+ Generate enhanced pain points using focused LLM analysis.
237
+
238
+ Args:
239
+ structured_data: Current structured data
240
+ customer_info_text: Original customer information
241
+
242
+ Returns:
243
+ Enhanced pain points list or None if failed
244
+ """
245
+ try:
246
+ if self.is_dry_run():
247
+ return self._get_mock_pain_points()
248
+
249
+ company_info = structured_data.get('companyInfo', {})
250
+ industry = company_info.get('industry', '')
251
+ company_size = company_info.get('size', '')
252
+
253
+ pain_point_prompt = f"""Analyze the following company information and identify specific, actionable pain points:
254
+
255
+ Company Information: {customer_info_text}
256
+ Industry: {industry}
257
+ Company Size: {company_size}
258
+
259
+ Based on this information, identify 3-5 specific pain points this company likely faces. For each pain point, provide:
260
+ 1. Category (e.g., "Operational Efficiency", "Technology", "Financial", "Market Competition", "Customer Experience")
261
+ 2. Detailed description of the specific challenge
262
+ 3. Impact level and explanation (High/Medium/Low with reasoning)
263
+
264
+ Return as JSON array:
265
+ [
266
+ {{
267
+ "category": "category name",
268
+ "description": "detailed description of the pain point",
269
+ "impact": "impact level with explanation"
270
+ }}
271
+ ]
272
+
273
+ Focus on realistic, industry-specific challenges that would resonate with the company."""
274
+
275
+ response = self.call_llm(pain_point_prompt, temperature=0.4)
276
+ pain_points = self.parse_json_response(response)
277
+
278
+ if isinstance(pain_points, list) and len(pain_points) > 0:
279
+ self.logger.info(f"Generated {len(pain_points)} enhanced pain points")
280
+ return pain_points
281
+
282
+ return None
283
+
284
+ except Exception as e:
285
+ self.logger.error(f"Enhanced pain point generation failed: {str(e)}")
286
+ return None
287
+
288
+ def _get_mock_pain_points(self) -> List[Dict[str, Any]]:
289
+ """
290
+ Get mock pain points for dry run mode.
291
+
292
+ Returns:
293
+ Mock pain points list
294
+ """
295
+ return [
296
+ {
297
+ 'category': 'Operational Efficiency',
298
+ 'description': 'Manual processes and lack of automation leading to increased operational costs and slower response times',
299
+ 'impact': 'High - directly affects profitability and customer satisfaction'
300
+ },
301
+ {
302
+ 'category': 'Technology Infrastructure',
303
+ 'description': 'Outdated systems and lack of integration between different business tools',
304
+ 'impact': 'Medium - limiting scalability and data-driven decision making'
305
+ },
306
+ {
307
+ 'category': 'Market Competition',
308
+ 'description': 'Increasing competition from digital-first companies with more agile business models',
309
+ 'impact': 'High - threatening market share and pricing power'
310
+ },
311
+ {
312
+ 'category': 'Customer Experience',
313
+ 'description': 'Inconsistent customer touchpoints and limited self-service options',
314
+ 'impact': 'Medium - affecting customer retention and acquisition costs'
315
+ }
316
+ ]
317
+
318
+ def _categorize_and_prioritize_pain_points(self, pain_points: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
319
+ """
320
+ Categorize and prioritize pain points.
321
+
322
+ Args:
323
+ pain_points: List of pain point dictionaries
324
+
325
+ Returns:
326
+ Categorized and prioritized pain points
327
+ """
328
+ try:
329
+ # Define priority mapping
330
+ impact_priority = {
331
+ 'high': 3,
332
+ 'medium': 2,
333
+ 'low': 1
334
+ }
335
+
336
+ # Add priority scores and normalize categories
337
+ for pain_point in pain_points:
338
+ # Normalize impact to get priority
339
+ impact = pain_point.get('impact', '').lower()
340
+ if 'high' in impact:
341
+ pain_point['priority'] = 3
342
+ elif 'medium' in impact:
343
+ pain_point['priority'] = 2
344
+ else:
345
+ pain_point['priority'] = 1
346
+
347
+ # Normalize category
348
+ category = pain_point.get('category', '').strip()
349
+ pain_point['category'] = self._normalize_pain_point_category(category)
350
+
351
+ # Sort by priority (highest first)
352
+ pain_points.sort(key=lambda x: x.get('priority', 0), reverse=True)
353
+
354
+ return pain_points
355
+
356
+ except Exception as e:
357
+ self.logger.error(f"Pain point categorization failed: {str(e)}")
358
+ return pain_points
359
+
360
+ def _normalize_pain_point_category(self, category: str) -> str:
361
+ """
362
+ Normalize pain point category names.
363
+
364
+ Args:
365
+ category: Original category name
366
+
367
+ Returns:
368
+ Normalized category name
369
+ """
370
+ category_mapping = {
371
+ 'operational': 'Operational Efficiency',
372
+ 'operations': 'Operational Efficiency',
373
+ 'efficiency': 'Operational Efficiency',
374
+ 'technology': 'Technology Infrastructure',
375
+ 'tech': 'Technology Infrastructure',
376
+ 'it': 'Technology Infrastructure',
377
+ 'financial': 'Financial Management',
378
+ 'finance': 'Financial Management',
379
+ 'money': 'Financial Management',
380
+ 'market': 'Market Competition',
381
+ 'competition': 'Market Competition',
382
+ 'competitive': 'Market Competition',
383
+ 'customer': 'Customer Experience',
384
+ 'customers': 'Customer Experience',
385
+ 'service': 'Customer Experience',
386
+ 'sales': 'Sales & Marketing',
387
+ 'marketing': 'Sales & Marketing',
388
+ 'growth': 'Business Growth',
389
+ 'scaling': 'Business Growth',
390
+ 'compliance': 'Regulatory Compliance',
391
+ 'legal': 'Regulatory Compliance',
392
+ 'hr': 'Human Resources',
393
+ 'talent': 'Human Resources',
394
+ 'staff': 'Human Resources'
395
+ }
396
+
397
+ category_lower = category.lower().strip()
398
+
399
+ # Check for exact matches first
400
+ if category_lower in category_mapping:
401
+ return category_mapping[category_lower]
402
+
403
+ # Check for partial matches
404
+ for key, value in category_mapping.items():
405
+ if key in category_lower:
406
+ return value
407
+
408
+ # Return original if no match found
409
+ return category.title() if category else 'General Business'
410
+
411
+ def _enhance_financial_analysis(self, structured_data: Dict[str, Any], customer_info_text: str) -> Dict[str, Any]:
412
+ """
413
+ Enhance financial analysis with additional insights.
414
+
415
+ Args:
416
+ structured_data: Current structured data
417
+ customer_info_text: Original customer information
418
+
419
+ Returns:
420
+ Enhanced structured data with better financial analysis
421
+ """
422
+ try:
423
+ company_info = structured_data.get('companyInfo', {})
424
+ current_financial = structured_data.get('financialInfo', {})
425
+
426
+ # If financial info is sparse, enhance it
427
+ if not current_financial.get('revenueLastThreeYears') and not current_financial.get('profit'):
428
+ enhanced_financial = self._generate_financial_estimates(company_info, customer_info_text)
429
+ if enhanced_financial:
430
+ structured_data['financialInfo'].update(enhanced_financial)
431
+
432
+ # Add financial health assessment
433
+ structured_data['financialInfo']['healthAssessment'] = self._assess_financial_health(
434
+ structured_data['financialInfo'], company_info
435
+ )
436
+
437
+ return structured_data
438
+
439
+ except Exception as e:
440
+ self.logger.error(f"Financial analysis enhancement failed: {str(e)}")
441
+ return structured_data
442
+
443
+ def _generate_financial_estimates(self, company_info: Dict[str, Any], customer_info_text: str) -> Optional[Dict[str, Any]]:
444
+ """
445
+ Generate financial estimates using LLM analysis.
446
+
447
+ Args:
448
+ company_info: Company information
449
+ customer_info_text: Original customer information
450
+
451
+ Returns:
452
+ Financial estimates or None if failed
453
+ """
454
+ try:
455
+ if self.is_dry_run():
456
+ return self._get_mock_financial_data()
457
+
458
+ industry = company_info.get('industry', '')
459
+ company_size = company_info.get('size', '')
460
+ company_name = company_info.get('name', '')
461
+
462
+ financial_prompt = f"""Based on the following company information, provide realistic financial estimates:
463
+
464
+ Company: {company_name}
465
+ Industry: {industry}
466
+ Size: {company_size}
467
+ Additional Info: {customer_info_text[:500]}
468
+
469
+ Provide financial estimates in JSON format:
470
+ {{
471
+ "estimatedAnnualRevenue": "revenue range estimate",
472
+ "revenueGrowthTrend": "growth trend analysis",
473
+ "profitMarginEstimate": "estimated profit margin percentage",
474
+ "fundingStage": "likely funding stage",
475
+ "financialChallenges": ["list of likely financial challenges"],
476
+ "revenueStreams": ["likely revenue streams"]
477
+ }}
478
+
479
+ Base estimates on industry standards and company size indicators. Be conservative and realistic."""
480
+
481
+ response = self.call_llm(financial_prompt, temperature=0.3)
482
+ financial_data = self.parse_json_response(response)
483
+
484
+ if isinstance(financial_data, dict):
485
+ self.logger.info("Generated financial estimates")
486
+ return financial_data
487
+
488
+ return None
489
+
490
+ except Exception as e:
491
+ self.logger.error(f"Financial estimate generation failed: {str(e)}")
492
+ return None
493
+
494
+ def _get_mock_financial_data(self) -> Dict[str, Any]:
495
+ """
496
+ Get mock financial data for dry run mode.
497
+
498
+ Returns:
499
+ Mock financial data
500
+ """
501
+ return {
502
+ 'estimatedAnnualRevenue': '$2-5M',
503
+ 'revenueGrowthTrend': 'Steady growth of 15-20% annually',
504
+ 'profitMarginEstimate': '12-18%',
505
+ 'fundingStage': 'Self-funded or Series A',
506
+ 'financialChallenges': [
507
+ 'Cash flow management during growth phases',
508
+ 'Balancing investment in growth vs profitability',
509
+ 'Managing operational costs as scale increases'
510
+ ],
511
+ 'revenueStreams': [
512
+ 'Product sales',
513
+ 'Service contracts',
514
+ 'Recurring subscriptions'
515
+ ]
516
+ }
517
+
518
+ def _assess_financial_health(self, financial_info: Dict[str, Any], company_info: Dict[str, Any]) -> Dict[str, Any]:
519
+ """
520
+ Assess financial health based on available information.
521
+
522
+ Args:
523
+ financial_info: Financial information
524
+ company_info: Company information
525
+
526
+ Returns:
527
+ Financial health assessment
528
+ """
529
+ try:
530
+ assessment = {
531
+ 'overallRating': 'Unknown',
532
+ 'strengths': [],
533
+ 'concerns': [],
534
+ 'recommendations': []
535
+ }
536
+
537
+ # Analyze revenue trend if available
538
+ revenue_years = financial_info.get('revenueLastThreeYears', [])
539
+ if len(revenue_years) >= 2:
540
+ # Calculate growth trend
541
+ recent_revenue = revenue_years[-1].get('revenue', 0)
542
+ previous_revenue = revenue_years[-2].get('revenue', 0)
543
+
544
+ if previous_revenue > 0:
545
+ growth_rate = ((recent_revenue - previous_revenue) / previous_revenue) * 100
546
+
547
+ if growth_rate > 20:
548
+ assessment['strengths'].append('Strong revenue growth')
549
+ assessment['overallRating'] = 'Good'
550
+ elif growth_rate > 0:
551
+ assessment['strengths'].append('Positive revenue growth')
552
+ assessment['overallRating'] = 'Fair'
553
+ else:
554
+ assessment['concerns'].append('Declining revenue trend')
555
+ assessment['overallRating'] = 'Concerning'
556
+
557
+ # Analyze profit margins
558
+ profit = financial_info.get('profit', 0)
559
+ if profit > 0:
560
+ assessment['strengths'].append('Profitable operations')
561
+ elif profit < 0:
562
+ assessment['concerns'].append('Operating at a loss')
563
+
564
+ # Industry-specific analysis
565
+ industry = company_info.get('industry', '').lower()
566
+ if 'technology' in industry or 'software' in industry:
567
+ assessment['recommendations'].append('Focus on recurring revenue models')
568
+ assessment['recommendations'].append('Invest in R&D for competitive advantage')
569
+ elif 'manufacturing' in industry:
570
+ assessment['recommendations'].append('Optimize supply chain efficiency')
571
+ assessment['recommendations'].append('Consider automation investments')
572
+
573
+ # General recommendations
574
+ if not assessment['recommendations']:
575
+ assessment['recommendations'] = [
576
+ 'Diversify revenue streams',
577
+ 'Improve operational efficiency',
578
+ 'Build cash reserves for growth opportunities'
579
+ ]
580
+
581
+ return assessment
582
+
583
+ except Exception as e:
584
+ self.logger.error(f"Financial health assessment failed: {str(e)}")
585
+ return {
586
+ 'overallRating': 'Unknown',
587
+ 'strengths': [],
588
+ 'concerns': [],
589
+ 'recommendations': ['Conduct detailed financial analysis']
590
+ }
591
+
592
+ def _enhance_research_analysis(self, structured_data: Dict[str, Any], customer_info_text: str) -> Dict[str, Any]:
593
+ """
594
+ Enhance research and development analysis.
595
+
596
+ Args:
597
+ structured_data: Current structured data
598
+ customer_info_text: Original customer information
599
+
600
+ Returns:
601
+ Enhanced structured data with R&D analysis
602
+ """
603
+ try:
604
+ company_info = structured_data.get('companyInfo', {})
605
+ current_tech = structured_data.get('technologyAndInnovation', {})
606
+
607
+ # Enhance technology stack analysis
608
+ enhanced_tech = self._analyze_technology_stack(company_info, customer_info_text)
609
+ if enhanced_tech:
610
+ current_tech.update(enhanced_tech)
611
+
612
+ # Enhance development plans
613
+ enhanced_plans = self._analyze_development_plans(structured_data, customer_info_text)
614
+ if enhanced_plans:
615
+ structured_data['developmentPlans'].update(enhanced_plans)
616
+
617
+ # Add competitive analysis
618
+ structured_data['competitiveAnalysis'] = self._generate_competitive_analysis(
619
+ company_info, customer_info_text
620
+ )
621
+
622
+ return structured_data
623
+
624
+ except Exception as e:
625
+ self.logger.error(f"Research analysis enhancement failed: {str(e)}")
626
+ return structured_data
627
+
628
+ def _analyze_technology_stack(self, company_info: Dict[str, Any], customer_info_text: str) -> Optional[Dict[str, Any]]:
629
+ """
630
+ Analyze and estimate technology stack.
631
+
632
+ Args:
633
+ company_info: Company information
634
+ customer_info_text: Original customer information
635
+
636
+ Returns:
637
+ Technology analysis or None if failed
638
+ """
639
+ try:
640
+ if self.is_dry_run():
641
+ return self._get_mock_technology_analysis()
642
+
643
+ industry = company_info.get('industry', '')
644
+ company_size = company_info.get('size', '')
645
+
646
+ tech_prompt = f"""Analyze the likely technology stack and innovation needs for this company:
647
+
648
+ Industry: {industry}
649
+ Company Size: {company_size}
650
+ Company Info: {customer_info_text[:400]}
651
+
652
+ Provide analysis in JSON format:
653
+ {{
654
+ "likelyTechStack": ["list of technologies they probably use"],
655
+ "technologyGaps": ["areas where they might need technology improvements"],
656
+ "innovationOpportunities": ["potential areas for innovation"],
657
+ "digitalMaturityLevel": "assessment of digital maturity (Basic/Intermediate/Advanced)",
658
+ "recommendedTechnologies": ["technologies that could benefit them"]
659
+ }}
660
+
661
+ Focus on realistic, industry-appropriate technology assessments."""
662
+
663
+ response = self.call_llm(tech_prompt, temperature=0.3)
664
+ tech_analysis = self.parse_json_response(response)
665
+
666
+ if isinstance(tech_analysis, dict):
667
+ self.logger.info("Generated technology stack analysis")
668
+ return tech_analysis
669
+
670
+ return None
671
+
672
+ except Exception as e:
673
+ self.logger.error(f"Technology stack analysis failed: {str(e)}")
674
+ return None
675
+
676
+ def _get_mock_technology_analysis(self) -> Dict[str, Any]:
677
+ """
678
+ Get mock technology analysis for dry run mode.
679
+
680
+ Returns:
681
+ Mock technology analysis
682
+ """
683
+ return {
684
+ 'likelyTechStack': ['CRM System', 'Email Marketing', 'Basic Analytics', 'Office Suite'],
685
+ 'technologyGaps': ['Marketing Automation', 'Advanced Analytics', 'Customer Support Tools'],
686
+ 'innovationOpportunities': ['AI-powered customer insights', 'Process automation', 'Mobile solutions'],
687
+ 'digitalMaturityLevel': 'Intermediate',
688
+ 'recommendedTechnologies': ['Marketing Automation Platform', 'Business Intelligence Tools', 'Cloud Infrastructure']
689
+ }
690
+
691
+ def _analyze_development_plans(self, structured_data: Dict[str, Any], customer_info_text: str) -> Optional[Dict[str, Any]]:
692
+ """
693
+ Analyze and enhance development plans.
694
+
695
+ Args:
696
+ structured_data: Current structured data
697
+ customer_info_text: Original customer information
698
+
699
+ Returns:
700
+ Enhanced development plans or None if failed
701
+ """
702
+ try:
703
+ company_info = structured_data.get('companyInfo', {})
704
+ pain_points = structured_data.get('painPoints', [])
705
+
706
+ # Extract key challenges for development planning
707
+ key_challenges = [pp.get('description', '') for pp in pain_points[:3]]
708
+
709
+ development_analysis = {
710
+ 'priorityAreas': self._identify_priority_development_areas(company_info, pain_points),
711
+ 'timelineEstimates': self._estimate_development_timelines(company_info),
712
+ 'resourceRequirements': self._estimate_resource_requirements(company_info, pain_points),
713
+ 'riskFactors': self._identify_development_risks(company_info, pain_points)
714
+ }
715
+
716
+ return development_analysis
717
+
718
+ except Exception as e:
719
+ self.logger.error(f"Development plans analysis failed: {str(e)}")
720
+ return None
721
+
722
+ def _identify_priority_development_areas(self, company_info: Dict[str, Any], pain_points: List[Dict[str, Any]]) -> List[str]:
723
+ """
724
+ Identify priority development areas based on pain points.
725
+
726
+ Args:
727
+ company_info: Company information
728
+ pain_points: List of pain points
729
+
730
+ Returns:
731
+ List of priority development areas
732
+ """
733
+ priority_areas = []
734
+
735
+ for pain_point in pain_points:
736
+ category = pain_point.get('category', '').lower()
737
+
738
+ if 'operational' in category or 'efficiency' in category:
739
+ priority_areas.append('Process Optimization')
740
+ elif 'technology' in category:
741
+ priority_areas.append('Technology Modernization')
742
+ elif 'customer' in category:
743
+ priority_areas.append('Customer Experience Enhancement')
744
+ elif 'financial' in category:
745
+ priority_areas.append('Financial Management Systems')
746
+ elif 'market' in category or 'competition' in category:
747
+ priority_areas.append('Market Expansion Strategy')
748
+
749
+ # Remove duplicates and limit to top 5
750
+ return list(dict.fromkeys(priority_areas))[:5]
751
+
752
+ def _estimate_development_timelines(self, company_info: Dict[str, Any]) -> Dict[str, str]:
753
+ """
754
+ Estimate development timelines based on company size.
755
+
756
+ Args:
757
+ company_info: Company information
758
+
759
+ Returns:
760
+ Timeline estimates
761
+ """
762
+ company_size = company_info.get('size', '').lower()
763
+
764
+ if 'small' in company_size or 'startup' in company_size:
765
+ return {
766
+ 'shortTerm': '3-6 months',
767
+ 'mediumTerm': '6-12 months',
768
+ 'longTerm': '1-2 years'
769
+ }
770
+ elif 'large' in company_size or 'enterprise' in company_size:
771
+ return {
772
+ 'shortTerm': '6-12 months',
773
+ 'mediumTerm': '1-2 years',
774
+ 'longTerm': '2-3 years'
775
+ }
776
+ else: # Medium size
777
+ return {
778
+ 'shortTerm': '4-8 months',
779
+ 'mediumTerm': '8-18 months',
780
+ 'longTerm': '1.5-2.5 years'
781
+ }
782
+
783
+ def _estimate_resource_requirements(self, company_info: Dict[str, Any], pain_points: List[Dict[str, Any]]) -> Dict[str, Any]:
784
+ """
785
+ Estimate resource requirements for development.
786
+
787
+ Args:
788
+ company_info: Company information
789
+ pain_points: List of pain points
790
+
791
+ Returns:
792
+ Resource requirement estimates
793
+ """
794
+ return {
795
+ 'budgetRange': 'Varies by project scope',
796
+ 'keyRoles': ['Project Manager', 'Technical Lead', 'Business Analyst'],
797
+ 'externalSupport': 'May require consultants for specialized areas',
798
+ 'trainingNeeds': 'Staff training on new processes and technologies'
799
+ }
800
+
801
+ def _identify_development_risks(self, company_info: Dict[str, Any], pain_points: List[Dict[str, Any]]) -> List[str]:
802
+ """
803
+ Identify potential development risks.
804
+
805
+ Args:
806
+ company_info: Company information
807
+ pain_points: List of pain points
808
+
809
+ Returns:
810
+ List of development risks
811
+ """
812
+ return [
813
+ 'Resource allocation conflicts with daily operations',
814
+ 'Change management resistance from staff',
815
+ 'Technology integration challenges',
816
+ 'Budget overruns due to scope creep',
817
+ 'Timeline delays due to unforeseen complications'
818
+ ]
819
+
820
+ def _generate_competitive_analysis(self, company_info: Dict[str, Any], customer_info_text: str) -> Dict[str, Any]:
821
+ """
822
+ Generate competitive analysis insights.
823
+
824
+ Args:
825
+ company_info: Company information
826
+ customer_info_text: Original customer information
827
+
828
+ Returns:
829
+ Competitive analysis insights
830
+ """
831
+ try:
832
+ industry = company_info.get('industry', '')
833
+ company_size = company_info.get('size', '')
834
+
835
+ return {
836
+ 'competitivePosition': self._assess_competitive_position(industry, company_size),
837
+ 'marketTrends': self._identify_market_trends(industry),
838
+ 'competitiveAdvantages': self._identify_potential_advantages(company_info),
839
+ 'threats': self._identify_competitive_threats(industry, company_size),
840
+ 'opportunities': self._identify_market_opportunities(industry, company_size)
841
+ }
842
+
843
+ except Exception as e:
844
+ self.logger.error(f"Competitive analysis failed: {str(e)}")
845
+ return {
846
+ 'competitivePosition': 'Analysis pending',
847
+ 'marketTrends': [],
848
+ 'competitiveAdvantages': [],
849
+ 'threats': [],
850
+ 'opportunities': []
851
+ }
852
+
853
+ def _assess_competitive_position(self, industry: str, company_size: str) -> str:
854
+ """Assess competitive position based on industry and size."""
855
+ if 'small' in company_size.lower():
856
+ return 'Niche player with agility advantages'
857
+ elif 'large' in company_size.lower():
858
+ return 'Established player with resource advantages'
859
+ else:
860
+ return 'Mid-market player with growth potential'
861
+
862
+ def _identify_market_trends(self, industry: str) -> List[str]:
863
+ """Identify relevant market trends."""
864
+ industry_lower = industry.lower()
865
+
866
+ if 'technology' in industry_lower or 'software' in industry_lower:
867
+ return ['Digital transformation acceleration', 'AI/ML adoption', 'Cloud migration', 'Remote work tools']
868
+ elif 'retail' in industry_lower or 'ecommerce' in industry_lower:
869
+ return ['Omnichannel experiences', 'Personalization', 'Sustainability focus', 'Mobile commerce']
870
+ elif 'healthcare' in industry_lower:
871
+ return ['Telemedicine growth', 'Digital health records', 'Patient experience focus', 'Regulatory compliance']
872
+ else:
873
+ return ['Digital transformation', 'Customer experience focus', 'Operational efficiency', 'Sustainability']
874
+
875
+ def _identify_potential_advantages(self, company_info: Dict[str, Any]) -> List[str]:
876
+ """Identify potential competitive advantages."""
877
+ return [
878
+ 'Local market knowledge',
879
+ 'Personalized customer service',
880
+ 'Agile decision making',
881
+ 'Specialized expertise'
882
+ ]
883
+
884
+ def _identify_competitive_threats(self, industry: str, company_size: str) -> List[str]:
885
+ """Identify competitive threats."""
886
+ return [
887
+ 'Larger competitors with more resources',
888
+ 'New market entrants with innovative solutions',
889
+ 'Price competition from low-cost providers',
890
+ 'Technology disruption changing industry dynamics'
891
+ ]
892
+
893
+ def _identify_market_opportunities(self, industry: str, company_size: str) -> List[str]:
894
+ """Identify market opportunities."""
895
+ return [
896
+ 'Underserved market segments',
897
+ 'Technology adoption gaps',
898
+ 'Partnership opportunities',
899
+ 'Geographic expansion potential'
900
+ ]
901
+
902
+ def _get_llm_instruction(self) -> str:
903
+ """
904
+ Get the LLM instruction from the original YAML workflow.
905
+
906
+ Returns:
907
+ LLM instruction text
908
+ """
909
+ return """Role: Customer research analyst conducting comprehensive data gathering on provided companies.
910
+
911
+ Objective: Based on the provided customer information. Conduct a comprehensive search to infer detailed customer information. Use online search tools, company databases, and public sources to gather accurate, up-to-date data. Ensure all fields in the JSON schema below are completed with reliable information.
912
+
913
+ If information is unavailable, use an empty string ('') for string fields. However, painPoints must always contain relevant data inferred from the company's description, industry, or general challenges associated with its sector.
914
+
915
+ Return only the JSON result, strictly following the schema, without any additional explanation.
916
+
917
+ **JSON Schema**:
918
+ ```
919
+ {'companyInfo':{'name':'','industry':'','size':'','annualRevenue':'','address':'','website':''},'primaryContact':{'name':'','position':'','email':'','phone':'','linkedIn':''},'currentTechStack':[],'painPoints':[{'category':'','description':'','impact':''}],'financialInfo':{'revenueLastThreeYears':[{'year':0,'revenue':0}],'profit':0,'fundingSources':[]},'legalInfo':{'taxCode':'','businessLicense':'','foundingYear':0},'productsAndServices':{'mainProducts':[],'targetMarket':[]},'developmentPlans':{'shortTermGoals':[],'longTermGoals':[]},'technologyAndInnovation':{'rdProjects':[],'patents':[{'name':'','number':'','filingDate':''}]}}
920
+ ```
921
+ **Key Focus Areas**:
922
+ 1. Pain Points: Highlight specific issues the company may face, such as financial challenges, operational inefficiencies, market positioning struggles, or customer satisfaction concerns. Always include specific issues the company may face, inferred from its description, industry, or general market challenges.
923
+ 2. Accuracy: Ensure all provided data is reliable and up-to-date.
924
+ 3. Fallbacks: For unavailable data, fill fields with empty strings ('') or empty arrays ([]).
925
+ Note: Return only the JSON output, without the json keyword or additional commentary."""
926
+
927
+ def _get_mock_structured_data(self) -> Dict[str, Any]:
928
+ """
929
+ Get mock structured data for dry run mode.
930
+
931
+ Returns:
932
+ Mock structured customer data
933
+ """
934
+ return {
935
+ 'companyInfo': {
936
+ 'name': 'Example Corp',
937
+ 'industry': 'Technology',
938
+ 'size': 'Medium (50-200 employees)',
939
+ 'annualRevenue': '$5-10M',
940
+ 'address': '123 Main St, City, State',
941
+ 'website': 'https://example.com'
942
+ },
943
+ 'primaryContact': {
944
+ 'name': 'John Doe',
945
+ 'position': 'CEO',
946
+ 'email': 'john@example.com',
947
+ 'phone': '+1-555-0123',
948
+ 'linkedIn': 'https://linkedin.com/in/johndoe'
949
+ },
950
+ 'currentTechStack': ['CRM', 'Email Marketing', 'Analytics'],
951
+ 'painPoints': [
952
+ {
953
+ 'category': 'Operational Efficiency',
954
+ 'description': 'Manual processes causing delays and errors',
955
+ 'impact': 'High - affecting customer satisfaction and costs'
956
+ },
957
+ {
958
+ 'category': 'Data Management',
959
+ 'description': 'Scattered data across multiple systems',
960
+ 'impact': 'Medium - limiting insights and decision making'
961
+ }
962
+ ],
963
+ 'financialInfo': {
964
+ 'revenueLastThreeYears': [
965
+ {'year': 2023, 'revenue': 8500000},
966
+ {'year': 2022, 'revenue': 7200000},
967
+ {'year': 2021, 'revenue': 6100000}
968
+ ],
969
+ 'profit': 1200000,
970
+ 'fundingSources': ['Self-funded', 'Bank loan']
971
+ },
972
+ 'legalInfo': {
973
+ 'taxCode': 'TC123456789',
974
+ 'businessLicense': 'BL987654321',
975
+ 'foundingYear': 2018
976
+ },
977
+ 'productsAndServices': {
978
+ 'mainProducts': ['Software Solutions', 'Consulting Services'],
979
+ 'targetMarket': ['SMB', 'Enterprise']
980
+ },
981
+ 'developmentPlans': {
982
+ 'shortTermGoals': ['Improve operational efficiency', 'Expand customer base'],
983
+ 'longTermGoals': ['International expansion', 'Product diversification']
984
+ },
985
+ 'technologyAndInnovation': {
986
+ 'rdProjects': ['AI Integration', 'Mobile App Development'],
987
+ 'patents': [
988
+ {
989
+ 'name': 'Automated Process Management',
990
+ 'number': 'US123456789',
991
+ 'filingDate': '2023-01-15'
992
+ }
993
+ ]
994
+ }
995
+ }
996
+
997
+ def _get_fallback_structured_data(self, customer_info_text: str) -> Dict[str, Any]:
998
+ """
999
+ Get fallback structured data when LLM extraction fails.
1000
+ Uses data from acquisition stage if available.
1001
+
1002
+ Args:
1003
+ customer_info_text: Original customer information text
1004
+
1005
+ Returns:
1006
+ Minimal structured customer data with available contact info
1007
+ """
1008
+ # Try to get acquisition data from context
1009
+ acquisition_data = getattr(self, '_current_acquisition_data', {})
1010
+
1011
+ return {
1012
+ 'companyInfo': {
1013
+ 'name': acquisition_data.get('company_name', ''),
1014
+ 'industry': '',
1015
+ 'size': '',
1016
+ 'annualRevenue': '',
1017
+ 'address': acquisition_data.get('customer_address', ''),
1018
+ 'website': acquisition_data.get('company_website', '')
1019
+ },
1020
+ 'primaryContact': {
1021
+ 'name': acquisition_data.get('contact_name', ''),
1022
+ 'position': '',
1023
+ 'email': acquisition_data.get('customer_email', ''),
1024
+ 'phone': acquisition_data.get('customer_phone', ''),
1025
+ 'linkedIn': acquisition_data.get('customer_linkedin', '')
1026
+ },
1027
+ 'currentTechStack': [],
1028
+ 'painPoints': [
1029
+ {
1030
+ 'category': 'General Business Challenges',
1031
+ 'description': 'Common business challenges that may affect operational efficiency and growth',
1032
+ 'impact': 'Medium - typical for businesses in competitive markets'
1033
+ }
1034
+ ],
1035
+ 'financialInfo': {
1036
+ 'revenueLastThreeYears': [],
1037
+ 'profit': 0,
1038
+ 'fundingSources': []
1039
+ },
1040
+ 'legalInfo': {
1041
+ 'taxCode': '',
1042
+ 'businessLicense': '',
1043
+ 'foundingYear': 0
1044
+ },
1045
+ 'productsAndServices': {
1046
+ 'mainProducts': [],
1047
+ 'targetMarket': []
1048
+ },
1049
+ 'developmentPlans': {
1050
+ 'shortTermGoals': [],
1051
+ 'longTermGoals': []
1052
+ },
1053
+ 'technologyAndInnovation': {
1054
+ 'rdProjects': [],
1055
+ 'patents': []
1056
+ },
1057
+ 'rawCustomerInfo': customer_info_text[:1000] + "..." if len(customer_info_text) > 1000 else customer_info_text
1058
+ }
1059
+
1060
+ def _validate_and_clean_data(self, structured_data: Dict[str, Any]) -> Dict[str, Any]:
1061
+ """
1062
+ Validate and clean the structured data.
1063
+
1064
+ Args:
1065
+ structured_data: Raw structured data from LLM
1066
+
1067
+ Returns:
1068
+ Validated and cleaned structured data
1069
+ """
1070
+ try:
1071
+ # Ensure all required sections exist
1072
+ required_sections = [
1073
+ 'companyInfo', 'primaryContact', 'currentTechStack', 'painPoints',
1074
+ 'financialInfo', 'legalInfo', 'productsAndServices',
1075
+ 'developmentPlans', 'technologyAndInnovation'
1076
+ ]
1077
+
1078
+ for section in required_sections:
1079
+ if section not in structured_data:
1080
+ structured_data[section] = {}
1081
+
1082
+ # Validate companyInfo
1083
+ company_info = structured_data.get('companyInfo', {})
1084
+ required_company_fields = ['name', 'industry', 'size', 'annualRevenue', 'address', 'website']
1085
+ for field in required_company_fields:
1086
+ if field not in company_info:
1087
+ company_info[field] = ''
1088
+
1089
+ # Validate primaryContact
1090
+ contact = structured_data.get('primaryContact', {})
1091
+ required_contact_fields = ['name', 'position', 'email', 'phone', 'linkedIn']
1092
+ for field in required_contact_fields:
1093
+ if field not in contact:
1094
+ contact[field] = ''
1095
+
1096
+ # Ensure painPoints is always a list with at least one item
1097
+ pain_points = structured_data.get('painPoints', [])
1098
+ if not pain_points or not isinstance(pain_points, list):
1099
+ pain_points = [
1100
+ {
1101
+ 'category': 'Business Operations',
1102
+ 'description': 'General operational challenges common in the industry',
1103
+ 'impact': 'Medium'
1104
+ }
1105
+ ]
1106
+ structured_data['painPoints'] = pain_points
1107
+
1108
+ # Validate financial info
1109
+ financial_info = structured_data.get('financialInfo', {})
1110
+ if 'revenueLastThreeYears' not in financial_info:
1111
+ financial_info['revenueLastThreeYears'] = []
1112
+ if 'profit' not in financial_info:
1113
+ financial_info['profit'] = 0
1114
+ if 'fundingSources' not in financial_info:
1115
+ financial_info['fundingSources'] = []
1116
+
1117
+ # Validate legal info
1118
+ legal_info = structured_data.get('legalInfo', {})
1119
+ required_legal_fields = ['taxCode', 'businessLicense', 'foundingYear']
1120
+ for field in required_legal_fields:
1121
+ if field not in legal_info:
1122
+ legal_info[field] = '' if field != 'foundingYear' else 0
1123
+
1124
+ # Ensure lists are actually lists
1125
+ list_fields = [
1126
+ ('currentTechStack', []),
1127
+ ('productsAndServices', {'mainProducts': [], 'targetMarket': []}),
1128
+ ('developmentPlans', {'shortTermGoals': [], 'longTermGoals': []}),
1129
+ ('technologyAndInnovation', {'rdProjects': [], 'patents': []})
1130
+ ]
1131
+
1132
+ for field, default in list_fields:
1133
+ if field not in structured_data:
1134
+ structured_data[field] = default
1135
+ elif isinstance(default, dict):
1136
+ for subfield, subdefault in default.items():
1137
+ if subfield not in structured_data[field]:
1138
+ structured_data[field][subfield] = subdefault
1139
+
1140
+ self.logger.info("Successfully validated and cleaned structured data")
1141
+ return structured_data
1142
+
1143
+ except Exception as e:
1144
+ self.logger.error(f"Data validation failed: {str(e)}")
1145
+ return structured_data # Return as-is if validation fails
1146
+
1147
+ def _save_customer_data(self, context: Dict[str, Any], structured_data: Dict[str, Any]) -> None:
1148
+ """
1149
+ Save customer data to local database.
1150
+
1151
+ Args:
1152
+ context: Execution context
1153
+ structured_data: Structured customer data
1154
+ """
1155
+ try:
1156
+ execution_id = context.get('execution_id')
1157
+ task_id = context.get('task_id', execution_id)
1158
+ company_info = structured_data.get('companyInfo', {})
1159
+ contact_info = structured_data.get('primaryContact', {})
1160
+
1161
+ # Save to customers table (basic customer info)
1162
+ customer_data = {
1163
+ 'customer_id': execution_id,
1164
+ 'org_id': self.config.get('org_id', ''),
1165
+ 'company_name': company_info.get('name', ''),
1166
+ 'website': company_info.get('website', ''),
1167
+ 'industry': company_info.get('industry', ''),
1168
+ 'contact_name': contact_info.get('name', ''),
1169
+ 'contact_email': contact_info.get('email', ''),
1170
+ 'contact_phone': contact_info.get('phone', ''),
1171
+ 'address': company_info.get('address', ''),
1172
+ 'profile_data': json.dumps(structured_data)
1173
+ }
1174
+
1175
+ # Save customer data to customers table
1176
+ self.data_manager.save_customer(customer_data)
1177
+ self.logger.info(f"Customer data saved to customers table: {execution_id}")
1178
+
1179
+ # Save to gs_customer_llmtask table (server-compatible)
1180
+ customer_task_data = {
1181
+ 'task_id': task_id,
1182
+ 'customer_id': execution_id,
1183
+ 'customer_name': company_info.get('name', ''),
1184
+ 'customer_phone': contact_info.get('phone', ''),
1185
+ 'customer_address': company_info.get('address', ''),
1186
+ 'customer_email': contact_info.get('email', ''),
1187
+ 'customer_industry': company_info.get('industry', ''),
1188
+ 'customer_taxcode': company_info.get('taxCode', ''),
1189
+ 'customer_website': company_info.get('website', ''),
1190
+ 'contact_name': contact_info.get('name', ''),
1191
+ 'org_id': self.config.get('org_id', ''),
1192
+ 'org_name': self.config.get('org_name', ''),
1193
+ 'project_code': 'FUSESELL',
1194
+ 'crm_dob': contact_info.get('dateOfBirth'),
1195
+ 'image_url': ''
1196
+ }
1197
+
1198
+ # Save customer task data to gs_customer_llmtask table
1199
+ self.data_manager.save_customer_task(customer_task_data)
1200
+ self.logger.info(f"Customer task data saved to gs_customer_llmtask table: {task_id}")
1201
+
1202
+ except Exception as e:
1203
+ self.logger.warning(f"Failed to save customer data: {str(e)}")
1204
+
1205
+ def validate_input(self, context: Dict[str, Any]) -> bool:
1206
+ """
1207
+ Validate input data for data preparation stage.
1208
+
1209
+ Args:
1210
+ context: Execution context
1211
+
1212
+ Returns:
1213
+ True if input is valid
1214
+ """
1215
+ # Check if we have data from data acquisition stage
1216
+ stage_results = context.get('stage_results', {})
1217
+ if 'data_acquisition' in stage_results:
1218
+ return True
1219
+
1220
+ # Fallback: check if we have basic input data
1221
+ input_data = context.get('input_data', {})
1222
+ return bool(input_data.get('customer_website') or input_data.get('customer_description'))
1223
+
1224
+ def get_required_fields(self) -> List[str]:
1225
+ """
1226
+ Get list of required input fields for this stage.
1227
+
1228
+ Returns:
1229
+ List of required field names
1230
+ """
1231
+ return [] # This stage depends on data_acquisition stage output