leads-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. company_discovery/__init__.py +4 -0
  2. company_discovery/adapters/__init__.py +5 -0
  3. company_discovery/adapters/apollo.py +189 -0
  4. company_discovery/adapters/exa.py +112 -0
  5. company_discovery/adapters/llm.py +118 -0
  6. company_discovery/adapters/protocols.py +58 -0
  7. company_discovery/adapters/website.py +154 -0
  8. company_discovery/bundled_skills/__init__.py +1 -0
  9. company_discovery/bundled_skills/company-discovery-operator/SKILL.md +72 -0
  10. company_discovery/bundled_skills/company-discovery-operator/agents/openai.yaml +4 -0
  11. company_discovery/bundled_skills/company-enrichment-operator/SKILL.md +94 -0
  12. company_discovery/bundled_skills/company-enrichment-operator/agents/openai.yaml +4 -0
  13. company_discovery/bundled_skills/company-search-spec-writer/SKILL.md +109 -0
  14. company_discovery/bundled_skills/company-search-spec-writer/agents/openai.yaml +4 -0
  15. company_discovery/bundled_skills/contact-discovery-operator/SKILL.md +80 -0
  16. company_discovery/bundled_skills/contact-discovery-operator/agents/openai.yaml +4 -0
  17. company_discovery/bundled_skills/contact-enrichment-operator/SKILL.md +86 -0
  18. company_discovery/bundled_skills/contact-enrichment-operator/agents/openai.yaml +4 -0
  19. company_discovery/bundled_skills/contact-search-spec-writer/SKILL.md +86 -0
  20. company_discovery/bundled_skills/contact-search-spec-writer/agents/openai.yaml +4 -0
  21. company_discovery/bundled_skills/leads-update-operator/SKILL.md +60 -0
  22. company_discovery/bundled_skills/leads-update-operator/agents/openai.yaml +4 -0
  23. company_discovery/cli.py +1789 -0
  24. company_discovery/db/__init__.py +5 -0
  25. company_discovery/db/contact_enrichment_repository.py +268 -0
  26. company_discovery/db/contact_repository.py +366 -0
  27. company_discovery/db/enrichment_repository.py +207 -0
  28. company_discovery/db/models.py +324 -0
  29. company_discovery/db/repository.py +363 -0
  30. company_discovery/db/session.py +48 -0
  31. company_discovery/domain/__init__.py +24 -0
  32. company_discovery/domain/contact_models.py +178 -0
  33. company_discovery/domain/contact_spec.py +86 -0
  34. company_discovery/domain/models.py +287 -0
  35. company_discovery/domain/spec.py +263 -0
  36. company_discovery/migrations.py +190 -0
  37. company_discovery/prompts/__init__.py +8 -0
  38. company_discovery/prompts/candidate_evaluation/system.md +13 -0
  39. company_discovery/prompts/company_enrichment/system.md +42 -0
  40. company_discovery/prompts/contact_evaluation/system.md +18 -0
  41. company_discovery/prompts/query_generation/system.md +10 -0
  42. company_discovery/release_manifest.json +7 -0
  43. company_discovery/reports/__init__.py +4 -0
  44. company_discovery/reports/contact_enrichment_exporter.py +108 -0
  45. company_discovery/reports/contact_exporter.py +132 -0
  46. company_discovery/reports/enrichment_exporter.py +125 -0
  47. company_discovery/reports/exporter.py +135 -0
  48. company_discovery/runtime.py +336 -0
  49. company_discovery/services/__init__.py +4 -0
  50. company_discovery/services/contact_enrichment_pipeline.py +344 -0
  51. company_discovery/services/contact_enrichment_progress.py +37 -0
  52. company_discovery/services/contact_evaluator.py +110 -0
  53. company_discovery/services/contact_pipeline.py +295 -0
  54. company_discovery/services/contact_progress.py +38 -0
  55. company_discovery/services/enrichment_extractor.py +61 -0
  56. company_discovery/services/enrichment_pipeline.py +526 -0
  57. company_discovery/services/enrichment_progress.py +20 -0
  58. company_discovery/services/enrichment_resolver.py +148 -0
  59. company_discovery/services/evaluator.py +40 -0
  60. company_discovery/services/hygiene.py +51 -0
  61. company_discovery/services/memory.py +150 -0
  62. company_discovery/services/normalization.py +98 -0
  63. company_discovery/services/pipeline.py +628 -0
  64. company_discovery/services/progress.py +48 -0
  65. company_discovery/services/query_planner.py +47 -0
  66. company_discovery/settings.py +152 -0
  67. company_discovery/skill_installer.py +197 -0
  68. company_discovery/update_plan.py +79 -0
  69. leads_cli-0.1.0.dist-info/METADATA +277 -0
  70. leads_cli-0.1.0.dist-info/RECORD +72 -0
  71. leads_cli-0.1.0.dist-info/WHEEL +4 -0
  72. leads_cli-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,526 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Protocol
5
+
6
+ from company_discovery.adapters.protocols import CompanySearchProvider
7
+ from company_discovery.db.enrichment_repository import EnrichmentRepository
8
+ from company_discovery.domain.models import (
9
+ EnrichmentExtraction,
10
+ EnrichmentItem,
11
+ EnrichmentOutcome,
12
+ EnrichmentProfile,
13
+ EnrichmentRunResult,
14
+ EnrichmentSummary,
15
+ IndependenceStatus,
16
+ InheritedFieldStatus,
17
+ LinkedInObservation,
18
+ WebsitePage,
19
+ )
20
+ from company_discovery.domain.spec import CompanySearchSpec
21
+ from company_discovery.reports.enrichment_exporter import EnrichmentArtifactExporter
22
+ from company_discovery.services.enrichment_progress import (
23
+ EnrichmentProgressReporter,
24
+ NullEnrichmentProgressReporter,
25
+ )
26
+ from company_discovery.services.enrichment_resolver import (
27
+ normalize_linkedin_company_url,
28
+ resolve_independence,
29
+ resolve_linkedin,
30
+ resolve_location,
31
+ resolve_phone,
32
+ )
33
+
34
+
35
+ class WebsiteRetriever(Protocol):
36
+ def fetch(self, domain: str) -> list[WebsitePage]: ...
37
+
38
+
39
+ class FactExtractor(Protocol):
40
+ def extract(
41
+ self, discovery: dict[str, object], pages: list[WebsitePage]
42
+ ) -> EnrichmentExtraction: ...
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class EnrichmentOptions:
47
+ bucket: str = "selected"
48
+ limit: int | None = None
49
+ refresh: str = "none"
50
+ allow_unknown_independence: bool = False
51
+
52
+ def as_dict(self) -> dict[str, object]:
53
+ return {
54
+ "bucket": self.bucket,
55
+ "limit": self.limit,
56
+ "refresh": self.refresh,
57
+ "allow_unknown_independence": self.allow_unknown_independence,
58
+ }
59
+
60
+
61
+ class EnrichmentPipeline:
62
+ def __init__(
63
+ self,
64
+ *,
65
+ repository: EnrichmentRepository,
66
+ exporter: EnrichmentArtifactExporter,
67
+ website: WebsiteRetriever | None,
68
+ extractor: FactExtractor | None,
69
+ fallback_search: CompanySearchProvider | None = None,
70
+ freshness_days: int = 180,
71
+ fallback_results: int = 5,
72
+ ) -> None:
73
+ self._repository = repository
74
+ self._exporter = exporter
75
+ self._website = website
76
+ self._extractor = extractor
77
+ self._fallback_search = fallback_search
78
+ self._freshness_days = freshness_days
79
+ self._fallback_results = fallback_results
80
+
81
+ def enrich(
82
+ self,
83
+ discovery_run_id: str,
84
+ *,
85
+ options: EnrichmentOptions | None = None,
86
+ progress: EnrichmentProgressReporter | None = None,
87
+ ) -> EnrichmentRunResult:
88
+ options = options or EnrichmentOptions()
89
+ reporter = progress or NullEnrichmentProgressReporter()
90
+ candidates = self._repository.discovery_candidates(
91
+ discovery_run_id, options.bucket, options.limit
92
+ )
93
+ run_id = self._repository.create_run(discovery_run_id, options.bucket, options.as_dict())
94
+ try:
95
+ return self._run(run_id, discovery_run_id, candidates, options, reporter)
96
+ except Exception as exc:
97
+ self._repository.fail_run(run_id, exc)
98
+ raise
99
+
100
+ def _run(
101
+ self,
102
+ run_id: str,
103
+ discovery_run_id: str,
104
+ candidates: list[dict[str, object]],
105
+ options: EnrichmentOptions,
106
+ reporter: EnrichmentProgressReporter,
107
+ ) -> EnrichmentRunResult:
108
+ summary = EnrichmentSummary()
109
+ items: list[EnrichmentItem] = []
110
+ reporter.start(discovery_run_id, len(candidates), options.bucket)
111
+ for index, record in enumerate(candidates, start=1):
112
+ item = self._enrich_one(
113
+ run_id, discovery_run_id, record, options, reporter, index, len(candidates), summary
114
+ )
115
+ items.append(item)
116
+ self._repository.save_item(run_id, item)
117
+ self._count_outcome(summary, item.outcome)
118
+
119
+ payload = {
120
+ "run_id": run_id,
121
+ "discovery_run_id": discovery_run_id,
122
+ "bucket": options.bucket,
123
+ "options": options.as_dict(),
124
+ "status": "completed",
125
+ "items": [item.model_dump(mode="json") for item in items],
126
+ }
127
+ paths = self._exporter.export(payload, summary)
128
+ self._repository.complete_run(run_id, summary, paths)
129
+ return EnrichmentRunResult(
130
+ run_id=run_id,
131
+ discovery_run_id=discovery_run_id,
132
+ summary=summary,
133
+ items=items,
134
+ artifact_paths=paths,
135
+ )
136
+
137
+ def _enrich_one(
138
+ self,
139
+ run_id: str,
140
+ discovery_run_id: str,
141
+ record: dict[str, object],
142
+ options: EnrichmentOptions,
143
+ reporter: EnrichmentProgressReporter,
144
+ index: int,
145
+ total: int,
146
+ summary: EnrichmentSummary,
147
+ ) -> EnrichmentItem:
148
+ company = dict(record["company"]) # type: ignore[arg-type]
149
+ evaluation = dict(record["evaluation"]) # type: ignore[arg-type]
150
+ spec = CompanySearchSpec.model_validate(record["spec"])
151
+ excluded_ownership_signals = {
152
+ signal.value for signal in spec.exclude.structured.ownership_signals
153
+ }
154
+ discovery = {
155
+ "run_id": discovery_run_id,
156
+ "company_name": company["company_name"],
157
+ "domain": company["domain"],
158
+ "vertical": company.get("vertical"),
159
+ "target_vertical": evaluation.get("target_vertical") or company.get("vertical"),
160
+ "country": company.get("country"),
161
+ "state": company.get("state"),
162
+ "employee_min": company.get("employee_min"),
163
+ "employee_max": company.get("employee_max"),
164
+ "ownership_type": company.get("ownership_type"),
165
+ "fit": evaluation.get("fit"),
166
+ "reason": evaluation.get("reason"),
167
+ "evidence": evaluation.get("evidence", []),
168
+ "source": record["source"],
169
+ "excluded_ownership_signals": sorted(excluded_ownership_signals),
170
+ }
171
+ reporter.company(index, total, str(discovery["company_name"]))
172
+ reporter.event("INHERITED", "name, domain, vertical, geography, employees, ownership type")
173
+ summary.processed += 1
174
+ summary.inherited_facts += 7
175
+ trace: list[dict[str, object]] = [
176
+ {"stage": "inherited", "fields": [
177
+ "company_name", "domain", "vertical", "geography", "employees", "ownership_type"
178
+ ]}
179
+ ]
180
+ candidate_id = int(record["candidate_id"]) # type: ignore[arg-type]
181
+ profile = self._repository.fresh_profile(candidate_id, self._freshness_days)
182
+ profile = self._apply_refresh(profile, options.refresh)
183
+ reused = sum(
184
+ value is not None
185
+ for value in (profile.phone, profile.location, profile.independence, profile.linkedin)
186
+ )
187
+ if reused:
188
+ summary.memory_profiles_reused += 1
189
+ reporter.event("MEMORY", f"reused {reused}/4 fresh enrichment facts")
190
+ trace.append({"stage": "memory", "reused": reused})
191
+ else:
192
+ reporter.event("MEMORY", "no reusable enrichment profile")
193
+ trace.append({"stage": "memory", "reused": 0})
194
+
195
+ conflicts: list[str] = []
196
+ statuses = {
197
+ key: InheritedFieldStatus.INHERITED
198
+ for key in (
199
+ "company_name", "domain", "vertical", "country", "state",
200
+ "employee_estimate", "ownership_type"
201
+ )
202
+ }
203
+ # A fresh explicit `unknown` independence result is reusable until its freshness window
204
+ # expires; only a newly fetched unknown result should trigger corroboration in this run.
205
+ try:
206
+ missing = self._missing(profile, include_unknown_independence=False)
207
+ if missing:
208
+ pages = self._fetch_pages(str(discovery["domain"]))
209
+ if pages:
210
+ summary.websites_fetched += 1
211
+ reporter.event("WEBSITE", f"read {len(pages)} targeted official pages")
212
+ trace.append({"stage": "website", "pages": [page.url for page in pages]})
213
+ extraction = self._extract(discovery, pages)
214
+ profile, new_conflicts = self._merge(
215
+ profile, extraction, discovery, "official_site"
216
+ )
217
+ conflicts.extend(new_conflicts)
218
+ self._confirm_inherited(statuses, extraction, discovery, profile)
219
+
220
+ missing = self._missing(profile, include_unknown_independence=True)
221
+ if missing and self._fallback_search is not None and self._extractor is not None:
222
+ for query, fields in self._fallback_queries(discovery, missing):
223
+ results = self._fallback_search.search(
224
+ query,
225
+ country=str(discovery.get("country") or "US"),
226
+ num_results=self._fallback_results,
227
+ )
228
+ summary.fallback_searches += 1
229
+ reporter.event("FALLBACK", f"narrow corroboration for {', '.join(fields)}")
230
+ pages = [
231
+ WebsitePage(
232
+ url=result.url,
233
+ title=result.title,
234
+ text=result.text or "",
235
+ page_type="search_evidence",
236
+ )
237
+ for result in results
238
+ if result.text or normalize_linkedin_company_url(result.url)
239
+ ]
240
+ trace.append({
241
+ "stage": "fallback",
242
+ "query": query,
243
+ "fields": fields,
244
+ "sources": [page.url for page in pages],
245
+ })
246
+ if pages:
247
+ extraction = self._extract(discovery, pages)
248
+ profile, new_conflicts = self._merge(
249
+ profile, extraction, discovery, "search_corroboration"
250
+ )
251
+ conflicts.extend(new_conflicts)
252
+ except Exception as exc:
253
+ if self._is_configuration_error(exc):
254
+ raise
255
+ return self._failed_item(
256
+ candidate_id,
257
+ discovery,
258
+ profile,
259
+ statuses,
260
+ conflicts,
261
+ trace,
262
+ exc,
263
+ reporter,
264
+ )
265
+
266
+ matched_exclusions = self._matched_ownership_exclusions(
267
+ profile, excluded_ownership_signals
268
+ )
269
+ conflicts.extend(
270
+ f"excluded_ownership_signal: {signal}" for signal in matched_exclusions
271
+ )
272
+ trace.append({
273
+ "stage": "structured_exclusions",
274
+ "requested": sorted(excluded_ownership_signals),
275
+ "matched": matched_exclusions,
276
+ })
277
+ outcome, review_flags = self._outcome(
278
+ profile,
279
+ conflicts,
280
+ options.allow_unknown_independence,
281
+ matched_exclusions,
282
+ )
283
+ label = "READY" if outcome == EnrichmentOutcome.READY else "REVIEW" if outcome in {
284
+ EnrichmentOutcome.GAPS, EnrichmentOutcome.INDEPENDENCE_UNCONFIRMED
285
+ } else "BLOCKED"
286
+ reporter.event(label, outcome.value)
287
+ trace.append({"stage": "outcome", "value": outcome.value})
288
+ return EnrichmentItem(
289
+ company_id=candidate_id,
290
+ discovery=discovery,
291
+ enrichment=profile,
292
+ inherited_status=statuses,
293
+ outcome=outcome,
294
+ conflicts=list(dict.fromkeys(conflicts)),
295
+ review_flags=review_flags,
296
+ trace=trace,
297
+ )
298
+
299
+ def _failed_item(
300
+ self,
301
+ candidate_id: int,
302
+ discovery: dict[str, object],
303
+ profile: EnrichmentProfile,
304
+ statuses: dict[str, InheritedFieldStatus],
305
+ conflicts: list[str],
306
+ trace: list[dict[str, object]],
307
+ error: Exception,
308
+ reporter: EnrichmentProgressReporter,
309
+ ) -> EnrichmentItem:
310
+ message = self._error_message(error)
311
+ reporter.event("FAILED", message)
312
+ trace.append({
313
+ "stage": "error",
314
+ "error_type": type(error).__name__,
315
+ "message": message,
316
+ })
317
+ return EnrichmentItem(
318
+ company_id=candidate_id,
319
+ discovery=discovery,
320
+ enrichment=profile,
321
+ inherited_status=statuses,
322
+ outcome=EnrichmentOutcome.FAILED,
323
+ conflicts=list(dict.fromkeys([*conflicts, f"enrichment_failed: {message}"])),
324
+ review_flags=["enrichment_failed"],
325
+ trace=trace,
326
+ )
327
+
328
+ def _fetch_pages(self, domain: str) -> list[WebsitePage]:
329
+ if self._website is None:
330
+ return []
331
+ return self._website.fetch(domain)
332
+
333
+ def _extract(
334
+ self, discovery: dict[str, object], pages: list[WebsitePage]
335
+ ) -> EnrichmentExtraction:
336
+ if self._extractor is None:
337
+ raise RuntimeError("LLM_API_KEY is required to extract enrichment facts")
338
+ extraction = self._extractor.extract(discovery, pages)
339
+ linkedin_profiles = list(extraction.linkedin_profiles)
340
+ seen = {profile.url for profile in linkedin_profiles}
341
+ for page in pages:
342
+ # Only official-site anchors are deterministic. Search-result URLs require the
343
+ # extractor's company/domain identity check before they become observations.
344
+ for candidate in page.linkedin_urls:
345
+ normalized = normalize_linkedin_company_url(candidate)
346
+ if normalized is not None and normalized not in seen:
347
+ linkedin_profiles.append(
348
+ LinkedInObservation(url=normalized, source_url=page.url)
349
+ )
350
+ seen.add(normalized)
351
+ return extraction.model_copy(update={"linkedin_profiles": linkedin_profiles})
352
+
353
+ @staticmethod
354
+ def _apply_refresh(profile: EnrichmentProfile, refresh: str) -> EnrichmentProfile:
355
+ updates: dict[str, object] = {}
356
+ if refresh in {"contact", "all"}:
357
+ updates.update(phone=None, location=None, linkedin=None)
358
+ if refresh in {"independence", "all"}:
359
+ updates["independence"] = None
360
+ return profile.model_copy(update=updates)
361
+
362
+ @staticmethod
363
+ def _missing(
364
+ profile: EnrichmentProfile,
365
+ *,
366
+ include_unknown_independence: bool,
367
+ ) -> list[str]:
368
+ missing = []
369
+ if profile.phone is None:
370
+ missing.append("phone")
371
+ if profile.location is None:
372
+ missing.append("address")
373
+ if profile.linkedin is None:
374
+ missing.append("linkedin")
375
+ if profile.independence is None or (
376
+ include_unknown_independence
377
+ and profile.independence.status == IndependenceStatus.UNKNOWN
378
+ ):
379
+ missing.append("independence")
380
+ return missing
381
+
382
+ @staticmethod
383
+ def _merge(
384
+ profile: EnrichmentProfile,
385
+ extraction: EnrichmentExtraction,
386
+ discovery: dict[str, object],
387
+ source: str,
388
+ ) -> tuple[EnrichmentProfile, list[str]]:
389
+ conflicts: list[str] = []
390
+ if extraction.identity_conflict:
391
+ conflicts.append(f"identity_conflict: {extraction.identity_conflict_reason or 'source mismatch'}")
392
+ phone = profile.phone or resolve_phone(extraction, source)
393
+ location = profile.location
394
+ if location is None:
395
+ location, _ = resolve_location(
396
+ extraction, discovery.get("state"), source # type: ignore[arg-type]
397
+ )
398
+ independence = profile.independence
399
+ resolved_independence = resolve_independence(extraction)
400
+ if independence is None or independence.status == IndependenceStatus.UNKNOWN:
401
+ independence = resolved_independence
402
+ if independence.status == IndependenceStatus.NO:
403
+ conflicts.append("independence_conflict: explicit parent, franchise, or acquisition evidence")
404
+ linkedin = profile.linkedin or resolve_linkedin(extraction, source)
405
+ return EnrichmentProfile(
406
+ phone=phone,
407
+ location=location,
408
+ independence=independence,
409
+ linkedin=linkedin,
410
+ ), conflicts
411
+
412
+ @staticmethod
413
+ def _confirm_inherited(
414
+ statuses: dict[str, InheritedFieldStatus],
415
+ extraction: EnrichmentExtraction,
416
+ discovery: dict[str, object],
417
+ profile: EnrichmentProfile,
418
+ ) -> None:
419
+ if extraction.observed_company_name and not extraction.identity_conflict:
420
+ statuses["company_name"] = InheritedFieldStatus.CONFIRMED
421
+ statuses["domain"] = InheritedFieldStatus.CONFIRMED
422
+ if profile.location is not None and profile.location.state == discovery.get("state"):
423
+ statuses["country"] = InheritedFieldStatus.CONFIRMED
424
+ statuses["state"] = InheritedFieldStatus.CONFIRMED
425
+ if extraction.identity_conflict:
426
+ statuses["company_name"] = InheritedFieldStatus.CONFLICT
427
+ statuses["domain"] = InheritedFieldStatus.CONFLICT
428
+
429
+ @staticmethod
430
+ def _fallback_query(discovery: dict[str, object], missing: list[str]) -> str:
431
+ return (
432
+ f'"{discovery["company_name"]}" site:{discovery["domain"]} '
433
+ f'{" ".join(missing)} contact address franchise parent ownership'
434
+ )
435
+
436
+ @classmethod
437
+ def _fallback_queries(
438
+ cls,
439
+ discovery: dict[str, object],
440
+ missing: list[str],
441
+ ) -> list[tuple[str, list[str]]]:
442
+ queries: list[tuple[str, list[str]]] = []
443
+ standard_fields = [field for field in missing if field != "linkedin"]
444
+ if standard_fields:
445
+ queries.append((cls._fallback_query(discovery, standard_fields), standard_fields))
446
+ if "linkedin" in missing:
447
+ queries.append((
448
+ f'"{discovery["company_name"]}" "{discovery["domain"]}" '
449
+ "site:linkedin.com/company",
450
+ ["linkedin"],
451
+ ))
452
+ return queries
453
+
454
+ @staticmethod
455
+ def _outcome(
456
+ profile: EnrichmentProfile,
457
+ conflicts: list[str],
458
+ allow_unknown: bool,
459
+ matched_exclusions: list[str] | None = None,
460
+ ) -> tuple[EnrichmentOutcome, list[str]]:
461
+ if any(value.startswith("identity_conflict") for value in conflicts):
462
+ return EnrichmentOutcome.IDENTITY_CONFLICT, ["identity_conflict"]
463
+ if any(value.startswith("geography_conflict") for value in conflicts):
464
+ return EnrichmentOutcome.GEOGRAPHY_CONFLICT, ["geography_conflict"]
465
+ if matched_exclusions:
466
+ return EnrichmentOutcome.FIT_CONFLICT, [
467
+ f"excluded_{signal}" for signal in matched_exclusions
468
+ ]
469
+ if profile.independence and profile.independence.status == IndependenceStatus.NO:
470
+ return EnrichmentOutcome.FIT_CONFLICT, ["not_independent"]
471
+ gaps = []
472
+ if profile.phone is None:
473
+ gaps.append("phone_missing")
474
+ if profile.location is None:
475
+ gaps.append("address_missing")
476
+ if profile.linkedin is None:
477
+ gaps.append("linkedin_missing")
478
+ if gaps:
479
+ return EnrichmentOutcome.GAPS, gaps
480
+ if profile.independence is None or profile.independence.status == IndependenceStatus.UNKNOWN:
481
+ if allow_unknown:
482
+ return EnrichmentOutcome.READY, ["independence_unknown_allowed"]
483
+ return EnrichmentOutcome.INDEPENDENCE_UNCONFIRMED, ["independence_unknown"]
484
+ return EnrichmentOutcome.READY, []
485
+
486
+ @staticmethod
487
+ def _matched_ownership_exclusions(
488
+ profile: EnrichmentProfile,
489
+ excluded_signals: set[str],
490
+ ) -> list[str]:
491
+ if not excluded_signals or profile.independence is None:
492
+ return []
493
+ observed = set(profile.independence.signal_kinds)
494
+ # Older cached facts predate signal_kinds. Preserve family-owned evidence across upgrades.
495
+ if "family_owned" not in observed:
496
+ evidence = " ".join(profile.independence.evidence).lower()
497
+ if "family-owned" in evidence or "family owned" in evidence:
498
+ observed.add("family_owned")
499
+ return sorted(observed & excluded_signals)
500
+
501
+ @staticmethod
502
+ def _error_message(error: Exception) -> str:
503
+ message = " ".join(str(error).split())
504
+ if not message:
505
+ message = type(error).__name__
506
+ return message[:1000]
507
+
508
+ @staticmethod
509
+ def _is_configuration_error(error: Exception) -> bool:
510
+ message = str(error)
511
+ return (
512
+ "LLM_API_KEY is required" in message
513
+ or "LLM API returned HTTP 401" in message
514
+ or "LLM API returned HTTP 403" in message
515
+ )
516
+
517
+ @staticmethod
518
+ def _count_outcome(summary: EnrichmentSummary, outcome: EnrichmentOutcome) -> None:
519
+ if outcome == EnrichmentOutcome.READY:
520
+ summary.ready += 1
521
+ elif outcome in {EnrichmentOutcome.GAPS, EnrichmentOutcome.INDEPENDENCE_UNCONFIRMED}:
522
+ summary.review += 1
523
+ elif outcome == EnrichmentOutcome.FAILED:
524
+ summary.failed += 1
525
+ else:
526
+ summary.blocked += 1
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol
4
+
5
+
6
+ class EnrichmentProgressReporter(Protocol):
7
+ def start(self, discovery_run_id: str, total: int, bucket: str) -> None: ...
8
+ def company(self, current: int, total: int, name: str) -> None: ...
9
+ def event(self, label: str, message: str) -> None: ...
10
+
11
+
12
+ class NullEnrichmentProgressReporter:
13
+ def start(self, discovery_run_id: str, total: int, bucket: str) -> None:
14
+ pass
15
+
16
+ def company(self, current: int, total: int, name: str) -> None:
17
+ pass
18
+
19
+ def event(self, label: str, message: str) -> None:
20
+ pass
@@ -0,0 +1,148 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from datetime import UTC, datetime
5
+ from urllib.parse import urlparse
6
+
7
+ from company_discovery.domain.models import (
8
+ EnrichmentExtraction,
9
+ IndependenceFact,
10
+ IndependenceStatus,
11
+ LinkedInFact,
12
+ LocationFact,
13
+ PhoneFact,
14
+ )
15
+
16
+
17
+ US_STATE_NAMES = {
18
+ "ALABAMA": "AL", "ALASKA": "AK", "ARIZONA": "AZ", "ARKANSAS": "AR",
19
+ "CALIFORNIA": "CA", "COLORADO": "CO", "CONNECTICUT": "CT", "DELAWARE": "DE",
20
+ "FLORIDA": "FL", "GEORGIA": "GA", "HAWAII": "HI", "IDAHO": "ID",
21
+ "ILLINOIS": "IL", "INDIANA": "IN", "IOWA": "IA", "KANSAS": "KS",
22
+ "KENTUCKY": "KY", "LOUISIANA": "LA", "MAINE": "ME", "MARYLAND": "MD",
23
+ "MASSACHUSETTS": "MA", "MICHIGAN": "MI", "MINNESOTA": "MN", "MISSISSIPPI": "MS",
24
+ "MISSOURI": "MO", "MONTANA": "MT", "NEBRASKA": "NE", "NEVADA": "NV",
25
+ "NEW HAMPSHIRE": "NH", "NEW JERSEY": "NJ", "NEW MEXICO": "NM", "NEW YORK": "NY",
26
+ "NORTH CAROLINA": "NC", "NORTH DAKOTA": "ND", "OHIO": "OH", "OKLAHOMA": "OK",
27
+ "OREGON": "OR", "PENNSYLVANIA": "PA", "RHODE ISLAND": "RI",
28
+ "SOUTH CAROLINA": "SC", "SOUTH DAKOTA": "SD", "TENNESSEE": "TN", "TEXAS": "TX",
29
+ "UTAH": "UT", "VERMONT": "VT", "VIRGINIA": "VA", "WASHINGTON": "WA",
30
+ "WEST VIRGINIA": "WV", "WISCONSIN": "WI", "WYOMING": "WY",
31
+ "DISTRICT OF COLUMBIA": "DC",
32
+ }
33
+
34
+
35
+ def normalize_state(value: str | None) -> str | None:
36
+ if not value:
37
+ return None
38
+ cleaned = value.strip().upper()
39
+ return US_STATE_NAMES.get(cleaned, cleaned if len(cleaned) == 2 else None)
40
+
41
+
42
+ def resolve_phone(extraction: EnrichmentExtraction, source: str) -> PhoneFact | None:
43
+ if not extraction.phones:
44
+ return None
45
+ preferred = next(
46
+ (phone for phone in extraction.phones if "fax" not in (phone.label or "").lower()),
47
+ None,
48
+ )
49
+ if preferred is None:
50
+ return None
51
+ digits = re.sub(r"\D", "", preferred.value)
52
+ if len(digits) == 10:
53
+ normalized = f"+1{digits}"
54
+ display = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
55
+ elif len(digits) == 11 and digits.startswith("1"):
56
+ normalized = f"+{digits}"
57
+ display = f"({digits[1:4]}) {digits[4:7]}-{digits[7:]}"
58
+ elif 8 <= len(digits) <= 15:
59
+ normalized = f"+{digits}"
60
+ display = preferred.value.strip()
61
+ else:
62
+ return None
63
+ return PhoneFact(
64
+ value=normalized,
65
+ display_value=display,
66
+ source=source,
67
+ source_url=preferred.source_url,
68
+ )
69
+
70
+
71
+ def resolve_location(
72
+ extraction: EnrichmentExtraction,
73
+ target_state: str | None,
74
+ source: str,
75
+ ) -> tuple[LocationFact | None, bool]:
76
+ if not extraction.locations:
77
+ return None, False
78
+ expected = normalize_state(target_state)
79
+ normalized = [(location, normalize_state(location.state)) for location in extraction.locations]
80
+ chosen = next((location for location, state in normalized if expected and state == expected), None)
81
+ geography_conflict = bool(expected and chosen is None)
82
+ if chosen is None and expected is None:
83
+ chosen = next(
84
+ (location for location in extraction.locations if "head" in (location.label or "").lower()),
85
+ extraction.locations[0],
86
+ )
87
+ if chosen is None:
88
+ return None, geography_conflict
89
+ return (
90
+ LocationFact(
91
+ street_address=chosen.street_address,
92
+ city=chosen.city,
93
+ state=normalize_state(chosen.state) or chosen.state,
94
+ zip=chosen.zip,
95
+ country=chosen.country,
96
+ source=source,
97
+ source_url=chosen.source_url,
98
+ ),
99
+ False,
100
+ )
101
+
102
+
103
+ def resolve_independence(extraction: EnrichmentExtraction) -> IndependenceFact:
104
+ negative = {"franchise", "parent", "subsidiary", "division", "acquired"}
105
+ positive = {"independent_explicit", "family_owned", "locally_owned"}
106
+ kinds = {signal.kind for signal in extraction.ownership_signals}
107
+ if kinds & negative:
108
+ status = IndependenceStatus.NO
109
+ elif kinds & positive:
110
+ status = IndependenceStatus.YES
111
+ else:
112
+ status = IndependenceStatus.UNKNOWN
113
+ return IndependenceFact(
114
+ status=status,
115
+ evidence=[signal.statement for signal in extraction.ownership_signals],
116
+ source_urls=list(dict.fromkeys(signal.source_url for signal in extraction.ownership_signals)),
117
+ signal_kinds=list(dict.fromkeys(signal.kind for signal in extraction.ownership_signals)),
118
+ observed_at=datetime.now(UTC),
119
+ )
120
+
121
+
122
+ def normalize_linkedin_company_url(value: str) -> str | None:
123
+ candidate = value.strip()
124
+ if not candidate:
125
+ return None
126
+ if not candidate.startswith(("http://", "https://")):
127
+ candidate = f"https://{candidate}"
128
+ parsed = urlparse(candidate)
129
+ host = (parsed.hostname or "").lower().removeprefix("www.")
130
+ parts = [part for part in parsed.path.split("/") if part]
131
+ if host != "linkedin.com" or len(parts) < 2 or parts[0].lower() != "company":
132
+ return None
133
+ slug = parts[1]
134
+ if not re.fullmatch(r"[A-Za-z0-9_-]+", slug):
135
+ return None
136
+ return f"https://www.linkedin.com/company/{slug}"
137
+
138
+
139
+ def resolve_linkedin(extraction: EnrichmentExtraction, source: str) -> LinkedInFact | None:
140
+ for observation in extraction.linkedin_profiles:
141
+ normalized = normalize_linkedin_company_url(observation.url)
142
+ if normalized is not None:
143
+ return LinkedInFact(
144
+ url=normalized,
145
+ source=source,
146
+ source_url=observation.source_url,
147
+ )
148
+ return None