leads-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- company_discovery/__init__.py +4 -0
- company_discovery/adapters/__init__.py +5 -0
- company_discovery/adapters/apollo.py +189 -0
- company_discovery/adapters/exa.py +112 -0
- company_discovery/adapters/llm.py +118 -0
- company_discovery/adapters/protocols.py +58 -0
- company_discovery/adapters/website.py +154 -0
- company_discovery/bundled_skills/__init__.py +1 -0
- company_discovery/bundled_skills/company-discovery-operator/SKILL.md +72 -0
- company_discovery/bundled_skills/company-discovery-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/company-enrichment-operator/SKILL.md +94 -0
- company_discovery/bundled_skills/company-enrichment-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/company-search-spec-writer/SKILL.md +109 -0
- company_discovery/bundled_skills/company-search-spec-writer/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-discovery-operator/SKILL.md +80 -0
- company_discovery/bundled_skills/contact-discovery-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-enrichment-operator/SKILL.md +86 -0
- company_discovery/bundled_skills/contact-enrichment-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-search-spec-writer/SKILL.md +86 -0
- company_discovery/bundled_skills/contact-search-spec-writer/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/leads-update-operator/SKILL.md +60 -0
- company_discovery/bundled_skills/leads-update-operator/agents/openai.yaml +4 -0
- company_discovery/cli.py +1789 -0
- company_discovery/db/__init__.py +5 -0
- company_discovery/db/contact_enrichment_repository.py +268 -0
- company_discovery/db/contact_repository.py +366 -0
- company_discovery/db/enrichment_repository.py +207 -0
- company_discovery/db/models.py +324 -0
- company_discovery/db/repository.py +363 -0
- company_discovery/db/session.py +48 -0
- company_discovery/domain/__init__.py +24 -0
- company_discovery/domain/contact_models.py +178 -0
- company_discovery/domain/contact_spec.py +86 -0
- company_discovery/domain/models.py +287 -0
- company_discovery/domain/spec.py +263 -0
- company_discovery/migrations.py +190 -0
- company_discovery/prompts/__init__.py +8 -0
- company_discovery/prompts/candidate_evaluation/system.md +13 -0
- company_discovery/prompts/company_enrichment/system.md +42 -0
- company_discovery/prompts/contact_evaluation/system.md +18 -0
- company_discovery/prompts/query_generation/system.md +10 -0
- company_discovery/release_manifest.json +7 -0
- company_discovery/reports/__init__.py +4 -0
- company_discovery/reports/contact_enrichment_exporter.py +108 -0
- company_discovery/reports/contact_exporter.py +132 -0
- company_discovery/reports/enrichment_exporter.py +125 -0
- company_discovery/reports/exporter.py +135 -0
- company_discovery/runtime.py +336 -0
- company_discovery/services/__init__.py +4 -0
- company_discovery/services/contact_enrichment_pipeline.py +344 -0
- company_discovery/services/contact_enrichment_progress.py +37 -0
- company_discovery/services/contact_evaluator.py +110 -0
- company_discovery/services/contact_pipeline.py +295 -0
- company_discovery/services/contact_progress.py +38 -0
- company_discovery/services/enrichment_extractor.py +61 -0
- company_discovery/services/enrichment_pipeline.py +526 -0
- company_discovery/services/enrichment_progress.py +20 -0
- company_discovery/services/enrichment_resolver.py +148 -0
- company_discovery/services/evaluator.py +40 -0
- company_discovery/services/hygiene.py +51 -0
- company_discovery/services/memory.py +150 -0
- company_discovery/services/normalization.py +98 -0
- company_discovery/services/pipeline.py +628 -0
- company_discovery/services/progress.py +48 -0
- company_discovery/services/query_planner.py +47 -0
- company_discovery/settings.py +152 -0
- company_discovery/skill_installer.py +197 -0
- company_discovery/update_plan.py +79 -0
- leads_cli-0.1.0.dist-info/METADATA +277 -0
- leads_cli-0.1.0.dist-info/RECORD +72 -0
- leads_cli-0.1.0.dist-info/WHEEL +4 -0
- leads_cli-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,526 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Protocol
|
|
5
|
+
|
|
6
|
+
from company_discovery.adapters.protocols import CompanySearchProvider
|
|
7
|
+
from company_discovery.db.enrichment_repository import EnrichmentRepository
|
|
8
|
+
from company_discovery.domain.models import (
|
|
9
|
+
EnrichmentExtraction,
|
|
10
|
+
EnrichmentItem,
|
|
11
|
+
EnrichmentOutcome,
|
|
12
|
+
EnrichmentProfile,
|
|
13
|
+
EnrichmentRunResult,
|
|
14
|
+
EnrichmentSummary,
|
|
15
|
+
IndependenceStatus,
|
|
16
|
+
InheritedFieldStatus,
|
|
17
|
+
LinkedInObservation,
|
|
18
|
+
WebsitePage,
|
|
19
|
+
)
|
|
20
|
+
from company_discovery.domain.spec import CompanySearchSpec
|
|
21
|
+
from company_discovery.reports.enrichment_exporter import EnrichmentArtifactExporter
|
|
22
|
+
from company_discovery.services.enrichment_progress import (
|
|
23
|
+
EnrichmentProgressReporter,
|
|
24
|
+
NullEnrichmentProgressReporter,
|
|
25
|
+
)
|
|
26
|
+
from company_discovery.services.enrichment_resolver import (
|
|
27
|
+
normalize_linkedin_company_url,
|
|
28
|
+
resolve_independence,
|
|
29
|
+
resolve_linkedin,
|
|
30
|
+
resolve_location,
|
|
31
|
+
resolve_phone,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class WebsiteRetriever(Protocol):
|
|
36
|
+
def fetch(self, domain: str) -> list[WebsitePage]: ...
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class FactExtractor(Protocol):
|
|
40
|
+
def extract(
|
|
41
|
+
self, discovery: dict[str, object], pages: list[WebsitePage]
|
|
42
|
+
) -> EnrichmentExtraction: ...
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True)
|
|
46
|
+
class EnrichmentOptions:
|
|
47
|
+
bucket: str = "selected"
|
|
48
|
+
limit: int | None = None
|
|
49
|
+
refresh: str = "none"
|
|
50
|
+
allow_unknown_independence: bool = False
|
|
51
|
+
|
|
52
|
+
def as_dict(self) -> dict[str, object]:
|
|
53
|
+
return {
|
|
54
|
+
"bucket": self.bucket,
|
|
55
|
+
"limit": self.limit,
|
|
56
|
+
"refresh": self.refresh,
|
|
57
|
+
"allow_unknown_independence": self.allow_unknown_independence,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class EnrichmentPipeline:
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
*,
|
|
65
|
+
repository: EnrichmentRepository,
|
|
66
|
+
exporter: EnrichmentArtifactExporter,
|
|
67
|
+
website: WebsiteRetriever | None,
|
|
68
|
+
extractor: FactExtractor | None,
|
|
69
|
+
fallback_search: CompanySearchProvider | None = None,
|
|
70
|
+
freshness_days: int = 180,
|
|
71
|
+
fallback_results: int = 5,
|
|
72
|
+
) -> None:
|
|
73
|
+
self._repository = repository
|
|
74
|
+
self._exporter = exporter
|
|
75
|
+
self._website = website
|
|
76
|
+
self._extractor = extractor
|
|
77
|
+
self._fallback_search = fallback_search
|
|
78
|
+
self._freshness_days = freshness_days
|
|
79
|
+
self._fallback_results = fallback_results
|
|
80
|
+
|
|
81
|
+
def enrich(
|
|
82
|
+
self,
|
|
83
|
+
discovery_run_id: str,
|
|
84
|
+
*,
|
|
85
|
+
options: EnrichmentOptions | None = None,
|
|
86
|
+
progress: EnrichmentProgressReporter | None = None,
|
|
87
|
+
) -> EnrichmentRunResult:
|
|
88
|
+
options = options or EnrichmentOptions()
|
|
89
|
+
reporter = progress or NullEnrichmentProgressReporter()
|
|
90
|
+
candidates = self._repository.discovery_candidates(
|
|
91
|
+
discovery_run_id, options.bucket, options.limit
|
|
92
|
+
)
|
|
93
|
+
run_id = self._repository.create_run(discovery_run_id, options.bucket, options.as_dict())
|
|
94
|
+
try:
|
|
95
|
+
return self._run(run_id, discovery_run_id, candidates, options, reporter)
|
|
96
|
+
except Exception as exc:
|
|
97
|
+
self._repository.fail_run(run_id, exc)
|
|
98
|
+
raise
|
|
99
|
+
|
|
100
|
+
def _run(
|
|
101
|
+
self,
|
|
102
|
+
run_id: str,
|
|
103
|
+
discovery_run_id: str,
|
|
104
|
+
candidates: list[dict[str, object]],
|
|
105
|
+
options: EnrichmentOptions,
|
|
106
|
+
reporter: EnrichmentProgressReporter,
|
|
107
|
+
) -> EnrichmentRunResult:
|
|
108
|
+
summary = EnrichmentSummary()
|
|
109
|
+
items: list[EnrichmentItem] = []
|
|
110
|
+
reporter.start(discovery_run_id, len(candidates), options.bucket)
|
|
111
|
+
for index, record in enumerate(candidates, start=1):
|
|
112
|
+
item = self._enrich_one(
|
|
113
|
+
run_id, discovery_run_id, record, options, reporter, index, len(candidates), summary
|
|
114
|
+
)
|
|
115
|
+
items.append(item)
|
|
116
|
+
self._repository.save_item(run_id, item)
|
|
117
|
+
self._count_outcome(summary, item.outcome)
|
|
118
|
+
|
|
119
|
+
payload = {
|
|
120
|
+
"run_id": run_id,
|
|
121
|
+
"discovery_run_id": discovery_run_id,
|
|
122
|
+
"bucket": options.bucket,
|
|
123
|
+
"options": options.as_dict(),
|
|
124
|
+
"status": "completed",
|
|
125
|
+
"items": [item.model_dump(mode="json") for item in items],
|
|
126
|
+
}
|
|
127
|
+
paths = self._exporter.export(payload, summary)
|
|
128
|
+
self._repository.complete_run(run_id, summary, paths)
|
|
129
|
+
return EnrichmentRunResult(
|
|
130
|
+
run_id=run_id,
|
|
131
|
+
discovery_run_id=discovery_run_id,
|
|
132
|
+
summary=summary,
|
|
133
|
+
items=items,
|
|
134
|
+
artifact_paths=paths,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
def _enrich_one(
|
|
138
|
+
self,
|
|
139
|
+
run_id: str,
|
|
140
|
+
discovery_run_id: str,
|
|
141
|
+
record: dict[str, object],
|
|
142
|
+
options: EnrichmentOptions,
|
|
143
|
+
reporter: EnrichmentProgressReporter,
|
|
144
|
+
index: int,
|
|
145
|
+
total: int,
|
|
146
|
+
summary: EnrichmentSummary,
|
|
147
|
+
) -> EnrichmentItem:
|
|
148
|
+
company = dict(record["company"]) # type: ignore[arg-type]
|
|
149
|
+
evaluation = dict(record["evaluation"]) # type: ignore[arg-type]
|
|
150
|
+
spec = CompanySearchSpec.model_validate(record["spec"])
|
|
151
|
+
excluded_ownership_signals = {
|
|
152
|
+
signal.value for signal in spec.exclude.structured.ownership_signals
|
|
153
|
+
}
|
|
154
|
+
discovery = {
|
|
155
|
+
"run_id": discovery_run_id,
|
|
156
|
+
"company_name": company["company_name"],
|
|
157
|
+
"domain": company["domain"],
|
|
158
|
+
"vertical": company.get("vertical"),
|
|
159
|
+
"target_vertical": evaluation.get("target_vertical") or company.get("vertical"),
|
|
160
|
+
"country": company.get("country"),
|
|
161
|
+
"state": company.get("state"),
|
|
162
|
+
"employee_min": company.get("employee_min"),
|
|
163
|
+
"employee_max": company.get("employee_max"),
|
|
164
|
+
"ownership_type": company.get("ownership_type"),
|
|
165
|
+
"fit": evaluation.get("fit"),
|
|
166
|
+
"reason": evaluation.get("reason"),
|
|
167
|
+
"evidence": evaluation.get("evidence", []),
|
|
168
|
+
"source": record["source"],
|
|
169
|
+
"excluded_ownership_signals": sorted(excluded_ownership_signals),
|
|
170
|
+
}
|
|
171
|
+
reporter.company(index, total, str(discovery["company_name"]))
|
|
172
|
+
reporter.event("INHERITED", "name, domain, vertical, geography, employees, ownership type")
|
|
173
|
+
summary.processed += 1
|
|
174
|
+
summary.inherited_facts += 7
|
|
175
|
+
trace: list[dict[str, object]] = [
|
|
176
|
+
{"stage": "inherited", "fields": [
|
|
177
|
+
"company_name", "domain", "vertical", "geography", "employees", "ownership_type"
|
|
178
|
+
]}
|
|
179
|
+
]
|
|
180
|
+
candidate_id = int(record["candidate_id"]) # type: ignore[arg-type]
|
|
181
|
+
profile = self._repository.fresh_profile(candidate_id, self._freshness_days)
|
|
182
|
+
profile = self._apply_refresh(profile, options.refresh)
|
|
183
|
+
reused = sum(
|
|
184
|
+
value is not None
|
|
185
|
+
for value in (profile.phone, profile.location, profile.independence, profile.linkedin)
|
|
186
|
+
)
|
|
187
|
+
if reused:
|
|
188
|
+
summary.memory_profiles_reused += 1
|
|
189
|
+
reporter.event("MEMORY", f"reused {reused}/4 fresh enrichment facts")
|
|
190
|
+
trace.append({"stage": "memory", "reused": reused})
|
|
191
|
+
else:
|
|
192
|
+
reporter.event("MEMORY", "no reusable enrichment profile")
|
|
193
|
+
trace.append({"stage": "memory", "reused": 0})
|
|
194
|
+
|
|
195
|
+
conflicts: list[str] = []
|
|
196
|
+
statuses = {
|
|
197
|
+
key: InheritedFieldStatus.INHERITED
|
|
198
|
+
for key in (
|
|
199
|
+
"company_name", "domain", "vertical", "country", "state",
|
|
200
|
+
"employee_estimate", "ownership_type"
|
|
201
|
+
)
|
|
202
|
+
}
|
|
203
|
+
# A fresh explicit `unknown` independence result is reusable until its freshness window
|
|
204
|
+
# expires; only a newly fetched unknown result should trigger corroboration in this run.
|
|
205
|
+
try:
|
|
206
|
+
missing = self._missing(profile, include_unknown_independence=False)
|
|
207
|
+
if missing:
|
|
208
|
+
pages = self._fetch_pages(str(discovery["domain"]))
|
|
209
|
+
if pages:
|
|
210
|
+
summary.websites_fetched += 1
|
|
211
|
+
reporter.event("WEBSITE", f"read {len(pages)} targeted official pages")
|
|
212
|
+
trace.append({"stage": "website", "pages": [page.url for page in pages]})
|
|
213
|
+
extraction = self._extract(discovery, pages)
|
|
214
|
+
profile, new_conflicts = self._merge(
|
|
215
|
+
profile, extraction, discovery, "official_site"
|
|
216
|
+
)
|
|
217
|
+
conflicts.extend(new_conflicts)
|
|
218
|
+
self._confirm_inherited(statuses, extraction, discovery, profile)
|
|
219
|
+
|
|
220
|
+
missing = self._missing(profile, include_unknown_independence=True)
|
|
221
|
+
if missing and self._fallback_search is not None and self._extractor is not None:
|
|
222
|
+
for query, fields in self._fallback_queries(discovery, missing):
|
|
223
|
+
results = self._fallback_search.search(
|
|
224
|
+
query,
|
|
225
|
+
country=str(discovery.get("country") or "US"),
|
|
226
|
+
num_results=self._fallback_results,
|
|
227
|
+
)
|
|
228
|
+
summary.fallback_searches += 1
|
|
229
|
+
reporter.event("FALLBACK", f"narrow corroboration for {', '.join(fields)}")
|
|
230
|
+
pages = [
|
|
231
|
+
WebsitePage(
|
|
232
|
+
url=result.url,
|
|
233
|
+
title=result.title,
|
|
234
|
+
text=result.text or "",
|
|
235
|
+
page_type="search_evidence",
|
|
236
|
+
)
|
|
237
|
+
for result in results
|
|
238
|
+
if result.text or normalize_linkedin_company_url(result.url)
|
|
239
|
+
]
|
|
240
|
+
trace.append({
|
|
241
|
+
"stage": "fallback",
|
|
242
|
+
"query": query,
|
|
243
|
+
"fields": fields,
|
|
244
|
+
"sources": [page.url for page in pages],
|
|
245
|
+
})
|
|
246
|
+
if pages:
|
|
247
|
+
extraction = self._extract(discovery, pages)
|
|
248
|
+
profile, new_conflicts = self._merge(
|
|
249
|
+
profile, extraction, discovery, "search_corroboration"
|
|
250
|
+
)
|
|
251
|
+
conflicts.extend(new_conflicts)
|
|
252
|
+
except Exception as exc:
|
|
253
|
+
if self._is_configuration_error(exc):
|
|
254
|
+
raise
|
|
255
|
+
return self._failed_item(
|
|
256
|
+
candidate_id,
|
|
257
|
+
discovery,
|
|
258
|
+
profile,
|
|
259
|
+
statuses,
|
|
260
|
+
conflicts,
|
|
261
|
+
trace,
|
|
262
|
+
exc,
|
|
263
|
+
reporter,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
matched_exclusions = self._matched_ownership_exclusions(
|
|
267
|
+
profile, excluded_ownership_signals
|
|
268
|
+
)
|
|
269
|
+
conflicts.extend(
|
|
270
|
+
f"excluded_ownership_signal: {signal}" for signal in matched_exclusions
|
|
271
|
+
)
|
|
272
|
+
trace.append({
|
|
273
|
+
"stage": "structured_exclusions",
|
|
274
|
+
"requested": sorted(excluded_ownership_signals),
|
|
275
|
+
"matched": matched_exclusions,
|
|
276
|
+
})
|
|
277
|
+
outcome, review_flags = self._outcome(
|
|
278
|
+
profile,
|
|
279
|
+
conflicts,
|
|
280
|
+
options.allow_unknown_independence,
|
|
281
|
+
matched_exclusions,
|
|
282
|
+
)
|
|
283
|
+
label = "READY" if outcome == EnrichmentOutcome.READY else "REVIEW" if outcome in {
|
|
284
|
+
EnrichmentOutcome.GAPS, EnrichmentOutcome.INDEPENDENCE_UNCONFIRMED
|
|
285
|
+
} else "BLOCKED"
|
|
286
|
+
reporter.event(label, outcome.value)
|
|
287
|
+
trace.append({"stage": "outcome", "value": outcome.value})
|
|
288
|
+
return EnrichmentItem(
|
|
289
|
+
company_id=candidate_id,
|
|
290
|
+
discovery=discovery,
|
|
291
|
+
enrichment=profile,
|
|
292
|
+
inherited_status=statuses,
|
|
293
|
+
outcome=outcome,
|
|
294
|
+
conflicts=list(dict.fromkeys(conflicts)),
|
|
295
|
+
review_flags=review_flags,
|
|
296
|
+
trace=trace,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
def _failed_item(
|
|
300
|
+
self,
|
|
301
|
+
candidate_id: int,
|
|
302
|
+
discovery: dict[str, object],
|
|
303
|
+
profile: EnrichmentProfile,
|
|
304
|
+
statuses: dict[str, InheritedFieldStatus],
|
|
305
|
+
conflicts: list[str],
|
|
306
|
+
trace: list[dict[str, object]],
|
|
307
|
+
error: Exception,
|
|
308
|
+
reporter: EnrichmentProgressReporter,
|
|
309
|
+
) -> EnrichmentItem:
|
|
310
|
+
message = self._error_message(error)
|
|
311
|
+
reporter.event("FAILED", message)
|
|
312
|
+
trace.append({
|
|
313
|
+
"stage": "error",
|
|
314
|
+
"error_type": type(error).__name__,
|
|
315
|
+
"message": message,
|
|
316
|
+
})
|
|
317
|
+
return EnrichmentItem(
|
|
318
|
+
company_id=candidate_id,
|
|
319
|
+
discovery=discovery,
|
|
320
|
+
enrichment=profile,
|
|
321
|
+
inherited_status=statuses,
|
|
322
|
+
outcome=EnrichmentOutcome.FAILED,
|
|
323
|
+
conflicts=list(dict.fromkeys([*conflicts, f"enrichment_failed: {message}"])),
|
|
324
|
+
review_flags=["enrichment_failed"],
|
|
325
|
+
trace=trace,
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
def _fetch_pages(self, domain: str) -> list[WebsitePage]:
|
|
329
|
+
if self._website is None:
|
|
330
|
+
return []
|
|
331
|
+
return self._website.fetch(domain)
|
|
332
|
+
|
|
333
|
+
def _extract(
|
|
334
|
+
self, discovery: dict[str, object], pages: list[WebsitePage]
|
|
335
|
+
) -> EnrichmentExtraction:
|
|
336
|
+
if self._extractor is None:
|
|
337
|
+
raise RuntimeError("LLM_API_KEY is required to extract enrichment facts")
|
|
338
|
+
extraction = self._extractor.extract(discovery, pages)
|
|
339
|
+
linkedin_profiles = list(extraction.linkedin_profiles)
|
|
340
|
+
seen = {profile.url for profile in linkedin_profiles}
|
|
341
|
+
for page in pages:
|
|
342
|
+
# Only official-site anchors are deterministic. Search-result URLs require the
|
|
343
|
+
# extractor's company/domain identity check before they become observations.
|
|
344
|
+
for candidate in page.linkedin_urls:
|
|
345
|
+
normalized = normalize_linkedin_company_url(candidate)
|
|
346
|
+
if normalized is not None and normalized not in seen:
|
|
347
|
+
linkedin_profiles.append(
|
|
348
|
+
LinkedInObservation(url=normalized, source_url=page.url)
|
|
349
|
+
)
|
|
350
|
+
seen.add(normalized)
|
|
351
|
+
return extraction.model_copy(update={"linkedin_profiles": linkedin_profiles})
|
|
352
|
+
|
|
353
|
+
@staticmethod
|
|
354
|
+
def _apply_refresh(profile: EnrichmentProfile, refresh: str) -> EnrichmentProfile:
|
|
355
|
+
updates: dict[str, object] = {}
|
|
356
|
+
if refresh in {"contact", "all"}:
|
|
357
|
+
updates.update(phone=None, location=None, linkedin=None)
|
|
358
|
+
if refresh in {"independence", "all"}:
|
|
359
|
+
updates["independence"] = None
|
|
360
|
+
return profile.model_copy(update=updates)
|
|
361
|
+
|
|
362
|
+
@staticmethod
|
|
363
|
+
def _missing(
|
|
364
|
+
profile: EnrichmentProfile,
|
|
365
|
+
*,
|
|
366
|
+
include_unknown_independence: bool,
|
|
367
|
+
) -> list[str]:
|
|
368
|
+
missing = []
|
|
369
|
+
if profile.phone is None:
|
|
370
|
+
missing.append("phone")
|
|
371
|
+
if profile.location is None:
|
|
372
|
+
missing.append("address")
|
|
373
|
+
if profile.linkedin is None:
|
|
374
|
+
missing.append("linkedin")
|
|
375
|
+
if profile.independence is None or (
|
|
376
|
+
include_unknown_independence
|
|
377
|
+
and profile.independence.status == IndependenceStatus.UNKNOWN
|
|
378
|
+
):
|
|
379
|
+
missing.append("independence")
|
|
380
|
+
return missing
|
|
381
|
+
|
|
382
|
+
@staticmethod
|
|
383
|
+
def _merge(
|
|
384
|
+
profile: EnrichmentProfile,
|
|
385
|
+
extraction: EnrichmentExtraction,
|
|
386
|
+
discovery: dict[str, object],
|
|
387
|
+
source: str,
|
|
388
|
+
) -> tuple[EnrichmentProfile, list[str]]:
|
|
389
|
+
conflicts: list[str] = []
|
|
390
|
+
if extraction.identity_conflict:
|
|
391
|
+
conflicts.append(f"identity_conflict: {extraction.identity_conflict_reason or 'source mismatch'}")
|
|
392
|
+
phone = profile.phone or resolve_phone(extraction, source)
|
|
393
|
+
location = profile.location
|
|
394
|
+
if location is None:
|
|
395
|
+
location, _ = resolve_location(
|
|
396
|
+
extraction, discovery.get("state"), source # type: ignore[arg-type]
|
|
397
|
+
)
|
|
398
|
+
independence = profile.independence
|
|
399
|
+
resolved_independence = resolve_independence(extraction)
|
|
400
|
+
if independence is None or independence.status == IndependenceStatus.UNKNOWN:
|
|
401
|
+
independence = resolved_independence
|
|
402
|
+
if independence.status == IndependenceStatus.NO:
|
|
403
|
+
conflicts.append("independence_conflict: explicit parent, franchise, or acquisition evidence")
|
|
404
|
+
linkedin = profile.linkedin or resolve_linkedin(extraction, source)
|
|
405
|
+
return EnrichmentProfile(
|
|
406
|
+
phone=phone,
|
|
407
|
+
location=location,
|
|
408
|
+
independence=independence,
|
|
409
|
+
linkedin=linkedin,
|
|
410
|
+
), conflicts
|
|
411
|
+
|
|
412
|
+
@staticmethod
|
|
413
|
+
def _confirm_inherited(
|
|
414
|
+
statuses: dict[str, InheritedFieldStatus],
|
|
415
|
+
extraction: EnrichmentExtraction,
|
|
416
|
+
discovery: dict[str, object],
|
|
417
|
+
profile: EnrichmentProfile,
|
|
418
|
+
) -> None:
|
|
419
|
+
if extraction.observed_company_name and not extraction.identity_conflict:
|
|
420
|
+
statuses["company_name"] = InheritedFieldStatus.CONFIRMED
|
|
421
|
+
statuses["domain"] = InheritedFieldStatus.CONFIRMED
|
|
422
|
+
if profile.location is not None and profile.location.state == discovery.get("state"):
|
|
423
|
+
statuses["country"] = InheritedFieldStatus.CONFIRMED
|
|
424
|
+
statuses["state"] = InheritedFieldStatus.CONFIRMED
|
|
425
|
+
if extraction.identity_conflict:
|
|
426
|
+
statuses["company_name"] = InheritedFieldStatus.CONFLICT
|
|
427
|
+
statuses["domain"] = InheritedFieldStatus.CONFLICT
|
|
428
|
+
|
|
429
|
+
@staticmethod
|
|
430
|
+
def _fallback_query(discovery: dict[str, object], missing: list[str]) -> str:
|
|
431
|
+
return (
|
|
432
|
+
f'"{discovery["company_name"]}" site:{discovery["domain"]} '
|
|
433
|
+
f'{" ".join(missing)} contact address franchise parent ownership'
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
@classmethod
|
|
437
|
+
def _fallback_queries(
|
|
438
|
+
cls,
|
|
439
|
+
discovery: dict[str, object],
|
|
440
|
+
missing: list[str],
|
|
441
|
+
) -> list[tuple[str, list[str]]]:
|
|
442
|
+
queries: list[tuple[str, list[str]]] = []
|
|
443
|
+
standard_fields = [field for field in missing if field != "linkedin"]
|
|
444
|
+
if standard_fields:
|
|
445
|
+
queries.append((cls._fallback_query(discovery, standard_fields), standard_fields))
|
|
446
|
+
if "linkedin" in missing:
|
|
447
|
+
queries.append((
|
|
448
|
+
f'"{discovery["company_name"]}" "{discovery["domain"]}" '
|
|
449
|
+
"site:linkedin.com/company",
|
|
450
|
+
["linkedin"],
|
|
451
|
+
))
|
|
452
|
+
return queries
|
|
453
|
+
|
|
454
|
+
@staticmethod
|
|
455
|
+
def _outcome(
|
|
456
|
+
profile: EnrichmentProfile,
|
|
457
|
+
conflicts: list[str],
|
|
458
|
+
allow_unknown: bool,
|
|
459
|
+
matched_exclusions: list[str] | None = None,
|
|
460
|
+
) -> tuple[EnrichmentOutcome, list[str]]:
|
|
461
|
+
if any(value.startswith("identity_conflict") for value in conflicts):
|
|
462
|
+
return EnrichmentOutcome.IDENTITY_CONFLICT, ["identity_conflict"]
|
|
463
|
+
if any(value.startswith("geography_conflict") for value in conflicts):
|
|
464
|
+
return EnrichmentOutcome.GEOGRAPHY_CONFLICT, ["geography_conflict"]
|
|
465
|
+
if matched_exclusions:
|
|
466
|
+
return EnrichmentOutcome.FIT_CONFLICT, [
|
|
467
|
+
f"excluded_{signal}" for signal in matched_exclusions
|
|
468
|
+
]
|
|
469
|
+
if profile.independence and profile.independence.status == IndependenceStatus.NO:
|
|
470
|
+
return EnrichmentOutcome.FIT_CONFLICT, ["not_independent"]
|
|
471
|
+
gaps = []
|
|
472
|
+
if profile.phone is None:
|
|
473
|
+
gaps.append("phone_missing")
|
|
474
|
+
if profile.location is None:
|
|
475
|
+
gaps.append("address_missing")
|
|
476
|
+
if profile.linkedin is None:
|
|
477
|
+
gaps.append("linkedin_missing")
|
|
478
|
+
if gaps:
|
|
479
|
+
return EnrichmentOutcome.GAPS, gaps
|
|
480
|
+
if profile.independence is None or profile.independence.status == IndependenceStatus.UNKNOWN:
|
|
481
|
+
if allow_unknown:
|
|
482
|
+
return EnrichmentOutcome.READY, ["independence_unknown_allowed"]
|
|
483
|
+
return EnrichmentOutcome.INDEPENDENCE_UNCONFIRMED, ["independence_unknown"]
|
|
484
|
+
return EnrichmentOutcome.READY, []
|
|
485
|
+
|
|
486
|
+
@staticmethod
|
|
487
|
+
def _matched_ownership_exclusions(
|
|
488
|
+
profile: EnrichmentProfile,
|
|
489
|
+
excluded_signals: set[str],
|
|
490
|
+
) -> list[str]:
|
|
491
|
+
if not excluded_signals or profile.independence is None:
|
|
492
|
+
return []
|
|
493
|
+
observed = set(profile.independence.signal_kinds)
|
|
494
|
+
# Older cached facts predate signal_kinds. Preserve family-owned evidence across upgrades.
|
|
495
|
+
if "family_owned" not in observed:
|
|
496
|
+
evidence = " ".join(profile.independence.evidence).lower()
|
|
497
|
+
if "family-owned" in evidence or "family owned" in evidence:
|
|
498
|
+
observed.add("family_owned")
|
|
499
|
+
return sorted(observed & excluded_signals)
|
|
500
|
+
|
|
501
|
+
@staticmethod
|
|
502
|
+
def _error_message(error: Exception) -> str:
|
|
503
|
+
message = " ".join(str(error).split())
|
|
504
|
+
if not message:
|
|
505
|
+
message = type(error).__name__
|
|
506
|
+
return message[:1000]
|
|
507
|
+
|
|
508
|
+
@staticmethod
|
|
509
|
+
def _is_configuration_error(error: Exception) -> bool:
|
|
510
|
+
message = str(error)
|
|
511
|
+
return (
|
|
512
|
+
"LLM_API_KEY is required" in message
|
|
513
|
+
or "LLM API returned HTTP 401" in message
|
|
514
|
+
or "LLM API returned HTTP 403" in message
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
@staticmethod
|
|
518
|
+
def _count_outcome(summary: EnrichmentSummary, outcome: EnrichmentOutcome) -> None:
|
|
519
|
+
if outcome == EnrichmentOutcome.READY:
|
|
520
|
+
summary.ready += 1
|
|
521
|
+
elif outcome in {EnrichmentOutcome.GAPS, EnrichmentOutcome.INDEPENDENCE_UNCONFIRMED}:
|
|
522
|
+
summary.review += 1
|
|
523
|
+
elif outcome == EnrichmentOutcome.FAILED:
|
|
524
|
+
summary.failed += 1
|
|
525
|
+
else:
|
|
526
|
+
summary.blocked += 1
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class EnrichmentProgressReporter(Protocol):
|
|
7
|
+
def start(self, discovery_run_id: str, total: int, bucket: str) -> None: ...
|
|
8
|
+
def company(self, current: int, total: int, name: str) -> None: ...
|
|
9
|
+
def event(self, label: str, message: str) -> None: ...
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class NullEnrichmentProgressReporter:
|
|
13
|
+
def start(self, discovery_run_id: str, total: int, bucket: str) -> None:
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
def company(self, current: int, total: int, name: str) -> None:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
def event(self, label: str, message: str) -> None:
|
|
20
|
+
pass
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
from company_discovery.domain.models import (
|
|
8
|
+
EnrichmentExtraction,
|
|
9
|
+
IndependenceFact,
|
|
10
|
+
IndependenceStatus,
|
|
11
|
+
LinkedInFact,
|
|
12
|
+
LocationFact,
|
|
13
|
+
PhoneFact,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
US_STATE_NAMES = {
|
|
18
|
+
"ALABAMA": "AL", "ALASKA": "AK", "ARIZONA": "AZ", "ARKANSAS": "AR",
|
|
19
|
+
"CALIFORNIA": "CA", "COLORADO": "CO", "CONNECTICUT": "CT", "DELAWARE": "DE",
|
|
20
|
+
"FLORIDA": "FL", "GEORGIA": "GA", "HAWAII": "HI", "IDAHO": "ID",
|
|
21
|
+
"ILLINOIS": "IL", "INDIANA": "IN", "IOWA": "IA", "KANSAS": "KS",
|
|
22
|
+
"KENTUCKY": "KY", "LOUISIANA": "LA", "MAINE": "ME", "MARYLAND": "MD",
|
|
23
|
+
"MASSACHUSETTS": "MA", "MICHIGAN": "MI", "MINNESOTA": "MN", "MISSISSIPPI": "MS",
|
|
24
|
+
"MISSOURI": "MO", "MONTANA": "MT", "NEBRASKA": "NE", "NEVADA": "NV",
|
|
25
|
+
"NEW HAMPSHIRE": "NH", "NEW JERSEY": "NJ", "NEW MEXICO": "NM", "NEW YORK": "NY",
|
|
26
|
+
"NORTH CAROLINA": "NC", "NORTH DAKOTA": "ND", "OHIO": "OH", "OKLAHOMA": "OK",
|
|
27
|
+
"OREGON": "OR", "PENNSYLVANIA": "PA", "RHODE ISLAND": "RI",
|
|
28
|
+
"SOUTH CAROLINA": "SC", "SOUTH DAKOTA": "SD", "TENNESSEE": "TN", "TEXAS": "TX",
|
|
29
|
+
"UTAH": "UT", "VERMONT": "VT", "VIRGINIA": "VA", "WASHINGTON": "WA",
|
|
30
|
+
"WEST VIRGINIA": "WV", "WISCONSIN": "WI", "WYOMING": "WY",
|
|
31
|
+
"DISTRICT OF COLUMBIA": "DC",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def normalize_state(value: str | None) -> str | None:
|
|
36
|
+
if not value:
|
|
37
|
+
return None
|
|
38
|
+
cleaned = value.strip().upper()
|
|
39
|
+
return US_STATE_NAMES.get(cleaned, cleaned if len(cleaned) == 2 else None)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def resolve_phone(extraction: EnrichmentExtraction, source: str) -> PhoneFact | None:
|
|
43
|
+
if not extraction.phones:
|
|
44
|
+
return None
|
|
45
|
+
preferred = next(
|
|
46
|
+
(phone for phone in extraction.phones if "fax" not in (phone.label or "").lower()),
|
|
47
|
+
None,
|
|
48
|
+
)
|
|
49
|
+
if preferred is None:
|
|
50
|
+
return None
|
|
51
|
+
digits = re.sub(r"\D", "", preferred.value)
|
|
52
|
+
if len(digits) == 10:
|
|
53
|
+
normalized = f"+1{digits}"
|
|
54
|
+
display = f"({digits[:3]}) {digits[3:6]}-{digits[6:]}"
|
|
55
|
+
elif len(digits) == 11 and digits.startswith("1"):
|
|
56
|
+
normalized = f"+{digits}"
|
|
57
|
+
display = f"({digits[1:4]}) {digits[4:7]}-{digits[7:]}"
|
|
58
|
+
elif 8 <= len(digits) <= 15:
|
|
59
|
+
normalized = f"+{digits}"
|
|
60
|
+
display = preferred.value.strip()
|
|
61
|
+
else:
|
|
62
|
+
return None
|
|
63
|
+
return PhoneFact(
|
|
64
|
+
value=normalized,
|
|
65
|
+
display_value=display,
|
|
66
|
+
source=source,
|
|
67
|
+
source_url=preferred.source_url,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def resolve_location(
|
|
72
|
+
extraction: EnrichmentExtraction,
|
|
73
|
+
target_state: str | None,
|
|
74
|
+
source: str,
|
|
75
|
+
) -> tuple[LocationFact | None, bool]:
|
|
76
|
+
if not extraction.locations:
|
|
77
|
+
return None, False
|
|
78
|
+
expected = normalize_state(target_state)
|
|
79
|
+
normalized = [(location, normalize_state(location.state)) for location in extraction.locations]
|
|
80
|
+
chosen = next((location for location, state in normalized if expected and state == expected), None)
|
|
81
|
+
geography_conflict = bool(expected and chosen is None)
|
|
82
|
+
if chosen is None and expected is None:
|
|
83
|
+
chosen = next(
|
|
84
|
+
(location for location in extraction.locations if "head" in (location.label or "").lower()),
|
|
85
|
+
extraction.locations[0],
|
|
86
|
+
)
|
|
87
|
+
if chosen is None:
|
|
88
|
+
return None, geography_conflict
|
|
89
|
+
return (
|
|
90
|
+
LocationFact(
|
|
91
|
+
street_address=chosen.street_address,
|
|
92
|
+
city=chosen.city,
|
|
93
|
+
state=normalize_state(chosen.state) or chosen.state,
|
|
94
|
+
zip=chosen.zip,
|
|
95
|
+
country=chosen.country,
|
|
96
|
+
source=source,
|
|
97
|
+
source_url=chosen.source_url,
|
|
98
|
+
),
|
|
99
|
+
False,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def resolve_independence(extraction: EnrichmentExtraction) -> IndependenceFact:
|
|
104
|
+
negative = {"franchise", "parent", "subsidiary", "division", "acquired"}
|
|
105
|
+
positive = {"independent_explicit", "family_owned", "locally_owned"}
|
|
106
|
+
kinds = {signal.kind for signal in extraction.ownership_signals}
|
|
107
|
+
if kinds & negative:
|
|
108
|
+
status = IndependenceStatus.NO
|
|
109
|
+
elif kinds & positive:
|
|
110
|
+
status = IndependenceStatus.YES
|
|
111
|
+
else:
|
|
112
|
+
status = IndependenceStatus.UNKNOWN
|
|
113
|
+
return IndependenceFact(
|
|
114
|
+
status=status,
|
|
115
|
+
evidence=[signal.statement for signal in extraction.ownership_signals],
|
|
116
|
+
source_urls=list(dict.fromkeys(signal.source_url for signal in extraction.ownership_signals)),
|
|
117
|
+
signal_kinds=list(dict.fromkeys(signal.kind for signal in extraction.ownership_signals)),
|
|
118
|
+
observed_at=datetime.now(UTC),
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def normalize_linkedin_company_url(value: str) -> str | None:
|
|
123
|
+
candidate = value.strip()
|
|
124
|
+
if not candidate:
|
|
125
|
+
return None
|
|
126
|
+
if not candidate.startswith(("http://", "https://")):
|
|
127
|
+
candidate = f"https://{candidate}"
|
|
128
|
+
parsed = urlparse(candidate)
|
|
129
|
+
host = (parsed.hostname or "").lower().removeprefix("www.")
|
|
130
|
+
parts = [part for part in parsed.path.split("/") if part]
|
|
131
|
+
if host != "linkedin.com" or len(parts) < 2 or parts[0].lower() != "company":
|
|
132
|
+
return None
|
|
133
|
+
slug = parts[1]
|
|
134
|
+
if not re.fullmatch(r"[A-Za-z0-9_-]+", slug):
|
|
135
|
+
return None
|
|
136
|
+
return f"https://www.linkedin.com/company/{slug}"
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def resolve_linkedin(extraction: EnrichmentExtraction, source: str) -> LinkedInFact | None:
|
|
140
|
+
for observation in extraction.linkedin_profiles:
|
|
141
|
+
normalized = normalize_linkedin_company_url(observation.url)
|
|
142
|
+
if normalized is not None:
|
|
143
|
+
return LinkedInFact(
|
|
144
|
+
url=normalized,
|
|
145
|
+
source=source,
|
|
146
|
+
source_url=observation.source_url,
|
|
147
|
+
)
|
|
148
|
+
return None
|