leads-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. company_discovery/__init__.py +4 -0
  2. company_discovery/adapters/__init__.py +5 -0
  3. company_discovery/adapters/apollo.py +189 -0
  4. company_discovery/adapters/exa.py +112 -0
  5. company_discovery/adapters/llm.py +118 -0
  6. company_discovery/adapters/protocols.py +58 -0
  7. company_discovery/adapters/website.py +154 -0
  8. company_discovery/bundled_skills/__init__.py +1 -0
  9. company_discovery/bundled_skills/company-discovery-operator/SKILL.md +72 -0
  10. company_discovery/bundled_skills/company-discovery-operator/agents/openai.yaml +4 -0
  11. company_discovery/bundled_skills/company-enrichment-operator/SKILL.md +94 -0
  12. company_discovery/bundled_skills/company-enrichment-operator/agents/openai.yaml +4 -0
  13. company_discovery/bundled_skills/company-search-spec-writer/SKILL.md +109 -0
  14. company_discovery/bundled_skills/company-search-spec-writer/agents/openai.yaml +4 -0
  15. company_discovery/bundled_skills/contact-discovery-operator/SKILL.md +80 -0
  16. company_discovery/bundled_skills/contact-discovery-operator/agents/openai.yaml +4 -0
  17. company_discovery/bundled_skills/contact-enrichment-operator/SKILL.md +86 -0
  18. company_discovery/bundled_skills/contact-enrichment-operator/agents/openai.yaml +4 -0
  19. company_discovery/bundled_skills/contact-search-spec-writer/SKILL.md +86 -0
  20. company_discovery/bundled_skills/contact-search-spec-writer/agents/openai.yaml +4 -0
  21. company_discovery/bundled_skills/leads-update-operator/SKILL.md +60 -0
  22. company_discovery/bundled_skills/leads-update-operator/agents/openai.yaml +4 -0
  23. company_discovery/cli.py +1789 -0
  24. company_discovery/db/__init__.py +5 -0
  25. company_discovery/db/contact_enrichment_repository.py +268 -0
  26. company_discovery/db/contact_repository.py +366 -0
  27. company_discovery/db/enrichment_repository.py +207 -0
  28. company_discovery/db/models.py +324 -0
  29. company_discovery/db/repository.py +363 -0
  30. company_discovery/db/session.py +48 -0
  31. company_discovery/domain/__init__.py +24 -0
  32. company_discovery/domain/contact_models.py +178 -0
  33. company_discovery/domain/contact_spec.py +86 -0
  34. company_discovery/domain/models.py +287 -0
  35. company_discovery/domain/spec.py +263 -0
  36. company_discovery/migrations.py +190 -0
  37. company_discovery/prompts/__init__.py +8 -0
  38. company_discovery/prompts/candidate_evaluation/system.md +13 -0
  39. company_discovery/prompts/company_enrichment/system.md +42 -0
  40. company_discovery/prompts/contact_evaluation/system.md +18 -0
  41. company_discovery/prompts/query_generation/system.md +10 -0
  42. company_discovery/release_manifest.json +7 -0
  43. company_discovery/reports/__init__.py +4 -0
  44. company_discovery/reports/contact_enrichment_exporter.py +108 -0
  45. company_discovery/reports/contact_exporter.py +132 -0
  46. company_discovery/reports/enrichment_exporter.py +125 -0
  47. company_discovery/reports/exporter.py +135 -0
  48. company_discovery/runtime.py +336 -0
  49. company_discovery/services/__init__.py +4 -0
  50. company_discovery/services/contact_enrichment_pipeline.py +344 -0
  51. company_discovery/services/contact_enrichment_progress.py +37 -0
  52. company_discovery/services/contact_evaluator.py +110 -0
  53. company_discovery/services/contact_pipeline.py +295 -0
  54. company_discovery/services/contact_progress.py +38 -0
  55. company_discovery/services/enrichment_extractor.py +61 -0
  56. company_discovery/services/enrichment_pipeline.py +526 -0
  57. company_discovery/services/enrichment_progress.py +20 -0
  58. company_discovery/services/enrichment_resolver.py +148 -0
  59. company_discovery/services/evaluator.py +40 -0
  60. company_discovery/services/hygiene.py +51 -0
  61. company_discovery/services/memory.py +150 -0
  62. company_discovery/services/normalization.py +98 -0
  63. company_discovery/services/pipeline.py +628 -0
  64. company_discovery/services/progress.py +48 -0
  65. company_discovery/services/query_planner.py +47 -0
  66. company_discovery/settings.py +152 -0
  67. company_discovery/skill_installer.py +197 -0
  68. company_discovery/update_plan.py +79 -0
  69. leads_cli-0.1.0.dist-info/METADATA +277 -0
  70. leads_cli-0.1.0.dist-info/RECORD +72 -0
  71. leads_cli-0.1.0.dist-info/WHEEL +4 -0
  72. leads_cli-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+
5
+ from company_discovery.adapters.protocols import StructuredLLM
6
+ from company_discovery.domain.models import CandidateEvaluation, NormalizedCandidate
7
+ from company_discovery.domain.spec import CompanySearchSpec
8
+ from company_discovery.prompts import load_prompt
9
+
10
+
11
+ class CandidateEvaluator:
12
+ def __init__(self, llm: StructuredLLM) -> None:
13
+ self._llm = llm
14
+ self._system_prompt = load_prompt("candidate_evaluation")
15
+
16
+ def evaluate(
17
+ self,
18
+ spec: CompanySearchSpec,
19
+ candidate: NormalizedCandidate,
20
+ ) -> CandidateEvaluation:
21
+ prompt = json.dumps(
22
+ {
23
+ "search_spec": spec.model_dump(mode="json"),
24
+ "candidate": candidate.model_dump(mode="json"),
25
+ },
26
+ indent=2,
27
+ )
28
+ result = self._llm.generate(
29
+ system_prompt=self._system_prompt,
30
+ user_prompt=prompt,
31
+ response_model=CandidateEvaluation,
32
+ )
33
+ if not isinstance(result, CandidateEvaluation):
34
+ raise TypeError(
35
+ f"LLM returned {type(result).__name__}, expected CandidateEvaluation"
36
+ )
37
+ evaluation = result
38
+ return evaluation.model_copy(
39
+ update={"company_name": candidate.company_name, "domain": candidate.domain}
40
+ )
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from company_discovery.domain.models import NormalizedCandidate
6
+
7
+
8
+ KNOWN_NON_COMPANY_DOMAINS = frozenset(
9
+ {
10
+ "bbb.org",
11
+ "bloomberg.com",
12
+ "chamberofcommerce.com",
13
+ "crunchbase.com",
14
+ "facebook.com",
15
+ "glassdoor.com",
16
+ "instagram.com",
17
+ "linkedin.com",
18
+ "mapquest.com",
19
+ "manta.com",
20
+ "opencorporates.com",
21
+ "pitchbook.com",
22
+ "wikipedia.org",
23
+ "yelp.com",
24
+ "yellowpages.com",
25
+ "youtube.com",
26
+ }
27
+ )
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class HygieneResult:
32
+ accepted: list[NormalizedCandidate]
33
+ rejected: list[tuple[NormalizedCandidate, str]]
34
+
35
+
36
+ def filter_hygiene(candidates: list[NormalizedCandidate]) -> HygieneResult:
37
+ accepted: list[NormalizedCandidate] = []
38
+ rejected: list[tuple[NormalizedCandidate, str]] = []
39
+ seen: set[str] = set()
40
+ for candidate in candidates:
41
+ if candidate.domain in seen:
42
+ rejected.append((candidate, "duplicate_domain"))
43
+ elif candidate.domain in KNOWN_NON_COMPANY_DOMAINS:
44
+ rejected.append((candidate, "known_non_company_domain"))
45
+ elif not candidate.company_name.strip():
46
+ rejected.append((candidate, "missing_company_name"))
47
+ else:
48
+ seen.add(candidate.domain)
49
+ accepted.append(candidate)
50
+ return HygieneResult(accepted=accepted, rejected=rejected)
51
+
@@ -0,0 +1,150 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from company_discovery.db.repository import MemoryRecord
6
+ from company_discovery.domain.models import FitVerdict
7
+ from company_discovery.domain.spec import CompanySearchSpec, NoveltyMode
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class SkippedMemoryRecord:
12
+ record: MemoryRecord
13
+ reason: str
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class MemoryScanResult:
18
+ matched: int
19
+ reusable: list[MemoryRecord]
20
+ recheck: list[MemoryRecord]
21
+ skipped: list[SkippedMemoryRecord]
22
+
23
+
24
+ class MemoryMatcher:
25
+ def scan(self, spec: CompanySearchSpec, records: list[MemoryRecord]) -> MemoryScanResult:
26
+ if spec.novelty_mode == NoveltyMode.ONLY_NEW:
27
+ return MemoryScanResult(
28
+ matched=0,
29
+ reusable=[],
30
+ recheck=[],
31
+ skipped=[
32
+ SkippedMemoryRecord(record, "memory_disabled_only_new") for record in records
33
+ ],
34
+ )
35
+
36
+ reusable: list[MemoryRecord] = []
37
+ recheck: list[MemoryRecord] = []
38
+ skipped: list[SkippedMemoryRecord] = []
39
+ for record in records:
40
+ mismatch = self._hard_mismatch(spec, record)
41
+ if mismatch:
42
+ skipped.append(SkippedMemoryRecord(record, mismatch))
43
+ continue
44
+ if (
45
+ record.latest_fit == FitVerdict.GOOD.value
46
+ and record.latest_evaluation is not None
47
+ and not self._requires_recheck(spec, record)
48
+ ):
49
+ reusable.append(record)
50
+ else:
51
+ recheck.append(record)
52
+
53
+ return MemoryScanResult(
54
+ matched=len(reusable) + len(recheck),
55
+ reusable=reusable,
56
+ recheck=recheck,
57
+ skipped=skipped,
58
+ )
59
+
60
+ @staticmethod
61
+ def _hard_mismatch(spec: CompanySearchSpec, record: MemoryRecord) -> str | None:
62
+ candidate = record.candidate
63
+ previous = record.latest_spec
64
+ reason_codes = set(record.latest_reason_codes)
65
+ if previous is not None:
66
+ previous_target = (
67
+ record.latest_evaluation.target_vertical
68
+ if record.latest_evaluation is not None
69
+ else None
70
+ )
71
+ same_vertical = previous_target == spec.vertical.key or (
72
+ len(previous.verticals) == 1 and previous.verticals[0] == spec.vertical
73
+ )
74
+ if "vertical_mismatch" in reason_codes and same_vertical:
75
+ return "prior_vertical_mismatch_same_spec"
76
+ if (
77
+ "geography_mismatch" in reason_codes
78
+ and previous.geography == spec.geography
79
+ ):
80
+ return "prior_geography_mismatch_same_spec"
81
+ if "size_mismatch" in reason_codes and previous.company_size == spec.company_size:
82
+ return "prior_size_mismatch_same_spec"
83
+ exclusion_codes = {
84
+ "excluded_ownership",
85
+ "excluded_keyword",
86
+ "excluded_company_pattern",
87
+ }
88
+ if reason_codes & exclusion_codes and previous.exclude == spec.exclude:
89
+ return "prior_exclusion_same_spec"
90
+ if spec.novelty_mode == NoveltyMode.UNUSED_MEMORY and record.ever_selected:
91
+ return "previously_selected"
92
+ if candidate.vertical and candidate.vertical != spec.vertical.key:
93
+ return "vertical_mismatch"
94
+ if candidate.country and candidate.country.upper() != spec.geography.country:
95
+ return "country_mismatch"
96
+ if spec.geography.states and candidate.state and candidate.state.upper() not in spec.geography.states:
97
+ return "state_mismatch"
98
+ size = spec.company_size
99
+ if (
100
+ size.employee_min is not None
101
+ and candidate.employee_max is not None
102
+ and candidate.employee_max < size.employee_min
103
+ ):
104
+ return "size_below_minimum"
105
+ if (
106
+ size.employee_max is not None
107
+ and candidate.employee_min is not None
108
+ and candidate.employee_min > size.employee_max
109
+ ):
110
+ return "size_above_maximum"
111
+ if (
112
+ candidate.ownership_type
113
+ and candidate.ownership_type.lower() in spec.exclude.ownership_types
114
+ ):
115
+ return "excluded_ownership"
116
+ searchable = " ".join(
117
+ [candidate.company_name]
118
+ + [sighting.title for sighting in candidate.sightings]
119
+ + [sighting.text or "" for sighting in candidate.sightings]
120
+ ).lower()
121
+ if any(keyword in searchable for keyword in spec.exclude.keywords):
122
+ return "excluded_keyword"
123
+ return None
124
+
125
+ @staticmethod
126
+ def _requires_recheck(spec: CompanySearchSpec, record: MemoryRecord) -> bool:
127
+ candidate = record.candidate
128
+ if candidate.vertical is None or candidate.country is None:
129
+ return True
130
+ if spec.geography.states and candidate.state is None:
131
+ return True
132
+ if not spec.company_size.is_unbounded and (
133
+ candidate.employee_min is None or candidate.employee_max is None
134
+ ):
135
+ return True
136
+
137
+ previous = record.latest_spec
138
+ if spec.include.keywords or spec.include.subtypes:
139
+ if previous is None or previous.include != spec.include:
140
+ return True
141
+ has_custom_exclusions = any(
142
+ (
143
+ spec.exclude.keywords,
144
+ spec.exclude.ownership_types,
145
+ spec.exclude.company_patterns,
146
+ )
147
+ )
148
+ if has_custom_exclusions and (previous is None or previous.exclude != spec.exclude):
149
+ return True
150
+ return False
@@ -0,0 +1,98 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from collections import OrderedDict
5
+ from datetime import UTC, datetime
6
+ from urllib.parse import urlparse
7
+
8
+ import tldextract
9
+
10
+ from company_discovery.domain.models import ExaSearchResult, NormalizedCandidate, SourceSighting
11
+
12
+
13
+ _extract_domain = tldextract.TLDExtract(suffix_list_urls=(), cache_dir=None)
14
+ _TITLE_SUFFIX = re.compile(r"\s+[|\-\u2013\u2014]\s+.*$")
15
+
16
+
17
+ def canonical_domain(url: str) -> str | None:
18
+ value = url.strip()
19
+ if not value:
20
+ return None
21
+ if ":" in value and "://" not in value:
22
+ host, _, port = value.partition(":")
23
+ if not ("." in host and port.isdigit()):
24
+ return None
25
+ parsed = urlparse(value if "://" in value else f"https://{value}")
26
+ if parsed.scheme not in {"http", "https"} or not parsed.hostname:
27
+ return None
28
+ extracted = _extract_domain(parsed.hostname.lower())
29
+ if not extracted.domain or not extracted.suffix:
30
+ return None
31
+ return f"{extracted.domain}.{extracted.suffix}"
32
+
33
+
34
+ def candidate_name(title: str, domain: str) -> str:
35
+ cleaned = _TITLE_SUFFIX.sub("", title).strip()
36
+ if cleaned:
37
+ return cleaned
38
+ return domain.split(".", maxsplit=1)[0].replace("-", " ").title()
39
+
40
+
41
+ def normalize_results(results: list[ExaSearchResult]) -> list[NormalizedCandidate]:
42
+ by_domain: OrderedDict[str, NormalizedCandidate] = OrderedDict()
43
+ now = datetime.now(UTC)
44
+ for result in results:
45
+ domain = canonical_domain(result.url)
46
+ if domain is None:
47
+ continue
48
+ sighting = SourceSighting(
49
+ query=result.query,
50
+ url=result.url,
51
+ title=result.title,
52
+ text=result.text,
53
+ exa_id=result.exa_id,
54
+ raw=result.raw,
55
+ )
56
+ existing = by_domain.get(domain)
57
+ if existing is None:
58
+ entity = _company_entity(result.raw)
59
+ properties = entity.get("properties", {}) if entity else {}
60
+ workforce = properties.get("workforce") or {}
61
+ headquarters = properties.get("headquarters") or {}
62
+ employee_total = workforce.get("total")
63
+ if not isinstance(employee_total, int) or employee_total < 1:
64
+ employee_total = None
65
+ country = _country_code(headquarters.get("country"))
66
+ by_domain[domain] = NormalizedCandidate(
67
+ company_name=properties.get("name") or candidate_name(result.title, domain),
68
+ domain=domain,
69
+ dedupe_key=domain,
70
+ country=country,
71
+ employee_min=employee_total,
72
+ employee_max=employee_total,
73
+ sightings=[sighting],
74
+ first_seen_at=now,
75
+ last_seen_at=now,
76
+ )
77
+ elif all(item.url != sighting.url for item in existing.sightings):
78
+ existing.sightings.append(sighting)
79
+ return list(by_domain.values())
80
+
81
+
82
+ def _company_entity(raw: dict) -> dict:
83
+ entities = raw.get("entities")
84
+ if not isinstance(entities, list):
85
+ return {}
86
+ for entity in entities:
87
+ if isinstance(entity, dict) and entity.get("type") in {None, "company"}:
88
+ return entity
89
+ return {}
90
+
91
+
92
+ def _country_code(value: object) -> str | None:
93
+ if not isinstance(value, str):
94
+ return None
95
+ normalized = value.strip().upper()
96
+ if normalized in {"UNITED STATES", "UNITED STATES OF AMERICA", "USA"}:
97
+ return "US"
98
+ return normalized if len(normalized) == 2 else None