autonitia-intel 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. autonitia_intel-0.2.0/LICENSE +21 -0
  2. autonitia_intel-0.2.0/PKG-INFO +119 -0
  3. autonitia_intel-0.2.0/README.md +88 -0
  4. autonitia_intel-0.2.0/autonitia_intel/__init__.py +27 -0
  5. autonitia_intel-0.2.0/autonitia_intel/__main__.py +3 -0
  6. autonitia_intel-0.2.0/autonitia_intel/cli.py +56 -0
  7. autonitia_intel-0.2.0/autonitia_intel/config.py +36 -0
  8. autonitia_intel-0.2.0/autonitia_intel/detection/__init__.py +4 -0
  9. autonitia_intel-0.2.0/autonitia_intel/detection/capabilities.py +136 -0
  10. autonitia_intel-0.2.0/autonitia_intel/detection/fingerprints.py +63 -0
  11. autonitia_intel-0.2.0/autonitia_intel/fetchers/__init__.py +4 -0
  12. autonitia_intel-0.2.0/autonitia_intel/fetchers/fetcher.py +146 -0
  13. autonitia_intel-0.2.0/autonitia_intel/fetchers/robots.py +50 -0
  14. autonitia_intel-0.2.0/autonitia_intel/graph/__init__.py +5 -0
  15. autonitia_intel-0.2.0/autonitia_intel/graph/base_graph.py +64 -0
  16. autonitia_intel-0.2.0/autonitia_intel/graph/base_node.py +13 -0
  17. autonitia_intel-0.2.0/autonitia_intel/graph/profile_graph.py +148 -0
  18. autonitia_intel-0.2.0/autonitia_intel/lenses/__init__.py +3 -0
  19. autonitia_intel-0.2.0/autonitia_intel/lenses/catalog.py +110 -0
  20. autonitia_intel-0.2.0/autonitia_intel/models.py +136 -0
  21. autonitia_intel-0.2.0/autonitia_intel/nodes/__init__.py +15 -0
  22. autonitia_intel-0.2.0/autonitia_intel/nodes/basic_assemble_node.py +76 -0
  23. autonitia_intel-0.2.0/autonitia_intel/nodes/fact_extraction_node.py +41 -0
  24. autonitia_intel-0.2.0/autonitia_intel/nodes/fetch_node.py +70 -0
  25. autonitia_intel-0.2.0/autonitia_intel/nodes/markdownify_node.py +35 -0
  26. autonitia_intel-0.2.0/autonitia_intel/nodes/positive_detection_node.py +24 -0
  27. autonitia_intel-0.2.0/autonitia_intel/nodes/repair_extraction_node.py +51 -0
  28. autonitia_intel-0.2.0/autonitia_intel/signal_packs/industries/real_estate.yaml +23 -0
  29. autonitia_intel-0.2.0/autonitia_intel/signal_packs/lenses/automation.yaml +57 -0
  30. autonitia_intel-0.2.0/autonitia_intel/signal_packs/lenses/marketing.yaml +37 -0
  31. autonitia_intel-0.2.0/autonitia_intel/signal_packs/lenses/sales.yaml +19 -0
  32. autonitia_intel-0.2.0/autonitia_intel/telemetry/__init__.py +3 -0
  33. autonitia_intel-0.2.0/autonitia_intel/telemetry/telemetry.py +84 -0
  34. autonitia_intel-0.2.0/autonitia_intel/usage.py +32 -0
  35. autonitia_intel-0.2.0/autonitia_intel.egg-info/PKG-INFO +119 -0
  36. autonitia_intel-0.2.0/autonitia_intel.egg-info/SOURCES.txt +45 -0
  37. autonitia_intel-0.2.0/autonitia_intel.egg-info/dependency_links.txt +1 -0
  38. autonitia_intel-0.2.0/autonitia_intel.egg-info/entry_points.txt +2 -0
  39. autonitia_intel-0.2.0/autonitia_intel.egg-info/requires.txt +12 -0
  40. autonitia_intel-0.2.0/autonitia_intel.egg-info/top_level.txt +1 -0
  41. autonitia_intel-0.2.0/pyproject.toml +48 -0
  42. autonitia_intel-0.2.0/setup.cfg +4 -0
  43. autonitia_intel-0.2.0/tests/test_catalog.py +76 -0
  44. autonitia_intel-0.2.0/tests/test_detection.py +87 -0
  45. autonitia_intel-0.2.0/tests/test_fetch.py +65 -0
  46. autonitia_intel-0.2.0/tests/test_graph.py +77 -0
  47. autonitia_intel-0.2.0/tests/test_integration.py +52 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Autonitia
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,119 @@
1
+ Metadata-Version: 2.4
2
+ Name: autonitia-intel
3
+ Version: 0.2.0
4
+ Summary: Turn any business website into a clean, structured company profile — a graph-based extraction engine.
5
+ Author: Syed Mukarramuddin
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Autonitia/autonitia-intel
8
+ Project-URL: Repository, https://github.com/Autonitia/autonitia-intel
9
+ Keywords: web-scraping,llm,company-data,lead-generation,openai,extraction,graph
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Internet :: WWW/HTTP
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: requests>=2.31
20
+ Requires-Dist: beautifulsoup4>=4.12
21
+ Requires-Dist: lxml>=5.0
22
+ Requires-Dist: openai>=1.40
23
+ Requires-Dist: pydantic>=2.0
24
+ Requires-Dist: python-dotenv>=1.0
25
+ Requires-Dist: pyyaml>=6.0
26
+ Requires-Dist: cloudscraper>=1.2.71
27
+ Requires-Dist: playwright>=1.40
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=8.0; extra == "dev"
30
+ Dynamic: license-file
31
+
32
+ # autonitia-intel
33
+
34
+ Turn any business website into a clean, structured company profile — and a quick read on where the opportunities are.
35
+
36
+ Point it at a URL and get back the company's details (description, services, contact info, social presence) plus the tools and capabilities its site exposes. It also tells you how many opportunities a given **lens** (automation, marketing, sales…) would surface.
37
+
38
+ ## Install
39
+
40
+ ```bash
41
+ pip install autonitia-intel
42
+ playwright install chromium # only needed for JavaScript-heavy sites
43
+ export OPENAI_API_KEY=sk-... # or pass api_key in the config
44
+ ```
45
+
46
+ ## Use it
47
+
48
+ ```python
49
+ from autonitia_intel import ProfileGraph
50
+
51
+ config = {
52
+ "llm": {"model": "gpt-4o-mini"}, # add "api_key": "sk-..." or use the env var
53
+ "lens": "automation", # automation | marketing | sales | …
54
+ "verbose": True,
55
+ }
56
+
57
+ graph = ProfileGraph(source="https://example.com", config=config)
58
+ result = graph.run()
59
+
60
+ print(result.model_dump_json(indent=2))
61
+ ```
62
+
63
+ Prefer the command line?
64
+
65
+ ```bash
66
+ python run.py https://example.com --lens marketing --json
67
+ ```
68
+
69
+ ## What you get
70
+
71
+ ```json
72
+ {
73
+ "target_company": {
74
+ "name": "Example Co",
75
+ "industry": "Real Estate",
76
+ "description": "...",
77
+ "location": "Dubai, UAE",
78
+ "contact": { "phones": ["..."], "emails": ["..."], "addresses": ["..."] }
79
+ },
80
+ "digital_presence": { "social_media": { "linkedin": "...", "instagram": "..." } },
81
+ "capabilities_present": ["phone", "whatsapp", "online_booking"],
82
+ "pro_features": { "lens": "automation", "opportunities_found": 2 }
83
+ }
84
+ ```
85
+
86
+ ## How it works
87
+
88
+ It fetches the site politely (respecting `robots.txt`, with retries and a real-browser fallback for JS-heavy pages), uses one LLM call to read out the company profile, and runs fast local checks to spot the tools and capabilities present. The opportunity count for a lens is computed locally — no guessing.
89
+
90
+ ## Lenses
91
+
92
+ A **lens** is the perspective you analyse a site through — `automation`, `marketing`, `sales`, and more. Lenses and the signals they look for are defined as simple **YAML packs** in [`autonitia_intel/signal_packs/`](autonitia_intel/signal_packs), so you can add a new lens or industry pack without touching the Python.
93
+
94
+ ## Contributing
95
+
96
+ Contributions welcome — the easiest place to start is a signal pack: drop a YAML file under `signal_packs/lenses/` or `signal_packs/industries/` and open a PR. Run the tests with `pytest -m "not integration"`.
97
+
98
+ ## Hosted version
99
+
100
+ This open-source engine gives you the profile and the opportunity count. The hosted **Autonitia Intel** turns those opportunities into verified, ranked, outreach-ready intelligence over a REST API.
101
+
102
+ **→ Docs & access: [autonitia.ai/intel](https://autonitia.ai/intel)**
103
+
104
+ | | Free — `autonitia-intel` | Hosted — Autonitia Intel |
105
+ |---|:---:|:---:|
106
+ | Company profile + contact + socials | ✅ | ✅ |
107
+ | Tool & capability detection | ✅ | ✅ |
108
+ | Opportunity count | ✅ | — |
109
+ | Verified capability analysis | — | ✅ |
110
+ | Pain signals with evidence | — | ✅ |
111
+ | Scoring (fit / opportunity / confidence) | — | ✅ |
112
+ | Offer matching + ranked opportunities | — | ✅ |
113
+ | Outreach messages | — | ✅ |
114
+ | External enrichment (founders, HQ, funding) | — | ✅ |
115
+ | REST API, async jobs, webhooks, CRM export | — | ✅ |
116
+
117
+ ## License
118
+
119
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,88 @@
1
+ # autonitia-intel
2
+
3
+ Turn any business website into a clean, structured company profile — and a quick read on where the opportunities are.
4
+
5
+ Point it at a URL and get back the company's details (description, services, contact info, social presence) plus the tools and capabilities its site exposes. It also tells you how many opportunities a given **lens** (automation, marketing, sales…) would surface.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install autonitia-intel
11
+ playwright install chromium # only needed for JavaScript-heavy sites
12
+ export OPENAI_API_KEY=sk-... # or pass api_key in the config
13
+ ```
14
+
15
+ ## Use it
16
+
17
+ ```python
18
+ from autonitia_intel import ProfileGraph
19
+
20
+ config = {
21
+ "llm": {"model": "gpt-4o-mini"}, # add "api_key": "sk-..." or use the env var
22
+ "lens": "automation", # automation | marketing | sales | …
23
+ "verbose": True,
24
+ }
25
+
26
+ graph = ProfileGraph(source="https://example.com", config=config)
27
+ result = graph.run()
28
+
29
+ print(result.model_dump_json(indent=2))
30
+ ```
31
+
32
+ Prefer the command line?
33
+
34
+ ```bash
35
+ python run.py https://example.com --lens marketing --json
36
+ ```
37
+
38
+ ## What you get
39
+
40
+ ```json
41
+ {
42
+ "target_company": {
43
+ "name": "Example Co",
44
+ "industry": "Real Estate",
45
+ "description": "...",
46
+ "location": "Dubai, UAE",
47
+ "contact": { "phones": ["..."], "emails": ["..."], "addresses": ["..."] }
48
+ },
49
+ "digital_presence": { "social_media": { "linkedin": "...", "instagram": "..." } },
50
+ "capabilities_present": ["phone", "whatsapp", "online_booking"],
51
+ "pro_features": { "lens": "automation", "opportunities_found": 2 }
52
+ }
53
+ ```
54
+
55
+ ## How it works
56
+
57
+ It fetches the site politely (respecting `robots.txt`, with retries and a real-browser fallback for JS-heavy pages), uses one LLM call to read out the company profile, and runs fast local checks to spot the tools and capabilities present. The opportunity count for a lens is computed locally — no guessing.
58
+
59
+ ## Lenses
60
+
61
+ A **lens** is the perspective you analyse a site through — `automation`, `marketing`, `sales`, and more. Lenses and the signals they look for are defined as simple **YAML packs** in [`autonitia_intel/signal_packs/`](autonitia_intel/signal_packs), so you can add a new lens or industry pack without touching the Python.
62
+
63
+ ## Contributing
64
+
65
+ Contributions welcome — the easiest place to start is a signal pack: drop a YAML file under `signal_packs/lenses/` or `signal_packs/industries/` and open a PR. Run the tests with `pytest -m "not integration"`.
66
+
67
+ ## Hosted version
68
+
69
+ This open-source engine gives you the profile and the opportunity count. The hosted **Autonitia Intel** turns those opportunities into verified, ranked, outreach-ready intelligence over a REST API.
70
+
71
+ **→ Docs & access: [autonitia.ai/intel](https://autonitia.ai/intel)**
72
+
73
+ | | Free — `autonitia-intel` | Hosted — Autonitia Intel |
74
+ |---|:---:|:---:|
75
+ | Company profile + contact + socials | ✅ | ✅ |
76
+ | Tool & capability detection | ✅ | ✅ |
77
+ | Opportunity count | ✅ | — |
78
+ | Verified capability analysis | — | ✅ |
79
+ | Pain signals with evidence | — | ✅ |
80
+ | Scoring (fit / opportunity / confidence) | — | ✅ |
81
+ | Offer matching + ranked opportunities | — | ✅ |
82
+ | Outreach messages | — | ✅ |
83
+ | External enrichment (founders, HQ, funding) | — | ✅ |
84
+ | REST API, async jobs, webhooks, CRM export | — | ✅ |
85
+
86
+ ## License
87
+
88
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,27 @@
1
+ """
2
+ autonitia-intel — open-source business-website profile extractor.
3
+
4
+ The FREE engine turns any business website into a clean structured profile
5
+ (company facts, contact details, social presence, detected tools/capabilities)
6
+ plus a *pro_features* count of opportunities. The intelligence layer — verified
7
+ signals, scoring, offer matching, outreach — is Autonitia Intel Pro, which
8
+ imports these same building blocks.
9
+
10
+ Quick start:
11
+
12
+ from autonitia_intel import ProfileGraph
13
+
14
+ graph = ProfileGraph(lens="automation") # bring your own key via env or args
15
+ profile = graph.run("https://example.com")
16
+ print(profile.model_dump_json(indent=2))
17
+
18
+ Bring your own model key:
19
+
20
+ ProfileGraph(api_key="sk-...", model="gpt-4o-mini")
21
+ """
22
+
23
+ from .graph import ProfileGraph
24
+ from .models import CompanyProfile, ProfileResult
25
+
26
+ __version__ = "0.2.0"
27
+ __all__ = ["ProfileGraph", "CompanyProfile", "ProfileResult"]
@@ -0,0 +1,3 @@
1
+ from .cli import main
2
+
3
+ main()
@@ -0,0 +1,56 @@
1
+ """
2
+ CLI: autonitia-intel analyse — extract a company profile (+ opportunity pro_features).
3
+
4
+ Example:
5
+ python -m autonitia_intel analyse --target-url https://example.com --lens automation
6
+ """
7
+
8
+ import argparse
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ from .config import OUTPUT_DIR
13
+ from .graph import ProfileGraph
14
+ from .lenses import LENSES
15
+
16
+
17
+ def main(argv=None):
18
+ parser = argparse.ArgumentParser(prog="autonitia-intel", description="Business-website profile extractor (free tier).")
19
+ sub = parser.add_subparsers(dest="command")
20
+
21
+ a = sub.add_parser("analyse", help="Extract a profile from a business website")
22
+ a.add_argument("--target-url", required=True)
23
+ a.add_argument("--lens", default="automation", choices=LENSES, help="Lens used only for the opportunity pro_features count")
24
+ a.add_argument("--api-key", default=None, help="Bring your own model key (overrides env)")
25
+ a.add_argument("--model", default=None, help="Model id (overrides env)")
26
+ a.add_argument("--no-cache", action="store_true")
27
+ a.add_argument("--no-telemetry", action="store_true")
28
+ a.add_argument("--quiet", action="store_true")
29
+
30
+ args = parser.parse_args(argv)
31
+ if args.command != "analyse":
32
+ parser.print_help()
33
+ sys.exit(1)
34
+
35
+ graph = ProfileGraph(
36
+ lens=args.lens,
37
+ telemetry=not args.no_telemetry,
38
+ verbose=not args.quiet,
39
+ api_key=args.api_key,
40
+ model=args.model,
41
+ )
42
+ if not args.quiet:
43
+ print(f"Analysing {args.target_url} (lens={args.lens}) ...")
44
+ result = graph.run(args.target_url, use_cache=not args.no_cache)
45
+
46
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
47
+ name = (result.target_company.name or "result").replace("/", "_").replace(" ", "_")
48
+ path = OUTPUT_DIR / f"{name}_profile.json"
49
+ path.write_text(result.model_dump_json(indent=2))
50
+
51
+ print(f"\nSaved: {path}\n")
52
+ print(result.model_dump_json(indent=2))
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
@@ -0,0 +1,36 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
9
+ MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
10
+
11
+ BROWSER_HEADERS = {
12
+ "User-Agent": (
13
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
14
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
15
+ "Chrome/125.0.0.0 Safari/537.36"
16
+ ),
17
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
18
+ "Accept-Language": "en-US,en;q=0.9",
19
+ }
20
+
21
+ CACHE_DIR = Path(__file__).parent.parent / "output" / ".cache"
22
+ OUTPUT_DIR = Path(__file__).parent.parent / "output"
23
+ REQUEST_TIMEOUT = 15
24
+ MAX_CONTENT_CHARS = 24_000 # overall budget sent to the LLM
25
+ PER_PAGE_CHARS = 6_000 # per-page cap so no single page (e.g. a bloated homepage) starves the rest
26
+ MAX_SUBPAGES = 3
27
+
28
+ # Crawling politeness/resilience
29
+ RESPECT_ROBOTS = os.getenv("AUTONITIA_RESPECT_ROBOTS", "true").lower() != "false"
30
+ FETCH_RETRIES = int(os.getenv("AUTONITIA_FETCH_RETRIES", "2")) # extra attempts on transient errors
31
+ ROBOTS_UA = "autonitia-intel"
32
+
33
+ # Telemetry — see telemetry/telemetry.py. Nothing is sent over the network in v0.1.
34
+ # Level 1 (execution metrics) is opt-OUT. Level 2 (dataset capture) is opt-IN.
35
+ TELEMETRY_ENABLED = os.getenv("AUTONITIA_TELEMETRY", "true").lower() != "false"
36
+ DATASET_CONTRIBUTION = os.getenv("AUTONITIA_DATASET", "false").lower() == "true"
@@ -0,0 +1,4 @@
1
+ from .capabilities import detect_capabilities, extract_contacts
2
+ from .fingerprints import detect_tools
3
+
4
+ __all__ = ["detect_capabilities", "detect_tools", "extract_contacts"]
@@ -0,0 +1,136 @@
1
+ """
2
+ Deterministic capability + digital-presence detection.
3
+
4
+ Inspects raw HTML across all fetched pages to determine observable facts:
5
+ lead-capture methods, social links, SEO basics, tracking. No LLM.
6
+
7
+ These are FACTS (present/absent), which become the evidence base for signals.
8
+ """
9
+
10
+ import re
11
+
12
+ from ..detection.fingerprints import detect_tools
13
+ from ..models import Capabilities, SEO, SocialMedia, Tracking
14
+
15
+ SOCIAL_PATTERNS = {
16
+ "facebook": r"https?://(?:www\.)?facebook\.com/[A-Za-z0-9_.\-/]+",
17
+ "instagram": r"https?://(?:www\.)?instagram\.com/[A-Za-z0-9_.\-/]+",
18
+ "linkedin": r"https?://(?:[a-z]{2}\.)?linkedin\.com/(?:company|in)/[A-Za-z0-9_.\-/]+",
19
+ "tiktok": r"https?://(?:www\.)?tiktok\.com/@[A-Za-z0-9_.\-/]+",
20
+ "youtube": r"https?://(?:www\.)?youtube\.com/[A-Za-z0-9_.\-/@]+",
21
+ "x": r"https?://(?:www\.)?(?:twitter|x)\.com/[A-Za-z0-9_]+",
22
+ }
23
+
24
+ # STRONG patterns = a real third-party tool / explicit URL → trustworthy, the
25
+ # LLM verifier may NOT downgrade these. WEAK patterns = generic text heuristics
26
+ # ("book now") that are easily wrong → the LLM verifier MAY override them.
27
+ BOOKING_STRONG = [r"calendly\.com", r"fresha\.com", r"acuityscheduling\.com", r"booksy\.com",
28
+ r"simplybook\.(me|it)", r"setmore\.com", r"squareup\.com/appointments"]
29
+ BOOKING_WEAK = [r"book\s*now", r"book\s*online", r"schedule\s*(an?\s*)?appointment", r"book\s*a\s*viewing"]
30
+
31
+ LIVE_CHAT_STRONG = [r"intercom", r"driftt?\.com", r"tidio", r"tawk\.to", r"crisp\.chat", r"hs-scripts"]
32
+ LIVE_CHAT_WEAK = [r"livechat", r"chat\s*with\s*us", r"live\s*chat"]
33
+
34
+ WHATSAPP_STRONG = [r"wa\.me/", r"api\.whatsapp\.com", r"whatsapp://", r"web\.whatsapp\.com",
35
+ r"chat\.whatsapp\.com", r"wa\.link/"]
36
+ WHATSAPP_WEAK = [r"click\s*to\s*whatsapp", r'aria-label=["\'][^"\']*whatsapp', r"whatsapp\s*us"]
37
+
38
+ NEWSLETTER_STRONG = [r"chimpstatic", r"klaviyo", r"list-manage\.com"]
39
+ NEWSLETTER_WEAK = [r"newsletter", r"subscribe"]
40
+
41
+ # These have no reliable "strong" structural signal — treat as weak (downgradable).
42
+ PRICING_WEAK = [r"/pricing", r">\s*pricing\s*<", r">\s*plans\s*<", r"per\s*month", r"/mo\b"]
43
+ CASE_STUDY_WEAK = [r"case\s*stud", r"success\s*stor", r"/portfolio", r"testimonial"]
44
+ FORM_WEAK = [r"<form[\s>]"] # a <form> could be search/login, not a contact form → downgradable
45
+
46
+ PHONE_PATTERN = r"tel:\+?[\d\s\-()]{7,}"
47
+ EMAIL_PATTERN = r"mailto:[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+"
48
+
49
+ # Capabilities the LLM verifier is allowed to downgrade when only a WEAK signal fired.
50
+ DOWNGRADABLE = {"has_online_booking", "has_whatsapp", "has_live_chat",
51
+ "has_pricing", "has_case_studies", "has_contact_form", "has_newsletter"}
52
+
53
+
54
+ def _any(patterns: list[str], html: str) -> bool:
55
+ return any(re.search(p, html, re.IGNORECASE) for p in patterns)
56
+
57
+
58
+ _EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
59
+ _TEL_RE = re.compile(r'tel:(\+?[\d\s\-()]{7,})', re.IGNORECASE)
60
+ _WA_RE = re.compile(r'(https?://(?:wa\.me|wa\.link|api\.whatsapp\.com|web\.whatsapp\.com|chat\.whatsapp\.com)/[^\s"\'<>]+)', re.IGNORECASE)
61
+
62
+
63
+ def extract_contacts(html: str) -> dict:
64
+ """Deterministic contact extraction — a backstop/merge for the LLM output."""
65
+ emails = sorted({m.group(0) for m in _EMAIL_RE.finditer(html)
66
+ if not m.group(0).lower().endswith((".png", ".jpg", ".gif", ".webp", ".svg"))})
67
+ phones = sorted({re.sub(r"\s+", " ", m.group(1)).strip() for m in _TEL_RE.finditer(html)})
68
+ wa = ""
69
+ m = _WA_RE.search(html)
70
+ if m:
71
+ wa = m.group(1)
72
+ return {"emails": emails, "phones": phones, "whatsapp": wa}
73
+
74
+
75
+ def detect_capabilities(combined_html: str):
76
+ """
77
+ Returns (capabilities, social, seo, tracking, tools, strongly_detected).
78
+
79
+ `strongly_detected` is the set of capability names backed by a STRONG
80
+ structural signal (a real tool/URL). The LLM verifier may only downgrade
81
+ capabilities NOT in this set.
82
+ """
83
+ booking_strong = _any(BOOKING_STRONG, combined_html)
84
+ chat_strong = _any(LIVE_CHAT_STRONG, combined_html)
85
+ wa_strong = _any(WHATSAPP_STRONG, combined_html)
86
+ news_strong = _any(NEWSLETTER_STRONG, combined_html)
87
+
88
+ caps = Capabilities(
89
+ has_phone=bool(re.search(PHONE_PATTERN, combined_html, re.IGNORECASE)),
90
+ has_email=bool(re.search(EMAIL_PATTERN, combined_html, re.IGNORECASE)),
91
+ has_contact_form=_any(FORM_WEAK, combined_html),
92
+ has_whatsapp=wa_strong or _any(WHATSAPP_WEAK, combined_html),
93
+ has_online_booking=booking_strong or _any(BOOKING_WEAK, combined_html),
94
+ has_live_chat=chat_strong or _any(LIVE_CHAT_WEAK, combined_html),
95
+ has_pricing=_any(PRICING_WEAK, combined_html),
96
+ has_case_studies=_any(CASE_STUDY_WEAK, combined_html),
97
+ has_newsletter=news_strong or _any(NEWSLETTER_WEAK, combined_html),
98
+ )
99
+
100
+ strongly_detected = set()
101
+ if booking_strong:
102
+ strongly_detected.add("has_online_booking")
103
+ if chat_strong:
104
+ strongly_detected.add("has_live_chat")
105
+ if wa_strong:
106
+ strongly_detected.add("has_whatsapp")
107
+ if news_strong:
108
+ strongly_detected.add("has_newsletter")
109
+
110
+ social = SocialMedia()
111
+ for field, pattern in SOCIAL_PATTERNS.items():
112
+ m = re.search(pattern, combined_html, re.IGNORECASE)
113
+ if m:
114
+ # Skip share/intent links — keep only profile-looking URLs
115
+ url = m.group(0)
116
+ if "sharer" not in url and "intent" not in url and "/share" not in url:
117
+ setattr(social, field, url)
118
+ caps.has_social_links = any(getattr(social, f) for f in SOCIAL_PATTERNS)
119
+
120
+ seo = SEO(
121
+ title_tag_present=bool(re.search(r"<title[\s>]", combined_html, re.IGNORECASE)),
122
+ meta_description_present=bool(re.search(r'<meta[^>]+name=["\']description["\']', combined_html, re.IGNORECASE)),
123
+ )
124
+
125
+ tools = detect_tools(combined_html)
126
+ tool_names = {t["name"] for t in tools}
127
+ tracking = Tracking(
128
+ google_analytics="Google Analytics" in tool_names,
129
+ google_tag_manager="Google Tag Manager" in tool_names,
130
+ meta_pixel="Meta Pixel" in tool_names,
131
+ tiktok_pixel="TikTok Pixel" in tool_names,
132
+ linkedin_pixel="LinkedIn Insight" in tool_names,
133
+ hotjar="Hotjar" in tool_names,
134
+ )
135
+
136
+ return caps, social, seo, tracking, tools, strongly_detected
@@ -0,0 +1,63 @@
1
+ """
2
+ Deterministic technology detection.
3
+
4
+ A pragmatic subset of Wappalyzer-style fingerprints: each entry matches a
5
+ substring/regex in the raw HTML. NO LLM involved — this is fact, not inference,
6
+ which is why it's the most defensible signal in the product.
7
+
8
+ For production, swap this dict for the full Wappalyzer fingerprint database
9
+ (https://github.com/enthec/webappanalyzer) — same matching approach, ~3000 apps.
10
+ """
11
+
12
+ import re
13
+
14
+ # name -> (category, [patterns], confidence)
15
+ FINGERPRINTS: dict[str, tuple[str, list[str], float]] = {
16
+ # CMS / site builders
17
+ "WordPress": ("cms", [r"wp-content", r"wp-includes"], 0.92),
18
+ "Shopify": ("ecommerce", [r"cdn\.shopify\.com", r"Shopify\.theme"], 0.95),
19
+ "Wix": ("cms", [r"static\.wixstatic\.com", r"_wixCssImports"], 0.9),
20
+ "Webflow": ("cms", [r"assets\.website-files\.com", r"webflow\.js", r"wf-"], 0.88),
21
+ "Squarespace": ("cms", [r"squarespace", r"static1\.squarespace\.com"], 0.9),
22
+ "WooCommerce": ("ecommerce", [r"woocommerce", r"wc-ajax"], 0.85),
23
+ # Analytics / tracking
24
+ "Google Analytics": ("analytics", [r"google-analytics\.com", r"gtag\(", r"ga\('create'"], 0.9),
25
+ "Google Tag Manager": ("analytics", [r"googletagmanager\.com"], 0.92),
26
+ "Meta Pixel": ("marketing_tracking", [r"fbq\(", r"connect\.facebook\.net/[a-z_]+/fbevents\.js"], 0.93),
27
+ "TikTok Pixel": ("marketing_tracking", [r"analytics\.tiktok\.com"], 0.9),
28
+ "LinkedIn Insight": ("marketing_tracking", [r"snap\.licdn\.com"], 0.9),
29
+ "Hotjar": ("analytics", [r"static\.hotjar\.com", r"hotjar"], 0.85),
30
+ # CRM / marketing / chat
31
+ "HubSpot": ("crm", [r"js\.hs-scripts\.com", r"hs-scripts"], 0.9),
32
+ "Intercom": ("live_chat", [r"widget\.intercom\.io", r"intercomSettings"], 0.9),
33
+ "Drift": ("live_chat", [r"js\.driftt\.com", r"drift\.com"], 0.88),
34
+ "Tidio": ("live_chat", [r"code\.tidio\.co"], 0.9),
35
+ "Tawk.to": ("live_chat", [r"embed\.tawk\.to"], 0.9),
36
+ "Crisp": ("live_chat", [r"client\.crisp\.chat"], 0.9),
37
+ "Mailchimp": ("email_marketing", [r"chimpstatic\.com", r"list-manage\.com"], 0.85),
38
+ "Klaviyo": ("email_marketing", [r"klaviyo"], 0.85),
39
+ # Booking / forms
40
+ "Calendly": ("booking", [r"calendly\.com"], 0.92),
41
+ "Fresha": ("booking", [r"fresha\.com"], 0.9),
42
+ "Acuity Scheduling": ("booking", [r"acuityscheduling\.com"], 0.9),
43
+ "Booksy": ("booking", [r"booksy\.com"], 0.9),
44
+ "SimplyBook": ("booking", [r"simplybook\.(me|it)"], 0.88),
45
+ "Typeform": ("forms", [r"typeform\.com"], 0.88),
46
+ "Jotform": ("forms", [r"jotform\.com"], 0.88),
47
+ }
48
+
49
+
50
+ def detect_tools(html: str) -> list[dict]:
51
+ """Return a list of detected tools: {name, category, confidence, evidence}."""
52
+ found = []
53
+ for name, (category, patterns, confidence) in FINGERPRINTS.items():
54
+ for pat in patterns:
55
+ if re.search(pat, html, re.IGNORECASE):
56
+ found.append({
57
+ "name": name,
58
+ "category": category,
59
+ "confidence": confidence,
60
+ "evidence": f"matched /{pat}/",
61
+ })
62
+ break
63
+ return found
@@ -0,0 +1,4 @@
1
+ from .fetcher import fetch_html
2
+ from .robots import RobotsDisallowed, allowed
3
+
4
+ __all__ = ["fetch_html", "RobotsDisallowed", "allowed"]