autonitia-intel 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. autonitia_intel/__init__.py +27 -0
  2. autonitia_intel/__main__.py +3 -0
  3. autonitia_intel/cli.py +56 -0
  4. autonitia_intel/config.py +36 -0
  5. autonitia_intel/detection/__init__.py +4 -0
  6. autonitia_intel/detection/capabilities.py +136 -0
  7. autonitia_intel/detection/fingerprints.py +63 -0
  8. autonitia_intel/fetchers/__init__.py +4 -0
  9. autonitia_intel/fetchers/fetcher.py +146 -0
  10. autonitia_intel/fetchers/robots.py +50 -0
  11. autonitia_intel/graph/__init__.py +5 -0
  12. autonitia_intel/graph/base_graph.py +64 -0
  13. autonitia_intel/graph/base_node.py +13 -0
  14. autonitia_intel/graph/profile_graph.py +148 -0
  15. autonitia_intel/lenses/__init__.py +3 -0
  16. autonitia_intel/lenses/catalog.py +110 -0
  17. autonitia_intel/models.py +136 -0
  18. autonitia_intel/nodes/__init__.py +15 -0
  19. autonitia_intel/nodes/basic_assemble_node.py +76 -0
  20. autonitia_intel/nodes/fact_extraction_node.py +41 -0
  21. autonitia_intel/nodes/fetch_node.py +70 -0
  22. autonitia_intel/nodes/markdownify_node.py +35 -0
  23. autonitia_intel/nodes/positive_detection_node.py +24 -0
  24. autonitia_intel/nodes/repair_extraction_node.py +51 -0
  25. autonitia_intel/signal_packs/industries/real_estate.yaml +23 -0
  26. autonitia_intel/signal_packs/lenses/automation.yaml +57 -0
  27. autonitia_intel/signal_packs/lenses/marketing.yaml +37 -0
  28. autonitia_intel/signal_packs/lenses/sales.yaml +19 -0
  29. autonitia_intel/telemetry/__init__.py +3 -0
  30. autonitia_intel/telemetry/telemetry.py +84 -0
  31. autonitia_intel/usage.py +32 -0
  32. autonitia_intel-0.2.0.dist-info/METADATA +119 -0
  33. autonitia_intel-0.2.0.dist-info/RECORD +37 -0
  34. autonitia_intel-0.2.0.dist-info/WHEEL +5 -0
  35. autonitia_intel-0.2.0.dist-info/entry_points.txt +2 -0
  36. autonitia_intel-0.2.0.dist-info/licenses/LICENSE +21 -0
  37. autonitia_intel-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,27 @@
1
+ """
2
+ autonitia-intel — open-source business-website profile extractor.
3
+
4
+ The FREE engine turns any business website into a clean structured profile
5
+ (company facts, contact details, social presence, detected tools/capabilities)
6
+ plus a *pro_features* count of opportunities. The intelligence layer — verified
7
+ signals, scoring, offer matching, outreach — is Autonitia Intel Pro, which
8
+ imports these same building blocks.
9
+
10
+ Quick start:
11
+
12
+ from autonitia_intel import ProfileGraph
13
+
14
+ graph = ProfileGraph(lens="automation") # bring your own key via env or args
15
+ profile = graph.run("https://example.com")
16
+ print(profile.model_dump_json(indent=2))
17
+
18
+ Bring your own model key:
19
+
20
+ ProfileGraph(api_key="sk-...", model="gpt-4o-mini")
21
+ """
22
+
23
+ from .graph import ProfileGraph
24
+ from .models import CompanyProfile, ProfileResult
25
+
26
+ __version__ = "0.2.0"
27
+ __all__ = ["ProfileGraph", "CompanyProfile", "ProfileResult"]
@@ -0,0 +1,3 @@
1
+ from .cli import main
2
+
3
+ main()
autonitia_intel/cli.py ADDED
@@ -0,0 +1,56 @@
1
+ """
2
+ CLI: autonitia-intel analyse — extract a company profile (+ opportunity pro_features).
3
+
4
+ Example:
5
+ python -m autonitia_intel analyse --target-url https://example.com --lens automation
6
+ """
7
+
8
+ import argparse
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ from .config import OUTPUT_DIR
13
+ from .graph import ProfileGraph
14
+ from .lenses import LENSES
15
+
16
+
17
+ def main(argv=None):
18
+ parser = argparse.ArgumentParser(prog="autonitia-intel", description="Business-website profile extractor (free tier).")
19
+ sub = parser.add_subparsers(dest="command")
20
+
21
+ a = sub.add_parser("analyse", help="Extract a profile from a business website")
22
+ a.add_argument("--target-url", required=True)
23
+ a.add_argument("--lens", default="automation", choices=LENSES, help="Lens used only for the opportunity pro_features count")
24
+ a.add_argument("--api-key", default=None, help="Bring your own model key (overrides env)")
25
+ a.add_argument("--model", default=None, help="Model id (overrides env)")
26
+ a.add_argument("--no-cache", action="store_true")
27
+ a.add_argument("--no-telemetry", action="store_true")
28
+ a.add_argument("--quiet", action="store_true")
29
+
30
+ args = parser.parse_args(argv)
31
+ if args.command != "analyse":
32
+ parser.print_help()
33
+ sys.exit(1)
34
+
35
+ graph = ProfileGraph(
36
+ lens=args.lens,
37
+ telemetry=not args.no_telemetry,
38
+ verbose=not args.quiet,
39
+ api_key=args.api_key,
40
+ model=args.model,
41
+ )
42
+ if not args.quiet:
43
+ print(f"Analysing {args.target_url} (lens={args.lens}) ...")
44
+ result = graph.run(args.target_url, use_cache=not args.no_cache)
45
+
46
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
47
+ name = (result.target_company.name or "result").replace("/", "_").replace(" ", "_")
48
+ path = OUTPUT_DIR / f"{name}_profile.json"
49
+ path.write_text(result.model_dump_json(indent=2))
50
+
51
+ print(f"\nSaved: {path}\n")
52
+ print(result.model_dump_json(indent=2))
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
@@ -0,0 +1,36 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
9
+ MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
10
+
11
+ BROWSER_HEADERS = {
12
+ "User-Agent": (
13
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
14
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
15
+ "Chrome/125.0.0.0 Safari/537.36"
16
+ ),
17
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
18
+ "Accept-Language": "en-US,en;q=0.9",
19
+ }
20
+
21
+ CACHE_DIR = Path(__file__).parent.parent / "output" / ".cache"
22
+ OUTPUT_DIR = Path(__file__).parent.parent / "output"
23
+ REQUEST_TIMEOUT = 15
24
+ MAX_CONTENT_CHARS = 24_000 # overall budget sent to the LLM
25
+ PER_PAGE_CHARS = 6_000 # per-page cap so no single page (e.g. a bloated homepage) starves the rest
26
+ MAX_SUBPAGES = 3
27
+
28
+ # Crawling politeness/resilience
29
+ RESPECT_ROBOTS = os.getenv("AUTONITIA_RESPECT_ROBOTS", "true").lower() != "false"
30
+ FETCH_RETRIES = int(os.getenv("AUTONITIA_FETCH_RETRIES", "2")) # extra attempts on transient errors
31
+ ROBOTS_UA = "autonitia-intel"
32
+
33
+ # Telemetry — see telemetry/telemetry.py. Nothing is sent over the network in v0.1.
34
+ # Level 1 (execution metrics) is opt-OUT. Level 2 (dataset capture) is opt-IN.
35
+ TELEMETRY_ENABLED = os.getenv("AUTONITIA_TELEMETRY", "true").lower() != "false"
36
+ DATASET_CONTRIBUTION = os.getenv("AUTONITIA_DATASET", "false").lower() == "true"
@@ -0,0 +1,4 @@
1
+ from .capabilities import detect_capabilities, extract_contacts
2
+ from .fingerprints import detect_tools
3
+
4
+ __all__ = ["detect_capabilities", "detect_tools", "extract_contacts"]
@@ -0,0 +1,136 @@
1
+ """
2
+ Deterministic capability + digital-presence detection.
3
+
4
+ Inspects raw HTML across all fetched pages to determine observable facts:
5
+ lead-capture methods, social links, SEO basics, tracking. No LLM.
6
+
7
+ These are FACTS (present/absent), which become the evidence base for signals.
8
+ """
9
+
10
+ import re
11
+
12
+ from ..detection.fingerprints import detect_tools
13
+ from ..models import Capabilities, SEO, SocialMedia, Tracking
14
+
15
+ SOCIAL_PATTERNS = {
16
+ "facebook": r"https?://(?:www\.)?facebook\.com/[A-Za-z0-9_.\-/]+",
17
+ "instagram": r"https?://(?:www\.)?instagram\.com/[A-Za-z0-9_.\-/]+",
18
+ "linkedin": r"https?://(?:[a-z]{2}\.)?linkedin\.com/(?:company|in)/[A-Za-z0-9_.\-/]+",
19
+ "tiktok": r"https?://(?:www\.)?tiktok\.com/@[A-Za-z0-9_.\-/]+",
20
+ "youtube": r"https?://(?:www\.)?youtube\.com/[A-Za-z0-9_.\-/@]+",
21
+ "x": r"https?://(?:www\.)?(?:twitter|x)\.com/[A-Za-z0-9_]+",
22
+ }
23
+
24
+ # STRONG patterns = a real third-party tool / explicit URL → trustworthy, the
25
+ # LLM verifier may NOT downgrade these. WEAK patterns = generic text heuristics
26
+ # ("book now") that are easily wrong → the LLM verifier MAY override them.
27
+ BOOKING_STRONG = [r"calendly\.com", r"fresha\.com", r"acuityscheduling\.com", r"booksy\.com",
28
+ r"simplybook\.(me|it)", r"setmore\.com", r"squareup\.com/appointments"]
29
+ BOOKING_WEAK = [r"book\s*now", r"book\s*online", r"schedule\s*(an?\s*)?appointment", r"book\s*a\s*viewing"]
30
+
31
+ LIVE_CHAT_STRONG = [r"intercom", r"driftt?\.com", r"tidio", r"tawk\.to", r"crisp\.chat", r"hs-scripts"]
32
+ LIVE_CHAT_WEAK = [r"livechat", r"chat\s*with\s*us", r"live\s*chat"]
33
+
34
+ WHATSAPP_STRONG = [r"wa\.me/", r"api\.whatsapp\.com", r"whatsapp://", r"web\.whatsapp\.com",
35
+ r"chat\.whatsapp\.com", r"wa\.link/"]
36
+ WHATSAPP_WEAK = [r"click\s*to\s*whatsapp", r'aria-label=["\'][^"\']*whatsapp', r"whatsapp\s*us"]
37
+
38
+ NEWSLETTER_STRONG = [r"chimpstatic", r"klaviyo", r"list-manage\.com"]
39
+ NEWSLETTER_WEAK = [r"newsletter", r"subscribe"]
40
+
41
+ # These have no reliable "strong" structural signal — treat as weak (downgradable).
42
+ PRICING_WEAK = [r"/pricing", r">\s*pricing\s*<", r">\s*plans\s*<", r"per\s*month", r"/mo\b"]
43
+ CASE_STUDY_WEAK = [r"case\s*stud", r"success\s*stor", r"/portfolio", r"testimonial"]
44
+ FORM_WEAK = [r"<form[\s>]"] # a <form> could be search/login, not a contact form → downgradable
45
+
46
+ PHONE_PATTERN = r"tel:\+?[\d\s\-()]{7,}"
47
+ EMAIL_PATTERN = r"mailto:[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+"
48
+
49
+ # Capabilities the LLM verifier is allowed to downgrade when only a WEAK signal fired.
50
+ DOWNGRADABLE = {"has_online_booking", "has_whatsapp", "has_live_chat",
51
+ "has_pricing", "has_case_studies", "has_contact_form", "has_newsletter"}
52
+
53
+
54
+ def _any(patterns: list[str], html: str) -> bool:
55
+ return any(re.search(p, html, re.IGNORECASE) for p in patterns)
56
+
57
+
58
+ _EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
59
+ _TEL_RE = re.compile(r'tel:(\+?[\d\s\-()]{7,})', re.IGNORECASE)
60
+ _WA_RE = re.compile(r'(https?://(?:wa\.me|wa\.link|api\.whatsapp\.com|web\.whatsapp\.com|chat\.whatsapp\.com)/[^\s"\'<>]+)', re.IGNORECASE)
61
+
62
+
63
+ def extract_contacts(html: str) -> dict:
64
+ """Deterministic contact extraction — a backstop/merge for the LLM output."""
65
+ emails = sorted({m.group(0) for m in _EMAIL_RE.finditer(html)
66
+ if not m.group(0).lower().endswith((".png", ".jpg", ".gif", ".webp", ".svg"))})
67
+ phones = sorted({re.sub(r"\s+", " ", m.group(1)).strip() for m in _TEL_RE.finditer(html)})
68
+ wa = ""
69
+ m = _WA_RE.search(html)
70
+ if m:
71
+ wa = m.group(1)
72
+ return {"emails": emails, "phones": phones, "whatsapp": wa}
73
+
74
+
75
+ def detect_capabilities(combined_html: str):
76
+ """
77
+ Returns (capabilities, social, seo, tracking, tools, strongly_detected).
78
+
79
+ `strongly_detected` is the set of capability names backed by a STRONG
80
+ structural signal (a real tool/URL). The LLM verifier may only downgrade
81
+ capabilities NOT in this set.
82
+ """
83
+ booking_strong = _any(BOOKING_STRONG, combined_html)
84
+ chat_strong = _any(LIVE_CHAT_STRONG, combined_html)
85
+ wa_strong = _any(WHATSAPP_STRONG, combined_html)
86
+ news_strong = _any(NEWSLETTER_STRONG, combined_html)
87
+
88
+ caps = Capabilities(
89
+ has_phone=bool(re.search(PHONE_PATTERN, combined_html, re.IGNORECASE)),
90
+ has_email=bool(re.search(EMAIL_PATTERN, combined_html, re.IGNORECASE)),
91
+ has_contact_form=_any(FORM_WEAK, combined_html),
92
+ has_whatsapp=wa_strong or _any(WHATSAPP_WEAK, combined_html),
93
+ has_online_booking=booking_strong or _any(BOOKING_WEAK, combined_html),
94
+ has_live_chat=chat_strong or _any(LIVE_CHAT_WEAK, combined_html),
95
+ has_pricing=_any(PRICING_WEAK, combined_html),
96
+ has_case_studies=_any(CASE_STUDY_WEAK, combined_html),
97
+ has_newsletter=news_strong or _any(NEWSLETTER_WEAK, combined_html),
98
+ )
99
+
100
+ strongly_detected = set()
101
+ if booking_strong:
102
+ strongly_detected.add("has_online_booking")
103
+ if chat_strong:
104
+ strongly_detected.add("has_live_chat")
105
+ if wa_strong:
106
+ strongly_detected.add("has_whatsapp")
107
+ if news_strong:
108
+ strongly_detected.add("has_newsletter")
109
+
110
+ social = SocialMedia()
111
+ for field, pattern in SOCIAL_PATTERNS.items():
112
+ m = re.search(pattern, combined_html, re.IGNORECASE)
113
+ if m:
114
+ # Skip share/intent links — keep only profile-looking URLs
115
+ url = m.group(0)
116
+ if "sharer" not in url and "intent" not in url and "/share" not in url:
117
+ setattr(social, field, url)
118
+ caps.has_social_links = any(getattr(social, f) for f in SOCIAL_PATTERNS)
119
+
120
+ seo = SEO(
121
+ title_tag_present=bool(re.search(r"<title[\s>]", combined_html, re.IGNORECASE)),
122
+ meta_description_present=bool(re.search(r'<meta[^>]+name=["\']description["\']', combined_html, re.IGNORECASE)),
123
+ )
124
+
125
+ tools = detect_tools(combined_html)
126
+ tool_names = {t["name"] for t in tools}
127
+ tracking = Tracking(
128
+ google_analytics="Google Analytics" in tool_names,
129
+ google_tag_manager="Google Tag Manager" in tool_names,
130
+ meta_pixel="Meta Pixel" in tool_names,
131
+ tiktok_pixel="TikTok Pixel" in tool_names,
132
+ linkedin_pixel="LinkedIn Insight" in tool_names,
133
+ hotjar="Hotjar" in tool_names,
134
+ )
135
+
136
+ return caps, social, seo, tracking, tools, strongly_detected
@@ -0,0 +1,63 @@
1
+ """
2
+ Deterministic technology detection.
3
+
4
+ A pragmatic subset of Wappalyzer-style fingerprints: each entry matches a
5
+ substring/regex in the raw HTML. NO LLM involved — this is fact, not inference,
6
+ which is why it's the most defensible signal in the product.
7
+
8
+ For production, swap this dict for the full Wappalyzer fingerprint database
9
+ (https://github.com/enthec/webappanalyzer) — same matching approach, ~3000 apps.
10
+ """
11
+
12
+ import re
13
+
14
+ # name -> (category, [patterns], confidence)
15
+ FINGERPRINTS: dict[str, tuple[str, list[str], float]] = {
16
+ # CMS / site builders
17
+ "WordPress": ("cms", [r"wp-content", r"wp-includes"], 0.92),
18
+ "Shopify": ("ecommerce", [r"cdn\.shopify\.com", r"Shopify\.theme"], 0.95),
19
+ "Wix": ("cms", [r"static\.wixstatic\.com", r"_wixCssImports"], 0.9),
20
+ "Webflow": ("cms", [r"assets\.website-files\.com", r"webflow\.js", r"wf-"], 0.88),
21
+ "Squarespace": ("cms", [r"squarespace", r"static1\.squarespace\.com"], 0.9),
22
+ "WooCommerce": ("ecommerce", [r"woocommerce", r"wc-ajax"], 0.85),
23
+ # Analytics / tracking
24
+ "Google Analytics": ("analytics", [r"google-analytics\.com", r"gtag\(", r"ga\('create'"], 0.9),
25
+ "Google Tag Manager": ("analytics", [r"googletagmanager\.com"], 0.92),
26
+ "Meta Pixel": ("marketing_tracking", [r"fbq\(", r"connect\.facebook\.net/[a-z_]+/fbevents\.js"], 0.93),
27
+ "TikTok Pixel": ("marketing_tracking", [r"analytics\.tiktok\.com"], 0.9),
28
+ "LinkedIn Insight": ("marketing_tracking", [r"snap\.licdn\.com"], 0.9),
29
+ "Hotjar": ("analytics", [r"static\.hotjar\.com", r"hotjar"], 0.85),
30
+ # CRM / marketing / chat
31
+ "HubSpot": ("crm", [r"js\.hs-scripts\.com", r"hs-scripts"], 0.9),
32
+ "Intercom": ("live_chat", [r"widget\.intercom\.io", r"intercomSettings"], 0.9),
33
+ "Drift": ("live_chat", [r"js\.driftt\.com", r"drift\.com"], 0.88),
34
+ "Tidio": ("live_chat", [r"code\.tidio\.co"], 0.9),
35
+ "Tawk.to": ("live_chat", [r"embed\.tawk\.to"], 0.9),
36
+ "Crisp": ("live_chat", [r"client\.crisp\.chat"], 0.9),
37
+ "Mailchimp": ("email_marketing", [r"chimpstatic\.com", r"list-manage\.com"], 0.85),
38
+ "Klaviyo": ("email_marketing", [r"klaviyo"], 0.85),
39
+ # Booking / forms
40
+ "Calendly": ("booking", [r"calendly\.com"], 0.92),
41
+ "Fresha": ("booking", [r"fresha\.com"], 0.9),
42
+ "Acuity Scheduling": ("booking", [r"acuityscheduling\.com"], 0.9),
43
+ "Booksy": ("booking", [r"booksy\.com"], 0.9),
44
+ "SimplyBook": ("booking", [r"simplybook\.(me|it)"], 0.88),
45
+ "Typeform": ("forms", [r"typeform\.com"], 0.88),
46
+ "Jotform": ("forms", [r"jotform\.com"], 0.88),
47
+ }
48
+
49
+
50
+ def detect_tools(html: str) -> list[dict]:
51
+ """Return a list of detected tools: {name, category, confidence, evidence}."""
52
+ found = []
53
+ for name, (category, patterns, confidence) in FINGERPRINTS.items():
54
+ for pat in patterns:
55
+ if re.search(pat, html, re.IGNORECASE):
56
+ found.append({
57
+ "name": name,
58
+ "category": category,
59
+ "confidence": confidence,
60
+ "evidence": f"matched /{pat}/",
61
+ })
62
+ break
63
+ return found
@@ -0,0 +1,4 @@
1
+ from .fetcher import fetch_html
2
+ from .robots import RobotsDisallowed, allowed
3
+
4
+ __all__ = ["fetch_html", "RobotsDisallowed", "allowed"]
@@ -0,0 +1,146 @@
1
+ """
2
+ 3-tier fetch: requests → cloudscraper → Playwright. Auto-escalation.
3
+ Returns raw HTML. Caching is keyed by URL with a 24h TTL.
4
+ """
5
+
6
+ import hashlib
7
+ import json
8
+ import time
9
+ from pathlib import Path
10
+
11
+ import requests
12
+
13
+ from ..config import BROWSER_HEADERS, CACHE_DIR, FETCH_RETRIES, REQUEST_TIMEOUT
14
+ from .robots import RobotsDisallowed, allowed
15
+
16
+ TTL_SECONDS = 86400
17
+ _RETRYABLE_STATUS = {429, 500, 502, 503, 504}
18
+
19
+
20
+ def _cache_path(url: str) -> Path:
21
+ return CACHE_DIR / f"{hashlib.sha256(url.encode()).hexdigest()[:16]}.json"
22
+
23
+
24
+ def _cache_get(url: str) -> str | None:
25
+ p = _cache_path(url)
26
+ if not p.exists():
27
+ return None
28
+ data = json.loads(p.read_text())
29
+ if time.time() - data["ts"] > TTL_SECONDS:
30
+ p.unlink()
31
+ return None
32
+ return data["html"]
33
+
34
+
35
+ def _cache_put(url: str, html: str) -> None:
36
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
37
+ _cache_path(url).write_text(json.dumps({"ts": time.time(), "url": url, "html": html}))
38
+
39
+
40
+ def _is_blocked(html: str, status: int) -> bool:
41
+ if status in (403, 503):
42
+ return True
43
+ snippet = html[:3000].lower()
44
+ return any(s in snippet for s in [
45
+ "just a moment", "captcha", "enable javascript",
46
+ "challenge-platform", "cf-browser-verification", "attention required",
47
+ ])
48
+
49
+
50
+ def _is_thin(html: str, min_chars: int = 600) -> bool:
51
+ """
52
+ JS-rendered pages often return a near-empty shell to plain `requests`.
53
+ If the visible text is tiny, escalate to a real browser so we don't miss
54
+ content (addresses, WhatsApp links, etc.) that loads client-side.
55
+ """
56
+ from bs4 import BeautifulSoup
57
+ soup = BeautifulSoup(html, "lxml")
58
+ for tag in soup(["script", "style", "noscript"]):
59
+ tag.decompose()
60
+ return len(soup.get_text(strip=True)) < min_chars
61
+
62
+
63
+ def _insufficient(html: str, status: int) -> bool:
64
+ return _is_blocked(html, status) or _is_thin(html)
65
+
66
+
67
+ def _via_cloudscraper(url: str) -> str:
68
+ import cloudscraper
69
+ scraper = cloudscraper.create_scraper(browser={"browser": "chrome", "platform": "darwin", "mobile": False})
70
+ resp = scraper.get(url, timeout=REQUEST_TIMEOUT)
71
+ resp.raise_for_status()
72
+ return resp.text
73
+
74
+
75
+ def _via_playwright(url: str) -> str:
76
+ from playwright.sync_api import sync_playwright
77
+ with sync_playwright() as p:
78
+ browser = p.chromium.launch(headless=True)
79
+ ctx = browser.new_context(user_agent=BROWSER_HEADERS["User-Agent"], locale="en-US",
80
+ viewport={"width": 1920, "height": 1080})
81
+ page = ctx.new_page()
82
+ page.goto(url, wait_until="domcontentloaded", timeout=60_000)
83
+ try:
84
+ page.wait_for_load_state("networkidle", timeout=12_000)
85
+ except Exception:
86
+ pass
87
+ page.wait_for_timeout(4000)
88
+ html = page.content()
89
+ browser.close()
90
+ return html
91
+
92
+
93
+ def _tier1_with_retry(url: str) -> str | None:
94
+ """Plain requests with exponential backoff on transient errors. None → escalate."""
95
+ session = requests.Session()
96
+ session.headers.update(BROWSER_HEADERS)
97
+ for attempt in range(FETCH_RETRIES + 1):
98
+ try:
99
+ resp = session.get(url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
100
+ if resp.status_code in _RETRYABLE_STATUS and attempt < FETCH_RETRIES:
101
+ time.sleep(0.5 * (2 ** attempt)) # 0.5s, 1s, 2s …
102
+ continue
103
+ if not _insufficient(resp.text, resp.status_code):
104
+ resp.raise_for_status()
105
+ return resp.text
106
+ return None # blocked/thin → escalate to cloudscraper/Playwright
107
+ except requests.RequestException:
108
+ if attempt < FETCH_RETRIES:
109
+ time.sleep(0.5 * (2 ** attempt))
110
+ continue
111
+ return None
112
+ return None
113
+
114
+
115
+ def fetch_html(url: str, use_cache: bool = True) -> str:
116
+ if use_cache:
117
+ cached = _cache_get(url)
118
+ if cached:
119
+ return cached
120
+
121
+ # Politeness: respect robots.txt before any network fetch.
122
+ if not allowed(url):
123
+ raise RobotsDisallowed(f"robots.txt disallows fetching {url}")
124
+
125
+ # Tier 1: requests with retry/backoff (escalate on bot-block OR thin JS shell)
126
+ html = _tier1_with_retry(url)
127
+ if html is not None:
128
+ if use_cache:
129
+ _cache_put(url, html)
130
+ return html
131
+
132
+ # Tier 2: cloudscraper
133
+ try:
134
+ html = _via_cloudscraper(url)
135
+ if not _insufficient(html, 200):
136
+ if use_cache:
137
+ _cache_put(url, html)
138
+ return html
139
+ except Exception:
140
+ pass
141
+
142
+ # Tier 3: Playwright (full JS render)
143
+ html = _via_playwright(url)
144
+ if use_cache:
145
+ _cache_put(url, html)
146
+ return html
@@ -0,0 +1,50 @@
1
+ """
2
+ robots.txt awareness — polite, opt-out crawling.
3
+
4
+ Before fetching, we check the target's robots.txt for our agent. Results are
5
+ cached per-domain. If robots.txt is missing/unreachable we default to ALLOW
6
+ (standard behaviour). Disabled with AUTONITIA_RESPECT_ROBOTS=false.
7
+ """
8
+
9
+ from urllib.parse import urlparse
10
+ from urllib.robotparser import RobotFileParser
11
+
12
+ import requests
13
+
14
+ from ..config import BROWSER_HEADERS, RESPECT_ROBOTS, ROBOTS_UA
15
+
16
+ _CACHE: dict[str, RobotFileParser | None] = {}
17
+
18
+
19
+ class RobotsDisallowed(Exception):
20
+ """Raised when robots.txt disallows fetching a URL."""
21
+
22
+
23
+ def _parser_for(domain_root: str) -> RobotFileParser | None:
24
+ if domain_root in _CACHE:
25
+ return _CACHE[domain_root]
26
+ rp = RobotFileParser()
27
+ try:
28
+ resp = requests.get(f"{domain_root}/robots.txt", headers=BROWSER_HEADERS, timeout=8)
29
+ if resp.status_code >= 400:
30
+ rp = None # no robots.txt → allow all
31
+ else:
32
+ rp.parse(resp.text.splitlines())
33
+ except requests.RequestException:
34
+ rp = None
35
+ _CACHE[domain_root] = rp
36
+ return rp
37
+
38
+
39
+ def allowed(url: str) -> bool:
40
+ """True if we may fetch this URL (always True when robots respect is off)."""
41
+ if not RESPECT_ROBOTS:
42
+ return True
43
+ parsed = urlparse(url)
44
+ if not parsed.scheme:
45
+ return True
46
+ root = f"{parsed.scheme}://{parsed.netloc}"
47
+ rp = _parser_for(root)
48
+ if rp is None:
49
+ return True
50
+ return rp.can_fetch(ROBOTS_UA, url)
@@ -0,0 +1,5 @@
1
+ from .base_graph import END, BaseGraph
2
+ from .base_node import BaseNode
3
+ from .profile_graph import ProfileGraph
4
+
5
+ __all__ = ["BaseGraph", "BaseNode", "END", "ProfileGraph"]
@@ -0,0 +1,64 @@
1
+ """
2
+ Graph executor with conditional edges.
3
+
4
+ A graph is: nodes (keyed by name) + an `edges` map + an `entry` node. Each edge
5
+ value is one of:
6
+ - a node name (str) → always go there next
7
+ - None (END) → stop
8
+ - a callable(state) -> str|None → CONDITIONAL: decide the next node at runtime
9
+
10
+ This makes the pipeline a real graph: branches (repair vs continue), short-circuits
11
+ (skip the LLM when there's nothing to do), and bounded loops. Still mirrors
12
+ LangGraph's contract so it can be swapped later. `max_steps` guards against loops.
13
+ """
14
+
15
+ import time
16
+
17
+ from .base_node import BaseNode
18
+
19
+ END = None
20
+
21
+
22
+ class BaseGraph:
23
+ def __init__(self, nodes: list[BaseNode], edges: dict, entry: str,
24
+ verbose: bool = True, max_steps: int = 25):
25
+ self.nodes = {n.name: n for n in nodes}
26
+ self.edges = edges
27
+ self.entry = entry
28
+ self.verbose = verbose
29
+ self.max_steps = max_steps
30
+
31
+ def execute(self, state: dict) -> dict:
32
+ trace = []
33
+ current = self.entry
34
+ steps = 0
35
+
36
+ while current is not None:
37
+ if steps >= self.max_steps:
38
+ raise RuntimeError(f"Graph exceeded max_steps={self.max_steps} (possible loop)")
39
+ if current not in self.nodes:
40
+ raise KeyError(f"Edge points to unknown node '{current}'")
41
+
42
+ node = self.nodes[current]
43
+ start = time.time()
44
+ try:
45
+ state = node.execute(state)
46
+ except Exception as e:
47
+ elapsed = int((time.time() - start) * 1000)
48
+ trace.append({"node": current, "status": "error", "ms": elapsed, "error": str(e)})
49
+ if self.verbose:
50
+ print(f" ✗ {current} failed: {e}")
51
+ state["_trace"] = trace
52
+ state["_error"] = {"node": current, "error": str(e)}
53
+ raise
54
+ elapsed = int((time.time() - start) * 1000)
55
+ trace.append({"node": current, "status": "success", "ms": elapsed})
56
+ if self.verbose:
57
+ print(f" ✓ {current} ({elapsed} ms)")
58
+
59
+ nxt = self.edges.get(current, END)
60
+ current = nxt(state) if callable(nxt) else nxt
61
+ steps += 1
62
+
63
+ state["_trace"] = trace
64
+ return state
@@ -0,0 +1,13 @@
1
+ """
2
+ Minimal node abstraction. Each node has a single responsibility and transforms
3
+ a shared `state` dict. This intentionally mirrors LangGraph's node contract so
4
+ the engine can be swapped for LangGraph later without rewriting nodes.
5
+ """
6
+
7
+
8
+ class BaseNode:
9
+ def __init__(self, name: str | None = None):
10
+ self.name = name or self.__class__.__name__
11
+
12
+ def execute(self, state: dict) -> dict: # pragma: no cover - interface
13
+ raise NotImplementedError