autonitia-intel 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autonitia_intel/__init__.py +27 -0
- autonitia_intel/__main__.py +3 -0
- autonitia_intel/cli.py +56 -0
- autonitia_intel/config.py +36 -0
- autonitia_intel/detection/__init__.py +4 -0
- autonitia_intel/detection/capabilities.py +136 -0
- autonitia_intel/detection/fingerprints.py +63 -0
- autonitia_intel/fetchers/__init__.py +4 -0
- autonitia_intel/fetchers/fetcher.py +146 -0
- autonitia_intel/fetchers/robots.py +50 -0
- autonitia_intel/graph/__init__.py +5 -0
- autonitia_intel/graph/base_graph.py +64 -0
- autonitia_intel/graph/base_node.py +13 -0
- autonitia_intel/graph/profile_graph.py +148 -0
- autonitia_intel/lenses/__init__.py +3 -0
- autonitia_intel/lenses/catalog.py +110 -0
- autonitia_intel/models.py +136 -0
- autonitia_intel/nodes/__init__.py +15 -0
- autonitia_intel/nodes/basic_assemble_node.py +76 -0
- autonitia_intel/nodes/fact_extraction_node.py +41 -0
- autonitia_intel/nodes/fetch_node.py +70 -0
- autonitia_intel/nodes/markdownify_node.py +35 -0
- autonitia_intel/nodes/positive_detection_node.py +24 -0
- autonitia_intel/nodes/repair_extraction_node.py +51 -0
- autonitia_intel/signal_packs/industries/real_estate.yaml +23 -0
- autonitia_intel/signal_packs/lenses/automation.yaml +57 -0
- autonitia_intel/signal_packs/lenses/marketing.yaml +37 -0
- autonitia_intel/signal_packs/lenses/sales.yaml +19 -0
- autonitia_intel/telemetry/__init__.py +3 -0
- autonitia_intel/telemetry/telemetry.py +84 -0
- autonitia_intel/usage.py +32 -0
- autonitia_intel-0.2.0.dist-info/METADATA +119 -0
- autonitia_intel-0.2.0.dist-info/RECORD +37 -0
- autonitia_intel-0.2.0.dist-info/WHEEL +5 -0
- autonitia_intel-0.2.0.dist-info/entry_points.txt +2 -0
- autonitia_intel-0.2.0.dist-info/licenses/LICENSE +21 -0
- autonitia_intel-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
autonitia-intel — open-source business-website profile extractor.
|
|
3
|
+
|
|
4
|
+
The FREE engine turns any business website into a clean structured profile
|
|
5
|
+
(company facts, contact details, social presence, detected tools/capabilities)
|
|
6
|
+
plus a *pro_features* count of opportunities. The intelligence layer — verified
|
|
7
|
+
signals, scoring, offer matching, outreach — is Autonitia Intel Pro, which
|
|
8
|
+
imports these same building blocks.
|
|
9
|
+
|
|
10
|
+
Quick start:
|
|
11
|
+
|
|
12
|
+
from autonitia_intel import ProfileGraph
|
|
13
|
+
|
|
14
|
+
graph = ProfileGraph(lens="automation") # bring your own key via env or args
|
|
15
|
+
profile = graph.run("https://example.com")
|
|
16
|
+
print(profile.model_dump_json(indent=2))
|
|
17
|
+
|
|
18
|
+
Bring your own model key:
|
|
19
|
+
|
|
20
|
+
ProfileGraph(api_key="sk-...", model="gpt-4o-mini")
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .graph import ProfileGraph
|
|
24
|
+
from .models import CompanyProfile, ProfileResult
|
|
25
|
+
|
|
26
|
+
__version__ = "0.2.0"
|
|
27
|
+
__all__ = ["ProfileGraph", "CompanyProfile", "ProfileResult"]
|
autonitia_intel/cli.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI: autonitia-intel analyse — extract a company profile (+ opportunity pro_features).
|
|
3
|
+
|
|
4
|
+
Example:
|
|
5
|
+
python -m autonitia_intel analyse --target-url https://example.com --lens automation
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from .config import OUTPUT_DIR
|
|
13
|
+
from .graph import ProfileGraph
|
|
14
|
+
from .lenses import LENSES
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main(argv=None):
|
|
18
|
+
parser = argparse.ArgumentParser(prog="autonitia-intel", description="Business-website profile extractor (free tier).")
|
|
19
|
+
sub = parser.add_subparsers(dest="command")
|
|
20
|
+
|
|
21
|
+
a = sub.add_parser("analyse", help="Extract a profile from a business website")
|
|
22
|
+
a.add_argument("--target-url", required=True)
|
|
23
|
+
a.add_argument("--lens", default="automation", choices=LENSES, help="Lens used only for the opportunity pro_features count")
|
|
24
|
+
a.add_argument("--api-key", default=None, help="Bring your own model key (overrides env)")
|
|
25
|
+
a.add_argument("--model", default=None, help="Model id (overrides env)")
|
|
26
|
+
a.add_argument("--no-cache", action="store_true")
|
|
27
|
+
a.add_argument("--no-telemetry", action="store_true")
|
|
28
|
+
a.add_argument("--quiet", action="store_true")
|
|
29
|
+
|
|
30
|
+
args = parser.parse_args(argv)
|
|
31
|
+
if args.command != "analyse":
|
|
32
|
+
parser.print_help()
|
|
33
|
+
sys.exit(1)
|
|
34
|
+
|
|
35
|
+
graph = ProfileGraph(
|
|
36
|
+
lens=args.lens,
|
|
37
|
+
telemetry=not args.no_telemetry,
|
|
38
|
+
verbose=not args.quiet,
|
|
39
|
+
api_key=args.api_key,
|
|
40
|
+
model=args.model,
|
|
41
|
+
)
|
|
42
|
+
if not args.quiet:
|
|
43
|
+
print(f"Analysing {args.target_url} (lens={args.lens}) ...")
|
|
44
|
+
result = graph.run(args.target_url, use_cache=not args.no_cache)
|
|
45
|
+
|
|
46
|
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
name = (result.target_company.name or "result").replace("/", "_").replace(" ", "_")
|
|
48
|
+
path = OUTPUT_DIR / f"{name}_profile.json"
|
|
49
|
+
path.write_text(result.model_dump_json(indent=2))
|
|
50
|
+
|
|
51
|
+
print(f"\nSaved: {path}\n")
|
|
52
|
+
print(result.model_dump_json(indent=2))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
main()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
|
|
6
|
+
load_dotenv()
|
|
7
|
+
|
|
8
|
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
|
9
|
+
MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
|
|
10
|
+
|
|
11
|
+
BROWSER_HEADERS = {
|
|
12
|
+
"User-Agent": (
|
|
13
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
14
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
15
|
+
"Chrome/125.0.0.0 Safari/537.36"
|
|
16
|
+
),
|
|
17
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
18
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
CACHE_DIR = Path(__file__).parent.parent / "output" / ".cache"
|
|
22
|
+
OUTPUT_DIR = Path(__file__).parent.parent / "output"
|
|
23
|
+
REQUEST_TIMEOUT = 15
|
|
24
|
+
MAX_CONTENT_CHARS = 24_000 # overall budget sent to the LLM
|
|
25
|
+
PER_PAGE_CHARS = 6_000 # per-page cap so no single page (e.g. a bloated homepage) starves the rest
|
|
26
|
+
MAX_SUBPAGES = 3
|
|
27
|
+
|
|
28
|
+
# Crawling politeness/resilience
|
|
29
|
+
RESPECT_ROBOTS = os.getenv("AUTONITIA_RESPECT_ROBOTS", "true").lower() != "false"
|
|
30
|
+
FETCH_RETRIES = int(os.getenv("AUTONITIA_FETCH_RETRIES", "2")) # extra attempts on transient errors
|
|
31
|
+
ROBOTS_UA = "autonitia-intel"
|
|
32
|
+
|
|
33
|
+
# Telemetry — see telemetry/telemetry.py. Nothing is sent over the network in v0.1.
|
|
34
|
+
# Level 1 (execution metrics) is opt-OUT. Level 2 (dataset capture) is opt-IN.
|
|
35
|
+
TELEMETRY_ENABLED = os.getenv("AUTONITIA_TELEMETRY", "true").lower() != "false"
|
|
36
|
+
DATASET_CONTRIBUTION = os.getenv("AUTONITIA_DATASET", "false").lower() == "true"
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deterministic capability + digital-presence detection.
|
|
3
|
+
|
|
4
|
+
Inspects raw HTML across all fetched pages to determine observable facts:
|
|
5
|
+
lead-capture methods, social links, SEO basics, tracking. No LLM.
|
|
6
|
+
|
|
7
|
+
These are FACTS (present/absent), which become the evidence base for signals.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
from ..detection.fingerprints import detect_tools
|
|
13
|
+
from ..models import Capabilities, SEO, SocialMedia, Tracking
|
|
14
|
+
|
|
15
|
+
SOCIAL_PATTERNS = {
|
|
16
|
+
"facebook": r"https?://(?:www\.)?facebook\.com/[A-Za-z0-9_.\-/]+",
|
|
17
|
+
"instagram": r"https?://(?:www\.)?instagram\.com/[A-Za-z0-9_.\-/]+",
|
|
18
|
+
"linkedin": r"https?://(?:[a-z]{2}\.)?linkedin\.com/(?:company|in)/[A-Za-z0-9_.\-/]+",
|
|
19
|
+
"tiktok": r"https?://(?:www\.)?tiktok\.com/@[A-Za-z0-9_.\-/]+",
|
|
20
|
+
"youtube": r"https?://(?:www\.)?youtube\.com/[A-Za-z0-9_.\-/@]+",
|
|
21
|
+
"x": r"https?://(?:www\.)?(?:twitter|x)\.com/[A-Za-z0-9_]+",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# STRONG patterns = a real third-party tool / explicit URL → trustworthy, the
|
|
25
|
+
# LLM verifier may NOT downgrade these. WEAK patterns = generic text heuristics
|
|
26
|
+
# ("book now") that are easily wrong → the LLM verifier MAY override them.
|
|
27
|
+
BOOKING_STRONG = [r"calendly\.com", r"fresha\.com", r"acuityscheduling\.com", r"booksy\.com",
|
|
28
|
+
r"simplybook\.(me|it)", r"setmore\.com", r"squareup\.com/appointments"]
|
|
29
|
+
BOOKING_WEAK = [r"book\s*now", r"book\s*online", r"schedule\s*(an?\s*)?appointment", r"book\s*a\s*viewing"]
|
|
30
|
+
|
|
31
|
+
LIVE_CHAT_STRONG = [r"intercom", r"driftt?\.com", r"tidio", r"tawk\.to", r"crisp\.chat", r"hs-scripts"]
|
|
32
|
+
LIVE_CHAT_WEAK = [r"livechat", r"chat\s*with\s*us", r"live\s*chat"]
|
|
33
|
+
|
|
34
|
+
WHATSAPP_STRONG = [r"wa\.me/", r"api\.whatsapp\.com", r"whatsapp://", r"web\.whatsapp\.com",
|
|
35
|
+
r"chat\.whatsapp\.com", r"wa\.link/"]
|
|
36
|
+
WHATSAPP_WEAK = [r"click\s*to\s*whatsapp", r'aria-label=["\'][^"\']*whatsapp', r"whatsapp\s*us"]
|
|
37
|
+
|
|
38
|
+
NEWSLETTER_STRONG = [r"chimpstatic", r"klaviyo", r"list-manage\.com"]
|
|
39
|
+
NEWSLETTER_WEAK = [r"newsletter", r"subscribe"]
|
|
40
|
+
|
|
41
|
+
# These have no reliable "strong" structural signal — treat as weak (downgradable).
|
|
42
|
+
PRICING_WEAK = [r"/pricing", r">\s*pricing\s*<", r">\s*plans\s*<", r"per\s*month", r"/mo\b"]
|
|
43
|
+
CASE_STUDY_WEAK = [r"case\s*stud", r"success\s*stor", r"/portfolio", r"testimonial"]
|
|
44
|
+
FORM_WEAK = [r"<form[\s>]"] # a <form> could be search/login, not a contact form → downgradable
|
|
45
|
+
|
|
46
|
+
PHONE_PATTERN = r"tel:\+?[\d\s\-()]{7,}"
|
|
47
|
+
EMAIL_PATTERN = r"mailto:[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+"
|
|
48
|
+
|
|
49
|
+
# Capabilities the LLM verifier is allowed to downgrade when only a WEAK signal fired.
|
|
50
|
+
DOWNGRADABLE = {"has_online_booking", "has_whatsapp", "has_live_chat",
|
|
51
|
+
"has_pricing", "has_case_studies", "has_contact_form", "has_newsletter"}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _any(patterns: list[str], html: str) -> bool:
|
|
55
|
+
return any(re.search(p, html, re.IGNORECASE) for p in patterns)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
_EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
|
|
59
|
+
_TEL_RE = re.compile(r'tel:(\+?[\d\s\-()]{7,})', re.IGNORECASE)
|
|
60
|
+
_WA_RE = re.compile(r'(https?://(?:wa\.me|wa\.link|api\.whatsapp\.com|web\.whatsapp\.com|chat\.whatsapp\.com)/[^\s"\'<>]+)', re.IGNORECASE)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def extract_contacts(html: str) -> dict:
|
|
64
|
+
"""Deterministic contact extraction — a backstop/merge for the LLM output."""
|
|
65
|
+
emails = sorted({m.group(0) for m in _EMAIL_RE.finditer(html)
|
|
66
|
+
if not m.group(0).lower().endswith((".png", ".jpg", ".gif", ".webp", ".svg"))})
|
|
67
|
+
phones = sorted({re.sub(r"\s+", " ", m.group(1)).strip() for m in _TEL_RE.finditer(html)})
|
|
68
|
+
wa = ""
|
|
69
|
+
m = _WA_RE.search(html)
|
|
70
|
+
if m:
|
|
71
|
+
wa = m.group(1)
|
|
72
|
+
return {"emails": emails, "phones": phones, "whatsapp": wa}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def detect_capabilities(combined_html: str):
|
|
76
|
+
"""
|
|
77
|
+
Returns (capabilities, social, seo, tracking, tools, strongly_detected).
|
|
78
|
+
|
|
79
|
+
`strongly_detected` is the set of capability names backed by a STRONG
|
|
80
|
+
structural signal (a real tool/URL). The LLM verifier may only downgrade
|
|
81
|
+
capabilities NOT in this set.
|
|
82
|
+
"""
|
|
83
|
+
booking_strong = _any(BOOKING_STRONG, combined_html)
|
|
84
|
+
chat_strong = _any(LIVE_CHAT_STRONG, combined_html)
|
|
85
|
+
wa_strong = _any(WHATSAPP_STRONG, combined_html)
|
|
86
|
+
news_strong = _any(NEWSLETTER_STRONG, combined_html)
|
|
87
|
+
|
|
88
|
+
caps = Capabilities(
|
|
89
|
+
has_phone=bool(re.search(PHONE_PATTERN, combined_html, re.IGNORECASE)),
|
|
90
|
+
has_email=bool(re.search(EMAIL_PATTERN, combined_html, re.IGNORECASE)),
|
|
91
|
+
has_contact_form=_any(FORM_WEAK, combined_html),
|
|
92
|
+
has_whatsapp=wa_strong or _any(WHATSAPP_WEAK, combined_html),
|
|
93
|
+
has_online_booking=booking_strong or _any(BOOKING_WEAK, combined_html),
|
|
94
|
+
has_live_chat=chat_strong or _any(LIVE_CHAT_WEAK, combined_html),
|
|
95
|
+
has_pricing=_any(PRICING_WEAK, combined_html),
|
|
96
|
+
has_case_studies=_any(CASE_STUDY_WEAK, combined_html),
|
|
97
|
+
has_newsletter=news_strong or _any(NEWSLETTER_WEAK, combined_html),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
strongly_detected = set()
|
|
101
|
+
if booking_strong:
|
|
102
|
+
strongly_detected.add("has_online_booking")
|
|
103
|
+
if chat_strong:
|
|
104
|
+
strongly_detected.add("has_live_chat")
|
|
105
|
+
if wa_strong:
|
|
106
|
+
strongly_detected.add("has_whatsapp")
|
|
107
|
+
if news_strong:
|
|
108
|
+
strongly_detected.add("has_newsletter")
|
|
109
|
+
|
|
110
|
+
social = SocialMedia()
|
|
111
|
+
for field, pattern in SOCIAL_PATTERNS.items():
|
|
112
|
+
m = re.search(pattern, combined_html, re.IGNORECASE)
|
|
113
|
+
if m:
|
|
114
|
+
# Skip share/intent links — keep only profile-looking URLs
|
|
115
|
+
url = m.group(0)
|
|
116
|
+
if "sharer" not in url and "intent" not in url and "/share" not in url:
|
|
117
|
+
setattr(social, field, url)
|
|
118
|
+
caps.has_social_links = any(getattr(social, f) for f in SOCIAL_PATTERNS)
|
|
119
|
+
|
|
120
|
+
seo = SEO(
|
|
121
|
+
title_tag_present=bool(re.search(r"<title[\s>]", combined_html, re.IGNORECASE)),
|
|
122
|
+
meta_description_present=bool(re.search(r'<meta[^>]+name=["\']description["\']', combined_html, re.IGNORECASE)),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
tools = detect_tools(combined_html)
|
|
126
|
+
tool_names = {t["name"] for t in tools}
|
|
127
|
+
tracking = Tracking(
|
|
128
|
+
google_analytics="Google Analytics" in tool_names,
|
|
129
|
+
google_tag_manager="Google Tag Manager" in tool_names,
|
|
130
|
+
meta_pixel="Meta Pixel" in tool_names,
|
|
131
|
+
tiktok_pixel="TikTok Pixel" in tool_names,
|
|
132
|
+
linkedin_pixel="LinkedIn Insight" in tool_names,
|
|
133
|
+
hotjar="Hotjar" in tool_names,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return caps, social, seo, tracking, tools, strongly_detected
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deterministic technology detection.
|
|
3
|
+
|
|
4
|
+
A pragmatic subset of Wappalyzer-style fingerprints: each entry matches a
|
|
5
|
+
substring/regex in the raw HTML. NO LLM involved — this is fact, not inference,
|
|
6
|
+
which is why it's the most defensible signal in the product.
|
|
7
|
+
|
|
8
|
+
For production, swap this dict for the full Wappalyzer fingerprint database
|
|
9
|
+
(https://github.com/enthec/webappanalyzer) — same matching approach, ~3000 apps.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
# name -> (category, [patterns], confidence)
|
|
15
|
+
FINGERPRINTS: dict[str, tuple[str, list[str], float]] = {
|
|
16
|
+
# CMS / site builders
|
|
17
|
+
"WordPress": ("cms", [r"wp-content", r"wp-includes"], 0.92),
|
|
18
|
+
"Shopify": ("ecommerce", [r"cdn\.shopify\.com", r"Shopify\.theme"], 0.95),
|
|
19
|
+
"Wix": ("cms", [r"static\.wixstatic\.com", r"_wixCssImports"], 0.9),
|
|
20
|
+
"Webflow": ("cms", [r"assets\.website-files\.com", r"webflow\.js", r"wf-"], 0.88),
|
|
21
|
+
"Squarespace": ("cms", [r"squarespace", r"static1\.squarespace\.com"], 0.9),
|
|
22
|
+
"WooCommerce": ("ecommerce", [r"woocommerce", r"wc-ajax"], 0.85),
|
|
23
|
+
# Analytics / tracking
|
|
24
|
+
"Google Analytics": ("analytics", [r"google-analytics\.com", r"gtag\(", r"ga\('create'"], 0.9),
|
|
25
|
+
"Google Tag Manager": ("analytics", [r"googletagmanager\.com"], 0.92),
|
|
26
|
+
"Meta Pixel": ("marketing_tracking", [r"fbq\(", r"connect\.facebook\.net/[a-z_]+/fbevents\.js"], 0.93),
|
|
27
|
+
"TikTok Pixel": ("marketing_tracking", [r"analytics\.tiktok\.com"], 0.9),
|
|
28
|
+
"LinkedIn Insight": ("marketing_tracking", [r"snap\.licdn\.com"], 0.9),
|
|
29
|
+
"Hotjar": ("analytics", [r"static\.hotjar\.com", r"hotjar"], 0.85),
|
|
30
|
+
# CRM / marketing / chat
|
|
31
|
+
"HubSpot": ("crm", [r"js\.hs-scripts\.com", r"hs-scripts"], 0.9),
|
|
32
|
+
"Intercom": ("live_chat", [r"widget\.intercom\.io", r"intercomSettings"], 0.9),
|
|
33
|
+
"Drift": ("live_chat", [r"js\.driftt\.com", r"drift\.com"], 0.88),
|
|
34
|
+
"Tidio": ("live_chat", [r"code\.tidio\.co"], 0.9),
|
|
35
|
+
"Tawk.to": ("live_chat", [r"embed\.tawk\.to"], 0.9),
|
|
36
|
+
"Crisp": ("live_chat", [r"client\.crisp\.chat"], 0.9),
|
|
37
|
+
"Mailchimp": ("email_marketing", [r"chimpstatic\.com", r"list-manage\.com"], 0.85),
|
|
38
|
+
"Klaviyo": ("email_marketing", [r"klaviyo"], 0.85),
|
|
39
|
+
# Booking / forms
|
|
40
|
+
"Calendly": ("booking", [r"calendly\.com"], 0.92),
|
|
41
|
+
"Fresha": ("booking", [r"fresha\.com"], 0.9),
|
|
42
|
+
"Acuity Scheduling": ("booking", [r"acuityscheduling\.com"], 0.9),
|
|
43
|
+
"Booksy": ("booking", [r"booksy\.com"], 0.9),
|
|
44
|
+
"SimplyBook": ("booking", [r"simplybook\.(me|it)"], 0.88),
|
|
45
|
+
"Typeform": ("forms", [r"typeform\.com"], 0.88),
|
|
46
|
+
"Jotform": ("forms", [r"jotform\.com"], 0.88),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def detect_tools(html: str) -> list[dict]:
|
|
51
|
+
"""Return a list of detected tools: {name, category, confidence, evidence}."""
|
|
52
|
+
found = []
|
|
53
|
+
for name, (category, patterns, confidence) in FINGERPRINTS.items():
|
|
54
|
+
for pat in patterns:
|
|
55
|
+
if re.search(pat, html, re.IGNORECASE):
|
|
56
|
+
found.append({
|
|
57
|
+
"name": name,
|
|
58
|
+
"category": category,
|
|
59
|
+
"confidence": confidence,
|
|
60
|
+
"evidence": f"matched /{pat}/",
|
|
61
|
+
})
|
|
62
|
+
break
|
|
63
|
+
return found
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""
|
|
2
|
+
3-tier fetch: requests → cloudscraper → Playwright. Auto-escalation.
|
|
3
|
+
Returns raw HTML. Caching is keyed by URL with a 24h TTL.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import hashlib
|
|
7
|
+
import json
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import requests
|
|
12
|
+
|
|
13
|
+
from ..config import BROWSER_HEADERS, CACHE_DIR, FETCH_RETRIES, REQUEST_TIMEOUT
|
|
14
|
+
from .robots import RobotsDisallowed, allowed
|
|
15
|
+
|
|
16
|
+
TTL_SECONDS = 86400
|
|
17
|
+
_RETRYABLE_STATUS = {429, 500, 502, 503, 504}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _cache_path(url: str) -> Path:
|
|
21
|
+
return CACHE_DIR / f"{hashlib.sha256(url.encode()).hexdigest()[:16]}.json"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _cache_get(url: str) -> str | None:
|
|
25
|
+
p = _cache_path(url)
|
|
26
|
+
if not p.exists():
|
|
27
|
+
return None
|
|
28
|
+
data = json.loads(p.read_text())
|
|
29
|
+
if time.time() - data["ts"] > TTL_SECONDS:
|
|
30
|
+
p.unlink()
|
|
31
|
+
return None
|
|
32
|
+
return data["html"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _cache_put(url: str, html: str) -> None:
|
|
36
|
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
_cache_path(url).write_text(json.dumps({"ts": time.time(), "url": url, "html": html}))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _is_blocked(html: str, status: int) -> bool:
|
|
41
|
+
if status in (403, 503):
|
|
42
|
+
return True
|
|
43
|
+
snippet = html[:3000].lower()
|
|
44
|
+
return any(s in snippet for s in [
|
|
45
|
+
"just a moment", "captcha", "enable javascript",
|
|
46
|
+
"challenge-platform", "cf-browser-verification", "attention required",
|
|
47
|
+
])
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _is_thin(html: str, min_chars: int = 600) -> bool:
|
|
51
|
+
"""
|
|
52
|
+
JS-rendered pages often return a near-empty shell to plain `requests`.
|
|
53
|
+
If the visible text is tiny, escalate to a real browser so we don't miss
|
|
54
|
+
content (addresses, WhatsApp links, etc.) that loads client-side.
|
|
55
|
+
"""
|
|
56
|
+
from bs4 import BeautifulSoup
|
|
57
|
+
soup = BeautifulSoup(html, "lxml")
|
|
58
|
+
for tag in soup(["script", "style", "noscript"]):
|
|
59
|
+
tag.decompose()
|
|
60
|
+
return len(soup.get_text(strip=True)) < min_chars
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _insufficient(html: str, status: int) -> bool:
|
|
64
|
+
return _is_blocked(html, status) or _is_thin(html)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _via_cloudscraper(url: str) -> str:
|
|
68
|
+
import cloudscraper
|
|
69
|
+
scraper = cloudscraper.create_scraper(browser={"browser": "chrome", "platform": "darwin", "mobile": False})
|
|
70
|
+
resp = scraper.get(url, timeout=REQUEST_TIMEOUT)
|
|
71
|
+
resp.raise_for_status()
|
|
72
|
+
return resp.text
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _via_playwright(url: str) -> str:
|
|
76
|
+
from playwright.sync_api import sync_playwright
|
|
77
|
+
with sync_playwright() as p:
|
|
78
|
+
browser = p.chromium.launch(headless=True)
|
|
79
|
+
ctx = browser.new_context(user_agent=BROWSER_HEADERS["User-Agent"], locale="en-US",
|
|
80
|
+
viewport={"width": 1920, "height": 1080})
|
|
81
|
+
page = ctx.new_page()
|
|
82
|
+
page.goto(url, wait_until="domcontentloaded", timeout=60_000)
|
|
83
|
+
try:
|
|
84
|
+
page.wait_for_load_state("networkidle", timeout=12_000)
|
|
85
|
+
except Exception:
|
|
86
|
+
pass
|
|
87
|
+
page.wait_for_timeout(4000)
|
|
88
|
+
html = page.content()
|
|
89
|
+
browser.close()
|
|
90
|
+
return html
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _tier1_with_retry(url: str) -> str | None:
|
|
94
|
+
"""Plain requests with exponential backoff on transient errors. None → escalate."""
|
|
95
|
+
session = requests.Session()
|
|
96
|
+
session.headers.update(BROWSER_HEADERS)
|
|
97
|
+
for attempt in range(FETCH_RETRIES + 1):
|
|
98
|
+
try:
|
|
99
|
+
resp = session.get(url, timeout=REQUEST_TIMEOUT, allow_redirects=True)
|
|
100
|
+
if resp.status_code in _RETRYABLE_STATUS and attempt < FETCH_RETRIES:
|
|
101
|
+
time.sleep(0.5 * (2 ** attempt)) # 0.5s, 1s, 2s …
|
|
102
|
+
continue
|
|
103
|
+
if not _insufficient(resp.text, resp.status_code):
|
|
104
|
+
resp.raise_for_status()
|
|
105
|
+
return resp.text
|
|
106
|
+
return None # blocked/thin → escalate to cloudscraper/Playwright
|
|
107
|
+
except requests.RequestException:
|
|
108
|
+
if attempt < FETCH_RETRIES:
|
|
109
|
+
time.sleep(0.5 * (2 ** attempt))
|
|
110
|
+
continue
|
|
111
|
+
return None
|
|
112
|
+
return None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def fetch_html(url: str, use_cache: bool = True) -> str:
|
|
116
|
+
if use_cache:
|
|
117
|
+
cached = _cache_get(url)
|
|
118
|
+
if cached:
|
|
119
|
+
return cached
|
|
120
|
+
|
|
121
|
+
# Politeness: respect robots.txt before any network fetch.
|
|
122
|
+
if not allowed(url):
|
|
123
|
+
raise RobotsDisallowed(f"robots.txt disallows fetching {url}")
|
|
124
|
+
|
|
125
|
+
# Tier 1: requests with retry/backoff (escalate on bot-block OR thin JS shell)
|
|
126
|
+
html = _tier1_with_retry(url)
|
|
127
|
+
if html is not None:
|
|
128
|
+
if use_cache:
|
|
129
|
+
_cache_put(url, html)
|
|
130
|
+
return html
|
|
131
|
+
|
|
132
|
+
# Tier 2: cloudscraper
|
|
133
|
+
try:
|
|
134
|
+
html = _via_cloudscraper(url)
|
|
135
|
+
if not _insufficient(html, 200):
|
|
136
|
+
if use_cache:
|
|
137
|
+
_cache_put(url, html)
|
|
138
|
+
return html
|
|
139
|
+
except Exception:
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
# Tier 3: Playwright (full JS render)
|
|
143
|
+
html = _via_playwright(url)
|
|
144
|
+
if use_cache:
|
|
145
|
+
_cache_put(url, html)
|
|
146
|
+
return html
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
robots.txt awareness — polite, opt-out crawling.
|
|
3
|
+
|
|
4
|
+
Before fetching, we check the target's robots.txt for our agent. Results are
|
|
5
|
+
cached per-domain. If robots.txt is missing/unreachable we default to ALLOW
|
|
6
|
+
(standard behaviour). Disabled with AUTONITIA_RESPECT_ROBOTS=false.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
from urllib.robotparser import RobotFileParser
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
from ..config import BROWSER_HEADERS, RESPECT_ROBOTS, ROBOTS_UA
|
|
15
|
+
|
|
16
|
+
_CACHE: dict[str, RobotFileParser | None] = {}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RobotsDisallowed(Exception):
|
|
20
|
+
"""Raised when robots.txt disallows fetching a URL."""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _parser_for(domain_root: str) -> RobotFileParser | None:
|
|
24
|
+
if domain_root in _CACHE:
|
|
25
|
+
return _CACHE[domain_root]
|
|
26
|
+
rp = RobotFileParser()
|
|
27
|
+
try:
|
|
28
|
+
resp = requests.get(f"{domain_root}/robots.txt", headers=BROWSER_HEADERS, timeout=8)
|
|
29
|
+
if resp.status_code >= 400:
|
|
30
|
+
rp = None # no robots.txt → allow all
|
|
31
|
+
else:
|
|
32
|
+
rp.parse(resp.text.splitlines())
|
|
33
|
+
except requests.RequestException:
|
|
34
|
+
rp = None
|
|
35
|
+
_CACHE[domain_root] = rp
|
|
36
|
+
return rp
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def allowed(url: str) -> bool:
|
|
40
|
+
"""True if we may fetch this URL (always True when robots respect is off)."""
|
|
41
|
+
if not RESPECT_ROBOTS:
|
|
42
|
+
return True
|
|
43
|
+
parsed = urlparse(url)
|
|
44
|
+
if not parsed.scheme:
|
|
45
|
+
return True
|
|
46
|
+
root = f"{parsed.scheme}://{parsed.netloc}"
|
|
47
|
+
rp = _parser_for(root)
|
|
48
|
+
if rp is None:
|
|
49
|
+
return True
|
|
50
|
+
return rp.can_fetch(ROBOTS_UA, url)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Graph executor with conditional edges.
|
|
3
|
+
|
|
4
|
+
A graph is: nodes (keyed by name) + an `edges` map + an `entry` node. Each edge
|
|
5
|
+
value is one of:
|
|
6
|
+
- a node name (str) → always go there next
|
|
7
|
+
- None (END) → stop
|
|
8
|
+
- a callable(state) -> str|None → CONDITIONAL: decide the next node at runtime
|
|
9
|
+
|
|
10
|
+
This makes the pipeline a real graph: branches (repair vs continue), short-circuits
|
|
11
|
+
(skip the LLM when there's nothing to do), and bounded loops. Still mirrors
|
|
12
|
+
LangGraph's contract so it can be swapped later. `max_steps` guards against loops.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import time
|
|
16
|
+
|
|
17
|
+
from .base_node import BaseNode
|
|
18
|
+
|
|
19
|
+
END = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaseGraph:
|
|
23
|
+
def __init__(self, nodes: list[BaseNode], edges: dict, entry: str,
|
|
24
|
+
verbose: bool = True, max_steps: int = 25):
|
|
25
|
+
self.nodes = {n.name: n for n in nodes}
|
|
26
|
+
self.edges = edges
|
|
27
|
+
self.entry = entry
|
|
28
|
+
self.verbose = verbose
|
|
29
|
+
self.max_steps = max_steps
|
|
30
|
+
|
|
31
|
+
def execute(self, state: dict) -> dict:
|
|
32
|
+
trace = []
|
|
33
|
+
current = self.entry
|
|
34
|
+
steps = 0
|
|
35
|
+
|
|
36
|
+
while current is not None:
|
|
37
|
+
if steps >= self.max_steps:
|
|
38
|
+
raise RuntimeError(f"Graph exceeded max_steps={self.max_steps} (possible loop)")
|
|
39
|
+
if current not in self.nodes:
|
|
40
|
+
raise KeyError(f"Edge points to unknown node '{current}'")
|
|
41
|
+
|
|
42
|
+
node = self.nodes[current]
|
|
43
|
+
start = time.time()
|
|
44
|
+
try:
|
|
45
|
+
state = node.execute(state)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
elapsed = int((time.time() - start) * 1000)
|
|
48
|
+
trace.append({"node": current, "status": "error", "ms": elapsed, "error": str(e)})
|
|
49
|
+
if self.verbose:
|
|
50
|
+
print(f" ✗ {current} failed: {e}")
|
|
51
|
+
state["_trace"] = trace
|
|
52
|
+
state["_error"] = {"node": current, "error": str(e)}
|
|
53
|
+
raise
|
|
54
|
+
elapsed = int((time.time() - start) * 1000)
|
|
55
|
+
trace.append({"node": current, "status": "success", "ms": elapsed})
|
|
56
|
+
if self.verbose:
|
|
57
|
+
print(f" ✓ {current} ({elapsed} ms)")
|
|
58
|
+
|
|
59
|
+
nxt = self.edges.get(current, END)
|
|
60
|
+
current = nxt(state) if callable(nxt) else nxt
|
|
61
|
+
steps += 1
|
|
62
|
+
|
|
63
|
+
state["_trace"] = trace
|
|
64
|
+
return state
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Minimal node abstraction. Each node has a single responsibility and transforms
|
|
3
|
+
a shared `state` dict. This intentionally mirrors LangGraph's node contract so
|
|
4
|
+
the engine can be swapped for LangGraph later without rewriting nodes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseNode:
|
|
9
|
+
def __init__(self, name: str | None = None):
|
|
10
|
+
self.name = name or self.__class__.__name__
|
|
11
|
+
|
|
12
|
+
def execute(self, state: dict) -> dict: # pragma: no cover - interface
|
|
13
|
+
raise NotImplementedError
|