agent-census 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_census/__init__.py +23 -0
- agent_census/__main__.py +10 -0
- agent_census/classify/__init__.py +53 -0
- agent_census/classify/ai_crawler.py +13 -0
- agent_census/classify/app.py +46 -0
- agent_census/classify/archiver.py +13 -0
- agent_census/classify/base.py +37 -0
- agent_census/classify/browser.py +180 -0
- agent_census/classify/combiner.py +175 -0
- agent_census/classify/crawler.py +56 -0
- agent_census/classify/data_harvester.py +19 -0
- agent_census/classify/feed_reader.py +101 -0
- agent_census/classify/known_bot.py +51 -0
- agent_census/classify/monitor.py +54 -0
- agent_census/classify/registry.py +46 -0
- agent_census/classify/scraper.py +46 -0
- agent_census/classify/search_engine.py +13 -0
- agent_census/classify/seo_marketing.py +13 -0
- agent_census/classify/social_preview.py +13 -0
- agent_census/classify/spam_bot.py +44 -0
- agent_census/classify/tags.py +236 -0
- agent_census/classify/vuln_scanner.py +73 -0
- agent_census/cli.py +477 -0
- agent_census/data/__init__.py +5 -0
- agent_census/data/ai_crawler.toml +155 -0
- agent_census/data/app_clients.toml +17 -0
- agent_census/data/archiver.toml +30 -0
- agent_census/data/browser_releases.toml +33 -0
- agent_census/data/data_harvester.toml +27 -0
- agent_census/data/datacenter_ranges.toml +242 -0
- agent_census/data/egress_networks.toml +65 -0
- agent_census/data/feed_readers.toml +15 -0
- agent_census/data/scanner_ua.toml +9 -0
- agent_census/data/search_engine.toml +130 -0
- agent_census/data/seo_marketing.toml +69 -0
- agent_census/data/social_preview.toml +78 -0
- agent_census/data/vuln_paths.toml +23 -0
- agent_census/dataload.py +342 -0
- agent_census/egress.py +59 -0
- agent_census/errors.py +20 -0
- agent_census/features.py +599 -0
- agent_census/hosting.py +133 -0
- agent_census/identity.py +107 -0
- agent_census/iprange.py +390 -0
- agent_census/model.py +283 -0
- agent_census/netverify.py +371 -0
- agent_census/parsing/__init__.py +15 -0
- agent_census/parsing/apache.py +196 -0
- agent_census/parsing/apache_directives.py +340 -0
- agent_census/parsing/base.py +38 -0
- agent_census/parsing/cloudflare.py +114 -0
- agent_census/parsing/registry.py +39 -0
- agent_census/pipeline.py +755 -0
- agent_census/py.typed +0 -0
- agent_census/report/__init__.py +17 -0
- agent_census/report/aggregate.py +240 -0
- agent_census/report/calibrate.py +400 -0
- agent_census/report/format.py +266 -0
- agent_census/report/html.py +880 -0
- agent_census/report/inspect.py +203 -0
- agent_census/report/markdown.py +226 -0
- agent_census/robots/__init__.py +16 -0
- agent_census/robots/compliance.py +118 -0
- agent_census/robots/parser.py +55 -0
- agent_census/robots/source.py +69 -0
- agent_census/uas.py +292 -0
- agent_census/userconfig.py +42 -0
- agent_census-0.0.1.dist-info/METADATA +266 -0
- agent_census-0.0.1.dist-info/RECORD +73 -0
- agent_census-0.0.1.dist-info/WHEEL +5 -0
- agent_census-0.0.1.dist-info/entry_points.txt +2 -0
- agent_census-0.0.1.dist-info/licenses/LICENSE.md +19 -0
- agent_census-0.0.1.dist-info/top_level.txt +1 -0
agent_census/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
__version__ = "0.0.1"
|
|
2
|
+
__author__ = "Mark Nottingham <mnot@mnot.net>"
|
|
3
|
+
__copyright__ = """\
|
|
4
|
+
Copyright (c) Mark Nottingham
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
23
|
+
"""
|
agent_census/__main__.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Client classification: independent rule-based classifiers + a combiner.
|
|
2
|
+
|
|
3
|
+
The public entry point is :func:`classify_client`, which runs every registered
|
|
4
|
+
classifier over a client's features and combines their signals into a single
|
|
5
|
+
:class:`~agent_census.model.Classification` (primary kind plus tags).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from ..model import BotVerification, Classification, ClientFeatures, ComplianceReport, Signal
|
|
11
|
+
from .base import Classifier
|
|
12
|
+
from .combiner import DEFAULT_UNKNOWN_THRESHOLD, combine
|
|
13
|
+
from .registry import all_classifiers
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def run_classifiers(features: ClientFeatures) -> list[Signal]:
|
|
17
|
+
"""Collect signals from every classifier for one client."""
|
|
18
|
+
signals: list[Signal] = []
|
|
19
|
+
for classifier in all_classifiers():
|
|
20
|
+
signals.extend(classifier.evaluate(features))
|
|
21
|
+
return signals
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def classify_client(
|
|
25
|
+
features: ClientFeatures,
|
|
26
|
+
*,
|
|
27
|
+
compliance: ComplianceReport | None = None,
|
|
28
|
+
verification: BotVerification | None = None,
|
|
29
|
+
datacenter: bool = False,
|
|
30
|
+
unknown_threshold: float = DEFAULT_UNKNOWN_THRESHOLD,
|
|
31
|
+
keep_signals: bool = True,
|
|
32
|
+
) -> Classification:
|
|
33
|
+
"""Run all classifiers over ``features`` and combine into a verdict."""
|
|
34
|
+
signals = run_classifiers(features)
|
|
35
|
+
return combine(
|
|
36
|
+
signals,
|
|
37
|
+
features,
|
|
38
|
+
compliance=compliance,
|
|
39
|
+
verification=verification,
|
|
40
|
+
datacenter=datacenter,
|
|
41
|
+
unknown_threshold=unknown_threshold,
|
|
42
|
+
keep_signals=keep_signals,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
"Classifier",
|
|
48
|
+
"classify_client",
|
|
49
|
+
"run_classifiers",
|
|
50
|
+
"combine",
|
|
51
|
+
"all_classifiers",
|
|
52
|
+
"DEFAULT_UNKNOWN_THRESHOLD",
|
|
53
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Declared AI / LLM data-gathering crawlers (GPTBot, ClaudeBot, Google-Extended, ...)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..model import Kind
|
|
6
|
+
from .known_bot import KnownBotClassifier
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AiCrawlerClassifier(KnownBotClassifier):
|
|
10
|
+
label = Kind.AI_CRAWLER
|
|
11
|
+
name = "ai_crawler"
|
|
12
|
+
category = "ai_crawler"
|
|
13
|
+
descriptor = "AI / LLM crawler"
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Native mobile / desktop app clients.
|
|
2
|
+
|
|
3
|
+
Not a browser and not a crawler: a first-party app making requests through a
|
|
4
|
+
platform networking stack (Apple's CFNetwork, Flutter's dart:io, …) or a named
|
|
5
|
+
networking framework. The User-Agent names the stack, not a browser engine, so
|
|
6
|
+
these otherwise fall through to UNKNOWN despite being ordinary app traffic.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from functools import lru_cache
|
|
13
|
+
|
|
14
|
+
from ..dataload import load_list
|
|
15
|
+
from ..model import ClientFeatures, Kind, Signal
|
|
16
|
+
from .base import Classifier
|
|
17
|
+
from .tags import identifies_as_known_agent
|
|
18
|
+
|
|
19
|
+
_APP_TOKENS = re.compile("|".join(re.escape(token) for token in load_list("app_clients")), re.I)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@lru_cache(maxsize=16384)
|
|
23
|
+
def app_stack_token(ua: str | None) -> str | None:
|
|
24
|
+
"""The native-app networking token in the UA, or None."""
|
|
25
|
+
if not ua:
|
|
26
|
+
return None
|
|
27
|
+
match = _APP_TOKENS.search(ua)
|
|
28
|
+
return match.group(0) if match else None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AppClientClassifier(Classifier):
|
|
32
|
+
label = Kind.APP
|
|
33
|
+
name = "app"
|
|
34
|
+
|
|
35
|
+
def evaluate(self, features: ClientFeatures) -> list[Signal]:
|
|
36
|
+
# A platform networking stack is the *weakest* identity: if the UA also
|
|
37
|
+
# names a feed reader, crawler, or bot, that more specific identity wins
|
|
38
|
+
# (a feed reader on CFNetwork is a feed reader, not just "an app").
|
|
39
|
+
if identifies_as_known_agent(features):
|
|
40
|
+
return []
|
|
41
|
+
token = app_stack_token(features.user_agent)
|
|
42
|
+
if token is None:
|
|
43
|
+
return []
|
|
44
|
+
# A platform networking stack is an unambiguous native-app identity, so one
|
|
45
|
+
# match is enough to carry it past the unknown threshold on its own.
|
|
46
|
+
return [self._signal(0.65, [f"native-app networking stack in User-Agent ({token})"])]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Web-archiving / preservation crawlers (Internet Archive / Wayback Machine)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..model import Kind
|
|
6
|
+
from .known_bot import KnownBotClassifier
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ArchiverClassifier(KnownBotClassifier):
|
|
10
|
+
label = Kind.ARCHIVER
|
|
11
|
+
name = "archiver"
|
|
12
|
+
category = "archiver"
|
|
13
|
+
descriptor = "web archiver"
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""The classifier contract.
|
|
2
|
+
|
|
3
|
+
A classifier is a pure function of :class:`ClientFeatures`: it reads only the
|
|
4
|
+
feature vector (and its own static data lists) and emits zero or more
|
|
5
|
+
:class:`Signal` votes for the kind it argues for. It never imports another
|
|
6
|
+
classifier and never sees the final decision — that keeps each one independently
|
|
7
|
+
testable and free to evolve. "This client is NOT a browser" is expressed simply
|
|
8
|
+
by the browser classifier not firing.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from abc import ABC, abstractmethod
|
|
14
|
+
|
|
15
|
+
from ..model import ClientFeatures, Kind, Signal
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Classifier(ABC):
|
|
19
|
+
"""Argues, from features alone, that a client is of a particular kind."""
|
|
20
|
+
|
|
21
|
+
#: the kind this classifier votes for
|
|
22
|
+
label: Kind = Kind.UNKNOWN
|
|
23
|
+
#: short stable name, recorded on each signal for provenance
|
|
24
|
+
name: str = ""
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def evaluate(self, features: ClientFeatures) -> list[Signal]:
|
|
28
|
+
"""Return signals supporting :attr:`label` (possibly empty)."""
|
|
29
|
+
|
|
30
|
+
def _signal(self, confidence: float, evidence: list[str]) -> Signal:
|
|
31
|
+
"""Helper to build a signal for this classifier's label."""
|
|
32
|
+
return Signal(
|
|
33
|
+
kind=self.label,
|
|
34
|
+
confidence=min(confidence, 1.0),
|
|
35
|
+
evidence=tuple(evidence),
|
|
36
|
+
classifier=self.name,
|
|
37
|
+
)
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Interactive browsers.
|
|
2
|
+
|
|
3
|
+
The strongest tell is sub-resource co-loading: a real browser, after fetching a
|
|
4
|
+
page, pulls its CSS/JS/images within seconds. Bursty (irregular) timing, on-site
|
|
5
|
+
link navigation, a browser-shaped UA, and a low error rate corroborate it.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .. import uas
|
|
11
|
+
from ..model import ClientFeatures, Kind, Signal
|
|
12
|
+
from .base import Classifier
|
|
13
|
+
from .tags import identifies_as_known_agent
|
|
14
|
+
|
|
15
|
+
# A browser-shaped UA with positive evidence but no disqualifying behaviour is
|
|
16
|
+
# floored to here, so a brief, asset-less visit (a real person who didn't trigger
|
|
17
|
+
# sub-resource loading) isn't lost to UNKNOWN. It matches the default unknown
|
|
18
|
+
# threshold; the combiner's datacenter discount then keeps the rescue to
|
|
19
|
+
# residential clients -- a hosting "browser" still drops to spoofed_browser.
|
|
20
|
+
_BROWSER_FLOOR = 0.45
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BrowserClassifier(Classifier):
|
|
24
|
+
label = Kind.BROWSER
|
|
25
|
+
name = "browser"
|
|
26
|
+
|
|
27
|
+
def evaluate(self, features: ClientFeatures) -> list[Signal]:
|
|
28
|
+
# A UA that names a feed reader, crawler, or bot is not a browser, even if
|
|
29
|
+
# it renders pages and co-loads their sub-resources -- its declared
|
|
30
|
+
# identity wins.
|
|
31
|
+
if identifies_as_known_agent(features):
|
|
32
|
+
return []
|
|
33
|
+
|
|
34
|
+
confidence = 0.0
|
|
35
|
+
evidence: list[str] = []
|
|
36
|
+
# Set when a signal positively argues *against* a browser (a cap or a
|
|
37
|
+
# penalty). While clean, an under-supported browser shape is floored at the
|
|
38
|
+
# end; once disqualified, it is left to fall where its confidence lands. A
|
|
39
|
+
# stale (but not ancient) version is a mild nudge, not a disqualifier.
|
|
40
|
+
disqualified = False
|
|
41
|
+
|
|
42
|
+
if features.asset_coload_ratio > 0.4:
|
|
43
|
+
confidence += 0.45
|
|
44
|
+
evidence.append(
|
|
45
|
+
f"{features.asset_coload_ratio:.0%} of pages followed by sub-resource loads"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
if features.ua_looks_like_browser:
|
|
49
|
+
confidence += 0.2
|
|
50
|
+
evidence.append("User-Agent matches a real browser profile")
|
|
51
|
+
|
|
52
|
+
regularity = features.rate_regularity
|
|
53
|
+
if regularity is not None and regularity > 0.6:
|
|
54
|
+
confidence += 0.1
|
|
55
|
+
evidence.append("irregular, bursty timing (human-like)")
|
|
56
|
+
elif regularity is not None and regularity < 0.15 and features.request_count >= 5:
|
|
57
|
+
# Metronomic cadence is a machine, not a person clicking around.
|
|
58
|
+
confidence -= 0.2
|
|
59
|
+
disqualified = True
|
|
60
|
+
evidence.append("metronomic timing — automated, not human")
|
|
61
|
+
|
|
62
|
+
if features.referer_following_ratio > 0.3:
|
|
63
|
+
confidence += 0.1
|
|
64
|
+
evidence.append(
|
|
65
|
+
f"{features.referer_following_ratio:.0%} of requests follow on-site links"
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if features.ratio_404 < 0.1 and features.vuln_path_hits == 0:
|
|
69
|
+
confidence += 0.1
|
|
70
|
+
evidence.append("low error rate, no probing")
|
|
71
|
+
|
|
72
|
+
if features.static_ratio > 0.3:
|
|
73
|
+
confidence += 0.05
|
|
74
|
+
evidence.append(f"{features.static_ratio:.0%} static-asset requests")
|
|
75
|
+
|
|
76
|
+
if features.status_counts.get(304, 0) > 0:
|
|
77
|
+
# Conditional requests answered 304 mean a real cache -- a browser tell.
|
|
78
|
+
confidence += 0.1
|
|
79
|
+
evidence.append("revalidates from cache (304 Not Modified)")
|
|
80
|
+
|
|
81
|
+
# Chromium-based browsers and Firefox auto-update on a ~monthly cadence, so
|
|
82
|
+
# a real browser is rarely far behind. A UA claiming a version years old
|
|
83
|
+
# (measured against when the client was active) is almost always a frozen,
|
|
84
|
+
# spoofed string; a current one weakly corroborates. Bots can copy a recent
|
|
85
|
+
# UA, so the fresh bonus is small and the stale penalty is the load-bearing
|
|
86
|
+
# half. Old but real cases exist (locked fleets, embedded WebViews, ESR),
|
|
87
|
+
# so only a very old version caps the verdict.
|
|
88
|
+
band = uas.version_age_band(features.user_agent, features.last_seen)
|
|
89
|
+
if evidence and band == "current":
|
|
90
|
+
confidence += 0.1
|
|
91
|
+
evidence.append("up-to-date browser version")
|
|
92
|
+
elif evidence and band == "stale":
|
|
93
|
+
confidence -= 0.15
|
|
94
|
+
evidence.append("browser version well out of date")
|
|
95
|
+
elif evidence and band == "ancient":
|
|
96
|
+
# Years behind on a family that auto-updates: almost always a frozen,
|
|
97
|
+
# spoofed UA, so cap the browser hypothesis below the threshold.
|
|
98
|
+
confidence = min(confidence, 0.3)
|
|
99
|
+
disqualified = True
|
|
100
|
+
evidence.append("browser version years out of date — modern browsers auto-update")
|
|
101
|
+
elif evidence and band == "impossible":
|
|
102
|
+
# Claims a version that doesn't exist yet: a forged UA, not a real
|
|
103
|
+
# browser, so cap the hypothesis just like an ancient one.
|
|
104
|
+
confidence = min(confidence, 0.3)
|
|
105
|
+
disqualified = True
|
|
106
|
+
evidence.append("browser version is impossibly new — forged User-Agent")
|
|
107
|
+
|
|
108
|
+
# A browser never auto-fetches /robots.txt; checking it is a crawler's
|
|
109
|
+
# habit. Slight on its own (a person could type the URL once), but it
|
|
110
|
+
# nudges an otherwise browser-shaped client the right way.
|
|
111
|
+
if evidence and features.fetched_robots_txt:
|
|
112
|
+
confidence -= 0.15
|
|
113
|
+
disqualified = True
|
|
114
|
+
evidence.append("fetched /robots.txt — a crawler's habit, not a browser's")
|
|
115
|
+
|
|
116
|
+
# A browser revalidates what it re-requests (earning 304s) and serves
|
|
117
|
+
# cached assets without hitting the server at all -- so at any real volume
|
|
118
|
+
# a genuine browser leaves some 304s behind, or simply makes few requests.
|
|
119
|
+
# A client that re-fetches the same URLs, or just makes a large number of
|
|
120
|
+
# requests, yet never receives a single 304 holds no cache: not browser
|
|
121
|
+
# behaviour. Only meaningful once paths are measured, and a 304 can only
|
|
122
|
+
# arise from a re-request, so distinct-once fetching at low volume is spared.
|
|
123
|
+
revisits = features.request_count - features.distinct_paths
|
|
124
|
+
no_304 = features.status_counts.get(304, 0) == 0
|
|
125
|
+
cold_refetch = revisits >= 20
|
|
126
|
+
high_volume = features.request_count >= 500
|
|
127
|
+
if evidence and no_304 and features.distinct_paths > 0 and (cold_refetch or high_volume):
|
|
128
|
+
dominant = (cold_refetch and revisits >= features.request_count * 0.5) or (
|
|
129
|
+
features.request_count >= 2000
|
|
130
|
+
)
|
|
131
|
+
disqualified = True
|
|
132
|
+
if dominant:
|
|
133
|
+
# Re-fetching dominates, or the volume is large enough that zero
|
|
134
|
+
# revalidations is itself damning: cap below the confident threshold.
|
|
135
|
+
confidence = min(confidence, 0.3)
|
|
136
|
+
evidence.append("heavy traffic without a single 304 — holds no browser cache")
|
|
137
|
+
else:
|
|
138
|
+
confidence -= 0.2
|
|
139
|
+
evidence.append("many requests, never revalidated (no 304s)")
|
|
140
|
+
|
|
141
|
+
# A person at a browser never fetches attack paths. Vuln probing or
|
|
142
|
+
# directory traversal means this is automation wearing a browser engine
|
|
143
|
+
# (e.g. headless Chrome), so cap the browser hypothesis below the unknown
|
|
144
|
+
# threshold rather than let asset co-loading carry it to a confident
|
|
145
|
+
# verdict. (Ignoring robots.txt is NOT penalised -- it does not bind a
|
|
146
|
+
# human browsing by hand.)
|
|
147
|
+
if features.traversal_hits > 0 or features.vuln_path_hits >= 2:
|
|
148
|
+
confidence = min(confidence, 0.3)
|
|
149
|
+
disqualified = True
|
|
150
|
+
evidence.append("but probes attack paths — not human browsing")
|
|
151
|
+
|
|
152
|
+
# Fabricated referers (the Referer is the requested URL itself) are
|
|
153
|
+
# impossible from real navigation; a client doing this systematically is
|
|
154
|
+
# faking organic traffic, not browsing.
|
|
155
|
+
if features.self_referer_ratio >= 0.5 and features.request_count >= 4:
|
|
156
|
+
confidence = min(confidence, 0.3)
|
|
157
|
+
disqualified = True
|
|
158
|
+
evidence.append("but referers are fabricated (Referer = the requested URL)")
|
|
159
|
+
|
|
160
|
+
# A browser fetches pages and their sub-resources with GET; it does not
|
|
161
|
+
# issue HEAD. Meaningful HEAD traffic from something otherwise browser-
|
|
162
|
+
# shaped is a machine (a monitor, link-checker, or other bot) behind a
|
|
163
|
+
# browser UA -- cap the hypothesis below the confident threshold. Gated on
|
|
164
|
+
# an existing browser signal so monitors/feed readers that legitimately
|
|
165
|
+
# HEAD don't each pick up a spurious browser signal.
|
|
166
|
+
if evidence and features.head_ratio > 0.1:
|
|
167
|
+
confidence = min(confidence, 0.3)
|
|
168
|
+
disqualified = True
|
|
169
|
+
evidence.append(f"but {features.head_ratio:.0%} HEAD requests — browsers issue GET")
|
|
170
|
+
|
|
171
|
+
if not evidence:
|
|
172
|
+
return []
|
|
173
|
+
# A browser-shaped UA with positive evidence and nothing arguing against it
|
|
174
|
+
# is a probable browser even without the asset-loading proof -- a brief
|
|
175
|
+
# visit. Floor it so it clears the unknown threshold rather than being lost;
|
|
176
|
+
# ancient/forged UAs and any non-browser behaviour disqualify it above.
|
|
177
|
+
if features.ua_looks_like_browser and not disqualified and confidence < _BROWSER_FLOOR:
|
|
178
|
+
confidence = _BROWSER_FLOOR
|
|
179
|
+
evidence.append("browser-shaped User-Agent with no non-browser behaviour")
|
|
180
|
+
return [self._signal(confidence, evidence)]
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Combine classifier signals into a final classification.
|
|
2
|
+
|
|
3
|
+
Confidence is treated as an ordinal strength per label, not a probability, so
|
|
4
|
+
signals are aggregated per kind (taking the strongest) rather than multiplied.
|
|
5
|
+
The strongest label wins; ties break by a fixed priority. Below a threshold the
|
|
6
|
+
honest answer is ``UNKNOWN``. Tags are derived separately and can demote a
|
|
7
|
+
falsely-claimed good-bot identity.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from .. import uas
|
|
13
|
+
from ..model import (
|
|
14
|
+
BotVerification,
|
|
15
|
+
Classification,
|
|
16
|
+
ClientFeatures,
|
|
17
|
+
ComplianceReport,
|
|
18
|
+
Kind,
|
|
19
|
+
Signal,
|
|
20
|
+
)
|
|
21
|
+
from .tags import derive_tags, impersonation, looks_like_fake_browser
|
|
22
|
+
|
|
23
|
+
# Tie-break order when two kinds share the top confidence: earlier wins.
|
|
24
|
+
_PRIORITY: tuple[Kind, ...] = (
|
|
25
|
+
Kind.IMPERSONATOR,
|
|
26
|
+
Kind.SEARCH_ENGINE,
|
|
27
|
+
Kind.SOCIAL_PREVIEW,
|
|
28
|
+
Kind.ARCHIVER,
|
|
29
|
+
Kind.AI_CRAWLER,
|
|
30
|
+
Kind.SEO_MARKETING,
|
|
31
|
+
Kind.VULN_SCANNER,
|
|
32
|
+
Kind.SPAM_BOT,
|
|
33
|
+
Kind.FEED_READER,
|
|
34
|
+
Kind.MONITOR,
|
|
35
|
+
Kind.BROWSER,
|
|
36
|
+
Kind.APP,
|
|
37
|
+
Kind.SCRAPER,
|
|
38
|
+
Kind.CRAWLER,
|
|
39
|
+
Kind.SPOOFED_BROWSER,
|
|
40
|
+
Kind.SINGLETON,
|
|
41
|
+
Kind.UNKNOWN,
|
|
42
|
+
)
|
|
43
|
+
_RANK = {kind: i for i, kind in enumerate(_PRIORITY)}
|
|
44
|
+
|
|
45
|
+
DEFAULT_UNKNOWN_THRESHOLD = 0.45
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _pick(by_label: dict[Kind, float]) -> Kind:
|
|
49
|
+
return max(by_label, key=lambda k: (by_label[k], -_RANK.get(k, len(_PRIORITY))))
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _top_evidence(signals: tuple[Signal, ...]) -> tuple[str, ...]:
|
|
53
|
+
if not signals:
|
|
54
|
+
return ("no classifier produced a signal",)
|
|
55
|
+
strongest = max(signals, key=lambda s: s.confidence)
|
|
56
|
+
return strongest.evidence or ("no specific evidence recorded",)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def combine(
|
|
60
|
+
signals: list[Signal],
|
|
61
|
+
features: ClientFeatures,
|
|
62
|
+
*,
|
|
63
|
+
compliance: ComplianceReport | None = None,
|
|
64
|
+
verification: BotVerification | None = None,
|
|
65
|
+
datacenter: bool = False,
|
|
66
|
+
unknown_threshold: float = DEFAULT_UNKNOWN_THRESHOLD,
|
|
67
|
+
keep_signals: bool = True,
|
|
68
|
+
) -> Classification:
|
|
69
|
+
"""Aggregate ``signals`` into a primary kind plus secondary tags.
|
|
70
|
+
|
|
71
|
+
``keep_signals`` retains every contributing signal on the result for inspect
|
|
72
|
+
mode's rationale. The ``analyze`` report never reads them, so it passes
|
|
73
|
+
``False`` to avoid holding a Signal (and its evidence strings) per client.
|
|
74
|
+
"""
|
|
75
|
+
by_label: dict[Kind, float] = {}
|
|
76
|
+
for signal in signals:
|
|
77
|
+
# Round when aggregating: classifiers build confidence from 0.05-step
|
|
78
|
+
# increments, and float error (0.3 + 0.15 == 0.44999999996) would otherwise
|
|
79
|
+
# drop a sum that equals the threshold just below it -- misfiling a clear
|
|
80
|
+
# client as UNKNOWN while it still displays as the rounded percentage.
|
|
81
|
+
by_label[signal.kind] = max(by_label.get(signal.kind, 0.0), round(signal.confidence, 3))
|
|
82
|
+
|
|
83
|
+
# A person rarely browses from hosting infrastructure, so nudge a datacenter
|
|
84
|
+
# "browser" verdict down a little -- enough to tip a borderline one, not to
|
|
85
|
+
# overrule a strongly-behaving real browser.
|
|
86
|
+
if datacenter and Kind.BROWSER in by_label:
|
|
87
|
+
by_label[Kind.BROWSER] = round(max(0.0, by_label[Kind.BROWSER] - 0.1), 3)
|
|
88
|
+
|
|
89
|
+
tags = derive_tags(features, compliance, verification, datacenter=datacenter)
|
|
90
|
+
stored = tuple(signals) if keep_signals else ()
|
|
91
|
+
|
|
92
|
+
# Impersonation is decisive: a client faking a declared identity is an
|
|
93
|
+
# impersonator, whatever else it looks like.
|
|
94
|
+
faking, why = impersonation(verification)
|
|
95
|
+
if faking:
|
|
96
|
+
return Classification(
|
|
97
|
+
primary=Kind.IMPERSONATOR,
|
|
98
|
+
confidence=0.9,
|
|
99
|
+
tags=frozenset(tags),
|
|
100
|
+
evidence=why,
|
|
101
|
+
all_signals=stored,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if not by_label or max(by_label.values()) < unknown_threshold:
|
|
105
|
+
# A would-be-unknown client wearing a browser UA from a hosting IP, with
|
|
106
|
+
# no browser behaviour, is automation in disguise -- name it as such.
|
|
107
|
+
if datacenter and looks_like_fake_browser(features):
|
|
108
|
+
return Classification(
|
|
109
|
+
primary=Kind.SPOOFED_BROWSER,
|
|
110
|
+
confidence=0.6,
|
|
111
|
+
tags=frozenset(tags),
|
|
112
|
+
evidence=("browser User-Agent from a datacenter IP, without browser behaviour",),
|
|
113
|
+
all_signals=stored,
|
|
114
|
+
)
|
|
115
|
+
# A generic HTTP library (or no UA) fetching several pages from hosting
|
|
116
|
+
# infrastructure is harvesting content -- a scraper -- even when no single
|
|
117
|
+
# signal cleared the bar. The datacenter origin is what tips it: the same
|
|
118
|
+
# library from a residential IP could be an app or a one-off script.
|
|
119
|
+
if datacenter and _looks_like_datacenter_scraper(features):
|
|
120
|
+
return Classification(
|
|
121
|
+
primary=Kind.SCRAPER,
|
|
122
|
+
confidence=0.5,
|
|
123
|
+
tags=frozenset(tags),
|
|
124
|
+
evidence=("generic HTTP client harvesting pages from a datacenter IP",),
|
|
125
|
+
all_signals=stored,
|
|
126
|
+
)
|
|
127
|
+
# A would-be-unknown client with a single request gets its own bucket:
|
|
128
|
+
# one hit is too little to characterize, so we file it by volume.
|
|
129
|
+
if features.request_count == 1:
|
|
130
|
+
return Classification(
|
|
131
|
+
primary=Kind.SINGLETON,
|
|
132
|
+
confidence=1.0,
|
|
133
|
+
tags=frozenset(tags),
|
|
134
|
+
evidence=("single request — too little activity to characterize",),
|
|
135
|
+
all_signals=stored,
|
|
136
|
+
)
|
|
137
|
+
confidence = max(by_label.values()) if by_label else 0.0
|
|
138
|
+
return Classification(
|
|
139
|
+
primary=Kind.UNKNOWN,
|
|
140
|
+
confidence=confidence,
|
|
141
|
+
tags=frozenset(tags),
|
|
142
|
+
evidence=_top_evidence(tuple(signals)),
|
|
143
|
+
all_signals=stored,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
primary = _pick(by_label)
|
|
147
|
+
if primary is Kind.FEED_READER and _fetches_non_feeds(features):
|
|
148
|
+
tags.add("fetches-non-feeds")
|
|
149
|
+
evidence = tuple(e for s in signals if s.kind is primary for e in s.evidence)
|
|
150
|
+
return Classification(
|
|
151
|
+
primary=primary,
|
|
152
|
+
confidence=by_label[primary],
|
|
153
|
+
tags=frozenset(tags),
|
|
154
|
+
evidence=evidence,
|
|
155
|
+
all_signals=stored,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _looks_like_datacenter_scraper(features: ClientFeatures) -> bool:
|
|
160
|
+
"""A generic-library / UA-less client harvesting several pages, benignly."""
|
|
161
|
+
return (
|
|
162
|
+
features.request_count >= 2
|
|
163
|
+
and features.distinct_paths >= 2
|
|
164
|
+
and (uas.is_library(features.user_agent) or features.ua_empty)
|
|
165
|
+
and features.vuln_path_hits == 0
|
|
166
|
+
and features.traversal_hits == 0
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _fetches_non_feeds(features: ClientFeatures) -> bool:
|
|
171
|
+
"""True if a feed reader also requested non-feed resources (robots.txt aside)."""
|
|
172
|
+
non_feed = features.request_count - features.feed_requests
|
|
173
|
+
if features.fetched_robots_txt:
|
|
174
|
+
non_feed -= 1 # a polite robots.txt fetch does not count as content scraping
|
|
175
|
+
return non_feed > 0
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Generic crawlers that walk the site following links at a steady pace.
|
|
2
|
+
|
|
3
|
+
Distinguished from a browser by the absence of sub-resource co-loading, and from
|
|
4
|
+
a scraper by actually following on-site links (high referer-following) rather
|
|
5
|
+
than hitting URLs cold.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from ..model import ClientFeatures, Kind, Signal
|
|
11
|
+
from .base import Classifier
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CrawlerClassifier(Classifier):
|
|
15
|
+
label = Kind.CRAWLER
|
|
16
|
+
name = "crawler"
|
|
17
|
+
|
|
18
|
+
def evaluate(self, features: ClientFeatures) -> list[Signal]:
|
|
19
|
+
confidence = 0.0
|
|
20
|
+
evidence: list[str] = []
|
|
21
|
+
|
|
22
|
+
if features.distinct_paths >= 20 and features.coverage > 0.7:
|
|
23
|
+
confidence += 0.3
|
|
24
|
+
evidence.append(
|
|
25
|
+
f"broad coverage: {features.distinct_paths} distinct paths "
|
|
26
|
+
f"({features.coverage:.0%} unique)"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
regularity = features.rate_regularity
|
|
30
|
+
if regularity is not None and regularity < 0.5 and features.request_count >= 10:
|
|
31
|
+
confidence += 0.2
|
|
32
|
+
evidence.append("steady, regular request cadence")
|
|
33
|
+
|
|
34
|
+
if features.referer_following_ratio > 0.3:
|
|
35
|
+
confidence += 0.15
|
|
36
|
+
evidence.append(
|
|
37
|
+
f"{features.referer_following_ratio:.0%} of requests follow on-site links"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if features.ua_declares_bot:
|
|
41
|
+
confidence += 0.15
|
|
42
|
+
evidence.append("User-Agent self-identifies as a bot")
|
|
43
|
+
# A self-declared bot walking many pages with no browser sub-resource
|
|
44
|
+
# loading is a crawler even without broad unique coverage -- the kind of
|
|
45
|
+
# "MyBot/1.0 (+url)" client that re-requests a modest path set.
|
|
46
|
+
if features.distinct_paths >= 20 and features.asset_coload_ratio < 0.1:
|
|
47
|
+
confidence += 0.3
|
|
48
|
+
evidence.append("walks many pages without browser sub-resource loading")
|
|
49
|
+
|
|
50
|
+
if features.ratio_2xx > 0.7 and features.asset_coload_ratio < 0.1:
|
|
51
|
+
confidence += 0.1
|
|
52
|
+
evidence.append("mostly successful page fetches, no browser sub-resource loading")
|
|
53
|
+
|
|
54
|
+
if not evidence:
|
|
55
|
+
return []
|
|
56
|
+
return [self._signal(confidence, evidence)]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Data harvesters: crawl content into a private corpus or dataset.
|
|
2
|
+
|
|
3
|
+
Commercial crawlers that ingest pages into a proprietary database for their own
|
|
4
|
+
product -- plagiarism/similarity indexes (Turnitin), data brokers and dataset
|
|
5
|
+
builders (Panscient) -- as opposed to public search, preservation, AI/LLM
|
|
6
|
+
training, or SEO. Recognised by a known UA token, like the other declared kinds.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from ..model import Kind
|
|
12
|
+
from .known_bot import KnownBotClassifier
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DataHarvesterClassifier(KnownBotClassifier):
|
|
16
|
+
label = Kind.DATA_HARVESTER
|
|
17
|
+
name = "data_harvester"
|
|
18
|
+
category = "data_harvester"
|
|
19
|
+
descriptor = "data harvester"
|