agent-census 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. agent_census/__init__.py +23 -0
  2. agent_census/__main__.py +10 -0
  3. agent_census/classify/__init__.py +53 -0
  4. agent_census/classify/ai_crawler.py +13 -0
  5. agent_census/classify/app.py +46 -0
  6. agent_census/classify/archiver.py +13 -0
  7. agent_census/classify/base.py +37 -0
  8. agent_census/classify/browser.py +180 -0
  9. agent_census/classify/combiner.py +175 -0
  10. agent_census/classify/crawler.py +56 -0
  11. agent_census/classify/data_harvester.py +19 -0
  12. agent_census/classify/feed_reader.py +101 -0
  13. agent_census/classify/known_bot.py +51 -0
  14. agent_census/classify/monitor.py +54 -0
  15. agent_census/classify/registry.py +46 -0
  16. agent_census/classify/scraper.py +46 -0
  17. agent_census/classify/search_engine.py +13 -0
  18. agent_census/classify/seo_marketing.py +13 -0
  19. agent_census/classify/social_preview.py +13 -0
  20. agent_census/classify/spam_bot.py +44 -0
  21. agent_census/classify/tags.py +236 -0
  22. agent_census/classify/vuln_scanner.py +73 -0
  23. agent_census/cli.py +477 -0
  24. agent_census/data/__init__.py +5 -0
  25. agent_census/data/ai_crawler.toml +155 -0
  26. agent_census/data/app_clients.toml +17 -0
  27. agent_census/data/archiver.toml +30 -0
  28. agent_census/data/browser_releases.toml +33 -0
  29. agent_census/data/data_harvester.toml +27 -0
  30. agent_census/data/datacenter_ranges.toml +242 -0
  31. agent_census/data/egress_networks.toml +65 -0
  32. agent_census/data/feed_readers.toml +15 -0
  33. agent_census/data/scanner_ua.toml +9 -0
  34. agent_census/data/search_engine.toml +130 -0
  35. agent_census/data/seo_marketing.toml +69 -0
  36. agent_census/data/social_preview.toml +78 -0
  37. agent_census/data/vuln_paths.toml +23 -0
  38. agent_census/dataload.py +342 -0
  39. agent_census/egress.py +59 -0
  40. agent_census/errors.py +20 -0
  41. agent_census/features.py +599 -0
  42. agent_census/hosting.py +133 -0
  43. agent_census/identity.py +107 -0
  44. agent_census/iprange.py +390 -0
  45. agent_census/model.py +283 -0
  46. agent_census/netverify.py +371 -0
  47. agent_census/parsing/__init__.py +15 -0
  48. agent_census/parsing/apache.py +196 -0
  49. agent_census/parsing/apache_directives.py +340 -0
  50. agent_census/parsing/base.py +38 -0
  51. agent_census/parsing/cloudflare.py +114 -0
  52. agent_census/parsing/registry.py +39 -0
  53. agent_census/pipeline.py +755 -0
  54. agent_census/py.typed +0 -0
  55. agent_census/report/__init__.py +17 -0
  56. agent_census/report/aggregate.py +240 -0
  57. agent_census/report/calibrate.py +400 -0
  58. agent_census/report/format.py +266 -0
  59. agent_census/report/html.py +880 -0
  60. agent_census/report/inspect.py +203 -0
  61. agent_census/report/markdown.py +226 -0
  62. agent_census/robots/__init__.py +16 -0
  63. agent_census/robots/compliance.py +118 -0
  64. agent_census/robots/parser.py +55 -0
  65. agent_census/robots/source.py +69 -0
  66. agent_census/uas.py +292 -0
  67. agent_census/userconfig.py +42 -0
  68. agent_census-0.0.1.dist-info/METADATA +266 -0
  69. agent_census-0.0.1.dist-info/RECORD +73 -0
  70. agent_census-0.0.1.dist-info/WHEEL +5 -0
  71. agent_census-0.0.1.dist-info/entry_points.txt +2 -0
  72. agent_census-0.0.1.dist-info/licenses/LICENSE.md +19 -0
  73. agent_census-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,23 @@
1
+ __version__ = "0.0.1"
2
+ __author__ = "Mark Nottingham <mnot@mnot.net>"
3
+ __copyright__ = """\
4
+ Copyright (c) Mark Nottingham
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
23
+ """
@@ -0,0 +1,10 @@
1
+ """Enable ``python -m agent_census``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+ from .cli import main
8
+
9
+ if __name__ == "__main__":
10
+ sys.exit(main())
@@ -0,0 +1,53 @@
1
+ """Client classification: independent rule-based classifiers + a combiner.
2
+
3
+ The public entry point is :func:`classify_client`, which runs every registered
4
+ classifier over a client's features and combines their signals into a single
5
+ :class:`~agent_census.model.Classification` (primary kind plus tags).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from ..model import BotVerification, Classification, ClientFeatures, ComplianceReport, Signal
11
+ from .base import Classifier
12
+ from .combiner import DEFAULT_UNKNOWN_THRESHOLD, combine
13
+ from .registry import all_classifiers
14
+
15
+
16
+ def run_classifiers(features: ClientFeatures) -> list[Signal]:
17
+ """Collect signals from every classifier for one client."""
18
+ signals: list[Signal] = []
19
+ for classifier in all_classifiers():
20
+ signals.extend(classifier.evaluate(features))
21
+ return signals
22
+
23
+
24
+ def classify_client(
25
+ features: ClientFeatures,
26
+ *,
27
+ compliance: ComplianceReport | None = None,
28
+ verification: BotVerification | None = None,
29
+ datacenter: bool = False,
30
+ unknown_threshold: float = DEFAULT_UNKNOWN_THRESHOLD,
31
+ keep_signals: bool = True,
32
+ ) -> Classification:
33
+ """Run all classifiers over ``features`` and combine into a verdict."""
34
+ signals = run_classifiers(features)
35
+ return combine(
36
+ signals,
37
+ features,
38
+ compliance=compliance,
39
+ verification=verification,
40
+ datacenter=datacenter,
41
+ unknown_threshold=unknown_threshold,
42
+ keep_signals=keep_signals,
43
+ )
44
+
45
+
46
+ __all__ = [
47
+ "Classifier",
48
+ "classify_client",
49
+ "run_classifiers",
50
+ "combine",
51
+ "all_classifiers",
52
+ "DEFAULT_UNKNOWN_THRESHOLD",
53
+ ]
@@ -0,0 +1,13 @@
1
+ """Declared AI / LLM data-gathering crawlers (GPTBot, ClaudeBot, Google-Extended, ...)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ..model import Kind
6
+ from .known_bot import KnownBotClassifier
7
+
8
+
9
+ class AiCrawlerClassifier(KnownBotClassifier):
10
+ label = Kind.AI_CRAWLER
11
+ name = "ai_crawler"
12
+ category = "ai_crawler"
13
+ descriptor = "AI / LLM crawler"
@@ -0,0 +1,46 @@
1
+ """Native mobile / desktop app clients.
2
+
3
+ Not a browser and not a crawler: a first-party app making requests through a
4
+ platform networking stack (Apple's CFNetwork, Flutter's dart:io, …) or a named
5
+ networking framework. The User-Agent names the stack, not a browser engine, so
6
+ these otherwise fall through to UNKNOWN despite being ordinary app traffic.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from functools import lru_cache
13
+
14
+ from ..dataload import load_list
15
+ from ..model import ClientFeatures, Kind, Signal
16
+ from .base import Classifier
17
+ from .tags import identifies_as_known_agent
18
+
19
+ _APP_TOKENS = re.compile("|".join(re.escape(token) for token in load_list("app_clients")), re.I)
20
+
21
+
22
+ @lru_cache(maxsize=16384)
23
+ def app_stack_token(ua: str | None) -> str | None:
24
+ """The native-app networking token in the UA, or None."""
25
+ if not ua:
26
+ return None
27
+ match = _APP_TOKENS.search(ua)
28
+ return match.group(0) if match else None
29
+
30
+
31
+ class AppClientClassifier(Classifier):
32
+ label = Kind.APP
33
+ name = "app"
34
+
35
+ def evaluate(self, features: ClientFeatures) -> list[Signal]:
36
+ # A platform networking stack is the *weakest* identity: if the UA also
37
+ # names a feed reader, crawler, or bot, that more specific identity wins
38
+ # (a feed reader on CFNetwork is a feed reader, not just "an app").
39
+ if identifies_as_known_agent(features):
40
+ return []
41
+ token = app_stack_token(features.user_agent)
42
+ if token is None:
43
+ return []
44
+ # A platform networking stack is an unambiguous native-app identity, so one
45
+ # match is enough to carry it past the unknown threshold on its own.
46
+ return [self._signal(0.65, [f"native-app networking stack in User-Agent ({token})"])]
@@ -0,0 +1,13 @@
1
+ """Web-archiving / preservation crawlers (Internet Archive / Wayback Machine)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ..model import Kind
6
+ from .known_bot import KnownBotClassifier
7
+
8
+
9
+ class ArchiverClassifier(KnownBotClassifier):
10
+ label = Kind.ARCHIVER
11
+ name = "archiver"
12
+ category = "archiver"
13
+ descriptor = "web archiver"
@@ -0,0 +1,37 @@
1
+ """The classifier contract.
2
+
3
+ A classifier is a pure function of :class:`ClientFeatures`: it reads only the
4
+ feature vector (and its own static data lists) and emits zero or more
5
+ :class:`Signal` votes for the kind it argues for. It never imports another
6
+ classifier and never sees the final decision — that keeps each one independently
7
+ testable and free to evolve. "This client is NOT a browser" is expressed simply
8
+ by the browser classifier not firing.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from abc import ABC, abstractmethod
14
+
15
+ from ..model import ClientFeatures, Kind, Signal
16
+
17
+
18
+ class Classifier(ABC):
19
+ """Argues, from features alone, that a client is of a particular kind."""
20
+
21
+ #: the kind this classifier votes for
22
+ label: Kind = Kind.UNKNOWN
23
+ #: short stable name, recorded on each signal for provenance
24
+ name: str = ""
25
+
26
+ @abstractmethod
27
+ def evaluate(self, features: ClientFeatures) -> list[Signal]:
28
+ """Return signals supporting :attr:`label` (possibly empty)."""
29
+
30
+ def _signal(self, confidence: float, evidence: list[str]) -> Signal:
31
+ """Helper to build a signal for this classifier's label."""
32
+ return Signal(
33
+ kind=self.label,
34
+ confidence=min(confidence, 1.0),
35
+ evidence=tuple(evidence),
36
+ classifier=self.name,
37
+ )
@@ -0,0 +1,180 @@
1
+ """Interactive browsers.
2
+
3
+ The strongest tell is sub-resource co-loading: a real browser, after fetching a
4
+ page, pulls its CSS/JS/images within seconds. Bursty (irregular) timing, on-site
5
+ link navigation, a browser-shaped UA, and a low error rate corroborate it.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .. import uas
11
+ from ..model import ClientFeatures, Kind, Signal
12
+ from .base import Classifier
13
+ from .tags import identifies_as_known_agent
14
+
15
+ # A browser-shaped UA with positive evidence but no disqualifying behaviour is
16
+ # floored to here, so a brief, asset-less visit (a real person who didn't trigger
17
+ # sub-resource loading) isn't lost to UNKNOWN. It matches the default unknown
18
+ # threshold; the combiner's datacenter discount then keeps the rescue to
19
+ # residential clients -- a hosting "browser" still drops to spoofed_browser.
20
+ _BROWSER_FLOOR = 0.45
21
+
22
+
23
+ class BrowserClassifier(Classifier):
24
+ label = Kind.BROWSER
25
+ name = "browser"
26
+
27
+ def evaluate(self, features: ClientFeatures) -> list[Signal]:
28
+ # A UA that names a feed reader, crawler, or bot is not a browser, even if
29
+ # it renders pages and co-loads their sub-resources -- its declared
30
+ # identity wins.
31
+ if identifies_as_known_agent(features):
32
+ return []
33
+
34
+ confidence = 0.0
35
+ evidence: list[str] = []
36
+ # Set when a signal positively argues *against* a browser (a cap or a
37
+ # penalty). While clean, an under-supported browser shape is floored at the
38
+ # end; once disqualified, it is left to fall where its confidence lands. A
39
+ # stale (but not ancient) version is a mild nudge, not a disqualifier.
40
+ disqualified = False
41
+
42
+ if features.asset_coload_ratio > 0.4:
43
+ confidence += 0.45
44
+ evidence.append(
45
+ f"{features.asset_coload_ratio:.0%} of pages followed by sub-resource loads"
46
+ )
47
+
48
+ if features.ua_looks_like_browser:
49
+ confidence += 0.2
50
+ evidence.append("User-Agent matches a real browser profile")
51
+
52
+ regularity = features.rate_regularity
53
+ if regularity is not None and regularity > 0.6:
54
+ confidence += 0.1
55
+ evidence.append("irregular, bursty timing (human-like)")
56
+ elif regularity is not None and regularity < 0.15 and features.request_count >= 5:
57
+ # Metronomic cadence is a machine, not a person clicking around.
58
+ confidence -= 0.2
59
+ disqualified = True
60
+ evidence.append("metronomic timing — automated, not human")
61
+
62
+ if features.referer_following_ratio > 0.3:
63
+ confidence += 0.1
64
+ evidence.append(
65
+ f"{features.referer_following_ratio:.0%} of requests follow on-site links"
66
+ )
67
+
68
+ if features.ratio_404 < 0.1 and features.vuln_path_hits == 0:
69
+ confidence += 0.1
70
+ evidence.append("low error rate, no probing")
71
+
72
+ if features.static_ratio > 0.3:
73
+ confidence += 0.05
74
+ evidence.append(f"{features.static_ratio:.0%} static-asset requests")
75
+
76
+ if features.status_counts.get(304, 0) > 0:
77
+ # Conditional requests answered 304 mean a real cache -- a browser tell.
78
+ confidence += 0.1
79
+ evidence.append("revalidates from cache (304 Not Modified)")
80
+
81
+ # Chromium-based browsers and Firefox auto-update on a ~monthly cadence, so
82
+ # a real browser is rarely far behind. A UA claiming a version years old
83
+ # (measured against when the client was active) is almost always a frozen,
84
+ # spoofed string; a current one weakly corroborates. Bots can copy a recent
85
+ # UA, so the fresh bonus is small and the stale penalty is the load-bearing
86
+ # half. Old but real cases exist (locked fleets, embedded WebViews, ESR),
87
+ # so only a very old version caps the verdict.
88
+ band = uas.version_age_band(features.user_agent, features.last_seen)
89
+ if evidence and band == "current":
90
+ confidence += 0.1
91
+ evidence.append("up-to-date browser version")
92
+ elif evidence and band == "stale":
93
+ confidence -= 0.15
94
+ evidence.append("browser version well out of date")
95
+ elif evidence and band == "ancient":
96
+ # Years behind on a family that auto-updates: almost always a frozen,
97
+ # spoofed UA, so cap the browser hypothesis below the threshold.
98
+ confidence = min(confidence, 0.3)
99
+ disqualified = True
100
+ evidence.append("browser version years out of date — modern browsers auto-update")
101
+ elif evidence and band == "impossible":
102
+ # Claims a version that doesn't exist yet: a forged UA, not a real
103
+ # browser, so cap the hypothesis just like an ancient one.
104
+ confidence = min(confidence, 0.3)
105
+ disqualified = True
106
+ evidence.append("browser version is impossibly new — forged User-Agent")
107
+
108
+ # A browser never auto-fetches /robots.txt; checking it is a crawler's
109
+ # habit. Slight on its own (a person could type the URL once), but it
110
+ # nudges an otherwise browser-shaped client the right way.
111
+ if evidence and features.fetched_robots_txt:
112
+ confidence -= 0.15
113
+ disqualified = True
114
+ evidence.append("fetched /robots.txt — a crawler's habit, not a browser's")
115
+
116
+ # A browser revalidates what it re-requests (earning 304s) and serves
117
+ # cached assets without hitting the server at all -- so at any real volume
118
+ # a genuine browser leaves some 304s behind, or simply makes few requests.
119
+ # A client that re-fetches the same URLs, or just makes a large number of
120
+ # requests, yet never receives a single 304 holds no cache: not browser
121
+ # behaviour. Only meaningful once paths are measured, and a 304 can only
122
+ # arise from a re-request, so distinct-once fetching at low volume is spared.
123
+ revisits = features.request_count - features.distinct_paths
124
+ no_304 = features.status_counts.get(304, 0) == 0
125
+ cold_refetch = revisits >= 20
126
+ high_volume = features.request_count >= 500
127
+ if evidence and no_304 and features.distinct_paths > 0 and (cold_refetch or high_volume):
128
+ dominant = (cold_refetch and revisits >= features.request_count * 0.5) or (
129
+ features.request_count >= 2000
130
+ )
131
+ disqualified = True
132
+ if dominant:
133
+ # Re-fetching dominates, or the volume is large enough that zero
134
+ # revalidations is itself damning: cap below the confident threshold.
135
+ confidence = min(confidence, 0.3)
136
+ evidence.append("heavy traffic without a single 304 — holds no browser cache")
137
+ else:
138
+ confidence -= 0.2
139
+ evidence.append("many requests, never revalidated (no 304s)")
140
+
141
+ # A person at a browser never fetches attack paths. Vuln probing or
142
+ # directory traversal means this is automation wearing a browser engine
143
+ # (e.g. headless Chrome), so cap the browser hypothesis below the unknown
144
+ # threshold rather than let asset co-loading carry it to a confident
145
+ # verdict. (Ignoring robots.txt is NOT penalised -- it does not bind a
146
+ # human browsing by hand.)
147
+ if features.traversal_hits > 0 or features.vuln_path_hits >= 2:
148
+ confidence = min(confidence, 0.3)
149
+ disqualified = True
150
+ evidence.append("but probes attack paths — not human browsing")
151
+
152
+ # Fabricated referers (the Referer is the requested URL itself) are
153
+ # impossible from real navigation; a client doing this systematically is
154
+ # faking organic traffic, not browsing.
155
+ if features.self_referer_ratio >= 0.5 and features.request_count >= 4:
156
+ confidence = min(confidence, 0.3)
157
+ disqualified = True
158
+ evidence.append("but referers are fabricated (Referer = the requested URL)")
159
+
160
+ # A browser fetches pages and their sub-resources with GET; it does not
161
+ # issue HEAD. Meaningful HEAD traffic from something otherwise browser-
162
+ # shaped is a machine (a monitor, link-checker, or other bot) behind a
163
+ # browser UA -- cap the hypothesis below the confident threshold. Gated on
164
+ # an existing browser signal so monitors/feed readers that legitimately
165
+ # HEAD don't each pick up a spurious browser signal.
166
+ if evidence and features.head_ratio > 0.1:
167
+ confidence = min(confidence, 0.3)
168
+ disqualified = True
169
+ evidence.append(f"but {features.head_ratio:.0%} HEAD requests — browsers issue GET")
170
+
171
+ if not evidence:
172
+ return []
173
+ # A browser-shaped UA with positive evidence and nothing arguing against it
174
+ # is a probable browser even without the asset-loading proof -- a brief
175
+ # visit. Floor it so it clears the unknown threshold rather than being lost;
176
+ # ancient/forged UAs and any non-browser behaviour disqualify it above.
177
+ if features.ua_looks_like_browser and not disqualified and confidence < _BROWSER_FLOOR:
178
+ confidence = _BROWSER_FLOOR
179
+ evidence.append("browser-shaped User-Agent with no non-browser behaviour")
180
+ return [self._signal(confidence, evidence)]
@@ -0,0 +1,175 @@
1
+ """Combine classifier signals into a final classification.
2
+
3
+ Confidence is treated as an ordinal strength per label, not a probability, so
4
+ signals are aggregated per kind (taking the strongest) rather than multiplied.
5
+ The strongest label wins; ties break by a fixed priority. Below a threshold the
6
+ honest answer is ``UNKNOWN``. Tags are derived separately and can demote a
7
+ falsely-claimed good-bot identity.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from .. import uas
13
+ from ..model import (
14
+ BotVerification,
15
+ Classification,
16
+ ClientFeatures,
17
+ ComplianceReport,
18
+ Kind,
19
+ Signal,
20
+ )
21
+ from .tags import derive_tags, impersonation, looks_like_fake_browser
22
+
23
+ # Tie-break order when two kinds share the top confidence: earlier wins.
24
+ _PRIORITY: tuple[Kind, ...] = (
25
+ Kind.IMPERSONATOR,
26
+ Kind.SEARCH_ENGINE,
27
+ Kind.SOCIAL_PREVIEW,
28
+ Kind.ARCHIVER,
29
+ Kind.AI_CRAWLER,
30
+ Kind.SEO_MARKETING,
31
+ Kind.VULN_SCANNER,
32
+ Kind.SPAM_BOT,
33
+ Kind.FEED_READER,
34
+ Kind.MONITOR,
35
+ Kind.BROWSER,
36
+ Kind.APP,
37
+ Kind.SCRAPER,
38
+ Kind.CRAWLER,
39
+ Kind.SPOOFED_BROWSER,
40
+ Kind.SINGLETON,
41
+ Kind.UNKNOWN,
42
+ )
43
+ _RANK = {kind: i for i, kind in enumerate(_PRIORITY)}
44
+
45
+ DEFAULT_UNKNOWN_THRESHOLD = 0.45
46
+
47
+
48
+ def _pick(by_label: dict[Kind, float]) -> Kind:
49
+ return max(by_label, key=lambda k: (by_label[k], -_RANK.get(k, len(_PRIORITY))))
50
+
51
+
52
+ def _top_evidence(signals: tuple[Signal, ...]) -> tuple[str, ...]:
53
+ if not signals:
54
+ return ("no classifier produced a signal",)
55
+ strongest = max(signals, key=lambda s: s.confidence)
56
+ return strongest.evidence or ("no specific evidence recorded",)
57
+
58
+
59
+ def combine(
60
+ signals: list[Signal],
61
+ features: ClientFeatures,
62
+ *,
63
+ compliance: ComplianceReport | None = None,
64
+ verification: BotVerification | None = None,
65
+ datacenter: bool = False,
66
+ unknown_threshold: float = DEFAULT_UNKNOWN_THRESHOLD,
67
+ keep_signals: bool = True,
68
+ ) -> Classification:
69
+ """Aggregate ``signals`` into a primary kind plus secondary tags.
70
+
71
+ ``keep_signals`` retains every contributing signal on the result for inspect
72
+ mode's rationale. The ``analyze`` report never reads them, so it passes
73
+ ``False`` to avoid holding a Signal (and its evidence strings) per client.
74
+ """
75
+ by_label: dict[Kind, float] = {}
76
+ for signal in signals:
77
+ # Round when aggregating: classifiers build confidence from 0.05-step
78
+ # increments, and float error (0.3 + 0.15 == 0.44999999996) would otherwise
79
+ # drop a sum that equals the threshold just below it -- misfiling a clear
80
+ # client as UNKNOWN while it still displays as the rounded percentage.
81
+ by_label[signal.kind] = max(by_label.get(signal.kind, 0.0), round(signal.confidence, 3))
82
+
83
+ # A person rarely browses from hosting infrastructure, so nudge a datacenter
84
+ # "browser" verdict down a little -- enough to tip a borderline one, not to
85
+ # overrule a strongly-behaving real browser.
86
+ if datacenter and Kind.BROWSER in by_label:
87
+ by_label[Kind.BROWSER] = round(max(0.0, by_label[Kind.BROWSER] - 0.1), 3)
88
+
89
+ tags = derive_tags(features, compliance, verification, datacenter=datacenter)
90
+ stored = tuple(signals) if keep_signals else ()
91
+
92
+ # Impersonation is decisive: a client faking a declared identity is an
93
+ # impersonator, whatever else it looks like.
94
+ faking, why = impersonation(verification)
95
+ if faking:
96
+ return Classification(
97
+ primary=Kind.IMPERSONATOR,
98
+ confidence=0.9,
99
+ tags=frozenset(tags),
100
+ evidence=why,
101
+ all_signals=stored,
102
+ )
103
+
104
+ if not by_label or max(by_label.values()) < unknown_threshold:
105
+ # A would-be-unknown client wearing a browser UA from a hosting IP, with
106
+ # no browser behaviour, is automation in disguise -- name it as such.
107
+ if datacenter and looks_like_fake_browser(features):
108
+ return Classification(
109
+ primary=Kind.SPOOFED_BROWSER,
110
+ confidence=0.6,
111
+ tags=frozenset(tags),
112
+ evidence=("browser User-Agent from a datacenter IP, without browser behaviour",),
113
+ all_signals=stored,
114
+ )
115
+ # A generic HTTP library (or no UA) fetching several pages from hosting
116
+ # infrastructure is harvesting content -- a scraper -- even when no single
117
+ # signal cleared the bar. The datacenter origin is what tips it: the same
118
+ # library from a residential IP could be an app or a one-off script.
119
+ if datacenter and _looks_like_datacenter_scraper(features):
120
+ return Classification(
121
+ primary=Kind.SCRAPER,
122
+ confidence=0.5,
123
+ tags=frozenset(tags),
124
+ evidence=("generic HTTP client harvesting pages from a datacenter IP",),
125
+ all_signals=stored,
126
+ )
127
+ # A would-be-unknown client with a single request gets its own bucket:
128
+ # one hit is too little to characterize, so we file it by volume.
129
+ if features.request_count == 1:
130
+ return Classification(
131
+ primary=Kind.SINGLETON,
132
+ confidence=1.0,
133
+ tags=frozenset(tags),
134
+ evidence=("single request — too little activity to characterize",),
135
+ all_signals=stored,
136
+ )
137
+ confidence = max(by_label.values()) if by_label else 0.0
138
+ return Classification(
139
+ primary=Kind.UNKNOWN,
140
+ confidence=confidence,
141
+ tags=frozenset(tags),
142
+ evidence=_top_evidence(tuple(signals)),
143
+ all_signals=stored,
144
+ )
145
+
146
+ primary = _pick(by_label)
147
+ if primary is Kind.FEED_READER and _fetches_non_feeds(features):
148
+ tags.add("fetches-non-feeds")
149
+ evidence = tuple(e for s in signals if s.kind is primary for e in s.evidence)
150
+ return Classification(
151
+ primary=primary,
152
+ confidence=by_label[primary],
153
+ tags=frozenset(tags),
154
+ evidence=evidence,
155
+ all_signals=stored,
156
+ )
157
+
158
+
159
+ def _looks_like_datacenter_scraper(features: ClientFeatures) -> bool:
160
+ """A generic-library / UA-less client harvesting several pages, benignly."""
161
+ return (
162
+ features.request_count >= 2
163
+ and features.distinct_paths >= 2
164
+ and (uas.is_library(features.user_agent) or features.ua_empty)
165
+ and features.vuln_path_hits == 0
166
+ and features.traversal_hits == 0
167
+ )
168
+
169
+
170
+ def _fetches_non_feeds(features: ClientFeatures) -> bool:
171
+ """True if a feed reader also requested non-feed resources (robots.txt aside)."""
172
+ non_feed = features.request_count - features.feed_requests
173
+ if features.fetched_robots_txt:
174
+ non_feed -= 1 # a polite robots.txt fetch does not count as content scraping
175
+ return non_feed > 0
@@ -0,0 +1,56 @@
1
+ """Generic crawlers that walk the site following links at a steady pace.
2
+
3
+ Distinguished from a browser by the absence of sub-resource co-loading, and from
4
+ a scraper by actually following on-site links (high referer-following) rather
5
+ than hitting URLs cold.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from ..model import ClientFeatures, Kind, Signal
11
+ from .base import Classifier
12
+
13
+
14
+ class CrawlerClassifier(Classifier):
15
+ label = Kind.CRAWLER
16
+ name = "crawler"
17
+
18
+ def evaluate(self, features: ClientFeatures) -> list[Signal]:
19
+ confidence = 0.0
20
+ evidence: list[str] = []
21
+
22
+ if features.distinct_paths >= 20 and features.coverage > 0.7:
23
+ confidence += 0.3
24
+ evidence.append(
25
+ f"broad coverage: {features.distinct_paths} distinct paths "
26
+ f"({features.coverage:.0%} unique)"
27
+ )
28
+
29
+ regularity = features.rate_regularity
30
+ if regularity is not None and regularity < 0.5 and features.request_count >= 10:
31
+ confidence += 0.2
32
+ evidence.append("steady, regular request cadence")
33
+
34
+ if features.referer_following_ratio > 0.3:
35
+ confidence += 0.15
36
+ evidence.append(
37
+ f"{features.referer_following_ratio:.0%} of requests follow on-site links"
38
+ )
39
+
40
+ if features.ua_declares_bot:
41
+ confidence += 0.15
42
+ evidence.append("User-Agent self-identifies as a bot")
43
+ # A self-declared bot walking many pages with no browser sub-resource
44
+ # loading is a crawler even without broad unique coverage -- the kind of
45
+ # "MyBot/1.0 (+url)" client that re-requests a modest path set.
46
+ if features.distinct_paths >= 20 and features.asset_coload_ratio < 0.1:
47
+ confidence += 0.3
48
+ evidence.append("walks many pages without browser sub-resource loading")
49
+
50
+ if features.ratio_2xx > 0.7 and features.asset_coload_ratio < 0.1:
51
+ confidence += 0.1
52
+ evidence.append("mostly successful page fetches, no browser sub-resource loading")
53
+
54
+ if not evidence:
55
+ return []
56
+ return [self._signal(confidence, evidence)]
@@ -0,0 +1,19 @@
1
+ """Data harvesters: crawl content into a private corpus or dataset.
2
+
3
+ Commercial crawlers that ingest pages into a proprietary database for their own
4
+ product -- plagiarism/similarity indexes (Turnitin), data brokers and dataset
5
+ builders (Panscient) -- as opposed to public search, preservation, AI/LLM
6
+ training, or SEO. Recognised by a known UA token, like the other declared kinds.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from ..model import Kind
12
+ from .known_bot import KnownBotClassifier
13
+
14
+
15
+ class DataHarvesterClassifier(KnownBotClassifier):
16
+ label = Kind.DATA_HARVESTER
17
+ name = "data_harvester"
18
+ category = "data_harvester"
19
+ descriptor = "data harvester"