agent-census 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_census-0.0.1/LICENSE.md +19 -0
- agent_census-0.0.1/MANIFEST.in +9 -0
- agent_census-0.0.1/PKG-INFO +266 -0
- agent_census-0.0.1/README.md +241 -0
- agent_census-0.0.1/agent_census/__init__.py +23 -0
- agent_census-0.0.1/agent_census/__main__.py +10 -0
- agent_census-0.0.1/agent_census/classify/__init__.py +53 -0
- agent_census-0.0.1/agent_census/classify/ai_crawler.py +13 -0
- agent_census-0.0.1/agent_census/classify/app.py +46 -0
- agent_census-0.0.1/agent_census/classify/archiver.py +13 -0
- agent_census-0.0.1/agent_census/classify/base.py +37 -0
- agent_census-0.0.1/agent_census/classify/browser.py +180 -0
- agent_census-0.0.1/agent_census/classify/combiner.py +175 -0
- agent_census-0.0.1/agent_census/classify/crawler.py +56 -0
- agent_census-0.0.1/agent_census/classify/data_harvester.py +19 -0
- agent_census-0.0.1/agent_census/classify/feed_reader.py +101 -0
- agent_census-0.0.1/agent_census/classify/known_bot.py +51 -0
- agent_census-0.0.1/agent_census/classify/monitor.py +54 -0
- agent_census-0.0.1/agent_census/classify/registry.py +46 -0
- agent_census-0.0.1/agent_census/classify/scraper.py +46 -0
- agent_census-0.0.1/agent_census/classify/search_engine.py +13 -0
- agent_census-0.0.1/agent_census/classify/seo_marketing.py +13 -0
- agent_census-0.0.1/agent_census/classify/social_preview.py +13 -0
- agent_census-0.0.1/agent_census/classify/spam_bot.py +44 -0
- agent_census-0.0.1/agent_census/classify/tags.py +236 -0
- agent_census-0.0.1/agent_census/classify/vuln_scanner.py +73 -0
- agent_census-0.0.1/agent_census/cli.py +477 -0
- agent_census-0.0.1/agent_census/data/__init__.py +5 -0
- agent_census-0.0.1/agent_census/data/ai_crawler.toml +155 -0
- agent_census-0.0.1/agent_census/data/app_clients.toml +17 -0
- agent_census-0.0.1/agent_census/data/archiver.toml +30 -0
- agent_census-0.0.1/agent_census/data/browser_releases.toml +33 -0
- agent_census-0.0.1/agent_census/data/data_harvester.toml +27 -0
- agent_census-0.0.1/agent_census/data/datacenter_ranges.toml +242 -0
- agent_census-0.0.1/agent_census/data/egress_networks.toml +65 -0
- agent_census-0.0.1/agent_census/data/feed_readers.toml +15 -0
- agent_census-0.0.1/agent_census/data/scanner_ua.toml +9 -0
- agent_census-0.0.1/agent_census/data/search_engine.toml +130 -0
- agent_census-0.0.1/agent_census/data/seo_marketing.toml +69 -0
- agent_census-0.0.1/agent_census/data/social_preview.toml +78 -0
- agent_census-0.0.1/agent_census/data/vuln_paths.toml +23 -0
- agent_census-0.0.1/agent_census/dataload.py +342 -0
- agent_census-0.0.1/agent_census/egress.py +59 -0
- agent_census-0.0.1/agent_census/errors.py +20 -0
- agent_census-0.0.1/agent_census/features.py +599 -0
- agent_census-0.0.1/agent_census/hosting.py +133 -0
- agent_census-0.0.1/agent_census/identity.py +107 -0
- agent_census-0.0.1/agent_census/iprange.py +390 -0
- agent_census-0.0.1/agent_census/model.py +283 -0
- agent_census-0.0.1/agent_census/netverify.py +371 -0
- agent_census-0.0.1/agent_census/parsing/__init__.py +15 -0
- agent_census-0.0.1/agent_census/parsing/apache.py +196 -0
- agent_census-0.0.1/agent_census/parsing/apache_directives.py +340 -0
- agent_census-0.0.1/agent_census/parsing/base.py +38 -0
- agent_census-0.0.1/agent_census/parsing/cloudflare.py +114 -0
- agent_census-0.0.1/agent_census/parsing/registry.py +39 -0
- agent_census-0.0.1/agent_census/pipeline.py +755 -0
- agent_census-0.0.1/agent_census/py.typed +0 -0
- agent_census-0.0.1/agent_census/report/__init__.py +17 -0
- agent_census-0.0.1/agent_census/report/aggregate.py +240 -0
- agent_census-0.0.1/agent_census/report/calibrate.py +400 -0
- agent_census-0.0.1/agent_census/report/format.py +266 -0
- agent_census-0.0.1/agent_census/report/html.py +880 -0
- agent_census-0.0.1/agent_census/report/inspect.py +203 -0
- agent_census-0.0.1/agent_census/report/markdown.py +226 -0
- agent_census-0.0.1/agent_census/robots/__init__.py +16 -0
- agent_census-0.0.1/agent_census/robots/compliance.py +118 -0
- agent_census-0.0.1/agent_census/robots/parser.py +55 -0
- agent_census-0.0.1/agent_census/robots/source.py +69 -0
- agent_census-0.0.1/agent_census/uas.py +292 -0
- agent_census-0.0.1/agent_census/userconfig.py +42 -0
- agent_census-0.0.1/agent_census.egg-info/SOURCES.txt +90 -0
- agent_census-0.0.1/pyproject.toml +63 -0
- agent_census-0.0.1/setup.cfg +4 -0
- agent_census-0.0.1/tests/test_actor_groups.py +157 -0
- agent_census-0.0.1/tests/test_apache_parser.py +236 -0
- agent_census-0.0.1/tests/test_calibrate.py +222 -0
- agent_census-0.0.1/tests/test_classify.py +1074 -0
- agent_census-0.0.1/tests/test_cli.py +251 -0
- agent_census-0.0.1/tests/test_cloudflare.py +76 -0
- agent_census-0.0.1/tests/test_dataload.py +118 -0
- agent_census-0.0.1/tests/test_egress.py +98 -0
- agent_census-0.0.1/tests/test_features.py +200 -0
- agent_census-0.0.1/tests/test_format.py +116 -0
- agent_census-0.0.1/tests/test_hosting.py +94 -0
- agent_census-0.0.1/tests/test_html.py +180 -0
- agent_census-0.0.1/tests/test_identity.py +46 -0
- agent_census-0.0.1/tests/test_inspect_select.py +54 -0
- agent_census-0.0.1/tests/test_iprange.py +119 -0
- agent_census-0.0.1/tests/test_netverify.py +350 -0
- agent_census-0.0.1/tests/test_network_matrix.py +84 -0
- agent_census-0.0.1/tests/test_pipeline_e2e.py +624 -0
- agent_census-0.0.1/tests/test_robots.py +95 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) Mark Nottingham
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
|
11
|
+
all copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
19
|
+
THE SOFTWARE.
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agent-census
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Characterize the clients hitting a web site by analyzing its access logs.
|
|
5
|
+
Author-email: Mark Nottingham <mnot@mnot.net>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: homepage, https://github.com/mnot/agent-census
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE.md
|
|
13
|
+
Requires-Dist: tomli; python_version < "3.11"
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: mypy; extra == "dev"
|
|
16
|
+
Requires-Dist: black; extra == "dev"
|
|
17
|
+
Requires-Dist: isort; extra == "dev"
|
|
18
|
+
Requires-Dist: pylint; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-md; extra == "dev"
|
|
21
|
+
Requires-Dist: validate-pyproject; extra == "dev"
|
|
22
|
+
Requires-Dist: build; extra == "dev"
|
|
23
|
+
Requires-Dist: tomli; extra == "dev"
|
|
24
|
+
Dynamic: license-file
|
|
25
|
+
|
|
26
|
+
# agent-census
|
|
27
|
+
|
|
28
|
+
*What's hitting your site, classified by how it behaves -- not just what it claims to be.*
|
|
29
|
+
|
|
30
|
+
Most of the traffic to a typical site isn't people; it's software, and a fair bit
|
|
31
|
+
of it lies about what it is. agent-census reads your access log and sorts the
|
|
32
|
+
clients by what they actually do -- whether they pull a page's sub-resources like
|
|
33
|
+
a browser, walk the site like a crawler, poll a feed on a schedule, or go looking
|
|
34
|
+
for known-vulnerable paths. Anything claiming to be a known crawler is checked
|
|
35
|
+
against DNS and published address ranges, so a Googlebot arriving from some random
|
|
36
|
+
datacentre gets called what it is. What you end up with is your traffic broken
|
|
37
|
+
down by what each client is for. The User-Agent still counts -- it's just treated
|
|
38
|
+
as a claim to weigh against behaviour and origin, not a fact to take on trust.
|
|
39
|
+
|
|
40
|
+
[Here's a sample report](https://projects.mnot.net/agent-census/) generated from a real access log.
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
pipx install agent-census
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Use
|
|
49
|
+
|
|
50
|
+
The simplest case is an Apache log in the default `combined` format:
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
agent-census analyze /var/log/apache2/access.log
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
You can pass several rotated logs at once. They're pooled into one analysis, so a
|
|
57
|
+
client that spans the rotation is counted once:
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
agent-census analyze /var/log/httpd/access.log*
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
For a custom format, pass the `LogFormat`/`CustomLog` directive string verbatim
|
|
64
|
+
from your Apache config. Tab separators (`\t`), quoted fields with spaces,
|
|
65
|
+
`%{...}x` SSL variables, and `%{...}e` environment variables are all handled:
|
|
66
|
+
|
|
67
|
+
```
|
|
68
|
+
agent-census analyze access.log \
|
|
69
|
+
--log-format '%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-Agent}i" %D'
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
The presets `common`, `combined`, and `vhost_combined` are available via
|
|
73
|
+
`--log-format-preset`. Options may appear before, after, or between the log files.
|
|
74
|
+
|
|
75
|
+
Cloudflare Logpush logs (newline-delimited JSON) are also supported, as another
|
|
76
|
+
preset:
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
agent-census analyze cloudflare-logs.json --log-format-preset cloudflare
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Cloudflare logs carry the client's AS number, so network and ASN-based detection
|
|
83
|
+
work without any extra configuration.
|
|
84
|
+
|
|
85
|
+
### What to log
|
|
86
|
+
|
|
87
|
+
The Apache `combined` format already carries everything the core analysis needs.
|
|
88
|
+
The `common` preset drops the User-Agent and the Referer, so prefer `combined`,
|
|
89
|
+
or a custom format that includes them.
|
|
90
|
+
|
|
91
|
+
Required (all present in `combined`):
|
|
92
|
+
|
|
93
|
+
- **Client address** (`%h`) -- the identity everything else groups on, and the
|
|
94
|
+
basis for the network, datacentre, and crawler-verification checks.
|
|
95
|
+
- **Timestamp** (`%t`) -- timing regularity, peak request rate, the reported time
|
|
96
|
+
range, and (with `--quiescent-hours`) freeing memory mid-run.
|
|
97
|
+
- **Request line** (`"%r"`) -- the method and path; the most load-bearing field,
|
|
98
|
+
behind vulnerability probing, feed detection, path coverage, and crawl shape.
|
|
99
|
+
- **Status code** (`%>s`) -- the status mix, 404 storms, `304 Not Modified` (the
|
|
100
|
+
`has-cache` tag), and robots.txt compliance.
|
|
101
|
+
- **User-Agent** (`"%{User-Agent}i"`) -- browser, bot, and declared-crawler
|
|
102
|
+
recognition.
|
|
103
|
+
|
|
104
|
+
Recommended. The first two are already in `combined`; the rest aren't in any
|
|
105
|
+
preset, so add them to a custom `LogFormat` (quoted) -- they're worth it:
|
|
106
|
+
|
|
107
|
+
- **Referer** (`"%{Referer}i"`, in `combined`) -- referer-following, which
|
|
108
|
+
separates crawlers from scrapers and flags fabricated referers.
|
|
109
|
+
- **Bytes sent** (`%b` or `%B`, in `combined`) -- the bandwidth figures in the
|
|
110
|
+
report.
|
|
111
|
+
- **AS organisation and number** (`"%{MM_ASORG}e"` and `"%{MM_ASN}e"`, MaxMind
|
|
112
|
+
`mod_maxminddb`) -- name datacentre clients by their hosting organisation, and
|
|
113
|
+
recognise datacentres and ASN-listed crawlers by AS number. Much of
|
|
114
|
+
[Networks and hosting](#networks-and-hosting) leans on these; log **both** (the
|
|
115
|
+
number drives recognition, the org names it).
|
|
116
|
+
- **Content-Type** (`"%{Content-Type}o"`) -- the response media type, which
|
|
117
|
+
sharpens feed-reader detection (an RSS/Atom type, not just a feed-shaped URL).
|
|
118
|
+
- **X-Forwarded-For** (`"%{X-Forwarded-For}i"`) -- if you're behind a CDN or
|
|
119
|
+
proxy, for `--identity forwarded`.
|
|
120
|
+
|
|
121
|
+
Response time (`%D` / `%T`) and the virtual host are parsed if present but not
|
|
122
|
+
currently used by the analysis.
|
|
123
|
+
|
|
124
|
+
Output is Markdown by default. Pass `--html` for a self-contained, styled page
|
|
125
|
+
(one file, no external assets) you can open in a browser. Both formats work for
|
|
126
|
+
`analyze` and `inspect`:
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
agent-census analyze access.log --html -o census.html
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
The report opens with a summary of each kind, then a cross-tab of where each
|
|
133
|
+
kind's traffic came from (see [Networks and hosting](#networks-and-hosting)),
|
|
134
|
+
then the notable clients in each kind. Within a kind, clients that differ only
|
|
135
|
+
by IP address and origin AS — same User-Agent, same tags — are collapsed into
|
|
136
|
+
one row showing their combined traffic; in the HTML report a disclosure expands
|
|
137
|
+
to the per-IP/ASN breakdown, and `inspect` always lists them individually.
|
|
138
|
+
|
|
139
|
+
### robots.txt compliance
|
|
140
|
+
|
|
141
|
+
To check `robots.txt` compliance, give agent-census the file. A local copy is the
|
|
142
|
+
default, since it should match the period the log covers:
|
|
143
|
+
|
|
144
|
+
```
|
|
145
|
+
agent-census analyze access.log --robots-file ./robots.txt
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Naming a host or URL instead fetches it over the network. A live `robots.txt` may
|
|
149
|
+
not match the rules that applied when the log was written, so the report flags it:
|
|
150
|
+
|
|
151
|
+
```
|
|
152
|
+
agent-census analyze access.log --host example.com
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
The summary's robots column reads `N✓ / M✗ / K?`: respected, ignored, or too few
|
|
156
|
+
requests to tell (a client that hasn't yet requested a disallowed path isn't
|
|
157
|
+
counted either way).
|
|
158
|
+
|
|
159
|
+
### Verifying declared crawlers
|
|
160
|
+
|
|
161
|
+
A User-Agent claiming Googlebot proves nothing on its own. Verification checks the
|
|
162
|
+
client's IP against the crawler's published address ranges and its reverse/forward
|
|
163
|
+
DNS. It runs by default and makes network calls (DNS lookups, and the occasional
|
|
164
|
+
ranges fetch); turn it off for an offline, faster run:
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
agent-census analyze access.log --no-verify-bots
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
A verified crawler's IPs collapse into one entry keyed by its domain. A client
|
|
171
|
+
whose IP is outside the published ranges, or whose reverse DNS doesn't check out,
|
|
172
|
+
is classed `impersonator`, which means a forged identity that verification has
|
|
173
|
+
disproved. Misbehaviour is separate: a "Googlebot" that probes for `/.env` keeps
|
|
174
|
+
its declared kind and gets a `probing` tag (and `ignores-robots` if it earns one),
|
|
175
|
+
because a real crawler can still behave badly. With verification off there's
|
|
176
|
+
nothing to disprove the claim, so it stays a declared crawler with those tags.
|
|
177
|
+
|
|
178
|
+
### Networks and hosting
|
|
179
|
+
|
|
180
|
+
Where a client comes from matters. A "browser" arriving from a datacentre rather
|
|
181
|
+
than a consumer ISP is usually automation. agent-census recognises the major
|
|
182
|
+
cloud and hosting providers (AWS, Google Cloud, Cloudflare, Hetzner) from their
|
|
183
|
+
published IP ranges, folds shared-egress traffic (iCloud Private Relay, Tor) into
|
|
184
|
+
one entry per network, and breaks the kinds down by origin network in a cross-tab.
|
|
185
|
+
In the HTML report that table is interactive: switch between raw counts, share of
|
|
186
|
+
each kind, and share of each network, with the busier cells shaded.
|
|
187
|
+
|
|
188
|
+
Range lists are fetched and cached weekly by default. `--no-fetch-ranges` stays
|
|
189
|
+
offline on the bundled data.
|
|
190
|
+
|
|
191
|
+
If your log carries the client's autonomous-system details (for example from
|
|
192
|
+
MaxMind's `mod_maxminddb`: `%{MM_ASORG}e` for the organisation and `%{MM_ASN}e`
|
|
193
|
+
for the number, quoted in your `LogFormat`), datacentre clients are named by their
|
|
194
|
+
hosting organisation. You can also list extra AS numbers to treat as datacentres
|
|
195
|
+
in the bundled `datacenter_ranges.toml`.
|
|
196
|
+
|
|
197
|
+
### Inspecting a client
|
|
198
|
+
|
|
199
|
+
To see why a client was classified the way it was, use `inspect`. It shows every
|
|
200
|
+
signal that fired (including the runners-up), the measured features, the
|
|
201
|
+
`robots.txt` finding, and the request trace:
|
|
202
|
+
|
|
203
|
+
```
|
|
204
|
+
agent-census inspect access.log --kind vuln_scanner
|
|
205
|
+
agent-census inspect access.log --client 203.0.113.66
|
|
206
|
+
agent-census inspect access.log --kind scraper --network aws
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
`--network` matches a substring of the origin-network name and composes with
|
|
210
|
+
`--kind`, so the two together select a single cell of the cross-tab.
|
|
211
|
+
|
|
212
|
+
### Identity
|
|
213
|
+
|
|
214
|
+
How requests are grouped into clients is configurable, since no single rule fits
|
|
215
|
+
every deployment. The default, `ip_ua`, groups by (IP, User-Agent). Behind a CDN,
|
|
216
|
+
use `forwarded` (the left-most `X-Forwarded-For`); for IP-rotating bots in one
|
|
217
|
+
range, `ip_ua_subnet`. The report notes how the chosen strategy fragmented or
|
|
218
|
+
merged the data, so you can judge whether it fit.
|
|
219
|
+
|
|
220
|
+
```
|
|
221
|
+
agent-census analyze access.log --identity forwarded
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
### Scoping to one site
|
|
225
|
+
|
|
226
|
+
If one server's log mixes several virtual hosts, `--vhost SUBSTRING` analyses
|
|
227
|
+
only the lines served for a matching host (matched against the logged `%v`, or
|
|
228
|
+
the `Host` header if you don't log `%v`). The filtered lines are reported as
|
|
229
|
+
excluded, separately from parse skips. `--vhost` is repeatable — a line is kept
|
|
230
|
+
if it matches any of the given hosts.
|
|
231
|
+
|
|
232
|
+
```
|
|
233
|
+
agent-census analyze access.log --log-format-preset vhost_combined \
|
|
234
|
+
--vhost mnot.net --vhost www.mnot.net
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
This also sidesteps a CDN artefact: if a slice of your traffic was proxied to
|
|
238
|
+
this origin under another hostname, those requests arrive from the CDN's IPs
|
|
239
|
+
(so they can't be attributed or crawler-verified). Scoping to your own host
|
|
240
|
+
drops that slice cleanly.
|
|
241
|
+
|
|
242
|
+
### Remembered settings
|
|
243
|
+
|
|
244
|
+
Some options are sticky, so you needn't retype them. `--log-format` /
|
|
245
|
+
`--log-format-preset`, `--identity`, and `--robots-file` / `--robots-url` are
|
|
246
|
+
saved to `~/.config/agent-census/config.json` and reused when a later run omits
|
|
247
|
+
them. Passing one updates the saved value.
|
|
248
|
+
|
|
249
|
+
## How it works
|
|
250
|
+
|
|
251
|
+
Classification is based on behaviour, not just the User-Agent (which is easy to
|
|
252
|
+
forge). Each client's requests are reduced to measured features: request volume,
|
|
253
|
+
status mix, timing regularity, sub-resource co-loading, path coverage, and the
|
|
254
|
+
like. A set of independent classifiers each vote for a kind, with a confidence and
|
|
255
|
+
the reasons behind it. The strongest vote wins, or `unknown` if nothing clears a
|
|
256
|
+
threshold. Secondary tags such as `verified`, `ignores-robots`, `datacenter`, and
|
|
257
|
+
`has-cache` annotate the result.
|
|
258
|
+
|
|
259
|
+
The confidence weights and the threshold are hand-tuned, so check the
|
|
260
|
+
classifications against your own logs before trusting the headline numbers.
|
|
261
|
+
`inspect` shows why any client landed where it did.
|
|
262
|
+
|
|
263
|
+
## Contributing
|
|
264
|
+
|
|
265
|
+
Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for the
|
|
266
|
+
development setup, conventions, and an outline of how the code fits together.
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
# agent-census
|
|
2
|
+
|
|
3
|
+
*What's hitting your site, classified by how it behaves -- not just what it claims to be.*
|
|
4
|
+
|
|
5
|
+
Most of the traffic to a typical site isn't people; it's software, and a fair bit
|
|
6
|
+
of it lies about what it is. agent-census reads your access log and sorts the
|
|
7
|
+
clients by what they actually do -- whether they pull a page's sub-resources like
|
|
8
|
+
a browser, walk the site like a crawler, poll a feed on a schedule, or go looking
|
|
9
|
+
for known-vulnerable paths. Anything claiming to be a known crawler is checked
|
|
10
|
+
against DNS and published address ranges, so a Googlebot arriving from some random
|
|
11
|
+
datacentre gets called what it is. What you end up with is your traffic broken
|
|
12
|
+
down by what each client is for. The User-Agent still counts -- it's just treated
|
|
13
|
+
as a claim to weigh against behaviour and origin, not a fact to take on trust.
|
|
14
|
+
|
|
15
|
+
[Here's a sample report](https://projects.mnot.net/agent-census/) generated from a real access log.
|
|
16
|
+
|
|
17
|
+
## Install
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
pipx install agent-census
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Use
|
|
24
|
+
|
|
25
|
+
The simplest case is an Apache log in the default `combined` format:
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
agent-census analyze /var/log/apache2/access.log
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
You can pass several rotated logs at once. They're pooled into one analysis, so a
|
|
32
|
+
client that spans the rotation is counted once:
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
agent-census analyze /var/log/httpd/access.log*
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
For a custom format, pass the `LogFormat`/`CustomLog` directive string verbatim
|
|
39
|
+
from your Apache config. Tab separators (`\t`), quoted fields with spaces,
|
|
40
|
+
`%{...}x` SSL variables, and `%{...}e` environment variables are all handled:
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
agent-census analyze access.log \
|
|
44
|
+
--log-format '%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-Agent}i" %D'
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
The presets `common`, `combined`, and `vhost_combined` are available via
|
|
48
|
+
`--log-format-preset`. Options may appear before, after, or between the log files.
|
|
49
|
+
|
|
50
|
+
Cloudflare Logpush logs (newline-delimited JSON) are also supported, as another
|
|
51
|
+
preset:
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
agent-census analyze cloudflare-logs.json --log-format-preset cloudflare
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Cloudflare logs carry the client's AS number, so network and ASN-based detection
|
|
58
|
+
work without any extra configuration.
|
|
59
|
+
|
|
60
|
+
### What to log
|
|
61
|
+
|
|
62
|
+
The Apache `combined` format already carries everything the core analysis needs.
|
|
63
|
+
The `common` preset drops the User-Agent and the Referer, so prefer `combined`,
|
|
64
|
+
or a custom format that includes them.
|
|
65
|
+
|
|
66
|
+
Required (all present in `combined`):
|
|
67
|
+
|
|
68
|
+
- **Client address** (`%h`) -- the identity everything else groups on, and the
|
|
69
|
+
basis for the network, datacentre, and crawler-verification checks.
|
|
70
|
+
- **Timestamp** (`%t`) -- timing regularity, peak request rate, the reported time
|
|
71
|
+
range, and (with `--quiescent-hours`) freeing memory mid-run.
|
|
72
|
+
- **Request line** (`"%r"`) -- the method and path; the most load-bearing field,
|
|
73
|
+
behind vulnerability probing, feed detection, path coverage, and crawl shape.
|
|
74
|
+
- **Status code** (`%>s`) -- the status mix, 404 storms, `304 Not Modified` (the
|
|
75
|
+
`has-cache` tag), and robots.txt compliance.
|
|
76
|
+
- **User-Agent** (`"%{User-Agent}i"`) -- browser, bot, and declared-crawler
|
|
77
|
+
recognition.
|
|
78
|
+
|
|
79
|
+
Recommended. The first two are already in `combined`; the rest aren't in any
|
|
80
|
+
preset, so add them to a custom `LogFormat` (quoted) -- they're worth it:
|
|
81
|
+
|
|
82
|
+
- **Referer** (`"%{Referer}i"`, in `combined`) -- referer-following, which
|
|
83
|
+
separates crawlers from scrapers and flags fabricated referers.
|
|
84
|
+
- **Bytes sent** (`%b` or `%B`, in `combined`) -- the bandwidth figures in the
|
|
85
|
+
report.
|
|
86
|
+
- **AS organisation and number** (`"%{MM_ASORG}e"` and `"%{MM_ASN}e"`, MaxMind
|
|
87
|
+
`mod_maxminddb`) -- name datacentre clients by their hosting organisation, and
|
|
88
|
+
recognise datacentres and ASN-listed crawlers by AS number. Much of
|
|
89
|
+
[Networks and hosting](#networks-and-hosting) leans on these; log **both** (the
|
|
90
|
+
number drives recognition, the org names it).
|
|
91
|
+
- **Content-Type** (`"%{Content-Type}o"`) -- the response media type, which
|
|
92
|
+
sharpens feed-reader detection (an RSS/Atom type, not just a feed-shaped URL).
|
|
93
|
+
- **X-Forwarded-For** (`"%{X-Forwarded-For}i"`) -- if you're behind a CDN or
|
|
94
|
+
proxy, for `--identity forwarded`.
|
|
95
|
+
|
|
96
|
+
Response time (`%D` / `%T`) and the virtual host are parsed if present but not
|
|
97
|
+
currently used by the analysis.
|
|
98
|
+
|
|
99
|
+
Output is Markdown by default. Pass `--html` for a self-contained, styled page
|
|
100
|
+
(one file, no external assets) you can open in a browser. Both formats work for
|
|
101
|
+
`analyze` and `inspect`:
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
agent-census analyze access.log --html -o census.html
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
The report opens with a summary of each kind, then a cross-tab of where each
|
|
108
|
+
kind's traffic came from (see [Networks and hosting](#networks-and-hosting)),
|
|
109
|
+
then the notable clients in each kind. Within a kind, clients that differ only
|
|
110
|
+
by IP address and origin AS — same User-Agent, same tags — are collapsed into
|
|
111
|
+
one row showing their combined traffic; in the HTML report a disclosure expands
|
|
112
|
+
to the per-IP/ASN breakdown, and `inspect` always lists them individually.
|
|
113
|
+
|
|
114
|
+
### robots.txt compliance
|
|
115
|
+
|
|
116
|
+
To check `robots.txt` compliance, give agent-census the file. A local copy is the
|
|
117
|
+
default, since it should match the period the log covers:
|
|
118
|
+
|
|
119
|
+
```
|
|
120
|
+
agent-census analyze access.log --robots-file ./robots.txt
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Naming a host or URL instead fetches it over the network. A live `robots.txt` may
|
|
124
|
+
not match the rules that applied when the log was written, so the report flags it:
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
agent-census analyze access.log --host example.com
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
The summary's robots column reads `N✓ / M✗ / K?`: respected, ignored, or too few
|
|
131
|
+
requests to tell (a client that hasn't yet requested a disallowed path isn't
|
|
132
|
+
counted either way).
|
|
133
|
+
|
|
134
|
+
### Verifying declared crawlers
|
|
135
|
+
|
|
136
|
+
A User-Agent claiming Googlebot proves nothing on its own. Verification checks the
|
|
137
|
+
client's IP against the crawler's published address ranges and its reverse/forward
|
|
138
|
+
DNS. It runs by default and makes network calls (DNS lookups, and the occasional
|
|
139
|
+
ranges fetch); turn it off for an offline, faster run:
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
agent-census analyze access.log --no-verify-bots
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
A verified crawler's IPs collapse into one entry keyed by its domain. A client
|
|
146
|
+
whose IP is outside the published ranges, or whose reverse DNS doesn't check out,
|
|
147
|
+
is classed `impersonator`, which means a forged identity that verification has
|
|
148
|
+
disproved. Misbehaviour is separate: a "Googlebot" that probes for `/.env` keeps
|
|
149
|
+
its declared kind and gets a `probing` tag (and `ignores-robots` if it earns one),
|
|
150
|
+
because a real crawler can still behave badly. With verification off there's
|
|
151
|
+
nothing to disprove the claim, so it stays a declared crawler with those tags.
|
|
152
|
+
|
|
153
|
+
### Networks and hosting
|
|
154
|
+
|
|
155
|
+
Where a client comes from matters. A "browser" arriving from a datacentre rather
|
|
156
|
+
than a consumer ISP is usually automation. agent-census recognises the major
|
|
157
|
+
cloud and hosting providers (AWS, Google Cloud, Cloudflare, Hetzner) from their
|
|
158
|
+
published IP ranges, folds shared-egress traffic (iCloud Private Relay, Tor) into
|
|
159
|
+
one entry per network, and breaks the kinds down by origin network in a cross-tab.
|
|
160
|
+
In the HTML report that table is interactive: switch between raw counts, share of
|
|
161
|
+
each kind, and share of each network, with the busier cells shaded.
|
|
162
|
+
|
|
163
|
+
Range lists are fetched and cached weekly by default. `--no-fetch-ranges` stays
|
|
164
|
+
offline on the bundled data.
|
|
165
|
+
|
|
166
|
+
If your log carries the client's autonomous-system details (for example from
|
|
167
|
+
MaxMind's `mod_maxminddb`: `%{MM_ASORG}e` for the organisation and `%{MM_ASN}e`
|
|
168
|
+
for the number, quoted in your `LogFormat`), datacentre clients are named by their
|
|
169
|
+
hosting organisation. You can also list extra AS numbers to treat as datacentres
|
|
170
|
+
in the bundled `datacenter_ranges.toml`.
|
|
171
|
+
|
|
172
|
+
### Inspecting a client
|
|
173
|
+
|
|
174
|
+
To see why a client was classified the way it was, use `inspect`. It shows every
|
|
175
|
+
signal that fired (including the runners-up), the measured features, the
|
|
176
|
+
`robots.txt` finding, and the request trace:
|
|
177
|
+
|
|
178
|
+
```
|
|
179
|
+
agent-census inspect access.log --kind vuln_scanner
|
|
180
|
+
agent-census inspect access.log --client 203.0.113.66
|
|
181
|
+
agent-census inspect access.log --kind scraper --network aws
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
`--network` matches a substring of the origin-network name and composes with
|
|
185
|
+
`--kind`, so the two together select a single cell of the cross-tab.
|
|
186
|
+
|
|
187
|
+
### Identity
|
|
188
|
+
|
|
189
|
+
How requests are grouped into clients is configurable, since no single rule fits
|
|
190
|
+
every deployment. The default, `ip_ua`, groups by (IP, User-Agent). Behind a CDN,
|
|
191
|
+
use `forwarded` (the left-most `X-Forwarded-For`); for IP-rotating bots in one
|
|
192
|
+
range, `ip_ua_subnet`. The report notes how the chosen strategy fragmented or
|
|
193
|
+
merged the data, so you can judge whether it fit.
|
|
194
|
+
|
|
195
|
+
```
|
|
196
|
+
agent-census analyze access.log --identity forwarded
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Scoping to one site
|
|
200
|
+
|
|
201
|
+
If one server's log mixes several virtual hosts, `--vhost SUBSTRING` analyses
|
|
202
|
+
only the lines served for a matching host (matched against the logged `%v`, or
|
|
203
|
+
the `Host` header if you don't log `%v`). The filtered lines are reported as
|
|
204
|
+
excluded, separately from parse skips. `--vhost` is repeatable — a line is kept
|
|
205
|
+
if it matches any of the given hosts.
|
|
206
|
+
|
|
207
|
+
```
|
|
208
|
+
agent-census analyze access.log --log-format-preset vhost_combined \
|
|
209
|
+
--vhost mnot.net --vhost www.mnot.net
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
This also sidesteps a CDN artefact: if a slice of your traffic was proxied to
|
|
213
|
+
this origin under another hostname, those requests arrive from the CDN's IPs
|
|
214
|
+
(so they can't be attributed or crawler-verified). Scoping to your own host
|
|
215
|
+
drops that slice cleanly.
|
|
216
|
+
|
|
217
|
+
### Remembered settings
|
|
218
|
+
|
|
219
|
+
Some options are sticky, so you needn't retype them. `--log-format` /
|
|
220
|
+
`--log-format-preset`, `--identity`, and `--robots-file` / `--robots-url` are
|
|
221
|
+
saved to `~/.config/agent-census/config.json` and reused when a later run omits
|
|
222
|
+
them. Passing one updates the saved value.
|
|
223
|
+
|
|
224
|
+
## How it works
|
|
225
|
+
|
|
226
|
+
Classification is based on behaviour, not just the User-Agent (which is easy to
|
|
227
|
+
forge). Each client's requests are reduced to measured features: request volume,
|
|
228
|
+
status mix, timing regularity, sub-resource co-loading, path coverage, and the
|
|
229
|
+
like. A set of independent classifiers each vote for a kind, with a confidence and
|
|
230
|
+
the reasons behind it. The strongest vote wins, or `unknown` if nothing clears a
|
|
231
|
+
threshold. Secondary tags such as `verified`, `ignores-robots`, `datacenter`, and
|
|
232
|
+
`has-cache` annotate the result.
|
|
233
|
+
|
|
234
|
+
The confidence weights and the threshold are hand-tuned, so check the
|
|
235
|
+
classifications against your own logs before trusting the headline numbers.
|
|
236
|
+
`inspect` shows why any client landed where it did.
|
|
237
|
+
|
|
238
|
+
## Contributing
|
|
239
|
+
|
|
240
|
+
Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for the
|
|
241
|
+
development setup, conventions, and an outline of how the code fits together.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
__version__ = "0.0.1"
|
|
2
|
+
__author__ = "Mark Nottingham <mnot@mnot.net>"
|
|
3
|
+
__copyright__ = """\
|
|
4
|
+
Copyright (c) Mark Nottingham
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in
|
|
14
|
+
all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
22
|
+
THE SOFTWARE.
|
|
23
|
+
"""
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Client classification: independent rule-based classifiers + a combiner.
|
|
2
|
+
|
|
3
|
+
The public entry point is :func:`classify_client`, which runs every registered
|
|
4
|
+
classifier over a client's features and combines their signals into a single
|
|
5
|
+
:class:`~agent_census.model.Classification` (primary kind plus tags).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from ..model import BotVerification, Classification, ClientFeatures, ComplianceReport, Signal
|
|
11
|
+
from .base import Classifier
|
|
12
|
+
from .combiner import DEFAULT_UNKNOWN_THRESHOLD, combine
|
|
13
|
+
from .registry import all_classifiers
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def run_classifiers(features: ClientFeatures) -> list[Signal]:
|
|
17
|
+
"""Collect signals from every classifier for one client."""
|
|
18
|
+
signals: list[Signal] = []
|
|
19
|
+
for classifier in all_classifiers():
|
|
20
|
+
signals.extend(classifier.evaluate(features))
|
|
21
|
+
return signals
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def classify_client(
|
|
25
|
+
features: ClientFeatures,
|
|
26
|
+
*,
|
|
27
|
+
compliance: ComplianceReport | None = None,
|
|
28
|
+
verification: BotVerification | None = None,
|
|
29
|
+
datacenter: bool = False,
|
|
30
|
+
unknown_threshold: float = DEFAULT_UNKNOWN_THRESHOLD,
|
|
31
|
+
keep_signals: bool = True,
|
|
32
|
+
) -> Classification:
|
|
33
|
+
"""Run all classifiers over ``features`` and combine into a verdict."""
|
|
34
|
+
signals = run_classifiers(features)
|
|
35
|
+
return combine(
|
|
36
|
+
signals,
|
|
37
|
+
features,
|
|
38
|
+
compliance=compliance,
|
|
39
|
+
verification=verification,
|
|
40
|
+
datacenter=datacenter,
|
|
41
|
+
unknown_threshold=unknown_threshold,
|
|
42
|
+
keep_signals=keep_signals,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
__all__ = [
|
|
47
|
+
"Classifier",
|
|
48
|
+
"classify_client",
|
|
49
|
+
"run_classifiers",
|
|
50
|
+
"combine",
|
|
51
|
+
"all_classifiers",
|
|
52
|
+
"DEFAULT_UNKNOWN_THRESHOLD",
|
|
53
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Declared AI / LLM data-gathering crawlers (GPTBot, ClaudeBot, Google-Extended, ...)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..model import Kind
|
|
6
|
+
from .known_bot import KnownBotClassifier
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AiCrawlerClassifier(KnownBotClassifier):
|
|
10
|
+
label = Kind.AI_CRAWLER
|
|
11
|
+
name = "ai_crawler"
|
|
12
|
+
category = "ai_crawler"
|
|
13
|
+
descriptor = "AI / LLM crawler"
|