agent-census 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. agent_census-0.0.1/LICENSE.md +19 -0
  2. agent_census-0.0.1/MANIFEST.in +9 -0
  3. agent_census-0.0.1/PKG-INFO +266 -0
  4. agent_census-0.0.1/README.md +241 -0
  5. agent_census-0.0.1/agent_census/__init__.py +23 -0
  6. agent_census-0.0.1/agent_census/__main__.py +10 -0
  7. agent_census-0.0.1/agent_census/classify/__init__.py +53 -0
  8. agent_census-0.0.1/agent_census/classify/ai_crawler.py +13 -0
  9. agent_census-0.0.1/agent_census/classify/app.py +46 -0
  10. agent_census-0.0.1/agent_census/classify/archiver.py +13 -0
  11. agent_census-0.0.1/agent_census/classify/base.py +37 -0
  12. agent_census-0.0.1/agent_census/classify/browser.py +180 -0
  13. agent_census-0.0.1/agent_census/classify/combiner.py +175 -0
  14. agent_census-0.0.1/agent_census/classify/crawler.py +56 -0
  15. agent_census-0.0.1/agent_census/classify/data_harvester.py +19 -0
  16. agent_census-0.0.1/agent_census/classify/feed_reader.py +101 -0
  17. agent_census-0.0.1/agent_census/classify/known_bot.py +51 -0
  18. agent_census-0.0.1/agent_census/classify/monitor.py +54 -0
  19. agent_census-0.0.1/agent_census/classify/registry.py +46 -0
  20. agent_census-0.0.1/agent_census/classify/scraper.py +46 -0
  21. agent_census-0.0.1/agent_census/classify/search_engine.py +13 -0
  22. agent_census-0.0.1/agent_census/classify/seo_marketing.py +13 -0
  23. agent_census-0.0.1/agent_census/classify/social_preview.py +13 -0
  24. agent_census-0.0.1/agent_census/classify/spam_bot.py +44 -0
  25. agent_census-0.0.1/agent_census/classify/tags.py +236 -0
  26. agent_census-0.0.1/agent_census/classify/vuln_scanner.py +73 -0
  27. agent_census-0.0.1/agent_census/cli.py +477 -0
  28. agent_census-0.0.1/agent_census/data/__init__.py +5 -0
  29. agent_census-0.0.1/agent_census/data/ai_crawler.toml +155 -0
  30. agent_census-0.0.1/agent_census/data/app_clients.toml +17 -0
  31. agent_census-0.0.1/agent_census/data/archiver.toml +30 -0
  32. agent_census-0.0.1/agent_census/data/browser_releases.toml +33 -0
  33. agent_census-0.0.1/agent_census/data/data_harvester.toml +27 -0
  34. agent_census-0.0.1/agent_census/data/datacenter_ranges.toml +242 -0
  35. agent_census-0.0.1/agent_census/data/egress_networks.toml +65 -0
  36. agent_census-0.0.1/agent_census/data/feed_readers.toml +15 -0
  37. agent_census-0.0.1/agent_census/data/scanner_ua.toml +9 -0
  38. agent_census-0.0.1/agent_census/data/search_engine.toml +130 -0
  39. agent_census-0.0.1/agent_census/data/seo_marketing.toml +69 -0
  40. agent_census-0.0.1/agent_census/data/social_preview.toml +78 -0
  41. agent_census-0.0.1/agent_census/data/vuln_paths.toml +23 -0
  42. agent_census-0.0.1/agent_census/dataload.py +342 -0
  43. agent_census-0.0.1/agent_census/egress.py +59 -0
  44. agent_census-0.0.1/agent_census/errors.py +20 -0
  45. agent_census-0.0.1/agent_census/features.py +599 -0
  46. agent_census-0.0.1/agent_census/hosting.py +133 -0
  47. agent_census-0.0.1/agent_census/identity.py +107 -0
  48. agent_census-0.0.1/agent_census/iprange.py +390 -0
  49. agent_census-0.0.1/agent_census/model.py +283 -0
  50. agent_census-0.0.1/agent_census/netverify.py +371 -0
  51. agent_census-0.0.1/agent_census/parsing/__init__.py +15 -0
  52. agent_census-0.0.1/agent_census/parsing/apache.py +196 -0
  53. agent_census-0.0.1/agent_census/parsing/apache_directives.py +340 -0
  54. agent_census-0.0.1/agent_census/parsing/base.py +38 -0
  55. agent_census-0.0.1/agent_census/parsing/cloudflare.py +114 -0
  56. agent_census-0.0.1/agent_census/parsing/registry.py +39 -0
  57. agent_census-0.0.1/agent_census/pipeline.py +755 -0
  58. agent_census-0.0.1/agent_census/py.typed +0 -0
  59. agent_census-0.0.1/agent_census/report/__init__.py +17 -0
  60. agent_census-0.0.1/agent_census/report/aggregate.py +240 -0
  61. agent_census-0.0.1/agent_census/report/calibrate.py +400 -0
  62. agent_census-0.0.1/agent_census/report/format.py +266 -0
  63. agent_census-0.0.1/agent_census/report/html.py +880 -0
  64. agent_census-0.0.1/agent_census/report/inspect.py +203 -0
  65. agent_census-0.0.1/agent_census/report/markdown.py +226 -0
  66. agent_census-0.0.1/agent_census/robots/__init__.py +16 -0
  67. agent_census-0.0.1/agent_census/robots/compliance.py +118 -0
  68. agent_census-0.0.1/agent_census/robots/parser.py +55 -0
  69. agent_census-0.0.1/agent_census/robots/source.py +69 -0
  70. agent_census-0.0.1/agent_census/uas.py +292 -0
  71. agent_census-0.0.1/agent_census/userconfig.py +42 -0
  72. agent_census-0.0.1/agent_census.egg-info/SOURCES.txt +90 -0
  73. agent_census-0.0.1/pyproject.toml +63 -0
  74. agent_census-0.0.1/setup.cfg +4 -0
  75. agent_census-0.0.1/tests/test_actor_groups.py +157 -0
  76. agent_census-0.0.1/tests/test_apache_parser.py +236 -0
  77. agent_census-0.0.1/tests/test_calibrate.py +222 -0
  78. agent_census-0.0.1/tests/test_classify.py +1074 -0
  79. agent_census-0.0.1/tests/test_cli.py +251 -0
  80. agent_census-0.0.1/tests/test_cloudflare.py +76 -0
  81. agent_census-0.0.1/tests/test_dataload.py +118 -0
  82. agent_census-0.0.1/tests/test_egress.py +98 -0
  83. agent_census-0.0.1/tests/test_features.py +200 -0
  84. agent_census-0.0.1/tests/test_format.py +116 -0
  85. agent_census-0.0.1/tests/test_hosting.py +94 -0
  86. agent_census-0.0.1/tests/test_html.py +180 -0
  87. agent_census-0.0.1/tests/test_identity.py +46 -0
  88. agent_census-0.0.1/tests/test_inspect_select.py +54 -0
  89. agent_census-0.0.1/tests/test_iprange.py +119 -0
  90. agent_census-0.0.1/tests/test_netverify.py +350 -0
  91. agent_census-0.0.1/tests/test_network_matrix.py +84 -0
  92. agent_census-0.0.1/tests/test_pipeline_e2e.py +624 -0
  93. agent_census-0.0.1/tests/test_robots.py +95 -0
@@ -0,0 +1,19 @@
1
+ Copyright (c) Mark Nottingham
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -0,0 +1,9 @@
1
+ global-include py.typed
2
+ recursive-include agent_census/data *.toml
3
+ prune build
4
+ prune dist
5
+ prune test
6
+ prune tools
7
+ prune src
8
+ prune *.egg-info
9
+ exclude .gitignore .editorconfig .dockerignore .coveragerc
@@ -0,0 +1,266 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-census
3
+ Version: 0.0.1
4
+ Summary: Characterize the clients hitting a web site by analyzing its access logs.
5
+ Author-email: Mark Nottingham <mnot@mnot.net>
6
+ License-Expression: MIT
7
+ Project-URL: homepage, https://github.com/mnot/agent-census
8
+ Classifier: Operating System :: OS Independent
9
+ Classifier: Development Status :: 4 - Beta
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE.md
13
+ Requires-Dist: tomli; python_version < "3.11"
14
+ Provides-Extra: dev
15
+ Requires-Dist: mypy; extra == "dev"
16
+ Requires-Dist: black; extra == "dev"
17
+ Requires-Dist: isort; extra == "dev"
18
+ Requires-Dist: pylint; extra == "dev"
19
+ Requires-Dist: pytest; extra == "dev"
20
+ Requires-Dist: pytest-md; extra == "dev"
21
+ Requires-Dist: validate-pyproject; extra == "dev"
22
+ Requires-Dist: build; extra == "dev"
23
+ Requires-Dist: tomli; extra == "dev"
24
+ Dynamic: license-file
25
+
26
+ # agent-census
27
+
28
+ *What's hitting your site, classified by how it behaves -- not just what it claims to be.*
29
+
30
+ Most of the traffic to a typical site isn't people; it's software, and a fair bit
31
+ of it lies about what it is. agent-census reads your access log and sorts the
32
+ clients by what they actually do -- whether they pull a page's sub-resources like
33
+ a browser, walk the site like a crawler, poll a feed on a schedule, or go looking
34
+ for known-vulnerable paths. Anything claiming to be a known crawler is checked
35
+ against DNS and published address ranges, so a Googlebot arriving from some random
36
+ datacentre gets called what it is. What you end up with is your traffic broken
37
+ down by what each client is for. The User-Agent still counts -- it's just treated
38
+ as a claim to weigh against behaviour and origin, not a fact to take on trust.
39
+
40
+ [Here's a sample report](https://projects.mnot.net/agent-census/) generated from a real access log.
41
+
42
+ ## Install
43
+
44
+ ```
45
+ pipx install agent-census
46
+ ```
47
+
48
+ ## Use
49
+
50
+ The simplest case is an Apache log in the default `combined` format:
51
+
52
+ ```
53
+ agent-census analyze /var/log/apache2/access.log
54
+ ```
55
+
56
+ You can pass several rotated logs at once. They're pooled into one analysis, so a
57
+ client that spans the rotation is counted once:
58
+
59
+ ```
60
+ agent-census analyze /var/log/httpd/access.log*
61
+ ```
62
+
63
+ For a custom format, pass the `LogFormat`/`CustomLog` directive string verbatim
64
+ from your Apache config. Tab separators (`\t`), quoted fields with spaces,
65
+ `%{...}x` SSL variables, and `%{...}e` environment variables are all handled:
66
+
67
+ ```
68
+ agent-census analyze access.log \
69
+ --log-format '%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-Agent}i" %D'
70
+ ```
71
+
72
+ The presets `common`, `combined`, and `vhost_combined` are available via
73
+ `--log-format-preset`. Options may appear before, after, or between the log files.
74
+
75
+ Cloudflare Logpush logs (newline-delimited JSON) are also supported, as another
76
+ preset:
77
+
78
+ ```
79
+ agent-census analyze cloudflare-logs.json --log-format-preset cloudflare
80
+ ```
81
+
82
+ Cloudflare logs carry the client's AS number, so network and ASN-based detection
83
+ work without any extra configuration.
84
+
85
+ ### What to log
86
+
87
+ The Apache `combined` format already carries everything the core analysis needs.
88
+ The `common` preset drops the User-Agent and the Referer, so prefer `combined`,
89
+ or a custom format that includes them.
90
+
91
+ Required (all present in `combined`):
92
+
93
+ - **Client address** (`%h`) -- the identity everything else groups on, and the
94
+ basis for the network, datacentre, and crawler-verification checks.
95
+ - **Timestamp** (`%t`) -- timing regularity, peak request rate, the reported time
96
+ range, and (with `--quiescent-hours`) freeing memory mid-run.
97
+ - **Request line** (`"%r"`) -- the method and path; the most load-bearing field,
98
+ behind vulnerability probing, feed detection, path coverage, and crawl shape.
99
+ - **Status code** (`%>s`) -- the status mix, 404 storms, `304 Not Modified` (the
100
+ `has-cache` tag), and robots.txt compliance.
101
+ - **User-Agent** (`"%{User-Agent}i"`) -- browser, bot, and declared-crawler
102
+ recognition.
103
+
104
+ Recommended. The first two are already in `combined`; the rest aren't in any
105
+ preset, so add them to a custom `LogFormat` (quoted) -- they're worth it:
106
+
107
+ - **Referer** (`"%{Referer}i"`, in `combined`) -- referer-following, which
108
+ separates crawlers from scrapers and flags fabricated referers.
109
+ - **Bytes sent** (`%b` or `%B`, in `combined`) -- the bandwidth figures in the
110
+ report.
111
+ - **AS organisation and number** (`"%{MM_ASORG}e"` and `"%{MM_ASN}e"`, MaxMind
112
+ `mod_maxminddb`) -- name datacentre clients by their hosting organisation, and
113
+ recognise datacentres and ASN-listed crawlers by AS number. Much of
114
+ [Networks and hosting](#networks-and-hosting) leans on these; log **both** (the
115
+ number drives recognition, the org names it).
116
+ - **Content-Type** (`"%{Content-Type}o"`) -- the response media type, which
117
+ sharpens feed-reader detection (an RSS/Atom type, not just a feed-shaped URL).
118
+ - **X-Forwarded-For** (`"%{X-Forwarded-For}i"`) -- if you're behind a CDN or
119
+ proxy, for `--identity forwarded`.
120
+
121
+ Response time (`%D` / `%T`) and the virtual host are parsed if present but not
122
+ currently used by the analysis.
123
+
124
+ Output is Markdown by default. Pass `--html` for a self-contained, styled page
125
+ (one file, no external assets) you can open in a browser. Both formats work for
126
+ `analyze` and `inspect`:
127
+
128
+ ```
129
+ agent-census analyze access.log --html -o census.html
130
+ ```
131
+
132
+ The report opens with a summary of each kind, then a cross-tab of where each
133
+ kind's traffic came from (see [Networks and hosting](#networks-and-hosting)),
134
+ then the notable clients in each kind. Within a kind, clients that differ only
135
+ by IP address and origin AS — same User-Agent, same tags — are collapsed into
136
+ one row showing their combined traffic; in the HTML report a disclosure expands
137
+ to the per-IP/ASN breakdown, and `inspect` always lists them individually.
138
+
139
+ ### robots.txt compliance
140
+
141
+ To check `robots.txt` compliance, give agent-census the file. A local copy is the
142
+ default, since it should match the period the log covers:
143
+
144
+ ```
145
+ agent-census analyze access.log --robots-file ./robots.txt
146
+ ```
147
+
148
+ Naming a host or URL instead fetches it over the network. A live `robots.txt` may
149
+ not match the rules that applied when the log was written, so the report flags it:
150
+
151
+ ```
152
+ agent-census analyze access.log --host example.com
153
+ ```
154
+
155
+ The summary's robots column reads `N✓ / M✗ / K?`: respected, ignored, or too few
156
+ requests to tell (a client that hasn't yet requested a disallowed path isn't
157
+ counted either way).
158
+
159
+ ### Verifying declared crawlers
160
+
161
+ A User-Agent claiming Googlebot proves nothing on its own. Verification checks the
162
+ client's IP against the crawler's published address ranges and its reverse/forward
163
+ DNS. It runs by default and makes network calls (DNS lookups, and the occasional
164
+ ranges fetch); turn it off for an offline, faster run:
165
+
166
+ ```
167
+ agent-census analyze access.log --no-verify-bots
168
+ ```
169
+
170
+ A verified crawler's IPs collapse into one entry keyed by its domain. A client
171
+ whose IP is outside the published ranges, or whose reverse DNS doesn't check out,
172
+ is classed `impersonator`, which means a forged identity that verification has
173
+ disproved. Misbehaviour is separate: a "Googlebot" that probes for `/.env` keeps
174
+ its declared kind and gets a `probing` tag (and `ignores-robots` if it earns one),
175
+ because a real crawler can still behave badly. With verification off there's
176
+ nothing to disprove the claim, so it stays a declared crawler with those tags.
177
+
178
+ ### Networks and hosting
179
+
180
+ Where a client comes from matters. A "browser" arriving from a datacentre rather
181
+ than a consumer ISP is usually automation. agent-census recognises the major
182
+ cloud and hosting providers (AWS, Google Cloud, Cloudflare, Hetzner) from their
183
+ published IP ranges, folds shared-egress traffic (iCloud Private Relay, Tor) into
184
+ one entry per network, and breaks the kinds down by origin network in a cross-tab.
185
+ In the HTML report that table is interactive: switch between raw counts, share of
186
+ each kind, and share of each network, with the busier cells shaded.
187
+
188
+ Range lists are fetched and cached weekly by default. `--no-fetch-ranges` stays
189
+ offline on the bundled data.
190
+
191
+ If your log carries the client's autonomous-system details (for example from
192
+ MaxMind's `mod_maxminddb`: `%{MM_ASORG}e` for the organisation and `%{MM_ASN}e`
193
+ for the number, quoted in your `LogFormat`), datacentre clients are named by their
194
+ hosting organisation. You can also list extra AS numbers to treat as datacentres
195
+ in the bundled `datacenter_ranges.toml`.
196
+
197
+ ### Inspecting a client
198
+
199
+ To see why a client was classified the way it was, use `inspect`. It shows every
200
+ signal that fired (including the runners-up), the measured features, the
201
+ `robots.txt` finding, and the request trace:
202
+
203
+ ```
204
+ agent-census inspect access.log --kind vuln_scanner
205
+ agent-census inspect access.log --client 203.0.113.66
206
+ agent-census inspect access.log --kind scraper --network aws
207
+ ```
208
+
209
+ `--network` matches a substring of the origin-network name and composes with
210
+ `--kind`, so the two together select a single cell of the cross-tab.
211
+
212
+ ### Identity
213
+
214
+ How requests are grouped into clients is configurable, since no single rule fits
215
+ every deployment. The default, `ip_ua`, groups by (IP, User-Agent). Behind a CDN,
216
+ use `forwarded` (the left-most `X-Forwarded-For`); for IP-rotating bots in one
217
+ range, `ip_ua_subnet`. The report notes how the chosen strategy fragmented or
218
+ merged the data, so you can judge whether it fit.
219
+
220
+ ```
221
+ agent-census analyze access.log --identity forwarded
222
+ ```
223
+
224
+ ### Scoping to one site
225
+
226
+ If one server's log mixes several virtual hosts, `--vhost SUBSTRING` analyses
227
+ only the lines served for a matching host (matched against the logged `%v`, or
228
+ the `Host` header if you don't log `%v`). The filtered lines are reported as
229
+ excluded, separately from parse skips. `--vhost` is repeatable — a line is kept
230
+ if it matches any of the given hosts.
231
+
232
+ ```
233
+ agent-census analyze access.log --log-format-preset vhost_combined \
234
+ --vhost mnot.net --vhost www.mnot.net
235
+ ```
236
+
237
+ This also sidesteps a CDN artefact: if a slice of your traffic was proxied to
238
+ this origin under another hostname, those requests arrive from the CDN's IPs
239
+ (so they can't be attributed or crawler-verified). Scoping to your own host
240
+ drops that slice cleanly.
241
+
242
+ ### Remembered settings
243
+
244
+ Some options are sticky, so you needn't retype them. `--log-format` /
245
+ `--log-format-preset`, `--identity`, and `--robots-file` / `--robots-url` are
246
+ saved to `~/.config/agent-census/config.json` and reused when a later run omits
247
+ them. Passing one updates the saved value.
248
+
249
+ ## How it works
250
+
251
+ Classification is based on behaviour, not just the User-Agent (which is easy to
252
+ forge). Each client's requests are reduced to measured features: request volume,
253
+ status mix, timing regularity, sub-resource co-loading, path coverage, and the
254
+ like. A set of independent classifiers each vote for a kind, with a confidence and
255
+ the reasons behind it. The strongest vote wins, or `unknown` if nothing clears a
256
+ threshold. Secondary tags such as `verified`, `ignores-robots`, `datacenter`, and
257
+ `has-cache` annotate the result.
258
+
259
+ The confidence weights and the threshold are hand-tuned, so check the
260
+ classifications against your own logs before trusting the headline numbers.
261
+ `inspect` shows why any client landed where it did.
262
+
263
+ ## Contributing
264
+
265
+ Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for the
266
+ development setup, conventions, and an outline of how the code fits together.
@@ -0,0 +1,241 @@
1
+ # agent-census
2
+
3
+ *What's hitting your site, classified by how it behaves -- not just what it claims to be.*
4
+
5
+ Most of the traffic to a typical site isn't people; it's software, and a fair bit
6
+ of it lies about what it is. agent-census reads your access log and sorts the
7
+ clients by what they actually do -- whether they pull a page's sub-resources like
8
+ a browser, walk the site like a crawler, poll a feed on a schedule, or go looking
9
+ for known-vulnerable paths. Anything claiming to be a known crawler is checked
10
+ against DNS and published address ranges, so a Googlebot arriving from some random
11
+ datacentre gets called what it is. What you end up with is your traffic broken
12
+ down by what each client is for. The User-Agent still counts -- it's just treated
13
+ as a claim to weigh against behaviour and origin, not a fact to take on trust.
14
+
15
+ [Here's a sample report](https://projects.mnot.net/agent-census/) generated from a real access log.
16
+
17
+ ## Install
18
+
19
+ ```
20
+ pipx install agent-census
21
+ ```
22
+
23
+ ## Use
24
+
25
+ The simplest case is an Apache log in the default `combined` format:
26
+
27
+ ```
28
+ agent-census analyze /var/log/apache2/access.log
29
+ ```
30
+
31
+ You can pass several rotated logs at once. They're pooled into one analysis, so a
32
+ client that spans the rotation is counted once:
33
+
34
+ ```
35
+ agent-census analyze /var/log/httpd/access.log*
36
+ ```
37
+
38
+ For a custom format, pass the `LogFormat`/`CustomLog` directive string verbatim
39
+ from your Apache config. Tab separators (`\t`), quoted fields with spaces,
40
+ `%{...}x` SSL variables, and `%{...}e` environment variables are all handled:
41
+
42
+ ```
43
+ agent-census analyze access.log \
44
+ --log-format '%h %l %u %t "%r" %>s %b "%{Referer}i" "%{User-Agent}i" %D'
45
+ ```
46
+
47
+ The presets `common`, `combined`, and `vhost_combined` are available via
48
+ `--log-format-preset`. Options may appear before, after, or between the log files.
49
+
50
+ Cloudflare Logpush logs (newline-delimited JSON) are also supported, as another
51
+ preset:
52
+
53
+ ```
54
+ agent-census analyze cloudflare-logs.json --log-format-preset cloudflare
55
+ ```
56
+
57
+ Cloudflare logs carry the client's AS number, so network and ASN-based detection
58
+ work without any extra configuration.
59
+
60
+ ### What to log
61
+
62
+ The Apache `combined` format already carries everything the core analysis needs.
63
+ The `common` preset drops the User-Agent and the Referer, so prefer `combined`,
64
+ or a custom format that includes them.
65
+
66
+ Required (all present in `combined`):
67
+
68
+ - **Client address** (`%h`) -- the identity everything else groups on, and the
69
+ basis for the network, datacentre, and crawler-verification checks.
70
+ - **Timestamp** (`%t`) -- timing regularity, peak request rate, the reported time
71
+ range, and (with `--quiescent-hours`) freeing memory mid-run.
72
+ - **Request line** (`"%r"`) -- the method and path; the most load-bearing field,
73
+ behind vulnerability probing, feed detection, path coverage, and crawl shape.
74
+ - **Status code** (`%>s`) -- the status mix, 404 storms, `304 Not Modified` (the
75
+ `has-cache` tag), and robots.txt compliance.
76
+ - **User-Agent** (`"%{User-Agent}i"`) -- browser, bot, and declared-crawler
77
+ recognition.
78
+
79
+ Recommended. The first two are already in `combined`; the rest aren't in any
80
+ preset, so add them to a custom `LogFormat` (quoted) -- they're worth it:
81
+
82
+ - **Referer** (`"%{Referer}i"`, in `combined`) -- referer-following, which
83
+ separates crawlers from scrapers and flags fabricated referers.
84
+ - **Bytes sent** (`%b` or `%B`, in `combined`) -- the bandwidth figures in the
85
+ report.
86
+ - **AS organisation and number** (`"%{MM_ASORG}e"` and `"%{MM_ASN}e"`, MaxMind
87
+ `mod_maxminddb`) -- name datacentre clients by their hosting organisation, and
88
+ recognise datacentres and ASN-listed crawlers by AS number. Much of
89
+ [Networks and hosting](#networks-and-hosting) leans on these; log **both** (the
90
+ number drives recognition, the org names it).
91
+ - **Content-Type** (`"%{Content-Type}o"`) -- the response media type, which
92
+ sharpens feed-reader detection (an RSS/Atom type, not just a feed-shaped URL).
93
+ - **X-Forwarded-For** (`"%{X-Forwarded-For}i"`) -- if you're behind a CDN or
94
+ proxy, for `--identity forwarded`.
95
+
96
+ Response time (`%D` / `%T`) and the virtual host are parsed if present but not
97
+ currently used by the analysis.
98
+
99
+ Output is Markdown by default. Pass `--html` for a self-contained, styled page
100
+ (one file, no external assets) you can open in a browser. Both formats work for
101
+ `analyze` and `inspect`:
102
+
103
+ ```
104
+ agent-census analyze access.log --html -o census.html
105
+ ```
106
+
107
+ The report opens with a summary of each kind, then a cross-tab of where each
108
+ kind's traffic came from (see [Networks and hosting](#networks-and-hosting)),
109
+ then the notable clients in each kind. Within a kind, clients that differ only
110
+ by IP address and origin AS — same User-Agent, same tags — are collapsed into
111
+ one row showing their combined traffic; in the HTML report a disclosure expands
112
+ to the per-IP/ASN breakdown, and `inspect` always lists them individually.
113
+
114
+ ### robots.txt compliance
115
+
116
+ To check `robots.txt` compliance, give agent-census the file. A local copy is the
117
+ default, since it should match the period the log covers:
118
+
119
+ ```
120
+ agent-census analyze access.log --robots-file ./robots.txt
121
+ ```
122
+
123
+ Naming a host or URL instead fetches it over the network. A live `robots.txt` may
124
+ not match the rules that applied when the log was written, so the report flags it:
125
+
126
+ ```
127
+ agent-census analyze access.log --host example.com
128
+ ```
129
+
130
+ The summary's robots column reads `N✓ / M✗ / K?`: respected, ignored, or too few
131
+ requests to tell (a client that hasn't yet requested a disallowed path isn't
132
+ counted either way).
133
+
134
+ ### Verifying declared crawlers
135
+
136
+ A User-Agent claiming Googlebot proves nothing on its own. Verification checks the
137
+ client's IP against the crawler's published address ranges and its reverse/forward
138
+ DNS. It runs by default and makes network calls (DNS lookups, and the occasional
139
+ ranges fetch); turn it off for an offline, faster run:
140
+
141
+ ```
142
+ agent-census analyze access.log --no-verify-bots
143
+ ```
144
+
145
+ A verified crawler's IPs collapse into one entry keyed by its domain. A client
146
+ whose IP is outside the published ranges, or whose reverse DNS doesn't check out,
147
+ is classed `impersonator`, which means a forged identity that verification has
148
+ disproved. Misbehaviour is separate: a "Googlebot" that probes for `/.env` keeps
149
+ its declared kind and gets a `probing` tag (and `ignores-robots` if it earns one),
150
+ because a real crawler can still behave badly. With verification off there's
151
+ nothing to disprove the claim, so it stays a declared crawler with those tags.
152
+
153
+ ### Networks and hosting
154
+
155
+ Where a client comes from matters. A "browser" arriving from a datacentre rather
156
+ than a consumer ISP is usually automation. agent-census recognises the major
157
+ cloud and hosting providers (AWS, Google Cloud, Cloudflare, Hetzner) from their
158
+ published IP ranges, folds shared-egress traffic (iCloud Private Relay, Tor) into
159
+ one entry per network, and breaks the kinds down by origin network in a cross-tab.
160
+ In the HTML report that table is interactive: switch between raw counts, share of
161
+ each kind, and share of each network, with the busier cells shaded.
162
+
163
+ Range lists are fetched and cached weekly by default. `--no-fetch-ranges` stays
164
+ offline on the bundled data.
165
+
166
+ If your log carries the client's autonomous-system details (for example from
167
+ MaxMind's `mod_maxminddb`: `%{MM_ASORG}e` for the organisation and `%{MM_ASN}e`
168
+ for the number, quoted in your `LogFormat`), datacentre clients are named by their
169
+ hosting organisation. You can also list extra AS numbers to treat as datacentres
170
+ in the bundled `datacenter_ranges.toml`.
171
+
172
+ ### Inspecting a client
173
+
174
+ To see why a client was classified the way it was, use `inspect`. It shows every
175
+ signal that fired (including the runners-up), the measured features, the
176
+ `robots.txt` finding, and the request trace:
177
+
178
+ ```
179
+ agent-census inspect access.log --kind vuln_scanner
180
+ agent-census inspect access.log --client 203.0.113.66
181
+ agent-census inspect access.log --kind scraper --network aws
182
+ ```
183
+
184
+ `--network` matches a substring of the origin-network name and composes with
185
+ `--kind`, so the two together select a single cell of the cross-tab.
186
+
187
+ ### Identity
188
+
189
+ How requests are grouped into clients is configurable, since no single rule fits
190
+ every deployment. The default, `ip_ua`, groups by (IP, User-Agent). Behind a CDN,
191
+ use `forwarded` (the left-most `X-Forwarded-For`); for IP-rotating bots in one
192
+ range, `ip_ua_subnet`. The report notes how the chosen strategy fragmented or
193
+ merged the data, so you can judge whether it fit.
194
+
195
+ ```
196
+ agent-census analyze access.log --identity forwarded
197
+ ```
198
+
199
+ ### Scoping to one site
200
+
201
+ If one server's log mixes several virtual hosts, `--vhost SUBSTRING` analyses
202
+ only the lines served for a matching host (matched against the logged `%v`, or
203
+ the `Host` header if you don't log `%v`). The filtered lines are reported as
204
+ excluded, separately from parse skips. `--vhost` is repeatable — a line is kept
205
+ if it matches any of the given hosts.
206
+
207
+ ```
208
+ agent-census analyze access.log --log-format-preset vhost_combined \
209
+ --vhost mnot.net --vhost www.mnot.net
210
+ ```
211
+
212
+ This also sidesteps a CDN artefact: if a slice of your traffic was proxied to
213
+ this origin under another hostname, those requests arrive from the CDN's IPs
214
+ (so they can't be attributed or crawler-verified). Scoping to your own host
215
+ drops that slice cleanly.
216
+
217
+ ### Remembered settings
218
+
219
+ Some options are sticky, so you needn't retype them. `--log-format` /
220
+ `--log-format-preset`, `--identity`, and `--robots-file` / `--robots-url` are
221
+ saved to `~/.config/agent-census/config.json` and reused when a later run omits
222
+ them. Passing one updates the saved value.
223
+
224
+ ## How it works
225
+
226
+ Classification is based on behaviour, not just the User-Agent (which is easy to
227
+ forge). Each client's requests are reduced to measured features: request volume,
228
+ status mix, timing regularity, sub-resource co-loading, path coverage, and the
229
+ like. A set of independent classifiers each vote for a kind, with a confidence and
230
+ the reasons behind it. The strongest vote wins, or `unknown` if nothing clears a
231
+ threshold. Secondary tags such as `verified`, `ignores-robots`, `datacenter`, and
232
+ `has-cache` annotate the result.
233
+
234
+ The confidence weights and the threshold are hand-tuned, so check the
235
+ classifications against your own logs before trusting the headline numbers.
236
+ `inspect` shows why any client landed where it did.
237
+
238
+ ## Contributing
239
+
240
+ Contributions are welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for the
241
+ development setup, conventions, and an outline of how the code fits together.
@@ -0,0 +1,23 @@
1
+ __version__ = "0.0.1"
2
+ __author__ = "Mark Nottingham <mnot@mnot.net>"
3
+ __copyright__ = """\
4
+ Copyright (c) Mark Nottingham
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in
14
+ all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
+ THE SOFTWARE.
23
+ """
@@ -0,0 +1,10 @@
1
+ """Enable ``python -m agent_census``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+ from .cli import main
8
+
9
+ if __name__ == "__main__":
10
+ sys.exit(main())
@@ -0,0 +1,53 @@
1
+ """Client classification: independent rule-based classifiers + a combiner.
2
+
3
+ The public entry point is :func:`classify_client`, which runs every registered
4
+ classifier over a client's features and combines their signals into a single
5
+ :class:`~agent_census.model.Classification` (primary kind plus tags).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from ..model import BotVerification, Classification, ClientFeatures, ComplianceReport, Signal
11
+ from .base import Classifier
12
+ from .combiner import DEFAULT_UNKNOWN_THRESHOLD, combine
13
+ from .registry import all_classifiers
14
+
15
+
16
+ def run_classifiers(features: ClientFeatures) -> list[Signal]:
17
+ """Collect signals from every classifier for one client."""
18
+ signals: list[Signal] = []
19
+ for classifier in all_classifiers():
20
+ signals.extend(classifier.evaluate(features))
21
+ return signals
22
+
23
+
24
+ def classify_client(
25
+ features: ClientFeatures,
26
+ *,
27
+ compliance: ComplianceReport | None = None,
28
+ verification: BotVerification | None = None,
29
+ datacenter: bool = False,
30
+ unknown_threshold: float = DEFAULT_UNKNOWN_THRESHOLD,
31
+ keep_signals: bool = True,
32
+ ) -> Classification:
33
+ """Run all classifiers over ``features`` and combine into a verdict."""
34
+ signals = run_classifiers(features)
35
+ return combine(
36
+ signals,
37
+ features,
38
+ compliance=compliance,
39
+ verification=verification,
40
+ datacenter=datacenter,
41
+ unknown_threshold=unknown_threshold,
42
+ keep_signals=keep_signals,
43
+ )
44
+
45
+
46
+ __all__ = [
47
+ "Classifier",
48
+ "classify_client",
49
+ "run_classifiers",
50
+ "combine",
51
+ "all_classifiers",
52
+ "DEFAULT_UNKNOWN_THRESHOLD",
53
+ ]
@@ -0,0 +1,13 @@
1
+ """Declared AI / LLM data-gathering crawlers (GPTBot, ClaudeBot, Google-Extended, ...)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ..model import Kind
6
+ from .known_bot import KnownBotClassifier
7
+
8
+
9
+ class AiCrawlerClassifier(KnownBotClassifier):
10
+ label = Kind.AI_CRAWLER
11
+ name = "ai_crawler"
12
+ category = "ai_crawler"
13
+ descriptor = "AI / LLM crawler"