crawler-user-agents 1.48.0 → 1.50.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Build a statically linked clf-filter binary and attach it to the GitHub release for each tag.
|
|
2
|
+
|
|
3
|
+
name: Release binary
|
|
4
|
+
|
|
5
|
+
on:
|
|
6
|
+
push:
|
|
7
|
+
tags:
|
|
8
|
+
- 'v*'
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
release-binary:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
permissions:
|
|
14
|
+
contents: write
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-go@v5
|
|
18
|
+
with:
|
|
19
|
+
go-version-file: go.mod
|
|
20
|
+
- name: Build static binary
|
|
21
|
+
run: CGO_ENABLED=0 go build -ldflags="-w -s" -o clf-filter ./cmd/clf-filter/
|
|
22
|
+
- name: Upload binary to release
|
|
23
|
+
uses: softprops/action-gh-release@v2
|
|
24
|
+
with:
|
|
25
|
+
files: clf-filter
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
// clf-filter reads Combined Log Format lines from stdin and writes them to stdout,
|
|
2
|
+
// removing bot/crawler lines by default. Use --bot to keep only bot lines.
|
|
3
|
+
package main
|
|
4
|
+
|
|
5
|
+
import (
|
|
6
|
+
"bufio"
|
|
7
|
+
"flag"
|
|
8
|
+
"fmt"
|
|
9
|
+
"os"
|
|
10
|
+
"strings"
|
|
11
|
+
|
|
12
|
+
agents "github.com/monperrus/crawler-user-agents"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
func extractUserAgent(line string) (string, bool) {
|
|
16
|
+
// Combined Log Format ends with: "referer" "user-agent"
|
|
17
|
+
// Find the last quoted field.
|
|
18
|
+
end := strings.LastIndex(line, "\"")
|
|
19
|
+
if end < 1 {
|
|
20
|
+
return "", false
|
|
21
|
+
}
|
|
22
|
+
start := strings.LastIndex(line[:end], "\"")
|
|
23
|
+
if start < 0 {
|
|
24
|
+
return "", false
|
|
25
|
+
}
|
|
26
|
+
return line[start+1 : end], true
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
func main() {
|
|
30
|
+
botOnly := flag.Bool("bot", false, "keep only bot/crawler lines (default: remove bots)")
|
|
31
|
+
flag.Parse()
|
|
32
|
+
|
|
33
|
+
scanner := bufio.NewScanner(os.Stdin)
|
|
34
|
+
// Support long lines (e.g. large URLs).
|
|
35
|
+
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
|
|
36
|
+
|
|
37
|
+
for scanner.Scan() {
|
|
38
|
+
line := scanner.Text()
|
|
39
|
+
ua, ok := extractUserAgent(line)
|
|
40
|
+
isBot := ok && agents.IsCrawler(ua)
|
|
41
|
+
|
|
42
|
+
if *botOnly == isBot {
|
|
43
|
+
fmt.Println(line)
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if err := scanner.Err(); err != nil {
|
|
48
|
+
fmt.Fprintln(os.Stderr, "clf-filter: read error:", err)
|
|
49
|
+
os.Exit(1)
|
|
50
|
+
}
|
|
51
|
+
}
|
package/crawler-user-agents.json
CHANGED
|
@@ -18351,5 +18351,17 @@
|
|
|
18351
18351
|
"tags": [
|
|
18352
18352
|
"scanner"
|
|
18353
18353
|
]
|
|
18354
|
+
},
|
|
18355
|
+
{
|
|
18356
|
+
"pattern": "PRTG Network Monitor",
|
|
18357
|
+
"addition_date": "2026/05/18",
|
|
18358
|
+
"url": "https://www.paessler.com/manuals/prtg/http_transaction_sensor",
|
|
18359
|
+
"instances": [
|
|
18360
|
+
"Mozilla/5.0 (compatible; PRTG Network Monitor (www.paessler.com ); Windows)"
|
|
18361
|
+
],
|
|
18362
|
+
"description": "PRTG HTTP Transaction Sensor",
|
|
18363
|
+
"tags": [
|
|
18364
|
+
"monitoring"
|
|
18365
|
+
]
|
|
18354
18366
|
}
|
|
18355
18367
|
]
|