crawler-user-agents 1.49.0 → 1.50.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Build a statically linked clf-filter binary and attach it to the GitHub release for each tag.
|
|
2
|
+
|
|
3
|
+
name: Release binary
|
|
4
|
+
|
|
5
|
+
on:
|
|
6
|
+
push:
|
|
7
|
+
tags:
|
|
8
|
+
- 'v*'
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
release-binary:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
permissions:
|
|
14
|
+
contents: write
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-go@v5
|
|
18
|
+
with:
|
|
19
|
+
go-version-file: go.mod
|
|
20
|
+
- name: Build static binary
|
|
21
|
+
run: CGO_ENABLED=0 go build -ldflags="-w -s" -o clf-filter ./cmd/clf-filter/
|
|
22
|
+
- name: Upload binary to release
|
|
23
|
+
uses: softprops/action-gh-release@v2
|
|
24
|
+
with:
|
|
25
|
+
files: clf-filter
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
// clf-filter reads Combined Log Format lines from stdin and writes them to stdout,
|
|
2
|
+
// removing bot/crawler lines by default. Use --bot to keep only bot lines.
|
|
3
|
+
package main
|
|
4
|
+
|
|
5
|
+
import (
|
|
6
|
+
"bufio"
|
|
7
|
+
"flag"
|
|
8
|
+
"fmt"
|
|
9
|
+
"os"
|
|
10
|
+
"strings"
|
|
11
|
+
|
|
12
|
+
agents "github.com/monperrus/crawler-user-agents"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
func extractUserAgent(line string) (string, bool) {
|
|
16
|
+
// Combined Log Format ends with: "referer" "user-agent"
|
|
17
|
+
// Find the last quoted field.
|
|
18
|
+
end := strings.LastIndex(line, "\"")
|
|
19
|
+
if end < 1 {
|
|
20
|
+
return "", false
|
|
21
|
+
}
|
|
22
|
+
start := strings.LastIndex(line[:end], "\"")
|
|
23
|
+
if start < 0 {
|
|
24
|
+
return "", false
|
|
25
|
+
}
|
|
26
|
+
return line[start+1 : end], true
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
func main() {
|
|
30
|
+
botOnly := flag.Bool("bot", false, "keep only bot/crawler lines (default: remove bots)")
|
|
31
|
+
flag.Parse()
|
|
32
|
+
|
|
33
|
+
scanner := bufio.NewScanner(os.Stdin)
|
|
34
|
+
// Support long lines (e.g. large URLs).
|
|
35
|
+
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
|
|
36
|
+
|
|
37
|
+
for scanner.Scan() {
|
|
38
|
+
line := scanner.Text()
|
|
39
|
+
ua, ok := extractUserAgent(line)
|
|
40
|
+
isBot := ok && agents.IsCrawler(ua)
|
|
41
|
+
|
|
42
|
+
if *botOnly == isBot {
|
|
43
|
+
fmt.Println(line)
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if err := scanner.Err(); err != nil {
|
|
48
|
+
fmt.Fprintln(os.Stderr, "clf-filter: read error:", err)
|
|
49
|
+
os.Exit(1)
|
|
50
|
+
}
|
|
51
|
+
}
|