crawler-user-agents 1.49.0 → 1.51.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Build a statically linked clf-filter binary and attach it to the GitHub release for each tag.
|
|
2
|
+
|
|
3
|
+
name: Release binary
|
|
4
|
+
|
|
5
|
+
on:
|
|
6
|
+
push:
|
|
7
|
+
tags:
|
|
8
|
+
- 'v*'
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
release-binary:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
permissions:
|
|
14
|
+
contents: write
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-go@v5
|
|
18
|
+
with:
|
|
19
|
+
go-version-file: go.mod
|
|
20
|
+
- name: Build static binary
|
|
21
|
+
run: CGO_ENABLED=0 go build -ldflags="-w -s" -o clf-filter ./cmd/clf-filter/
|
|
22
|
+
- name: Upload binary to release
|
|
23
|
+
uses: softprops/action-gh-release@v2
|
|
24
|
+
with:
|
|
25
|
+
files: clf-filter
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
// clf-filter reads Combined Log Format lines from stdin and writes them to stdout,
|
|
2
|
+
// removing bot/crawler lines by default. Use --bot to keep only bot lines.
|
|
3
|
+
package main
|
|
4
|
+
|
|
5
|
+
import (
|
|
6
|
+
"bufio"
|
|
7
|
+
"flag"
|
|
8
|
+
"fmt"
|
|
9
|
+
"os"
|
|
10
|
+
"strings"
|
|
11
|
+
|
|
12
|
+
agents "github.com/monperrus/crawler-user-agents"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
func extractUserAgent(line string) (string, bool) {
|
|
16
|
+
// Combined Log Format ends with: "referer" "user-agent"
|
|
17
|
+
// Find the last quoted field.
|
|
18
|
+
end := strings.LastIndex(line, "\"")
|
|
19
|
+
if end < 1 {
|
|
20
|
+
return "", false
|
|
21
|
+
}
|
|
22
|
+
start := strings.LastIndex(line[:end], "\"")
|
|
23
|
+
if start < 0 {
|
|
24
|
+
return "", false
|
|
25
|
+
}
|
|
26
|
+
return line[start+1 : end], true
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
func main() {
|
|
30
|
+
botOnly := flag.Bool("bot", false, "keep only bot/crawler lines (default: remove bots)")
|
|
31
|
+
flag.Parse()
|
|
32
|
+
|
|
33
|
+
scanner := bufio.NewScanner(os.Stdin)
|
|
34
|
+
// Support long lines (e.g. large URLs).
|
|
35
|
+
scanner.Buffer(make([]byte, 1024*1024), 1024*1024)
|
|
36
|
+
|
|
37
|
+
for scanner.Scan() {
|
|
38
|
+
line := scanner.Text()
|
|
39
|
+
ua, ok := extractUserAgent(line)
|
|
40
|
+
isBot := ok && agents.IsCrawler(ua)
|
|
41
|
+
|
|
42
|
+
if *botOnly == isBot {
|
|
43
|
+
fmt.Println(line)
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if err := scanner.Err(); err != nil {
|
|
48
|
+
fmt.Fprintln(os.Stderr, "clf-filter: read error:", err)
|
|
49
|
+
os.Exit(1)
|
|
50
|
+
}
|
|
51
|
+
}
|
package/crawler-user-agents.json
CHANGED
|
@@ -18363,5 +18363,17 @@
|
|
|
18363
18363
|
"tags": [
|
|
18364
18364
|
"monitoring"
|
|
18365
18365
|
]
|
|
18366
|
+
},
|
|
18367
|
+
{
|
|
18368
|
+
"pattern": "GeedoShopProductFinder",
|
|
18369
|
+
"addition_date": "2026/06/17",
|
|
18370
|
+
"url": "https://geedo.com/product-finder/",
|
|
18371
|
+
"instances": [
|
|
18372
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko; GeedoShopProductFinder) Chrome/142.0.0.0 Safari/537.36"
|
|
18373
|
+
],
|
|
18374
|
+
"description": "GeedoShopProductFinder is the automated crawler used by Geedo, a global product-search engine specialized in online retail.",
|
|
18375
|
+
"tags": [
|
|
18376
|
+
"search-engine"
|
|
18377
|
+
]
|
|
18366
18378
|
}
|
|
18367
18379
|
]
|