crawler-user-agents 1.0.129 → 1.0.130

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,3 +23,4 @@ jobs:
23
23
  - run: py.test -vv
24
24
  - run: python3 validate.py
25
25
  - run: php validate.php
26
+ - run: go test
package/README.md CHANGED
@@ -34,6 +34,40 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f
34
34
  * JavaScript: `if (RegExp(entry.pattern).test(req.headers['user-agent']) { ... }`
35
35
  * PHP: add a slash before and after the pattern: `if (preg_match('/'.$entry['pattern'].'/', $_SERVER['HTTP_USER_AGENT'])): ...`
36
36
  * Python: `if re.search(entry['pattern'], ua): ...`
37
+ * Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents),
38
+ it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`),
39
+ functions `IsCrawler` and `MatchingCrawlers`.
40
+
41
+ Example of Go program:
42
+
43
+ ```go
44
+ package main
45
+
46
+ import (
47
+ "fmt"
48
+
49
+ "github.com/monperrus/crawler-user-agents"
50
+ )
51
+
52
+ func main() {
53
+ userAgent := "Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)"
54
+
55
+ isCrawler := agents.IsCrawler(userAgent)
56
+ fmt.Println("isCrawler:", isCrawler)
57
+
58
+ indices := agents.MatchingCrawlers(userAgent)
59
+ fmt.Println("crawlers' indices:", indices)
60
+ fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL)
61
+ }
62
+ ```
63
+
64
+ Output:
65
+
66
+ ```
67
+ isCrawler: true
68
+ crawlers' indices: [237]
69
+ crawler' URL: https://discordapp.com
70
+ ```
37
71
 
38
72
  ## Contributing
39
73
 
@@ -66,7 +100,6 @@ There are a few wrapper libraries that use this data to detect bots:
66
100
  * [Voight-Kampff](https://github.com/biola/Voight-Kampff) (Ruby)
67
101
  * [isbot](https://github.com/Hentioe/isbot) (Ruby)
68
102
  * [crawlers](https://github.com/Olical/crawlers) (Clojure)
69
- * [crawlerflagger](https://godoc.org/go.kelfa.io/kelfa/pkg/crawlerflagger) (Go)
70
103
  * [isBot](https://github.com/omrilotan/isbot) (Node.JS)
71
104
 
72
105
  Other systems for spotting robots, crawlers, and spiders that you may want to consider are:
package/go.mod ADDED
@@ -0,0 +1,3 @@
1
+ module github.com/monperrus/crawler-user-agents
2
+
3
+ go 1.19
package/go.sum ADDED
File without changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawler-user-agents",
3
- "version": "1.0.129",
3
+ "version": "1.0.130",
4
4
  "main": "crawler-user-agents.json",
5
5
  "typings": "./index.d.ts",
6
6
  "author": "Martin Monperrus <martin.monperrus@gnieh.org>",
package/validate.go ADDED
@@ -0,0 +1,110 @@
1
+ package agents
2
+
3
+ import (
4
+ _ "embed"
5
+ "encoding/json"
6
+ "fmt"
7
+ "regexp"
8
+ "time"
9
+ )
10
+
11
+ //go:embed crawler-user-agents.json
12
+ var crawlersJson []byte
13
+
14
+ // Crawler contains information about one crawler.
15
+ type Crawler struct {
16
+ // Regexp of User Agent of the crawler.
17
+ Pattern string `json:"pattern"`
18
+
19
+ // Discovery date.
20
+ AdditionDate time.Time `json:"addition_date"`
21
+
22
+ // Official url of the robot.
23
+ URL string `json:"url"`
24
+
25
+ // Examples of full User Agent strings.
26
+ Instances []string `json:"instances"`
27
+ }
28
+
29
+ // Private time needed to convert addition_date from/to the format used in JSON.
30
+ type jsonCrawler struct {
31
+ Pattern string `json:"pattern"`
32
+ AdditionDate string `json:"addition_date"`
33
+ URL string `json:"url"`
34
+ Instances []string `json:"instances"`
35
+ }
36
+
37
+ const timeLayout = "2006/01/02"
38
+
39
+ func (c Crawler) MarshalJSON() ([]byte, error) {
40
+ jc := jsonCrawler{
41
+ Pattern: c.Pattern,
42
+ AdditionDate: c.AdditionDate.Format(timeLayout),
43
+ URL: c.URL,
44
+ Instances: c.Instances,
45
+ }
46
+ return json.Marshal(jc)
47
+ }
48
+
49
+ func (c *Crawler) UnmarshalJSON(b []byte) error {
50
+ var jc jsonCrawler
51
+ if err := json.Unmarshal(b, &jc); err != nil {
52
+ return err
53
+ }
54
+
55
+ c.Pattern = jc.Pattern
56
+ c.URL = jc.URL
57
+ c.Instances = jc.Instances
58
+
59
+ if c.Pattern == "" {
60
+ return fmt.Errorf("empty pattern in record %s", string(b))
61
+ }
62
+
63
+ if jc.AdditionDate != "" {
64
+ tim, err := time.ParseInLocation(timeLayout, jc.AdditionDate, time.UTC)
65
+ if err != nil {
66
+ return err
67
+ }
68
+ c.AdditionDate = tim
69
+ }
70
+
71
+ return nil
72
+ }
73
+
74
+ // The list of crawlers, built from contents of crawler-user-agents.json.
75
+ var Crawlers = func() []Crawler {
76
+ var crawlers []Crawler
77
+ if err := json.Unmarshal(crawlersJson, &crawlers); err != nil {
78
+ panic(err)
79
+ }
80
+ return crawlers
81
+ }()
82
+
83
+ var regexps = func() []*regexp.Regexp {
84
+ regexps := make([]*regexp.Regexp, len(Crawlers))
85
+ for i, crawler := range Crawlers {
86
+ regexps[i] = regexp.MustCompile(crawler.Pattern)
87
+ }
88
+ return regexps
89
+ }()
90
+
91
+ // Returns if User Agent string matches any of crawler patterns.
92
+ func IsCrawler(userAgent string) bool {
93
+ for _, re := range regexps {
94
+ if re.MatchString(userAgent) {
95
+ return true
96
+ }
97
+ }
98
+ return false
99
+ }
100
+
101
+ // Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers.
102
+ func MatchingCrawlers(userAgent string) []int {
103
+ indices := []int{}
104
+ for i, re := range regexps {
105
+ if re.MatchString(userAgent) {
106
+ indices = append(indices, i)
107
+ }
108
+ }
109
+ return indices
110
+ }
@@ -0,0 +1,121 @@
1
+ package agents
2
+
3
+ import (
4
+ "encoding/json"
5
+ "fmt"
6
+ "net/http"
7
+ "testing"
8
+ )
9
+
10
+ func contains(list []int, value int) bool {
11
+ for _, elem := range list {
12
+ if elem == value {
13
+ return true
14
+ }
15
+ }
16
+ return false
17
+ }
18
+
19
+ func TestPatterns(t *testing.T) {
20
+ // Loading all crawlers with go:embed
21
+ // some validation happens in UnmarshalJSON.
22
+ allCrawlers := Crawlers
23
+
24
+ // There are at least 10 crawlers.
25
+ if len(allCrawlers) < 10 {
26
+ t.Errorf("Number of crawlers must be at least 10, got %d.", len(allCrawlers))
27
+ }
28
+
29
+ if IsCrawler(browserUA) {
30
+ t.Errorf("Browser UA %q was detected as a crawler.", browserUA)
31
+ }
32
+ if len(MatchingCrawlers(browserUA)) != 0 {
33
+ t.Errorf("MatchingCrawlers found crawlers matching Browser UA %q.", browserUA)
34
+ }
35
+
36
+ for i, crawler := range allCrawlers {
37
+ t.Run(crawler.Pattern, func(t *testing.T) {
38
+ fmt.Println(crawler.Pattern)
39
+
40
+ for _, instance := range crawler.Instances {
41
+ if !IsCrawler(instance) {
42
+ t.Errorf("Instance %q is not detected as a crawler.", instance)
43
+ }
44
+ hits := MatchingCrawlers(instance)
45
+ if !contains(hits, i) {
46
+ t.Errorf("Crawler with index %d (pattern %q) is not in the list returned by MatchingCrawlers(%q): %v.", i, crawler.Pattern, instance, hits)
47
+ }
48
+ }
49
+ })
50
+ }
51
+ }
52
+
53
+ func TestFalseNegatives(t *testing.T) {
54
+ const browsersURL = "https://raw.githubusercontent.com/microlinkhq/top-user-agents/master/src/index.json"
55
+ resp, err := http.Get(browsersURL)
56
+ if err != nil {
57
+ t.Fatalf("Failed to fetch the list of browser User Agents from %s: %v.", browsersURL, err)
58
+ }
59
+
60
+ t.Cleanup(func() {
61
+ if err := resp.Body.Close(); err != nil {
62
+ t.Fatal(err)
63
+ }
64
+ })
65
+
66
+ var browsers []string
67
+ if err := json.NewDecoder(resp.Body).Decode(&browsers); err != nil {
68
+ t.Fatalf("Failed to parse the list of browser User Agents: %v.", err)
69
+ }
70
+
71
+ for _, userAgent := range browsers {
72
+ if IsCrawler(userAgent) {
73
+ t.Errorf("Browser User Agent %q is recognized as a crawler.", userAgent)
74
+ }
75
+ indices := MatchingCrawlers(userAgent)
76
+ if len(indices) != 0 {
77
+ t.Errorf("Browser User Agent %q matches with crawlers %v.", userAgent, indices)
78
+ }
79
+ }
80
+ }
81
+
82
+ const (
83
+ crawlerUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google (+https://developers.google.com/+/web/snippet/"
84
+ browserUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.3 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36"
85
+ )
86
+
87
+ func BenchmarkIsCrawlerPositive(b *testing.B) {
88
+ b.SetBytes(int64(len(crawlerUA)))
89
+ for n := 0; n < b.N; n++ {
90
+ if !IsCrawler(crawlerUA) {
91
+ b.Fail()
92
+ }
93
+ }
94
+ }
95
+
96
+ func BenchmarkMatchingCrawlersPositive(b *testing.B) {
97
+ b.SetBytes(int64(len(crawlerUA)))
98
+ for n := 0; n < b.N; n++ {
99
+ if len(MatchingCrawlers(crawlerUA)) == 0 {
100
+ b.Fail()
101
+ }
102
+ }
103
+ }
104
+
105
+ func BenchmarkIsCrawlerNegative(b *testing.B) {
106
+ b.SetBytes(int64(len(browserUA)))
107
+ for n := 0; n < b.N; n++ {
108
+ if IsCrawler(browserUA) {
109
+ b.Fail()
110
+ }
111
+ }
112
+ }
113
+
114
+ func BenchmarkMatchingCrawlersNegative(b *testing.B) {
115
+ b.SetBytes(int64(len(browserUA)))
116
+ for n := 0; n < b.N; n++ {
117
+ if len(MatchingCrawlers(browserUA)) != 0 {
118
+ b.Fail()
119
+ }
120
+ }
121
+ }