crawler_detect 0.1.12 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/build.yml +38 -0
- data/.rubocop.yml +13 -168
- data/CHANGELOG.md +36 -0
- data/Gemfile.lock +42 -6
- data/README.md +27 -12
- data/bin/update_raw_files +10 -0
- data/crawler_detect.gemspec +6 -3
- data/lib/crawler_detect.rb +29 -7
- data/lib/crawler_detect/config.rb +29 -0
- data/lib/crawler_detect/detector.rb +27 -14
- data/lib/crawler_detect/library.rb +9 -3
- data/lib/crawler_detect/library/crawlers.rb +6 -1277
- data/lib/crawler_detect/library/exclusions.rb +6 -50
- data/lib/crawler_detect/library/headers.rb +6 -17
- data/lib/crawler_detect/library/loader.rb +18 -0
- data/lib/crawler_detect/library/raw/Crawlers.json +1 -0
- data/lib/crawler_detect/library/raw/Exclusions.json +1 -0
- data/lib/crawler_detect/library/raw/Headers.json +1 -0
- data/lib/crawler_detect/version.rb +2 -1
- data/lib/rack/crawler_detect.rb +20 -17
- metadata +41 -6
- data/.travis.yml +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6535ee2f876e0b1e05444bf0bb4a7a9082fd70ba4a454d7b1e2e9737b4b84d41
|
4
|
+
data.tar.gz: ff05c16b5cd08416dfded4a8ca3c83ee73d2599dc2641aa8d212a7d3d72f3bd0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 384c92b53a77f3b8280259897060f7fc74e6d0ac214d6469cfdd87a84c290286ae9bacace2875c2c033d69a77eb863cb7eee5053552d0c619b85f1f15ba9a79d
|
7
|
+
data.tar.gz: 3904d9beef1ff2211881408d7ab2c60caf44c2103d07f8188a5c8d9454e58d274f1779ca4d0ad0ca53a548631062dd9f4f97c32c0ee9e811191b17c117b59db5
|
@@ -0,0 +1,38 @@
|
|
1
|
+
name: build
|
2
|
+
|
3
|
+
on: [push]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
lint:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
steps:
|
9
|
+
- uses: actions/checkout@v2
|
10
|
+
- uses: ruby/setup-ruby@v1
|
11
|
+
with:
|
12
|
+
ruby-version: 2.7
|
13
|
+
- name: install gems
|
14
|
+
run: |
|
15
|
+
gem install bundler
|
16
|
+
bundle install --jobs 4 --retry 3
|
17
|
+
- run: bundle exec rubocop
|
18
|
+
test:
|
19
|
+
needs: lint
|
20
|
+
runs-on: ubuntu-latest
|
21
|
+
strategy:
|
22
|
+
fail-fast: false
|
23
|
+
matrix:
|
24
|
+
ruby: [2.5, 2.6, 2.7]
|
25
|
+
test-group: [1, 2, 3, 4]
|
26
|
+
name: Ruby ${{ matrix.ruby }}, test-group ${{ matrix.test-group }}
|
27
|
+
steps:
|
28
|
+
- uses: actions/checkout@v2
|
29
|
+
- uses: ruby/setup-ruby@v1
|
30
|
+
with:
|
31
|
+
ruby-version: ${{ matrix.ruby }}
|
32
|
+
- name: install gems
|
33
|
+
run: |
|
34
|
+
gem install bundler
|
35
|
+
bundle install --jobs 4 --retry 3
|
36
|
+
- name: test
|
37
|
+
run: |
|
38
|
+
bundle exec parallel_rspec spec/ -n 4 --only-group ${{ matrix.test-group }} --group-by runtime --runtime-log spec/fixtures/parallel_runtime_rspec.log
|
data/.rubocop.yml
CHANGED
@@ -1,174 +1,19 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
Exclude:
|
7
|
-
- '**/templates/**/*'
|
8
|
-
- '**/vendor/**/*'
|
9
|
-
- '**/vendor/**/.*'
|
10
|
-
- '**/node_modules/**/*'
|
11
|
-
- 'actionpack/lib/action_dispatch/journey/parser.rb'
|
1
|
+
inherit_gem:
|
2
|
+
armitage-rubocop:
|
3
|
+
- lib/rubocop.general.yml
|
4
|
+
- lib/rubocop.rake.yml
|
5
|
+
- lib/rubocop.rspec.yml
|
12
6
|
|
13
|
-
|
14
|
-
|
7
|
+
AllCops:
|
8
|
+
TargetRubyVersion: 2.7.1
|
15
9
|
Include:
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
# Do not use braces for hash literals when they are the last argument of a
|
23
|
-
# method call.
|
24
|
-
Style/BracesAroundHashParameters:
|
25
|
-
Enabled: true
|
26
|
-
EnforcedStyle: context_dependent
|
27
|
-
|
28
|
-
# Align `when` with `case`.
|
29
|
-
Layout/CaseIndentation:
|
30
|
-
Enabled: true
|
31
|
-
|
32
|
-
# Align comments with method definitions.
|
33
|
-
Layout/CommentIndentation:
|
34
|
-
Enabled: true
|
35
|
-
|
36
|
-
Layout/ElseAlignment:
|
37
|
-
Enabled: true
|
38
|
-
|
39
|
-
# Align `end` with the matching keyword or starting expression except for
|
40
|
-
# assignments, where it should be aligned with the LHS.
|
41
|
-
Layout/EndAlignment:
|
42
|
-
Enabled: true
|
43
|
-
EnforcedStyleAlignWith: variable
|
44
|
-
AutoCorrect: true
|
45
|
-
|
46
|
-
Layout/EmptyLineAfterMagicComment:
|
47
|
-
Enabled: true
|
48
|
-
|
49
|
-
# In a regular class definition, no empty lines around the body.
|
50
|
-
Layout/EmptyLinesAroundClassBody:
|
51
|
-
Enabled: true
|
52
|
-
|
53
|
-
# In a regular method definition, no empty lines around the body.
|
54
|
-
Layout/EmptyLinesAroundMethodBody:
|
55
|
-
Enabled: true
|
10
|
+
- lib/**/*.rb
|
11
|
+
- spec/**/*.rb
|
12
|
+
- Gemfile
|
13
|
+
- Rakefile
|
14
|
+
- crawler_detect.gemspec
|
15
|
+
- bin/console
|
56
16
|
|
57
|
-
# In a regular module definition, no empty lines around the body.
|
58
|
-
Layout/EmptyLinesAroundModuleBody:
|
59
|
-
Enabled: true
|
60
|
-
|
61
|
-
Layout/FirstParameterIndentation:
|
62
|
-
Enabled: true
|
63
|
-
|
64
|
-
# Use Ruby >= 1.9 syntax for hashes. Prefer { a: :b } over { :a => :b }.
|
65
|
-
Style/HashSyntax:
|
66
|
-
Enabled: true
|
67
|
-
|
68
|
-
# Method definitions after `private` or `protected` isolated calls need one
|
69
|
-
# extra level of indentation.
|
70
|
-
Layout/IndentationConsistency:
|
71
|
-
Enabled: true
|
72
|
-
EnforcedStyle: rails
|
73
|
-
|
74
|
-
# Two spaces, no tabs (for indentation).
|
75
|
-
Layout/IndentationWidth:
|
76
|
-
Enabled: true
|
77
|
-
|
78
|
-
Layout/LeadingCommentSpace:
|
79
|
-
Enabled: true
|
80
|
-
|
81
|
-
Layout/SpaceAfterColon:
|
82
|
-
Enabled: true
|
83
|
-
|
84
|
-
Layout/SpaceAfterComma:
|
85
|
-
Enabled: true
|
86
|
-
|
87
|
-
Layout/SpaceAroundEqualsInParameterDefault:
|
88
|
-
Enabled: true
|
89
|
-
|
90
|
-
Layout/SpaceAroundKeyword:
|
91
|
-
Enabled: true
|
92
|
-
|
93
|
-
Layout/SpaceAroundOperators:
|
94
|
-
Enabled: true
|
95
|
-
|
96
|
-
Layout/SpaceBeforeComma:
|
97
|
-
Enabled: true
|
98
|
-
|
99
|
-
Layout/SpaceBeforeFirstArg:
|
100
|
-
Enabled: true
|
101
|
-
|
102
|
-
Style/DefWithParentheses:
|
103
|
-
Enabled: true
|
104
|
-
|
105
|
-
# Defining a method with parameters needs parentheses.
|
106
|
-
Style/MethodDefParentheses:
|
107
|
-
Enabled: true
|
108
|
-
|
109
|
-
Style/FrozenStringLiteralComment:
|
110
|
-
Enabled: true
|
111
|
-
EnforcedStyle: always
|
112
|
-
Exclude:
|
113
|
-
- 'actionview/test/**/*.builder'
|
114
|
-
- 'actionview/test/**/*.ruby'
|
115
|
-
- 'actionpack/test/**/*.builder'
|
116
|
-
- 'actionpack/test/**/*.ruby'
|
117
|
-
- 'activestorage/db/migrate/**/*.rb'
|
118
|
-
- 'db/migrate/**/*.rb'
|
119
|
-
- 'db/*.rb'
|
120
|
-
|
121
|
-
# Use `foo {}` not `foo{}`.
|
122
|
-
Layout/SpaceBeforeBlockBraces:
|
123
|
-
Enabled: true
|
124
|
-
|
125
|
-
# Use `foo { bar }` not `foo {bar}`.
|
126
|
-
Layout/SpaceInsideBlockBraces:
|
127
|
-
Enabled: true
|
128
|
-
|
129
|
-
# Use `{ a: 1 }` not `{a:1}`.
|
130
|
-
Layout/SpaceInsideHashLiteralBraces:
|
131
|
-
Enabled: true
|
132
|
-
|
133
|
-
Layout/SpaceInsideParens:
|
134
|
-
Enabled: true
|
135
|
-
|
136
|
-
# Check quotes usage according to lint rule below.
|
137
17
|
Style/StringLiterals:
|
138
18
|
Enabled: true
|
139
19
|
EnforcedStyle: double_quotes
|
140
|
-
|
141
|
-
# Detect hard tabs, no hard tabs.
|
142
|
-
Layout/Tab:
|
143
|
-
Enabled: true
|
144
|
-
|
145
|
-
# Blank lines should not have any spaces.
|
146
|
-
Layout/TrailingBlankLines:
|
147
|
-
Enabled: true
|
148
|
-
|
149
|
-
# No trailing whitespace.
|
150
|
-
Layout/TrailingWhitespace:
|
151
|
-
Enabled: true
|
152
|
-
|
153
|
-
# Use quotes for string literals when they are enough.
|
154
|
-
Style/UnneededPercentQ:
|
155
|
-
Enabled: true
|
156
|
-
|
157
|
-
# Use my_method(my_arg) not my_method( my_arg ) or my_method my_arg.
|
158
|
-
Lint/RequireParentheses:
|
159
|
-
Enabled: true
|
160
|
-
|
161
|
-
Lint/StringConversionInInterpolation:
|
162
|
-
Enabled: true
|
163
|
-
|
164
|
-
Style/RedundantReturn:
|
165
|
-
Enabled: true
|
166
|
-
AllowMultipleReturnValues: true
|
167
|
-
|
168
|
-
Style/Semicolon:
|
169
|
-
Enabled: true
|
170
|
-
AllowAsExpressionSeparator: true
|
171
|
-
|
172
|
-
# Prefer Foo.method over Foo::method
|
173
|
-
Style/ColonMethodCall:
|
174
|
-
Enabled: true
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# CrawlerDetect major changes
|
2
|
+
|
3
|
+
This changelog **does not contain** raw data updates
|
4
|
+
but only major changes.
|
5
|
+
|
6
|
+
1.1.0
|
7
|
+
---------
|
8
|
+
- Moves to Ruby's Stdlib JSON implementation to reduce dependencies
|
9
|
+
|
10
|
+
1.0.0
|
11
|
+
---------
|
12
|
+
- Use raw JSON files instead of copy them to rb [#8]
|
13
|
+
- Add CrawlerDetect::Config to make it possible to have own raw files [#8]
|
14
|
+
- Add bin/update_raw_files to update raw files from PHP lib [#8]
|
15
|
+
- Add Changelog
|
16
|
+
|
17
|
+
0.1.11
|
18
|
+
---------
|
19
|
+
- Add thread safety [#19]
|
20
|
+
|
21
|
+
0.1.6
|
22
|
+
---------
|
23
|
+
- Strip crawler name [#10]
|
24
|
+
|
25
|
+
0.1.2
|
26
|
+
---------
|
27
|
+
- Add parallel tests [#2]
|
28
|
+
|
29
|
+
0.1.1
|
30
|
+
---------
|
31
|
+
- Fix: rack request
|
32
|
+
|
33
|
+
0.1.0
|
34
|
+
---------
|
35
|
+
- init
|
36
|
+
|
data/Gemfile.lock
CHANGED
@@ -1,16 +1,25 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
crawler_detect (
|
4
|
+
crawler_detect (1.1.0)
|
5
|
+
qonfig (~> 0.24)
|
5
6
|
|
6
7
|
GEM
|
7
8
|
remote: https://rubygems.org/
|
8
9
|
specs:
|
9
|
-
activesupport (
|
10
|
+
activesupport (6.0.3.1)
|
10
11
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
11
12
|
i18n (>= 0.7, < 2)
|
12
13
|
minitest (~> 5.1)
|
13
14
|
tzinfo (~> 1.1)
|
15
|
+
zeitwerk (~> 2.2, >= 2.2.2)
|
16
|
+
armitage-rubocop (0.82.0)
|
17
|
+
rubocop (= 0.82.0)
|
18
|
+
rubocop-performance (= 1.5.2)
|
19
|
+
rubocop-rails (= 2.5.2)
|
20
|
+
rubocop-rake (= 0.5.1)
|
21
|
+
rubocop-rspec (= 1.38.1)
|
22
|
+
ast (2.4.0)
|
14
23
|
awesome_print (1.8.0)
|
15
24
|
byebug (11.1.2)
|
16
25
|
coderay (1.1.2)
|
@@ -21,11 +30,14 @@ GEM
|
|
21
30
|
ruby-progressbar (~> 1.4)
|
22
31
|
i18n (1.8.2)
|
23
32
|
concurrent-ruby (~> 1.0)
|
33
|
+
jaro_winkler (1.5.4)
|
24
34
|
method_source (1.0.0)
|
25
|
-
minitest (5.14.
|
35
|
+
minitest (5.14.1)
|
26
36
|
parallel (1.19.1)
|
27
37
|
parallel_tests (2.32.0)
|
28
38
|
parallel
|
39
|
+
parser (2.7.1.2)
|
40
|
+
ast (~> 2.4.0)
|
29
41
|
pry (0.13.1)
|
30
42
|
coderay (~> 1.1)
|
31
43
|
method_source (~> 1.0)
|
@@ -40,10 +52,13 @@ GEM
|
|
40
52
|
pry-remote (0.1.8)
|
41
53
|
pry (~> 0.9)
|
42
54
|
slop (~> 3.0)
|
43
|
-
|
55
|
+
qonfig (0.24.1)
|
56
|
+
rack (2.2.3)
|
44
57
|
rack-test (1.1.0)
|
45
58
|
rack (>= 1.0, < 3)
|
59
|
+
rainbow (3.0.0)
|
46
60
|
rake (13.0.1)
|
61
|
+
rexml (3.2.4)
|
47
62
|
rspec (3.9.0)
|
48
63
|
rspec-core (~> 3.9.0)
|
49
64
|
rspec-expectations (~> 3.9.0)
|
@@ -57,17 +72,38 @@ GEM
|
|
57
72
|
diff-lcs (>= 1.2.0, < 2.0)
|
58
73
|
rspec-support (~> 3.9.0)
|
59
74
|
rspec-support (3.9.2)
|
75
|
+
rubocop (0.82.0)
|
76
|
+
jaro_winkler (~> 1.5.1)
|
77
|
+
parallel (~> 1.10)
|
78
|
+
parser (>= 2.7.0.1)
|
79
|
+
rainbow (>= 2.2.2, < 4.0)
|
80
|
+
rexml
|
81
|
+
ruby-progressbar (~> 1.7)
|
82
|
+
unicode-display_width (>= 1.4.0, < 2.0)
|
83
|
+
rubocop-performance (1.5.2)
|
84
|
+
rubocop (>= 0.71.0)
|
85
|
+
rubocop-rails (2.5.2)
|
86
|
+
activesupport
|
87
|
+
rack (>= 1.1)
|
88
|
+
rubocop (>= 0.72.0)
|
89
|
+
rubocop-rake (0.5.1)
|
90
|
+
rubocop
|
91
|
+
rubocop-rspec (1.38.1)
|
92
|
+
rubocop (>= 0.68.1)
|
60
93
|
ruby-progressbar (1.10.1)
|
61
94
|
slop (3.6.0)
|
62
95
|
thread_safe (0.3.6)
|
63
96
|
tzinfo (1.2.7)
|
64
97
|
thread_safe (~> 0.1)
|
98
|
+
unicode-display_width (1.7.0)
|
99
|
+
zeitwerk (2.3.0)
|
65
100
|
|
66
101
|
PLATFORMS
|
67
102
|
ruby
|
68
103
|
|
69
104
|
DEPENDENCIES
|
70
|
-
activesupport (~>
|
105
|
+
activesupport (~> 6.0.3)
|
106
|
+
armitage-rubocop (= 0.82)
|
71
107
|
bundler (>= 1.15)
|
72
108
|
crawler_detect!
|
73
109
|
fuubar (~> 2.0)
|
@@ -78,4 +114,4 @@ DEPENDENCIES
|
|
78
114
|
rspec (~> 3.0)
|
79
115
|
|
80
116
|
BUNDLED WITH
|
81
|
-
2.1.
|
117
|
+
2.1.4
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# CrawlerDetect
|
2
2
|
|
3
|
-
|
3
|
+
 [](https://badge.fury.io/rb/crawler_detect)
|
4
4
|
|
5
5
|
## About
|
6
6
|
**CrawlerDetect** is a Ruby version of PHP class @[CrawlerDetect](https://github.com/JayBizzle/Crawler-Detect).
|
@@ -15,44 +15,59 @@ Comparing with other popular bot-detection gems:
|
|
15
15
|
| Number of checked HTTP-headers | 10 | 1 | 1 |
|
16
16
|
| Number of updates of bot-list *(1st half of 2018)* | 14 | 1 | 7 |
|
17
17
|
|
18
|
+
In order to remain up-to-date, this gem does not accept any crawler data updates – any PRs to edit the crawler data should be offered to the original [JayBizzle/CrawlerDetect](https://github.com/JayBizzle/Crawler-Detect) project.
|
19
|
+
|
18
20
|
## Installation
|
19
21
|
Add this line to your application's Gemfile:
|
20
22
|
|
21
23
|
`gem 'crawler_detect'`
|
22
24
|
## Basic Usage
|
23
|
-
```
|
25
|
+
```ruby
|
24
26
|
CrawlerDetect.is_crawler?("Bot user agent")
|
25
27
|
=> true
|
26
28
|
```
|
27
29
|
Or if you need crawler name:
|
28
|
-
```
|
30
|
+
```ruby
|
29
31
|
detector = CrawlerDetect.new("Googlebot/2.1 (http://www.google.com/bot.html)")
|
30
32
|
detector.is_crawler?
|
31
|
-
=> true
|
33
|
+
# => true
|
32
34
|
detector.crawler_name
|
33
|
-
=> "Googlebot"
|
35
|
+
# => "Googlebot"
|
34
36
|
```
|
35
37
|
## Rack::Request extension
|
36
38
|
**Optionally** you can add additional methods for `request`:
|
37
|
-
```
|
39
|
+
```ruby
|
38
40
|
request.is_crawler?
|
39
|
-
=> false
|
41
|
+
# => false
|
40
42
|
request.crawler_name
|
41
|
-
=> nil
|
43
|
+
# => nil
|
42
44
|
```
|
43
45
|
It's more flexible to use `request.is_crawler?` rather than `CrawlerDetect.is_crawler?` because it automatically checks 10 HTTP-headers, not only `HTTP_USER_AGENT`.
|
44
46
|
|
45
47
|
Only one thing you have to do is to configure `Rack::CrawlerDetect` midleware:
|
46
48
|
### Rails
|
47
|
-
```
|
49
|
+
```ruby
|
48
50
|
class Application < Rails::Application
|
49
|
-
...
|
51
|
+
# ...
|
50
52
|
config.middleware.use Rack::CrawlerDetect
|
51
53
|
end
|
52
54
|
```
|
53
|
-
###
|
54
|
-
```
|
55
|
+
### Rack
|
56
|
+
```ruby
|
55
57
|
use Rack::CrawlerDetect
|
56
58
|
```
|
59
|
+
## Configuration
|
60
|
+
In some cases you may want to use your own white-list, or black-list or list of http-headers to detect User-agent.
|
61
|
+
|
62
|
+
It is possible to do via `CrawlerDetect::Config`. For example, you may have initializer like this:
|
63
|
+
```ruby
|
64
|
+
CrawlerDetect.setup! do |config|
|
65
|
+
config.raw_headers_path = File.expand_path("crawlers/MyHeaders.json", __dir__)
|
66
|
+
config.raw_crawlers_path = File.expand_path("crawlers/MyCrawlers.json", __dir__)
|
67
|
+
config.raw_exclusions_path = File.expand_path("crawlers/MyExclusions.json", __dir__)
|
68
|
+
end
|
69
|
+
```
|
70
|
+
Make sure that your files are correct JSON files.
|
71
|
+
Look at [the raw files](https://github.com/loadkpi/crawler_detect/tree/master/lib/crawler_detect/library/raw) which are used by default for more information.
|
57
72
|
## License
|
58
73
|
MIT License
|