crawler_detect 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +11 -0
- data/.rspec +3 -0
- data/.rubocop.yml +174 -0
- data/.travis.yml +12 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +56 -0
- data/Rakefile +8 -0
- data/crawler_detect.gemspec +35 -0
- data/lib/crawler_detect.rb +28 -0
- data/lib/crawler_detect/detector.rb +40 -0
- data/lib/crawler_detect/library.rb +17 -0
- data/lib/crawler_detect/library/crawlers.rb +1170 -0
- data/lib/crawler_detect/library/exclusions.rb +57 -0
- data/lib/crawler_detect/library/headers.rb +25 -0
- data/lib/crawler_detect/version.rb +5 -0
- data/lib/rack/crawler_detect.rb +47 -0
- metadata +159 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CrawlerDetect
|
4
|
+
module Library
|
5
|
+
module Exclusions
|
6
|
+
EXCLUSIONS = %q[
|
7
|
+
Safari.[\d\.]*
|
8
|
+
Firefox.[\d\.]*
|
9
|
+
Chrome.[\d\.]*
|
10
|
+
Chromium.[\d\.]*
|
11
|
+
MSIE.[\d\.]
|
12
|
+
Opera\/[\d\.]*
|
13
|
+
Mozilla.[\d\.]*
|
14
|
+
AppleWebKit.[\d\.]*
|
15
|
+
Trident.[\d\.]*
|
16
|
+
Windows NT.[\d\.]*
|
17
|
+
Android [\d\.]*
|
18
|
+
Macintosh.
|
19
|
+
Ubuntu
|
20
|
+
Linux
|
21
|
+
[ ]Intel
|
22
|
+
Mac OS X [\d_]*
|
23
|
+
(like )?Gecko(.[\d\.]*)?
|
24
|
+
KHTML,
|
25
|
+
CriOS.[\d\.]*
|
26
|
+
CPU iPhone OS ([0-9_])* like Mac OS X
|
27
|
+
CPU OS ([0-9_])* like Mac OS X
|
28
|
+
iPod
|
29
|
+
compatible
|
30
|
+
x86_..
|
31
|
+
i686
|
32
|
+
x64
|
33
|
+
X11
|
34
|
+
rv:[\d\.]*
|
35
|
+
Version.[\d\.]*
|
36
|
+
WOW64
|
37
|
+
Win64
|
38
|
+
Dalvik.[\d\.]*
|
39
|
+
\.NET CLR [\d\.]*
|
40
|
+
Presto.[\d\.]*
|
41
|
+
Media Center PC
|
42
|
+
BlackBerry
|
43
|
+
Build
|
44
|
+
Opera Mini\/\d{1,2}\.\d{1,2}\.[\d\.]*\/\d{1,2}\.
|
45
|
+
Opera
|
46
|
+
\.NET[\d\.]*
|
47
|
+
cubot
|
48
|
+
; M bot
|
49
|
+
; B bot
|
50
|
+
; IDbot
|
51
|
+
; ID bot
|
52
|
+
; POWER BOT
|
53
|
+
;
|
54
|
+
].strip.split(/\n+/).freeze
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module CrawlerDetect
|
4
|
+
module Library
|
5
|
+
module Headers
|
6
|
+
HEADERS = [
|
7
|
+
# The default User-Agent string.
|
8
|
+
"HTTP_USER_AGENT",
|
9
|
+
# Header can occur on devices using Opera Mini.
|
10
|
+
"HTTP_X_OPERAMINI_PHONE_UA",
|
11
|
+
# Vodafone specific header: http://www.seoprinciple.com/mobile-web-community-still-angry-at-vodafone/24/
|
12
|
+
"HTTP_X_DEVICE_USER_AGENT",
|
13
|
+
"HTTP_X_ORIGINAL_USER_AGENT",
|
14
|
+
"HTTP_X_SKYFIRE_PHONE",
|
15
|
+
"HTTP_X_BOLT_PHONE_UA",
|
16
|
+
"HTTP_DEVICE_STOCK_UA",
|
17
|
+
"HTTP_X_UCBROWSER_DEVICE_UA",
|
18
|
+
# Sometimes, bots (especially Google) use a genuine user agent, but fill this header in with their email address
|
19
|
+
"HTTP_FROM",
|
20
|
+
# Seen in use by Netsparker
|
21
|
+
"HTTP_X_SCANNER",
|
22
|
+
].freeze
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Rack
|
4
|
+
class CrawlerDetect
|
5
|
+
def initialize(app, options = {})
|
6
|
+
Rack::Request::Helpers.module_eval do
|
7
|
+
def is_crawler?
|
8
|
+
env["rack.crawler_detect"][:is_crawler]
|
9
|
+
end
|
10
|
+
|
11
|
+
def crawler_name
|
12
|
+
env["rack.crawler_detect"][:crawler_name]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
@app = app
|
16
|
+
end
|
17
|
+
|
18
|
+
def call(env)
|
19
|
+
@env = env
|
20
|
+
set_env_variables!
|
21
|
+
@app.call(@env)
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def set_env_variables!
|
27
|
+
return @env unless user_agent
|
28
|
+
detector = ::CrawlerDetect::Detector.new(user_agent)
|
29
|
+
@env["rack.crawler_detect"] = {
|
30
|
+
is_crawler: detector.is_crawler?,
|
31
|
+
crawler_name: detector.crawler_name,
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
def user_agent
|
36
|
+
@user_agent ||= begin
|
37
|
+
user_agent_headers.map do |header|
|
38
|
+
@env[header]
|
39
|
+
end.compact.join(" ")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def user_agent_headers
|
44
|
+
::CrawlerDetect::Library.get_array("headers")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
metadata
ADDED
@@ -0,0 +1,159 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawler_detect
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Pavel Kozlov
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-08-05 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.15'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.15'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: fuubar
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '2.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: parallel_tests
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '2.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '2.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: pry-meta
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.0.10
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.0.10
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rack-test
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '1.1'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '1.1'
|
111
|
+
description: CrawlerDetect is a library to detect bots/crawlers via the user agent
|
112
|
+
email:
|
113
|
+
- loadkpi@gmail.com
|
114
|
+
executables: []
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files: []
|
117
|
+
files:
|
118
|
+
- ".gitignore"
|
119
|
+
- ".rspec"
|
120
|
+
- ".rubocop.yml"
|
121
|
+
- ".travis.yml"
|
122
|
+
- Gemfile
|
123
|
+
- LICENSE.txt
|
124
|
+
- README.md
|
125
|
+
- Rakefile
|
126
|
+
- crawler_detect.gemspec
|
127
|
+
- lib/crawler_detect.rb
|
128
|
+
- lib/crawler_detect/detector.rb
|
129
|
+
- lib/crawler_detect/library.rb
|
130
|
+
- lib/crawler_detect/library/crawlers.rb
|
131
|
+
- lib/crawler_detect/library/exclusions.rb
|
132
|
+
- lib/crawler_detect/library/headers.rb
|
133
|
+
- lib/crawler_detect/version.rb
|
134
|
+
- lib/rack/crawler_detect.rb
|
135
|
+
homepage: https://github.com/loadkpi/crawler_detect
|
136
|
+
licenses:
|
137
|
+
- MIT
|
138
|
+
metadata: {}
|
139
|
+
post_install_message:
|
140
|
+
rdoc_options: []
|
141
|
+
require_paths:
|
142
|
+
- lib
|
143
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
144
|
+
requirements:
|
145
|
+
- - ">="
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: '0'
|
148
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
requirements: []
|
154
|
+
rubyforge_project:
|
155
|
+
rubygems_version: 2.7.7
|
156
|
+
signing_key:
|
157
|
+
specification_version: 4
|
158
|
+
summary: 'CrawlerDetect: detect bots/crawlers'
|
159
|
+
test_files: []
|