bot_detection 0.9.9 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2d3eb0c816dab6abd7bedf902b7064e55d87b6b3
4
+ data.tar.gz: 31db977a5485d47cf579822b3e6a46297730ff91
5
+ SHA512:
6
+ metadata.gz: 9dec5f7768f8a9d382c7a6faa045977e649b89e8e05a0f25f90224f97333e65a33edf023960162c91a11e21c0ebc78ec0f145cba22b06f20c8f71687e670f81a
7
+ data.tar.gz: ce183a9aa5f595fe46eaa1e9a3a7de8972b4a86d22cb66c4c29e3bf591e80f6f3b80790ad12808b8fb739d1a1d0d8732d1cc7d1815b6cd09477a9eeff0816f74
data/CHANGELOG.md ADDED
@@ -0,0 +1,14 @@
1
+ # Changelog
2
+
3
+ #### Release 1.0.1
4
+
5
+ - Yandex support added
6
+ - Baidu support added
7
+ - better check for hostnames on reverse lookup
8
+ - match for full user agent on msn, bing and yahoo
9
+ - it should now properly detect all google bots
10
+ - the methods is_msn?, is_bing? and is_yahoo? have been removed
11
+
12
+ #### Release 0.9.9
13
+
14
+ - Initial version
data/README.md CHANGED
@@ -16,6 +16,10 @@ Or install it yourself as:
16
16
 
17
17
  $ gem install bot_detection
18
18
 
19
+ ## Changes
20
+
21
+ See the [CHANGELOG.md](CHANGELOG.md) file for details.
22
+
19
23
  ## Usage
20
24
 
21
25
  TODO: Write usage instructions here
@@ -16,6 +16,8 @@ Gem::Specification.new do |spec|
16
16
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
17
  spec.require_paths = ["lib"]
18
18
 
19
+ spec.add_runtime_dependency "public_suffix", "~> 1.4.6"
20
+
19
21
  spec.add_development_dependency "bundler", "~> 1.6"
20
22
  spec.add_development_dependency "rake"
21
23
  end
@@ -0,0 +1,7 @@
1
+ BotDetection::BotUserAgents = [
2
+ "FacebookExternalHit/1.1",
3
+ "FacebookExternalHit/1.0",
4
+ "facebookexternalhit/1.0 (+http://www.facebook.com/externalhit_uatext.php)",
5
+ "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
6
+ "facebookplatform/1.0 (+http://developers.facebook.com)"
7
+ ]
@@ -1,5 +1,6 @@
1
1
  BotDetection::GOOGLE_USER_AGENTS = [
2
2
  "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
3
+ "Googlebot/2.1 (+http://www.googlebot.com/bot.html)",
3
4
  "Googlebot/2.1 (+http://www.google.com/bot.html)",
4
5
  "Googlebot-News",
5
6
  "Googlebot-Image/1.0",
@@ -1,40 +1,39 @@
1
1
  require 'bot_detection'
2
2
  require_relative 'google_user_agents.rb'
3
+ require_relative 'bot_user_agents.rb'
4
+ require_relative 'search_engine_user_agents.rb'
3
5
 
4
6
  module BotDetection::InstanceMethods
7
+ def is_known_crawler?
8
+ BotDetection::BotUserAgents.include?(user_agent) || is_search_engine_crawler?(reverse_lookup: false)
9
+ end
10
+
5
11
  def is_search_engine_crawler? options = {}
6
12
  remote_ip = options.delete(:ip) || options.delete(:ip_address) || request.remote_ip
7
- return false if remote_ip.blank? || (!is_google? && !is_yahoo? && !is_msn? && !is_bing?)
8
-
13
+ return false if remote_ip.blank?
14
+
15
+ reverse_lookup = options.delete(:reverse_lookup)
16
+ reverse_lookup = true if reverse_lookup.nil?
17
+
18
+ return false unless is_known_search_engine_crawler?
19
+ return true unless reverse_lookup
9
20
  return true if options.delete(:development)
10
21
 
11
- found = false
12
- host = get_hostname(remote_ip)
13
- ["crawl.yahoo.net", "googlebot", "search.msn.com", "ask.com"].each do |h|
14
- found = true and break if host.include?(h)
15
- end
16
-
17
- return false unless found
22
+ host = get_hostname(remote_ip)
23
+ domain = PublicSuffix.parse(host) rescue nil
24
+ return false if domain.nil?
18
25
 
19
- host_ip = get_hostip(host)
20
- return host_ip == remote_ip
21
- false
26
+ return false unless ["crawl.yahoo.net", "googlebot.com", "google.com", "search.msn.com", "ask.com", "yandex.net", "yandex.com", "yandex.ru", "baidu.com", "baidu.jp"].include?(domain.domain.downcase)
27
+ get_hostip(host) == remote_ip
22
28
  end
23
29
 
24
30
  def is_google?
25
- BotDetection::GOOGLE_USER_AGENTS.include?(request.env['HTTP_USER_AGENT']) || request.user_agent.to_s.downcase.include?("googlebot")
26
- end
27
-
28
- def is_yahoo?
29
- request.user_agent.to_s.downcase.include?("yahoo! slurp")
30
- end
31
-
32
- def is_msn?
33
- request.user_agent.to_s.downcase.include?("msnbot")
31
+ BotDetection::GOOGLE_USER_AGENTS.include?(user_agent)
34
32
  end
35
-
36
- def is_bing?
37
- request.user_agent.to_s.downcase.include?("bing")
33
+
34
+ protected
35
+ def is_known_search_engine_crawler?
36
+ is_google? || BotDetection::SearchEngineUserAgents.include?(user_agent)
38
37
  end
39
38
 
40
39
  def get_hostname(ip_address)
@@ -44,4 +43,8 @@ module BotDetection::InstanceMethods
44
43
  def get_hostip(host)
45
44
  Socket.gethostbyname(host).last.unpack("C*").join(".")
46
45
  end
46
+
47
+ def user_agent
48
+ (request.env['HTTP_USER_AGENT'] || request.user_agent).to_s
49
+ end
47
50
  end
@@ -0,0 +1,36 @@
1
+ BotDetection::SearchEngineUserAgents = [
2
+ "Baiduspider+(+http://www.baidu.com/search/spider.htm)",
3
+ "Baiduspider+(+http://www.baidu.com/search/spider_jp.html)",
4
+ "BaiDuSpider",
5
+ "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)",
6
+ "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)",
7
+ "Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)",
8
+ "Mozilla/5.0 (compatible; YandexVideo/3.0; +http://yandex.com/bots)",
9
+ "Mozilla/5.0 (compatible; YandexMedia/3.0; +http://yandex.com/bots)",
10
+ "Mozilla/5.0 (compatible; YandexBlogs/0.99; robot; +http://yandex.com/bots)",
11
+ "Mozilla/5.0 (compatible; YandexFavicons/1.0; +http://yandex.com/bots)",
12
+ "Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots)",
13
+ "Mozilla/5.0 (compatible; YandexPagechecker/1.0; +http://yandex.com/bots)",
14
+ "Mozilla/5.0 (compatible; YandexImageResizer/2.0; +http://yandex.com/bots)",
15
+ "Mozilla/5.0 (compatible; YandexDirect/3.0; +http://yandex.com/bots)",
16
+ "Mozilla/5.0 (compatible; YandexDirect/2.0; Dyatel; +http://yandex.com/bots)",
17
+ "Mozilla/5.0 (compatible; YandexMetrika/2.0; +http://yandex.com/bots)",
18
+ "Mozilla/5.0 (compatible; YandexNews/3.0; +http://yandex.com/bots)",
19
+ "Mozilla/5.0 (compatible; YandexCatalog/3.0; +http://yandex.com/bots)",
20
+ "Mozilla/5.0 (compatible; YandexAntivirus/2.0; +http://yandex.com/bots)",
21
+ "Mozilla/5.0 (compatible; YandexZakladki/3.0; +http://yandex.com/bots)",
22
+ "Mozilla/5.0 (compatible; YandexMarket/1.0; +http://yandex.com/bots)",
23
+ "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
24
+ "Mozilla/5.0 (compatible; Yahoo Slurp; http://help.yahoo.com/help/us/ysearch/slurp)",
25
+ "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)",
26
+ "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
27
+ "Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)",
28
+ "msnbot/2.1",
29
+ "msnbot/2.0b",
30
+ "msnbot/1.1 (+http://search.msn.com/msnbot.htm)",
31
+ "msnbot/1.1",
32
+ "msnbot/1.0 (+http://search.msn.com/msnbot.htm)",
33
+ "msnbot/0.9 (+http://search.msn.com/msnbot.htm)",
34
+ "msnbot/0.11 ( http://search.msn.com/msnbot.htm)",
35
+ "MSNBOT/0.1 (http://search.msn.com/msnbot.htm)",
36
+ ]
@@ -1,3 +1,3 @@
1
1
  module BotDetection
2
- VERSION = "0.9.9"
2
+ VERSION = "1.0.1"
3
3
  end
data/lib/bot_detection.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require "bot_detection/version"
2
+ require "public_suffix"
2
3
 
3
4
  module BotDetection
4
5
  def self.included(target)
metadata CHANGED
@@ -1,20 +1,32 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bot_detection
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.9
5
- prerelease:
4
+ version: 1.0.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Nils Berenbold
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2014-10-07 00:00:00.000000000 Z
11
+ date: 2014-11-13 00:00:00.000000000 Z
13
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: public_suffix
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 1.4.6
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 1.4.6
14
27
  - !ruby/object:Gem::Dependency
15
28
  name: bundler
16
29
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
30
  requirements:
19
31
  - - ~>
20
32
  - !ruby/object:Gem::Version
@@ -22,7 +34,6 @@ dependencies:
22
34
  type: :development
23
35
  prerelease: false
24
36
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
37
  requirements:
27
38
  - - ~>
28
39
  - !ruby/object:Gem::Version
@@ -30,17 +41,15 @@ dependencies:
30
41
  - !ruby/object:Gem::Dependency
31
42
  name: rake
32
43
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
44
  requirements:
35
- - - ! '>='
45
+ - - '>='
36
46
  - !ruby/object:Gem::Version
37
47
  version: '0'
38
48
  type: :development
39
49
  prerelease: false
40
50
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
51
  requirements:
43
- - - ! '>='
52
+ - - '>='
44
53
  - !ruby/object:Gem::Version
45
54
  version: '0'
46
55
  description:
@@ -51,38 +60,40 @@ extensions: []
51
60
  extra_rdoc_files: []
52
61
  files:
53
62
  - .gitignore
63
+ - CHANGELOG.md
54
64
  - Gemfile
55
65
  - LICENSE.txt
56
66
  - README.md
57
67
  - Rakefile
58
68
  - bot_detection.gemspec
59
69
  - lib/bot_detection.rb
70
+ - lib/bot_detection/bot_user_agents.rb
60
71
  - lib/bot_detection/google_user_agents.rb
61
72
  - lib/bot_detection/instance_methods.rb
73
+ - lib/bot_detection/search_engine_user_agents.rb
62
74
  - lib/bot_detection/version.rb
63
75
  homepage: http://www.nilsberenbold.de
64
76
  licenses:
65
77
  - MIT
78
+ metadata: {}
66
79
  post_install_message:
67
80
  rdoc_options: []
68
81
  require_paths:
69
82
  - lib
70
83
  required_ruby_version: !ruby/object:Gem::Requirement
71
- none: false
72
84
  requirements:
73
- - - ! '>='
85
+ - - '>='
74
86
  - !ruby/object:Gem::Version
75
87
  version: '0'
76
88
  required_rubygems_version: !ruby/object:Gem::Requirement
77
- none: false
78
89
  requirements:
79
- - - ! '>='
90
+ - - '>='
80
91
  - !ruby/object:Gem::Version
81
92
  version: '0'
82
93
  requirements: []
83
94
  rubyforge_project:
84
- rubygems_version: 1.8.24
95
+ rubygems_version: 2.0.14
85
96
  signing_key:
86
- specification_version: 3
97
+ specification_version: 4
87
98
  summary: Detects Search Engine crawlers by reverse DNS lookups.
88
99
  test_files: []