iGEL-ua_parser 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +50 -0
- data/Rakefile +12 -0
- data/apache_log_tester.rb +138 -0
- data/lib/ua_parser/user_agent.rb +492 -0
- data/lib/ua_parser/version.rb +85 -0
- data/lib/ua_parser.rb +2 -0
- data/test/crawler_test.rb +620 -0
- data/test/gecko_test.rb +493 -0
- data/test/other_test.rb +146 -0
- data/test/presto_test.rb +169 -0
- data/test/trident_test.rb +146 -0
- data/test/version_test.rb +130 -0
- data/test/webkit_test.rb +242 -0
- data/ua_parser.gemspec +32 -0
- metadata +81 -0
data/README.rdoc
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
== About ua_parser
|
2
|
+
|
3
|
+
ua_parser will become a ruby gem to identify user agents like browsers or
|
4
|
+
crawlers by the provided user agent string. I'm planning try to get most of
|
5
|
+
the available information like GUI language of the browser or email addresses
|
6
|
+
provided by a bot out of it.
|
7
|
+
|
8
|
+
I tried to identify common user agents first, reducing the necessary regexps for
|
9
|
+
them. But I guess, it could be improved alot. Of course I'd like to get feedback.
|
10
|
+
Even if you just revise my crappy English, send me an e-mail. ;-)
|
11
|
+
|
12
|
+
=== Project status (as of 2009-01-25):
|
13
|
+
|
14
|
+
Right know, the project is at a very early state. Of my 14 million hits sample,
|
15
|
+
ua_parser can identify about 96.5 % of all hits.
|
16
|
+
|
17
|
+
I tried to cover as much as possible with tests. At the moment, I have 99 tests
|
18
|
+
implemented.
|
19
|
+
|
20
|
+
Known browsers:
|
21
|
+
* Chrome
|
22
|
+
* Firefox and most other gecko based browsers
|
23
|
+
* Internet Explorer
|
24
|
+
* Opera, pure and pretending to be an Internet Explorer or Firefox
|
25
|
+
* Safari >= Version 3
|
26
|
+
|
27
|
+
Known bots:
|
28
|
+
* Baidubot
|
29
|
+
* gigabot
|
30
|
+
* gonzo (of suchen.de)
|
31
|
+
* Googlebot, Googlebot-Images, Mediapartners-Google
|
32
|
+
* mj12bot
|
33
|
+
* msnbot and msnbot-media
|
34
|
+
* seekbot
|
35
|
+
* speedy spider
|
36
|
+
* twiceler (of cuil.com)
|
37
|
+
* Yahoo! Slurp
|
38
|
+
* yeti (of naver.com)
|
39
|
+
|
40
|
+
Other known agents
|
41
|
+
* Apache httpd
|
42
|
+
* Jakarta Commons httpclient
|
43
|
+
* Java
|
44
|
+
* libwww-perl
|
45
|
+
* SVN
|
46
|
+
* TortoiseSVN
|
47
|
+
* veoh service
|
48
|
+
|
49
|
+
Also, ua_parser tries to identify bots and feedreader, even if it doesn't know
|
50
|
+
about them. That way, the results should be close to 100%.
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'echoe'
|
4
|
+
|
5
|
+
Echoe.new('ua_parser', '0.0.1') do |p|
|
6
|
+
p.description = "Parses the user agent of browsers and bots."
|
7
|
+
p.url = "http://github.com/iGEL/ua_parser"
|
8
|
+
p.author = "Johannes Barre"
|
9
|
+
p.email = "igel@igels.net"
|
10
|
+
p.ignore_pattern = ["pkg/*", "logs/*", "output*", "nbproject/**", "nbproject/private/*"]
|
11
|
+
p.development_dependencies = []
|
12
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "lib/ua_parser"
|
4
|
+
|
5
|
+
def stats(stats)
|
6
|
+
$stderr.puts
|
7
|
+
$stderr.puts "Parsed #{stats[:parsed]} of #{stats[:files].size} files"
|
8
|
+
$stderr.puts "#{stats[:unknown].size} unknown user agents, #{stats[:known].size} known and #{stats[:errors].size} errors"
|
9
|
+
$stderr.puts "#{stats[:known_count]} of #{stats[:total_count]} hits (#{sprintf("%.3f", stats[:known_count]/stats[:total_count].to_f * 100)}%) identified"
|
10
|
+
$stderr.puts
|
11
|
+
end
|
12
|
+
|
13
|
+
raise "File tmp_log exists" if File.exist?("tmp_log")
|
14
|
+
Kernel.trap("SIGUSR1") do
|
15
|
+
File.delete("tmp_log") if File.exists?("tmp_log")
|
16
|
+
exit
|
17
|
+
end
|
18
|
+
|
19
|
+
gunzip = `which gunzip`.strip
|
20
|
+
bunzip = `which bunzip2`.strip
|
21
|
+
|
22
|
+
unknown = {}
|
23
|
+
known = {}
|
24
|
+
errors = {}
|
25
|
+
|
26
|
+
files = Dir["logs/*access*"]
|
27
|
+
i = 0
|
28
|
+
known_count = 0
|
29
|
+
unknown_count = 0
|
30
|
+
total_count = 0
|
31
|
+
|
32
|
+
files.each do |filename|
|
33
|
+
if i % 25 == 0 && i > 0
|
34
|
+
stats(:parsed => i, :files => files, :unknown => unknown, :known => known, :errors => errors, :known_count => known_count, :total_count => total_count)
|
35
|
+
end
|
36
|
+
i += 1
|
37
|
+
org_name = filename
|
38
|
+
if /\.gz$/ =~ filename
|
39
|
+
if gunzip == ""
|
40
|
+
$stderr.puts "SKIPPING #{filename}, gunzip not found"
|
41
|
+
next
|
42
|
+
end
|
43
|
+
$stderr.puts "Decompressing #{filename}"
|
44
|
+
system "#{gunzip} --stdout #{filename} > tmp_log"
|
45
|
+
filename = "tmp_log"
|
46
|
+
elsif /\.bz2$/ =~ filename
|
47
|
+
if bunzip == ""
|
48
|
+
$stderr.puts "SKIPPING #{filename}, bunzip2 not found"
|
49
|
+
next
|
50
|
+
end
|
51
|
+
$stderr.puts "Decompressing #{filename}"
|
52
|
+
raise "File tmp_log exists" if File.exist?("tmp_log")
|
53
|
+
system "#{bunzip} --stdout #{filename} > tmp_log"
|
54
|
+
filename = "tmp_log"
|
55
|
+
end
|
56
|
+
$stderr.puts "Parsing #{org_name}"
|
57
|
+
File.open(filename) do |file|
|
58
|
+
file.each_line do |line|
|
59
|
+
match = /\[([0-9]+)\/([a-z]+)\/([0-9]+):([0-9]+):([0-9]+):([0-9]+) .*\] .* "([^"]+)"\Z/i.match(line)
|
60
|
+
begin
|
61
|
+
if match && match[7].strip != "" && match[7].strip != "-"
|
62
|
+
access = Time.mktime(match[3].to_i, {"Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12}[match[2]], match[1].to_i, match[4].to_i, match[5].to_i, match[6].to_i)
|
63
|
+
ua_string = match[7].strip
|
64
|
+
ua = UaParser::UserAgent.new(ua_string)
|
65
|
+
total_count += 1
|
66
|
+
if ua.known?
|
67
|
+
known_count += 1
|
68
|
+
name = "#{ua.name}"
|
69
|
+
name = "#{ua.name}/#{ua.version.major}" unless ua.version.nil?
|
70
|
+
unless known.has_key?(name)
|
71
|
+
known[name] = {:count => 0, :count_30d => 0, :regexps => 0, :last_access => Time.at(0), :first_seen => Time.now}
|
72
|
+
end
|
73
|
+
obj = known[name]
|
74
|
+
else
|
75
|
+
unknown_count += 1
|
76
|
+
unless unknown.has_key?ua_string
|
77
|
+
unknown[ua_string] = {:count => 0, :count_30d => 0, :guess => ua.type, :regexps => 0, :last_access => Time.at(0), :first_seen => Time.now}
|
78
|
+
end
|
79
|
+
obj = unknown[ua_string]
|
80
|
+
end
|
81
|
+
obj[:count] += 1
|
82
|
+
obj[:regexps] += ua.regexps
|
83
|
+
obj[:last_access] = access if access > obj[:last_access]
|
84
|
+
obj[:first_seen] = access if access < obj[:first_seen]
|
85
|
+
obj[:count_30d] += 1 if access > Time.now - 30 * 24 * 60 * 60
|
86
|
+
end
|
87
|
+
rescue RuntimeError => e
|
88
|
+
if errors[ua_string]
|
89
|
+
errors[ua_string][:count] += 1
|
90
|
+
else
|
91
|
+
errors[ua_string] = {:exception => e, :count => 1}
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
File.delete("tmp_log") if filename == "tmp_log"
|
97
|
+
break if errors.size > 100 || unknown.size > 5000 || known.size > 5000
|
98
|
+
GC.start
|
99
|
+
end
|
100
|
+
stats(:parsed => i, :files => files, :unknown => unknown, :known => known, :errors => errors, :known_count => known_count, :total_count => total_count)
|
101
|
+
|
102
|
+
puts
|
103
|
+
puts "UNKNOWN USER AGENTS:"
|
104
|
+
puts "===================="
|
105
|
+
keys = unknown.keys.sort do |one, other|
|
106
|
+
if unknown[other][:count_30d] == unknown[one][:count_30d]
|
107
|
+
unknown[other][:count] <=> unknown[one][:count]
|
108
|
+
else
|
109
|
+
unknown[other][:count_30d] <=> unknown[one][:count_30d]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
keys.each do |ua|
|
113
|
+
puts "#{ua}: #{unknown[ua][:count_30d]} hits in the last 30 days (Hits total: #{unknown[ua][:count]}, my guess: It's a #{unknown[ua][:guess]}, first seen: #{unknown[ua][:first_seen].strftime("%Y-%m-%d %H:%M:%S")}, last access: #{unknown[ua][:last_access].strftime("%Y-%m-%d %H:%M:%S")})"
|
114
|
+
end
|
115
|
+
|
116
|
+
puts
|
117
|
+
puts "KNOWN USER AGENTS:"
|
118
|
+
puts "=================="
|
119
|
+
keys = known.keys.sort do |one, other|
|
120
|
+
if known[other][:count_30d] == known[one][:count_30d]
|
121
|
+
known[other][:count] <=> known[one][:count]
|
122
|
+
else
|
123
|
+
known[other][:count_30d] <=> known[one][:count_30d]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
keys.each do |ua|
|
127
|
+
puts "#{ua}: #{known[ua][:count_30d]} hits in the last 30 days (Hits total: #{known[ua][:count]}, regexps: #{sprintf("%.3f", known[ua][:regexps].to_f/known[ua][:count])}/hit, first seen: #{known[ua][:first_seen].strftime("%Y-%m-%d %H:%M:%S")}, last access: #{known[ua][:last_access].strftime("%Y-%m-%d %H:%M:%S")})"
|
128
|
+
end
|
129
|
+
|
130
|
+
puts
|
131
|
+
puts "ERROR CAUSING AGENTS:"
|
132
|
+
puts "====================="
|
133
|
+
errors.each_key do |ua|
|
134
|
+
puts "#{ua}: #{errors[ua][:exception].class}: #{errors[ua][:exception].message}"
|
135
|
+
errors[ua][:exception].backtrace.each do |line|
|
136
|
+
puts " #{line}"
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,492 @@
|
|
1
|
+
module UaParser
|
2
|
+
class UserAgent
|
3
|
+
# Creates a new UserAgent object.
|
4
|
+
#
|
5
|
+
# == Parameters
|
6
|
+
# * ua_string: The user agent as you got it
|
7
|
+
def initialize(ua_string)
|
8
|
+
@ua_string = ua_string.strip.downcase
|
9
|
+
@known = false
|
10
|
+
@type = :browser
|
11
|
+
@name = :unknown_browser
|
12
|
+
@urls = []
|
13
|
+
@emails = []
|
14
|
+
@dot_net_versions = []
|
15
|
+
@regexps = 0
|
16
|
+
@details = []
|
17
|
+
|
18
|
+
if @ua_string == "" || @ua_string == "-"
|
19
|
+
@name = :no_agent_given
|
20
|
+
return
|
21
|
+
end
|
22
|
+
|
23
|
+
# Identify Operas pretending to be an Internet Explorer
|
24
|
+
unless @known
|
25
|
+
match = /^mozilla\/4\.0 \(compatible; msie [4-6].0;(.*)\) opera ([0-9]+\.[0-9]+)(.*)$/.match(@ua_string)
|
26
|
+
@regexps += 1
|
27
|
+
if match
|
28
|
+
@known = true
|
29
|
+
@name = :opera
|
30
|
+
@render_engine = :presto
|
31
|
+
@version = match[2]
|
32
|
+
@details = match[1].split(/;\s?/) + match[3].split(/\s/)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Identify Internet Explorers
|
37
|
+
unless @known
|
38
|
+
match = /^mozilla\/4.0 \(compatible; msie ([0-9]\.[0-9]); ([^)]+)\)(.*)$/.match(@ua_string)
|
39
|
+
@regexps += 1
|
40
|
+
if match
|
41
|
+
@known = true
|
42
|
+
@name = :internet_explorer
|
43
|
+
@render_engine = :trident
|
44
|
+
@version = match[1]
|
45
|
+
@details = match[2].split(/;\s?/) + match[3].split(/\s/)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Identify Opera pretending to be a Firefox
|
50
|
+
unless @known
|
51
|
+
match = /\Amozilla\/5.0 \((.+) rv:[0-9.]+\) gecko\/[0-9]+ firefox\/[0-9.]+ opera ([0-9]+\.[0-9]+)(.*)\Z/.match(@ua_string)
|
52
|
+
@regexps += 1
|
53
|
+
if match
|
54
|
+
@known = true
|
55
|
+
@name = :opera
|
56
|
+
@render_engine = :presto
|
57
|
+
@version = match[2]
|
58
|
+
@details = match[1].split(/;\s?/) + match[3].split(/\s/)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Flock and some versions of the Netscape Navigator try to identify themself
|
63
|
+
# also as a Firefox, so we have to identify them first.
|
64
|
+
unless @known
|
65
|
+
match = /^mozilla\/5\.0 \(([^)]+); rv:([^; )]+)\) gecko\/20[0-2][0-9][01][0-9][0-3][0-9][0-9]* firefox\/[^ ]+ (flock|navigator)\/([^ ]+)(.*)$/.match(@ua_string)
|
66
|
+
@regexps += 1
|
67
|
+
if match
|
68
|
+
@known = true
|
69
|
+
@name = match[3].to_sym
|
70
|
+
@render_engine = :gecko
|
71
|
+
@details = match[1].split(/;\s?/) + match[5].split(/\s/)
|
72
|
+
@render_engine_version = match[2]
|
73
|
+
@version = match[4]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Identify all other gecko based browsers except the orginal mozilla
|
78
|
+
unless @known
|
79
|
+
match = /\Amozilla\/5\.0 \((.+); rv:([^; )]+)\) gecko\/20[0-2][0-9][01][0-9][0-3][0-9][0-9]*( [^ ]+)?( \([^)]+\))? (bonecho|camino|epiphany|firefox|granparadiso|iceweasel|k-meleon|minefield|netscape6?|phoenix|seamonkey|songbird|thunderbird)\/([^ ]+)(.*)\Z/.match(@ua_string)
|
80
|
+
@regexps += 1
|
81
|
+
if match
|
82
|
+
@known = true
|
83
|
+
@name = match[5].to_sym
|
84
|
+
@name = :netscape if @name == :netscape6
|
85
|
+
@render_engine = :gecko
|
86
|
+
@details = match[1].split(/;\s?/) + "#{match[3]} #{match[7]}".split(/\s/)
|
87
|
+
@render_engine_version = match[2]
|
88
|
+
@version = match[6]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Identify the Googlebot, mj12bot and yahoo slurp
|
93
|
+
unless @known
|
94
|
+
match = /^mozilla\/5.0 \(compatible; (googlebot|mj12bot|yahoo! slurp)(\/v?([^)]+))?; \+?(http:\/\/[^)]+)\)$/.match(@ua_string)
|
95
|
+
@regexps += 1
|
96
|
+
if match
|
97
|
+
@known = true
|
98
|
+
@name = match[1].to_sym
|
99
|
+
@type = :bot
|
100
|
+
@urls << match[4]
|
101
|
+
@version = match[3]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Identify the baiduspider, gigabot, mnsbot, msnbot-media, seekbot, speedy spider and naver yeti > 1.0
|
106
|
+
unless @known
|
107
|
+
match = /\A(baiduspider|gigabot|msnbot|msnbot-media|seekbot|speedy spider|yeti)(\/([^+ ]+))?[ +]\(.*(http:\/\/[^)]+)\)/.match(@ua_string)
|
108
|
+
@regexps += 1
|
109
|
+
if match
|
110
|
+
@known = true
|
111
|
+
@name = match[1].to_sym
|
112
|
+
@type = :bot
|
113
|
+
@urls << match[4]
|
114
|
+
@version = match[3]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
unless @known
|
119
|
+
match = /\A(mediapartners-google|googlebot-image)(\/([^ ]+))?\Z/.match(@ua_string)
|
120
|
+
@regexps += 1
|
121
|
+
if match
|
122
|
+
@known = true
|
123
|
+
@name = match[1].to_sym
|
124
|
+
@type = :bot
|
125
|
+
@version = match[3]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Identify Chrome
|
130
|
+
unless @known
|
131
|
+
match = /\Amozilla\/5\.0 \(([^)]+)\) applewebkit\/([0-9]+\.[0-9]+) \(khtml, like gecko\) chrome\/([0-9.]+) safari\/[0-9]+\.[0-9]+(.*)\Z/.match(@ua_string)
|
132
|
+
@regexps += 1
|
133
|
+
if match
|
134
|
+
@known = true
|
135
|
+
@name = :chrome
|
136
|
+
@render_engine = :webkit
|
137
|
+
@render_engine_version = match[2]
|
138
|
+
@version = match[3]
|
139
|
+
@details = match[1].split(/;\s?/) + match[4].split(/\s/)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# Identify Safari >= bersion 3
|
144
|
+
unless @known
|
145
|
+
match = /\Amozilla\/5.0 \(([^)]+)\) applewebkit\/([^ ]+) \(khtml, like gecko\) version\/([^ ]+) safari\/[0-9]+\.[0-9]+(.*)\Z/.match(@ua_string)
|
146
|
+
@regexps += 1
|
147
|
+
if match
|
148
|
+
@known = true
|
149
|
+
@name = :safari
|
150
|
+
@render_engine = :webkit
|
151
|
+
@render_engine_version = match[2]
|
152
|
+
@version = match[3]
|
153
|
+
@details = match[1].split(/;\s?/) + match[4].split(/\s/)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Identify twiceler (of cuil)
|
158
|
+
unless @known
|
159
|
+
match = /^mozilla\/5.0 \((twiceler)-([^ ]+) \+?(http:\/\/[^)]+)\)$/.match(@ua_string)
|
160
|
+
@regexps += 1
|
161
|
+
if match
|
162
|
+
@known = true
|
163
|
+
@name = match[1].to_sym
|
164
|
+
@type = :bot
|
165
|
+
@urls << match[3]
|
166
|
+
@version = match[2]
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# Identify gonzo (of suchen.de)
|
171
|
+
unless @known
|
172
|
+
match = /\Agonzo([0-9]+)\[[a-z]\] \+([^ ]+)\Z/.match(@ua_string)
|
173
|
+
@regexps += 1
|
174
|
+
if match
|
175
|
+
@known = true
|
176
|
+
@name = :gonzo
|
177
|
+
@type = :bot
|
178
|
+
@version = match[1]
|
179
|
+
@urls << match[2]
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# Identify the yeti 0.01
|
184
|
+
unless @known
|
185
|
+
match = /\Ayeti\/([^ ]+) \(nhn\/1noon, yetibot@naver.com, check robots.txt daily and follow it\)\Z/.match(@ua_string)
|
186
|
+
@regexps += 1
|
187
|
+
if match
|
188
|
+
@known = true
|
189
|
+
@name = :yeti
|
190
|
+
@type = :bot
|
191
|
+
@emails << "yetibot@naver.com"
|
192
|
+
@version = match[1]
|
193
|
+
end
|
194
|
+
end
|
195
|
+
=begin
|
196
|
+
Disabled, since we cannot identify the browser versions. If you know a table
|
197
|
+
with all released versions of safari with the used webkit versions, please
|
198
|
+
report it!
|
199
|
+
|
200
|
+
|
201
|
+
# Identify Safari < version 3
|
202
|
+
unless @known
|
203
|
+
match = /\Amozilla\/5\.0 \((.+)\) applewebkit\/([^ ]+) \(khtml, like gecko\) safari\/([^ ]+)(.*)\Z/.match(@ua_string)
|
204
|
+
@regexps += 1
|
205
|
+
if match
|
206
|
+
@known = true
|
207
|
+
@name = :safari
|
208
|
+
@render_engine = :webkit
|
209
|
+
@render_engine_version = match[2]
|
210
|
+
@version = match[3]
|
211
|
+
ua_info = "#{match[1]} #{match[4]}"
|
212
|
+
end
|
213
|
+
end
|
214
|
+
=end
|
215
|
+
|
216
|
+
# Identify now Operas, which do not try to pretend another browser
|
217
|
+
unless @known
|
218
|
+
match = /\Aopera\/([0-9]+\.[0-9]+) \(([^)]+)\) ?(presto\/([0-9.]+))?(.*)\Z/.match(@ua_string)
|
219
|
+
@regexps += 1
|
220
|
+
if match
|
221
|
+
@known = true
|
222
|
+
@name = :opera
|
223
|
+
@version = match[1]
|
224
|
+
@render_engine = :presto
|
225
|
+
@render_engine_version = match[4]
|
226
|
+
@details = match[2].split(/;\s?/) + match[5].split(/\s/)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
# Identify rarely used agents
|
231
|
+
unless @known
|
232
|
+
match = /veoh-\\xe2\\xb8\\xb3\\xe2\\xb8\\xb40 service \([^)]\)/.match(@ua_string)
|
233
|
+
@regexps += 1
|
234
|
+
if match
|
235
|
+
@known = true
|
236
|
+
@name = :veoh_service
|
237
|
+
@type = :other
|
238
|
+
@details = match[1].split(/;\s?/)
|
239
|
+
end
|
240
|
+
|
241
|
+
# Identfity tortoise svn
|
242
|
+
unless @known
|
243
|
+
match = /\Asvn\/[^ ]+ \(r[0-9]+\)\/tortoisesvn-([^ ]+)/.match(@ua_string)
|
244
|
+
@regexps += 1
|
245
|
+
if match
|
246
|
+
@known = true
|
247
|
+
@name = :tortoisesvn
|
248
|
+
@type = :other
|
249
|
+
@version = match[1]
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
# Identify other http agents
|
254
|
+
unless @known
|
255
|
+
match = /\Asvn\/([^ ]+) \(r[0-9]+\)/.match(@ua_string)
|
256
|
+
@regexps += 1
|
257
|
+
if match
|
258
|
+
@known = true
|
259
|
+
@name = :svn_client
|
260
|
+
@type = :other
|
261
|
+
@version = match[1]
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
# Identify libwww-perl
|
266
|
+
unless @known
|
267
|
+
match = /\Alibwww-perl\/([^ ]+)\Z/.match(@ua_string)
|
268
|
+
@regexps += 1
|
269
|
+
if match
|
270
|
+
@known = true
|
271
|
+
@type = :other
|
272
|
+
@name = :"libwww-perl"
|
273
|
+
@version = match[1]
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
# Identify Jakarta Commons HttpClient libary
|
278
|
+
unless @known
|
279
|
+
match = /\Ajakarta commons-httpclient\/([^ ]+)\Z/.match(@ua_string)
|
280
|
+
@regexps += 1
|
281
|
+
if match
|
282
|
+
@known = true
|
283
|
+
@type = :other
|
284
|
+
@name = :jakarta_commons_httpclient
|
285
|
+
@version = match[1]
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
unless @known
|
290
|
+
match = /\Ajava\/([^ ]+)\Z/.match(@ua_string)
|
291
|
+
@regexps += 1
|
292
|
+
if match
|
293
|
+
@known = true
|
294
|
+
@type = :other
|
295
|
+
@name = :java
|
296
|
+
@version = match[1]
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
# Identify connections from the Apache httpd.
|
301
|
+
unless @known
|
302
|
+
match = /\Aapache\/([^ ]+)/.match(@ua_string)
|
303
|
+
@regexps += 1
|
304
|
+
if match
|
305
|
+
@known = true
|
306
|
+
@name = :apache_httpd
|
307
|
+
@version = match[1]
|
308
|
+
@type = :other
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
# Still not known? Try to guess
|
314
|
+
unless @known
|
315
|
+
if @ua_string =~ /bot|spider|crawler/
|
316
|
+
@regexps += 1
|
317
|
+
@type = :bot
|
318
|
+
@name = :unknown_bot
|
319
|
+
elsif @ua_string =~ /feed|rss|atom/
|
320
|
+
@regexps += 2
|
321
|
+
@type = :feed_reader
|
322
|
+
@name = :unknown_feed_reader
|
323
|
+
end
|
324
|
+
@urls = @ua_string.scan(/(((f|ht)tps?:\/\/|www\.)[a-z0-9.-]+(:[0-9]+)?(\/([a-z0-9.\-_+\/%?=;,&]*[a-z0-9.\-_+\/%?=,&])?)?)/).collect { |url| url.first }
|
325
|
+
@emails = @ua_string.scan(/([0-9a-z]([0-9a-z_\-.]*[0-9a-z])?@[0-9a-z.-]+)/).collect { |mail| mail.first}
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
# Returns the architecture as a lower case symbol like :i686 or :ppc
|
330
|
+
def architecture
|
331
|
+
raise NotImplementedError
|
332
|
+
end
|
333
|
+
|
334
|
+
# Returns true, if the user agent is suspected or known to be a bot.
|
335
|
+
def bot?
|
336
|
+
@type == :bot
|
337
|
+
end
|
338
|
+
|
339
|
+
# Returns true, if the user agent is known or suspected to be a browser.
|
340
|
+
# Completely unknown user agents are also suspected browsers. Check known?,
|
341
|
+
# if you want to make sure the browser is known.
|
342
|
+
def browser?
|
343
|
+
@type == :browser
|
344
|
+
end
|
345
|
+
|
346
|
+
# Returns an Array of all available .NET-Versions.
|
347
|
+
def dotnet_versions
|
348
|
+
raise NotImplementedError
|
349
|
+
end
|
350
|
+
|
351
|
+
# Some bots provide on or more contact email addresses. This method will
|
352
|
+
# return the first identified address as a String or nil, if no address could
|
353
|
+
# be identified.
|
354
|
+
def email
|
355
|
+
@emails.first
|
356
|
+
end
|
357
|
+
|
358
|
+
# Some bots provide one or more contact email addresses. This method will
|
359
|
+
# return all identified addresses as an Array. It'll empty, if no addresses
|
360
|
+
# could be identified
|
361
|
+
def emails
|
362
|
+
@emails
|
363
|
+
end
|
364
|
+
|
365
|
+
# Returns true, if the given user agent is suspected or known to be a feed
|
366
|
+
# reader.
|
367
|
+
def feed_reader?
|
368
|
+
@type == :feed_reader
|
369
|
+
end
|
370
|
+
|
371
|
+
# Returns true, if the user agent is a tool to grab the contents of a
|
372
|
+
# webpage like wget
|
373
|
+
def grapper?
|
374
|
+
@type == :grabber
|
375
|
+
end
|
376
|
+
|
377
|
+
# Returns true, if the given user agent is know, otherwise false
|
378
|
+
def known?
|
379
|
+
@known
|
380
|
+
end
|
381
|
+
|
382
|
+
# Returns the name of the user agent.
|
383
|
+
def name
|
384
|
+
@name
|
385
|
+
end
|
386
|
+
|
387
|
+
# Returns the operating system as a string. Returns nil if the operating
|
388
|
+
# system is unknown.
|
389
|
+
# For Linux systems, the name of the distribution will be returned, or
|
390
|
+
# "Unknown Linux Distribution"
|
391
|
+
def os
|
392
|
+
raise NotImplementedError
|
393
|
+
end
|
394
|
+
|
395
|
+
# Returns the type of the operating system as a String, like "Linux" or
|
396
|
+
# "Windows". Returns nil, if the os is unknown.
|
397
|
+
def os_type
|
398
|
+
raise NotImplementedError
|
399
|
+
end
|
400
|
+
|
401
|
+
# Returns the operating system as a string. Returns nil if the operating
|
402
|
+
# system version is unknown
|
403
|
+
def os_version
|
404
|
+
raise NotImplementedError
|
405
|
+
end
|
406
|
+
|
407
|
+
# Returns true, if the user agent is neighter a browser, bot or feed reader.
|
408
|
+
# Examples: SVN Client, Apache-Browser, libaries like libwww-perl
|
409
|
+
def other?
|
410
|
+
@type == :other
|
411
|
+
end
|
412
|
+
|
413
|
+
# Returns the number of regexps, which were executed to identify this agent.
|
414
|
+
# Regexps for identifications of versions are not counted.
|
415
|
+
def regexps
|
416
|
+
@regexps
|
417
|
+
end
|
418
|
+
|
419
|
+
# Returns the name of the render engine as a string if known, otherwise nil.
|
420
|
+
def render_engine
|
421
|
+
@render_engine
|
422
|
+
end
|
423
|
+
|
424
|
+
# Returns a UaParser::Version object with the version of the render engine
|
425
|
+
# if known, otherwise nil
|
426
|
+
def render_engine_version
|
427
|
+
parse_details if @render_engine == :trident
|
428
|
+
return nil if @render_engine_version.nil?
|
429
|
+
@render_engine_version_object = Version.new(@render_engine_version, @render_engine) if @render_engine_version_object.nil?
|
430
|
+
@render_engine_version_object
|
431
|
+
end
|
432
|
+
|
433
|
+
# Returns the type of the user agent as a Symbol. Returns :browser, if the
|
434
|
+
# user agent coundn't be recognized, since this seems the savest choice. Use
|
435
|
+
# #known? if you want to know, if the user agent could be recognized.
|
436
|
+
def type
|
437
|
+
@type
|
438
|
+
end
|
439
|
+
|
440
|
+
# Returns the user interface language as a symbol like :en or :de. Returns nil
|
441
|
+
# if the user interface language is not known.
|
442
|
+
def ui_lang
|
443
|
+
raise NotImplementedError
|
444
|
+
end
|
445
|
+
|
446
|
+
# Returns the country code from the user interface language as a lower case
|
447
|
+
# symbol. For example, en_US will return :us. Returns nil if the country is
|
448
|
+
# unknown.
|
449
|
+
def ui_lang_country
|
450
|
+
raise NotImplementedError
|
451
|
+
end
|
452
|
+
|
453
|
+
# Some bots provide on or more urls. This method will return the first
|
454
|
+
# identified url as a String or nil, if no url could be identified.
|
455
|
+
def url
|
456
|
+
@urls.first
|
457
|
+
end
|
458
|
+
|
459
|
+
# Some bots provide one or more urls. This method will return all identified
|
460
|
+
# urls as an Array. It'll empty, if no urls could be identified.
|
461
|
+
def urls
|
462
|
+
@urls
|
463
|
+
end
|
464
|
+
|
465
|
+
# Returns the vendor of the user agent, if known. Otherwise nil will be
|
466
|
+
# returned.
|
467
|
+
def vendor
|
468
|
+
raise NotImplementedError
|
469
|
+
end
|
470
|
+
|
471
|
+
# Returns a UaAgent::Version object with the Version number
|
472
|
+
def version
|
473
|
+
return nil if @version.nil?
|
474
|
+
@version_object = Version.new(@version, @name) if @version_object.nil?
|
475
|
+
@version_object
|
476
|
+
end
|
477
|
+
|
478
|
+
private
|
479
|
+
def parse_details
|
480
|
+
if !@details_parsed && !@details.empty?
|
481
|
+
if @render_engine == :trident
|
482
|
+
@details.each do |info|
|
483
|
+
match = /trident\/(.+)/.match(info)
|
484
|
+
@regexps += 1
|
485
|
+
@render_engine_version = match[1] if match
|
486
|
+
end
|
487
|
+
end
|
488
|
+
@details_parsed = true
|
489
|
+
end
|
490
|
+
end
|
491
|
+
end
|
492
|
+
end
|