iGEL-ua_parser 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +50 -0
- data/Rakefile +12 -0
- data/apache_log_tester.rb +138 -0
- data/lib/ua_parser/user_agent.rb +492 -0
- data/lib/ua_parser/version.rb +85 -0
- data/lib/ua_parser.rb +2 -0
- data/test/crawler_test.rb +620 -0
- data/test/gecko_test.rb +493 -0
- data/test/other_test.rb +146 -0
- data/test/presto_test.rb +169 -0
- data/test/trident_test.rb +146 -0
- data/test/version_test.rb +130 -0
- data/test/webkit_test.rb +242 -0
- data/ua_parser.gemspec +32 -0
- metadata +81 -0
data/README.rdoc
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
== About ua_parser
|
2
|
+
|
3
|
+
ua_parser will become a ruby gem to identify user agents like browsers or
|
4
|
+
crawlers by the provided user agent string. I'm planning try to get most of
|
5
|
+
the available information like GUI language of the browser or email addresses
|
6
|
+
provided by a bot out of it.
|
7
|
+
|
8
|
+
I tried to identify common user agents first, reducing the necessary regexps for
|
9
|
+
them. But I guess, it could be improved alot. Of course I'd like to get feedback.
|
10
|
+
Even if you just revise my crappy English, send me an e-mail. ;-)
|
11
|
+
|
12
|
+
=== Project status (as of 2009-01-25):
|
13
|
+
|
14
|
+
Right know, the project is at a very early state. Of my 14 million hits sample,
|
15
|
+
ua_parser can identify about 96.5 % of all hits.
|
16
|
+
|
17
|
+
I tried to cover as much as possible with tests. At the moment, I have 99 tests
|
18
|
+
implemented.
|
19
|
+
|
20
|
+
Known browsers:
|
21
|
+
* Chrome
|
22
|
+
* Firefox and most other gecko based browsers
|
23
|
+
* Internet Explorer
|
24
|
+
* Opera, pure and pretending to be an Internet Explorer or Firefox
|
25
|
+
* Safari >= Version 3
|
26
|
+
|
27
|
+
Known bots:
|
28
|
+
* Baidubot
|
29
|
+
* gigabot
|
30
|
+
* gonzo (of suchen.de)
|
31
|
+
* Googlebot, Googlebot-Images, Mediapartners-Google
|
32
|
+
* mj12bot
|
33
|
+
* msnbot and msnbot-media
|
34
|
+
* seekbot
|
35
|
+
* speedy spider
|
36
|
+
* twiceler (of cuil.com)
|
37
|
+
* Yahoo! Slurp
|
38
|
+
* yeti (of naver.com)
|
39
|
+
|
40
|
+
Other known agents
|
41
|
+
* Apache httpd
|
42
|
+
* Jakarta Commons httpclient
|
43
|
+
* Java
|
44
|
+
* libwww-perl
|
45
|
+
* SVN
|
46
|
+
* TortoiseSVN
|
47
|
+
* veoh service
|
48
|
+
|
49
|
+
Also, ua_parser tries to identify bots and feedreader, even if it doesn't know
|
50
|
+
about them. That way, the results should be close to 100%.
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'echoe'
|
4
|
+
|
5
|
+
Echoe.new('ua_parser', '0.0.1') do |p|
|
6
|
+
p.description = "Parses the user agent of browsers and bots."
|
7
|
+
p.url = "http://github.com/iGEL/ua_parser"
|
8
|
+
p.author = "Johannes Barre"
|
9
|
+
p.email = "igel@igels.net"
|
10
|
+
p.ignore_pattern = ["pkg/*", "logs/*", "output*", "nbproject/**", "nbproject/private/*"]
|
11
|
+
p.development_dependencies = []
|
12
|
+
end
|
@@ -0,0 +1,138 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "lib/ua_parser"
|
4
|
+
|
5
|
+
def stats(stats)
|
6
|
+
$stderr.puts
|
7
|
+
$stderr.puts "Parsed #{stats[:parsed]} of #{stats[:files].size} files"
|
8
|
+
$stderr.puts "#{stats[:unknown].size} unknown user agents, #{stats[:known].size} known and #{stats[:errors].size} errors"
|
9
|
+
$stderr.puts "#{stats[:known_count]} of #{stats[:total_count]} hits (#{sprintf("%.3f", stats[:known_count]/stats[:total_count].to_f * 100)}%) identified"
|
10
|
+
$stderr.puts
|
11
|
+
end
|
12
|
+
|
13
|
+
raise "File tmp_log exists" if File.exist?("tmp_log")
|
14
|
+
Kernel.trap("SIGUSR1") do
|
15
|
+
File.delete("tmp_log") if File.exists?("tmp_log")
|
16
|
+
exit
|
17
|
+
end
|
18
|
+
|
19
|
+
gunzip = `which gunzip`.strip
|
20
|
+
bunzip = `which bunzip2`.strip
|
21
|
+
|
22
|
+
unknown = {}
|
23
|
+
known = {}
|
24
|
+
errors = {}
|
25
|
+
|
26
|
+
files = Dir["logs/*access*"]
|
27
|
+
i = 0
|
28
|
+
known_count = 0
|
29
|
+
unknown_count = 0
|
30
|
+
total_count = 0
|
31
|
+
|
32
|
+
files.each do |filename|
|
33
|
+
if i % 25 == 0 && i > 0
|
34
|
+
stats(:parsed => i, :files => files, :unknown => unknown, :known => known, :errors => errors, :known_count => known_count, :total_count => total_count)
|
35
|
+
end
|
36
|
+
i += 1
|
37
|
+
org_name = filename
|
38
|
+
if /\.gz$/ =~ filename
|
39
|
+
if gunzip == ""
|
40
|
+
$stderr.puts "SKIPPING #{filename}, gunzip not found"
|
41
|
+
next
|
42
|
+
end
|
43
|
+
$stderr.puts "Decompressing #{filename}"
|
44
|
+
system "#{gunzip} --stdout #{filename} > tmp_log"
|
45
|
+
filename = "tmp_log"
|
46
|
+
elsif /\.bz2$/ =~ filename
|
47
|
+
if bunzip == ""
|
48
|
+
$stderr.puts "SKIPPING #{filename}, bunzip2 not found"
|
49
|
+
next
|
50
|
+
end
|
51
|
+
$stderr.puts "Decompressing #{filename}"
|
52
|
+
raise "File tmp_log exists" if File.exist?("tmp_log")
|
53
|
+
system "#{bunzip} --stdout #{filename} > tmp_log"
|
54
|
+
filename = "tmp_log"
|
55
|
+
end
|
56
|
+
$stderr.puts "Parsing #{org_name}"
|
57
|
+
File.open(filename) do |file|
|
58
|
+
file.each_line do |line|
|
59
|
+
match = /\[([0-9]+)\/([a-z]+)\/([0-9]+):([0-9]+):([0-9]+):([0-9]+) .*\] .* "([^"]+)"\Z/i.match(line)
|
60
|
+
begin
|
61
|
+
if match && match[7].strip != "" && match[7].strip != "-"
|
62
|
+
access = Time.mktime(match[3].to_i, {"Jan" => 1, "Feb" => 2, "Mar" => 3, "Apr" => 4, "May" => 5, "Jun" => 6, "Jul" => 7, "Aug" => 8, "Sep" => 9, "Oct" => 10, "Nov" => 11, "Dec" => 12}[match[2]], match[1].to_i, match[4].to_i, match[5].to_i, match[6].to_i)
|
63
|
+
ua_string = match[7].strip
|
64
|
+
ua = UaParser::UserAgent.new(ua_string)
|
65
|
+
total_count += 1
|
66
|
+
if ua.known?
|
67
|
+
known_count += 1
|
68
|
+
name = "#{ua.name}"
|
69
|
+
name = "#{ua.name}/#{ua.version.major}" unless ua.version.nil?
|
70
|
+
unless known.has_key?(name)
|
71
|
+
known[name] = {:count => 0, :count_30d => 0, :regexps => 0, :last_access => Time.at(0), :first_seen => Time.now}
|
72
|
+
end
|
73
|
+
obj = known[name]
|
74
|
+
else
|
75
|
+
unknown_count += 1
|
76
|
+
unless unknown.has_key?ua_string
|
77
|
+
unknown[ua_string] = {:count => 0, :count_30d => 0, :guess => ua.type, :regexps => 0, :last_access => Time.at(0), :first_seen => Time.now}
|
78
|
+
end
|
79
|
+
obj = unknown[ua_string]
|
80
|
+
end
|
81
|
+
obj[:count] += 1
|
82
|
+
obj[:regexps] += ua.regexps
|
83
|
+
obj[:last_access] = access if access > obj[:last_access]
|
84
|
+
obj[:first_seen] = access if access < obj[:first_seen]
|
85
|
+
obj[:count_30d] += 1 if access > Time.now - 30 * 24 * 60 * 60
|
86
|
+
end
|
87
|
+
rescue RuntimeError => e
|
88
|
+
if errors[ua_string]
|
89
|
+
errors[ua_string][:count] += 1
|
90
|
+
else
|
91
|
+
errors[ua_string] = {:exception => e, :count => 1}
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
File.delete("tmp_log") if filename == "tmp_log"
|
97
|
+
break if errors.size > 100 || unknown.size > 5000 || known.size > 5000
|
98
|
+
GC.start
|
99
|
+
end
|
100
|
+
stats(:parsed => i, :files => files, :unknown => unknown, :known => known, :errors => errors, :known_count => known_count, :total_count => total_count)
|
101
|
+
|
102
|
+
puts
|
103
|
+
puts "UNKNOWN USER AGENTS:"
|
104
|
+
puts "===================="
|
105
|
+
keys = unknown.keys.sort do |one, other|
|
106
|
+
if unknown[other][:count_30d] == unknown[one][:count_30d]
|
107
|
+
unknown[other][:count] <=> unknown[one][:count]
|
108
|
+
else
|
109
|
+
unknown[other][:count_30d] <=> unknown[one][:count_30d]
|
110
|
+
end
|
111
|
+
end
|
112
|
+
keys.each do |ua|
|
113
|
+
puts "#{ua}: #{unknown[ua][:count_30d]} hits in the last 30 days (Hits total: #{unknown[ua][:count]}, my guess: It's a #{unknown[ua][:guess]}, first seen: #{unknown[ua][:first_seen].strftime("%Y-%m-%d %H:%M:%S")}, last access: #{unknown[ua][:last_access].strftime("%Y-%m-%d %H:%M:%S")})"
|
114
|
+
end
|
115
|
+
|
116
|
+
puts
|
117
|
+
puts "KNOWN USER AGENTS:"
|
118
|
+
puts "=================="
|
119
|
+
keys = known.keys.sort do |one, other|
|
120
|
+
if known[other][:count_30d] == known[one][:count_30d]
|
121
|
+
known[other][:count] <=> known[one][:count]
|
122
|
+
else
|
123
|
+
known[other][:count_30d] <=> known[one][:count_30d]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
keys.each do |ua|
|
127
|
+
puts "#{ua}: #{known[ua][:count_30d]} hits in the last 30 days (Hits total: #{known[ua][:count]}, regexps: #{sprintf("%.3f", known[ua][:regexps].to_f/known[ua][:count])}/hit, first seen: #{known[ua][:first_seen].strftime("%Y-%m-%d %H:%M:%S")}, last access: #{known[ua][:last_access].strftime("%Y-%m-%d %H:%M:%S")})"
|
128
|
+
end
|
129
|
+
|
130
|
+
puts
|
131
|
+
puts "ERROR CAUSING AGENTS:"
|
132
|
+
puts "====================="
|
133
|
+
errors.each_key do |ua|
|
134
|
+
puts "#{ua}: #{errors[ua][:exception].class}: #{errors[ua][:exception].message}"
|
135
|
+
errors[ua][:exception].backtrace.each do |line|
|
136
|
+
puts " #{line}"
|
137
|
+
end
|
138
|
+
end
|
@@ -0,0 +1,492 @@
|
|
1
|
+
module UaParser
|
2
|
+
class UserAgent
|
3
|
+
# Creates a new UserAgent object.
|
4
|
+
#
|
5
|
+
# == Parameters
|
6
|
+
# * ua_string: The user agent as you got it
|
7
|
+
def initialize(ua_string)
|
8
|
+
@ua_string = ua_string.strip.downcase
|
9
|
+
@known = false
|
10
|
+
@type = :browser
|
11
|
+
@name = :unknown_browser
|
12
|
+
@urls = []
|
13
|
+
@emails = []
|
14
|
+
@dot_net_versions = []
|
15
|
+
@regexps = 0
|
16
|
+
@details = []
|
17
|
+
|
18
|
+
if @ua_string == "" || @ua_string == "-"
|
19
|
+
@name = :no_agent_given
|
20
|
+
return
|
21
|
+
end
|
22
|
+
|
23
|
+
# Identify Operas pretending to be an Internet Explorer
|
24
|
+
unless @known
|
25
|
+
match = /^mozilla\/4\.0 \(compatible; msie [4-6].0;(.*)\) opera ([0-9]+\.[0-9]+)(.*)$/.match(@ua_string)
|
26
|
+
@regexps += 1
|
27
|
+
if match
|
28
|
+
@known = true
|
29
|
+
@name = :opera
|
30
|
+
@render_engine = :presto
|
31
|
+
@version = match[2]
|
32
|
+
@details = match[1].split(/;\s?/) + match[3].split(/\s/)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Identify Internet Explorers
|
37
|
+
unless @known
|
38
|
+
match = /^mozilla\/4.0 \(compatible; msie ([0-9]\.[0-9]); ([^)]+)\)(.*)$/.match(@ua_string)
|
39
|
+
@regexps += 1
|
40
|
+
if match
|
41
|
+
@known = true
|
42
|
+
@name = :internet_explorer
|
43
|
+
@render_engine = :trident
|
44
|
+
@version = match[1]
|
45
|
+
@details = match[2].split(/;\s?/) + match[3].split(/\s/)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Identify Opera pretending to be a Firefox
|
50
|
+
unless @known
|
51
|
+
match = /\Amozilla\/5.0 \((.+) rv:[0-9.]+\) gecko\/[0-9]+ firefox\/[0-9.]+ opera ([0-9]+\.[0-9]+)(.*)\Z/.match(@ua_string)
|
52
|
+
@regexps += 1
|
53
|
+
if match
|
54
|
+
@known = true
|
55
|
+
@name = :opera
|
56
|
+
@render_engine = :presto
|
57
|
+
@version = match[2]
|
58
|
+
@details = match[1].split(/;\s?/) + match[3].split(/\s/)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Flock and some versions of the Netscape Navigator try to identify themself
|
63
|
+
# also as a Firefox, so we have to identify them first.
|
64
|
+
unless @known
|
65
|
+
match = /^mozilla\/5\.0 \(([^)]+); rv:([^; )]+)\) gecko\/20[0-2][0-9][01][0-9][0-3][0-9][0-9]* firefox\/[^ ]+ (flock|navigator)\/([^ ]+)(.*)$/.match(@ua_string)
|
66
|
+
@regexps += 1
|
67
|
+
if match
|
68
|
+
@known = true
|
69
|
+
@name = match[3].to_sym
|
70
|
+
@render_engine = :gecko
|
71
|
+
@details = match[1].split(/;\s?/) + match[5].split(/\s/)
|
72
|
+
@render_engine_version = match[2]
|
73
|
+
@version = match[4]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Identify all other gecko based browsers except the orginal mozilla
|
78
|
+
unless @known
|
79
|
+
match = /\Amozilla\/5\.0 \((.+); rv:([^; )]+)\) gecko\/20[0-2][0-9][01][0-9][0-3][0-9][0-9]*( [^ ]+)?( \([^)]+\))? (bonecho|camino|epiphany|firefox|granparadiso|iceweasel|k-meleon|minefield|netscape6?|phoenix|seamonkey|songbird|thunderbird)\/([^ ]+)(.*)\Z/.match(@ua_string)
|
80
|
+
@regexps += 1
|
81
|
+
if match
|
82
|
+
@known = true
|
83
|
+
@name = match[5].to_sym
|
84
|
+
@name = :netscape if @name == :netscape6
|
85
|
+
@render_engine = :gecko
|
86
|
+
@details = match[1].split(/;\s?/) + "#{match[3]} #{match[7]}".split(/\s/)
|
87
|
+
@render_engine_version = match[2]
|
88
|
+
@version = match[6]
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Identify the Googlebot, mj12bot and yahoo slurp
|
93
|
+
unless @known
|
94
|
+
match = /^mozilla\/5.0 \(compatible; (googlebot|mj12bot|yahoo! slurp)(\/v?([^)]+))?; \+?(http:\/\/[^)]+)\)$/.match(@ua_string)
|
95
|
+
@regexps += 1
|
96
|
+
if match
|
97
|
+
@known = true
|
98
|
+
@name = match[1].to_sym
|
99
|
+
@type = :bot
|
100
|
+
@urls << match[4]
|
101
|
+
@version = match[3]
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# Identify the baiduspider, gigabot, mnsbot, msnbot-media, seekbot, speedy spider and naver yeti > 1.0
|
106
|
+
unless @known
|
107
|
+
match = /\A(baiduspider|gigabot|msnbot|msnbot-media|seekbot|speedy spider|yeti)(\/([^+ ]+))?[ +]\(.*(http:\/\/[^)]+)\)/.match(@ua_string)
|
108
|
+
@regexps += 1
|
109
|
+
if match
|
110
|
+
@known = true
|
111
|
+
@name = match[1].to_sym
|
112
|
+
@type = :bot
|
113
|
+
@urls << match[4]
|
114
|
+
@version = match[3]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
unless @known
|
119
|
+
match = /\A(mediapartners-google|googlebot-image)(\/([^ ]+))?\Z/.match(@ua_string)
|
120
|
+
@regexps += 1
|
121
|
+
if match
|
122
|
+
@known = true
|
123
|
+
@name = match[1].to_sym
|
124
|
+
@type = :bot
|
125
|
+
@version = match[3]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Identify Chrome
|
130
|
+
unless @known
|
131
|
+
match = /\Amozilla\/5\.0 \(([^)]+)\) applewebkit\/([0-9]+\.[0-9]+) \(khtml, like gecko\) chrome\/([0-9.]+) safari\/[0-9]+\.[0-9]+(.*)\Z/.match(@ua_string)
|
132
|
+
@regexps += 1
|
133
|
+
if match
|
134
|
+
@known = true
|
135
|
+
@name = :chrome
|
136
|
+
@render_engine = :webkit
|
137
|
+
@render_engine_version = match[2]
|
138
|
+
@version = match[3]
|
139
|
+
@details = match[1].split(/;\s?/) + match[4].split(/\s/)
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
# Identify Safari >= bersion 3
|
144
|
+
unless @known
|
145
|
+
match = /\Amozilla\/5.0 \(([^)]+)\) applewebkit\/([^ ]+) \(khtml, like gecko\) version\/([^ ]+) safari\/[0-9]+\.[0-9]+(.*)\Z/.match(@ua_string)
|
146
|
+
@regexps += 1
|
147
|
+
if match
|
148
|
+
@known = true
|
149
|
+
@name = :safari
|
150
|
+
@render_engine = :webkit
|
151
|
+
@render_engine_version = match[2]
|
152
|
+
@version = match[3]
|
153
|
+
@details = match[1].split(/;\s?/) + match[4].split(/\s/)
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# Identify twiceler (of cuil)
|
158
|
+
unless @known
|
159
|
+
match = /^mozilla\/5.0 \((twiceler)-([^ ]+) \+?(http:\/\/[^)]+)\)$/.match(@ua_string)
|
160
|
+
@regexps += 1
|
161
|
+
if match
|
162
|
+
@known = true
|
163
|
+
@name = match[1].to_sym
|
164
|
+
@type = :bot
|
165
|
+
@urls << match[3]
|
166
|
+
@version = match[2]
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
# Identify gonzo (of suchen.de)
|
171
|
+
unless @known
|
172
|
+
match = /\Agonzo([0-9]+)\[[a-z]\] \+([^ ]+)\Z/.match(@ua_string)
|
173
|
+
@regexps += 1
|
174
|
+
if match
|
175
|
+
@known = true
|
176
|
+
@name = :gonzo
|
177
|
+
@type = :bot
|
178
|
+
@version = match[1]
|
179
|
+
@urls << match[2]
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
# Identify the yeti 0.01
|
184
|
+
unless @known
|
185
|
+
match = /\Ayeti\/([^ ]+) \(nhn\/1noon, yetibot@naver.com, check robots.txt daily and follow it\)\Z/.match(@ua_string)
|
186
|
+
@regexps += 1
|
187
|
+
if match
|
188
|
+
@known = true
|
189
|
+
@name = :yeti
|
190
|
+
@type = :bot
|
191
|
+
@emails << "yetibot@naver.com"
|
192
|
+
@version = match[1]
|
193
|
+
end
|
194
|
+
end
|
195
|
+
=begin
|
196
|
+
Disabled, since we cannot identify the browser versions. If you know a table
|
197
|
+
with all released versions of safari with the used webkit versions, please
|
198
|
+
report it!
|
199
|
+
|
200
|
+
|
201
|
+
# Identify Safari < version 3
|
202
|
+
unless @known
|
203
|
+
match = /\Amozilla\/5\.0 \((.+)\) applewebkit\/([^ ]+) \(khtml, like gecko\) safari\/([^ ]+)(.*)\Z/.match(@ua_string)
|
204
|
+
@regexps += 1
|
205
|
+
if match
|
206
|
+
@known = true
|
207
|
+
@name = :safari
|
208
|
+
@render_engine = :webkit
|
209
|
+
@render_engine_version = match[2]
|
210
|
+
@version = match[3]
|
211
|
+
ua_info = "#{match[1]} #{match[4]}"
|
212
|
+
end
|
213
|
+
end
|
214
|
+
=end
|
215
|
+
|
216
|
+
# Identify now Operas, which do not try to pretend another browser
|
217
|
+
unless @known
|
218
|
+
match = /\Aopera\/([0-9]+\.[0-9]+) \(([^)]+)\) ?(presto\/([0-9.]+))?(.*)\Z/.match(@ua_string)
|
219
|
+
@regexps += 1
|
220
|
+
if match
|
221
|
+
@known = true
|
222
|
+
@name = :opera
|
223
|
+
@version = match[1]
|
224
|
+
@render_engine = :presto
|
225
|
+
@render_engine_version = match[4]
|
226
|
+
@details = match[2].split(/;\s?/) + match[5].split(/\s/)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
# Identify rarely used agents
|
231
|
+
unless @known
|
232
|
+
match = /veoh-\\xe2\\xb8\\xb3\\xe2\\xb8\\xb40 service \([^)]\)/.match(@ua_string)
|
233
|
+
@regexps += 1
|
234
|
+
if match
|
235
|
+
@known = true
|
236
|
+
@name = :veoh_service
|
237
|
+
@type = :other
|
238
|
+
@details = match[1].split(/;\s?/)
|
239
|
+
end
|
240
|
+
|
241
|
+
# Identfity tortoise svn
|
242
|
+
unless @known
|
243
|
+
match = /\Asvn\/[^ ]+ \(r[0-9]+\)\/tortoisesvn-([^ ]+)/.match(@ua_string)
|
244
|
+
@regexps += 1
|
245
|
+
if match
|
246
|
+
@known = true
|
247
|
+
@name = :tortoisesvn
|
248
|
+
@type = :other
|
249
|
+
@version = match[1]
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
# Identify other http agents
|
254
|
+
unless @known
|
255
|
+
match = /\Asvn\/([^ ]+) \(r[0-9]+\)/.match(@ua_string)
|
256
|
+
@regexps += 1
|
257
|
+
if match
|
258
|
+
@known = true
|
259
|
+
@name = :svn_client
|
260
|
+
@type = :other
|
261
|
+
@version = match[1]
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
# Identify libwww-perl
|
266
|
+
unless @known
|
267
|
+
match = /\Alibwww-perl\/([^ ]+)\Z/.match(@ua_string)
|
268
|
+
@regexps += 1
|
269
|
+
if match
|
270
|
+
@known = true
|
271
|
+
@type = :other
|
272
|
+
@name = :"libwww-perl"
|
273
|
+
@version = match[1]
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
# Identify Jakarta Commons HttpClient libary
|
278
|
+
unless @known
|
279
|
+
match = /\Ajakarta commons-httpclient\/([^ ]+)\Z/.match(@ua_string)
|
280
|
+
@regexps += 1
|
281
|
+
if match
|
282
|
+
@known = true
|
283
|
+
@type = :other
|
284
|
+
@name = :jakarta_commons_httpclient
|
285
|
+
@version = match[1]
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
unless @known
|
290
|
+
match = /\Ajava\/([^ ]+)\Z/.match(@ua_string)
|
291
|
+
@regexps += 1
|
292
|
+
if match
|
293
|
+
@known = true
|
294
|
+
@type = :other
|
295
|
+
@name = :java
|
296
|
+
@version = match[1]
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
# Identify connections from the Apache httpd.
|
301
|
+
unless @known
|
302
|
+
match = /\Aapache\/([^ ]+)/.match(@ua_string)
|
303
|
+
@regexps += 1
|
304
|
+
if match
|
305
|
+
@known = true
|
306
|
+
@name = :apache_httpd
|
307
|
+
@version = match[1]
|
308
|
+
@type = :other
|
309
|
+
end
|
310
|
+
end
|
311
|
+
end
|
312
|
+
|
313
|
+
# Still not known? Try to guess
|
314
|
+
unless @known
|
315
|
+
if @ua_string =~ /bot|spider|crawler/
|
316
|
+
@regexps += 1
|
317
|
+
@type = :bot
|
318
|
+
@name = :unknown_bot
|
319
|
+
elsif @ua_string =~ /feed|rss|atom/
|
320
|
+
@regexps += 2
|
321
|
+
@type = :feed_reader
|
322
|
+
@name = :unknown_feed_reader
|
323
|
+
end
|
324
|
+
@urls = @ua_string.scan(/(((f|ht)tps?:\/\/|www\.)[a-z0-9.-]+(:[0-9]+)?(\/([a-z0-9.\-_+\/%?=;,&]*[a-z0-9.\-_+\/%?=,&])?)?)/).collect { |url| url.first }
|
325
|
+
@emails = @ua_string.scan(/([0-9a-z]([0-9a-z_\-.]*[0-9a-z])?@[0-9a-z.-]+)/).collect { |mail| mail.first}
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
# Returns the architecture as a lower case symbol like :i686 or :ppc
|
330
|
+
def architecture
|
331
|
+
raise NotImplementedError
|
332
|
+
end
|
333
|
+
|
334
|
+
# Returns true, if the user agent is suspected or known to be a bot.
|
335
|
+
def bot?
|
336
|
+
@type == :bot
|
337
|
+
end
|
338
|
+
|
339
|
+
# Returns true, if the user agent is known or suspected to be a browser.
|
340
|
+
# Completely unknown user agents are also suspected browsers. Check known?,
|
341
|
+
# if you want to make sure the browser is known.
|
342
|
+
def browser?
|
343
|
+
@type == :browser
|
344
|
+
end
|
345
|
+
|
346
|
+
# Returns an Array of all available .NET-Versions.
|
347
|
+
def dotnet_versions
|
348
|
+
raise NotImplementedError
|
349
|
+
end
|
350
|
+
|
351
|
+
# Some bots provide on or more contact email addresses. This method will
|
352
|
+
# return the first identified address as a String or nil, if no address could
|
353
|
+
# be identified.
|
354
|
+
def email
|
355
|
+
@emails.first
|
356
|
+
end
|
357
|
+
|
358
|
+
# Some bots provide one or more contact email addresses. This method will
|
359
|
+
# return all identified addresses as an Array. It'll empty, if no addresses
|
360
|
+
# could be identified
|
361
|
+
def emails
|
362
|
+
@emails
|
363
|
+
end
|
364
|
+
|
365
|
+
# Returns true, if the given user agent is suspected or known to be a feed
|
366
|
+
# reader.
|
367
|
+
def feed_reader?
|
368
|
+
@type == :feed_reader
|
369
|
+
end
|
370
|
+
|
371
|
+
# Returns true, if the user agent is a tool to grab the contents of a
|
372
|
+
# webpage like wget
|
373
|
+
def grapper?
|
374
|
+
@type == :grabber
|
375
|
+
end
|
376
|
+
|
377
|
+
# Returns true, if the given user agent is know, otherwise false
|
378
|
+
def known?
|
379
|
+
@known
|
380
|
+
end
|
381
|
+
|
382
|
+
# Returns the name of the user agent.
|
383
|
+
def name
|
384
|
+
@name
|
385
|
+
end
|
386
|
+
|
387
|
+
# Returns the operating system as a string. Returns nil if the operating
|
388
|
+
# system is unknown.
|
389
|
+
# For Linux systems, the name of the distribution will be returned, or
|
390
|
+
# "Unknown Linux Distribution"
|
391
|
+
def os
|
392
|
+
raise NotImplementedError
|
393
|
+
end
|
394
|
+
|
395
|
+
# Returns the type of the operating system as a String, like "Linux" or
|
396
|
+
# "Windows". Returns nil, if the os is unknown.
|
397
|
+
def os_type
|
398
|
+
raise NotImplementedError
|
399
|
+
end
|
400
|
+
|
401
|
+
# Returns the operating system as a string. Returns nil if the operating
|
402
|
+
# system version is unknown
|
403
|
+
def os_version
|
404
|
+
raise NotImplementedError
|
405
|
+
end
|
406
|
+
|
407
|
+
# Returns true, if the user agent is neighter a browser, bot or feed reader.
|
408
|
+
# Examples: SVN Client, Apache-Browser, libaries like libwww-perl
|
409
|
+
def other?
|
410
|
+
@type == :other
|
411
|
+
end
|
412
|
+
|
413
|
+
# Returns the number of regexps, which were executed to identify this agent.
|
414
|
+
# Regexps for identifications of versions are not counted.
|
415
|
+
def regexps
|
416
|
+
@regexps
|
417
|
+
end
|
418
|
+
|
419
|
+
# Returns the name of the render engine as a string if known, otherwise nil.
|
420
|
+
def render_engine
|
421
|
+
@render_engine
|
422
|
+
end
|
423
|
+
|
424
|
+
# Returns a UaParser::Version object with the version of the render engine
|
425
|
+
# if known, otherwise nil
|
426
|
+
def render_engine_version
|
427
|
+
parse_details if @render_engine == :trident
|
428
|
+
return nil if @render_engine_version.nil?
|
429
|
+
@render_engine_version_object = Version.new(@render_engine_version, @render_engine) if @render_engine_version_object.nil?
|
430
|
+
@render_engine_version_object
|
431
|
+
end
|
432
|
+
|
433
|
+
# Returns the type of the user agent as a Symbol. Returns :browser, if the
|
434
|
+
# user agent coundn't be recognized, since this seems the savest choice. Use
|
435
|
+
# #known? if you want to know, if the user agent could be recognized.
|
436
|
+
def type
|
437
|
+
@type
|
438
|
+
end
|
439
|
+
|
440
|
+
# Returns the user interface language as a symbol like :en or :de. Returns nil
|
441
|
+
# if the user interface language is not known.
|
442
|
+
def ui_lang
|
443
|
+
raise NotImplementedError
|
444
|
+
end
|
445
|
+
|
446
|
+
# Returns the country code from the user interface language as a lower case
|
447
|
+
# symbol. For example, en_US will return :us. Returns nil if the country is
|
448
|
+
# unknown.
|
449
|
+
def ui_lang_country
|
450
|
+
raise NotImplementedError
|
451
|
+
end
|
452
|
+
|
453
|
+
# Some bots provide on or more urls. This method will return the first
|
454
|
+
# identified url as a String or nil, if no url could be identified.
|
455
|
+
def url
|
456
|
+
@urls.first
|
457
|
+
end
|
458
|
+
|
459
|
+
# Some bots provide one or more urls. This method will return all identified
|
460
|
+
# urls as an Array. It'll empty, if no urls could be identified.
|
461
|
+
def urls
|
462
|
+
@urls
|
463
|
+
end
|
464
|
+
|
465
|
+
# Returns the vendor of the user agent, if known. Otherwise nil will be
|
466
|
+
# returned.
|
467
|
+
def vendor
|
468
|
+
raise NotImplementedError
|
469
|
+
end
|
470
|
+
|
471
|
+
# Returns a UaAgent::Version object with the Version number
|
472
|
+
def version
|
473
|
+
return nil if @version.nil?
|
474
|
+
@version_object = Version.new(@version, @name) if @version_object.nil?
|
475
|
+
@version_object
|
476
|
+
end
|
477
|
+
|
478
|
+
private
|
479
|
+
def parse_details
|
480
|
+
if !@details_parsed && !@details.empty?
|
481
|
+
if @render_engine == :trident
|
482
|
+
@details.each do |info|
|
483
|
+
match = /trident\/(.+)/.match(info)
|
484
|
+
@regexps += 1
|
485
|
+
@render_engine_version = match[1] if match
|
486
|
+
end
|
487
|
+
end
|
488
|
+
@details_parsed = true
|
489
|
+
end
|
490
|
+
end
|
491
|
+
end
|
492
|
+
end
|