google_safe_browsing_redis 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,99 @@
1
+ require 'ip'
2
+ require 'uri'
3
+
4
+ class Canonicalize
5
+ def self.canonicalize(url)
6
+ url.strip!
7
+
8
+ # Remove any tab (0x09), CR (0x0d), and LF (0x0a) characters from the URL
9
+ url = url.gsub(' ','').gsub("\n",'').gsub("\r",'')
10
+
11
+ # If the URL ends in a fragment, the fragment should be removed
12
+ url = url.split('#')[0]
13
+
14
+ # Repeatedly URL-unescape the URL until it has no more hex-encodings
15
+ while(url != URI.unescape(url))
16
+ url = URI.unescape(url)
17
+ end
18
+
19
+ # Extract the hostname from the URL
20
+ protocol = url.split('://')[0]
21
+ if(protocol == nil || !url.include?('://'))
22
+ protocol = "http://"
23
+ host = url.split('/')[0]
24
+ path = url.sub(host, '')
25
+ else
26
+ protocol += "://"
27
+ host = url.sub(protocol, '').split('/')[0]
28
+ path = url.sub(protocol, '').sub(host, '')
29
+ end
30
+
31
+ query = ''
32
+ if(path.include?('?'))
33
+ query = path[path.index('?')..-1]
34
+ path = path.sub(query, '')
35
+ end
36
+
37
+ # Remove all leading and trailing dots
38
+ host.gsub!(/\A\.+|\.+\Z/, '')
39
+
40
+ # Replace consecutive dots with a single dot
41
+ host.gsub!(/\.+/, '.')
42
+
43
+ # If the hostname can be parsed as an IP address, it should be normalized to 4 dot-separated decimal values.
44
+ # The client should handle any legal IP- address encoding, including octal, hex, and fewer than 4 components.
45
+ if(host.match(/^\d+$/))
46
+ host = IP::V4.new(host.to_i).to_addr
47
+ end
48
+
49
+ # Lowercase the whole string
50
+ protocol.downcase!
51
+ host.downcase!
52
+
53
+ # The sequences "/../" and "/./" in the path should be resolved,
54
+ # by replacing "/./" with "/", and removing "/../" along with the preceding path component.
55
+ path = path.gsub('/./', '/')
56
+ trailing = path[-1..-1] == '/'
57
+ path_parts = path.split('/')
58
+ path = []
59
+ path_parts.each do |part|
60
+ if(part == '..')
61
+ path.pop
62
+ else
63
+ path.push(part)
64
+ end
65
+ end
66
+ path = path.join('/')
67
+ if(path == '' || trailing)
68
+ path += '/'
69
+ end
70
+
71
+ # Runs of consecutive slashes should be replaced with a single slash character
72
+ path.gsub!(/\/+/, '/')
73
+
74
+ # After performing these steps, percent-escape all characters in the URL which are <= ASCII 32, >= 127, "#", or "%".
75
+ # The escapes should use uppercase hex characters.
76
+ protocol = URI.escape(protocol).gsub('%5E', '^')
77
+ host = URI.escape(host).gsub('%5E', '^')
78
+ path = URI.escape(path).gsub('%5E', '^')
79
+ query = URI.escape(query).gsub('%5E', '^')
80
+
81
+ host = remove_user_password_and_port(host)
82
+
83
+ url = protocol + host + path + query
84
+
85
+ return url, { 'protocol' => protocol, 'host' => host, 'path' => path, 'query' => query }
86
+ end
87
+
88
+ def self.remove_user_password_and_port(host)
89
+ if(host.include?('@'))
90
+ host = host.split('@')[1]
91
+ end
92
+
93
+ if(host.include?(':'))
94
+ host = host.split(':')[0]
95
+ end
96
+
97
+ return host
98
+ end
99
+ end
@@ -0,0 +1,437 @@
1
+ require 'digest/sha2'
2
+ require 'ip'
3
+ require 'net/http'
4
+ require 'redis'
5
+ require 'resolv'
6
+ require 'uri'
7
+
8
+ require_relative './canonicalize'
9
+
10
+ class GoogleSafeBrowsing
11
+ $api_key = ''
12
+ $redis = nil
13
+ $debug = false
14
+
15
+ $appver = '0.1'
16
+ $pver = '2.2'
17
+
18
+ # the lists we care about
19
+ $lists = ["goog-malware-shavar", "googpub-phish-shavar"]
20
+
21
+ @delay = Time.now
22
+
23
+ # set the api key and redis object
24
+ def initialize(api_key, options = {})
25
+ $api_key = api_key
26
+ $redis = options[:redis] || Redis.new
27
+ $debug = options[:debug] || false
28
+ end
29
+
30
+ # request data from google's servers
31
+ def update()
32
+ say('Updating...')
33
+
34
+ # checking if we need to wait longer before updating
35
+ delay = $redis.get("delay")
36
+ if(delay != '' && delay != nil)
37
+ say("Error: must wait #{delay.to_i - Time.now.to_i} more seconds before updating! (#{delay})")
38
+ return
39
+ end
40
+
41
+ # check what lists we have access to
42
+ available_lists = get_lists()
43
+ say("Available lists: #{available_lists.inspect}")
44
+
45
+ # only download from lists we care about and have access to
46
+ lists = (available_lists & $lists)
47
+
48
+ get_data(lists)
49
+ end
50
+
51
+ # perform a lookup on a url
52
+ def lookup(url)
53
+ say("Checking url: #{url}")
54
+ url, parts = Canonicalize::canonicalize(url)
55
+ hosts, paths = get_possible_hosts_paths(parts)
56
+
57
+ # get all possible host+path combination hash prefixes
58
+ hostpaths = get_hash_prefixes(hosts.product(paths).collect{|a, b| a + b})
59
+
60
+ # add a trailing slash to all hosts, and get their hash prefixes
61
+ hosts = get_hash_prefixes(hosts.collect{|a| a + '/'})
62
+
63
+ $lists.each do |list|
64
+ hosts.each do |host|
65
+ is_member = $redis.sismember("#{list}:hosts", host)
66
+ if(is_member)
67
+ suffixes = $redis.smembers("#{list}:host_#{host}")
68
+ if(suffixes.length == 0 || suffixes & hostpaths != [])
69
+ say("URL matches a list: #{list} (#{url})")
70
+ return list
71
+ end
72
+ end
73
+ end
74
+ end
75
+
76
+ say("URL does not match any lists (#{url})")
77
+ return ''
78
+ end
79
+
80
+ # convert an array of strings into an array of 32 bit hash prefixes
81
+ def get_hash_prefixes(items)
82
+ prefixes = []
83
+ items.each do |item|
84
+ prefixes.push((Digest::SHA2.new << item).to_s[0..7])
85
+ end
86
+
87
+ return prefixes
88
+ end
89
+
90
+ # expand a url into its possible host-path combinations according to the Google API
91
+ def get_possible_hosts_paths(parts)
92
+ case parts['host']
93
+ when Resolv::IPv4::Regex
94
+ ip = true
95
+ when Resolv::IPv6::Regex
96
+ ip = true
97
+ else
98
+ ip = false
99
+ end
100
+
101
+ # For the hostname, the client will try at most 5 different strings. They are:
102
+ # - the exact hostname in the url
103
+ # - up to 4 hostnames formed by starting with the last 5 components and successively removing the leading component.
104
+ # The top-level domain can be skipped. These additional hostnames should not be checked if the host is an IP address.
105
+ possible_hosts = []
106
+
107
+ if(!ip)
108
+ host = parts['host'].split('.')
109
+ [host.length - 2, 4].min.times do |i|
110
+ possible_hosts.push(host[host.length-2-i..-1].join('.'))
111
+ end
112
+ end
113
+ possible_hosts.push(parts['host'])
114
+ possible_hosts.reverse!
115
+
116
+ # For the path, the client will also try at most 6 different strings. They are:
117
+ # - the exact path of the url, including query parameters
118
+ # - the exact path of the url, without query parameters
119
+ # - the 4 paths formed by starting at the root (/) and successively appending path components, including a trailing slash.
120
+ possible_paths = []
121
+
122
+ if(parts['query'] != '')
123
+ possible_paths.push(parts['path'] + parts['query'])
124
+ end
125
+ possible_paths.push(parts['path'])
126
+
127
+ path = parts['path'].split('/')
128
+ [path.length - 1, 5].min.times do |i|
129
+ possible_path = path[0..i].join('/')
130
+ if(possible_path == '' || i < path.length - 1)
131
+ possible_path += '/'
132
+ end
133
+
134
+ possible_paths.push(possible_path)
135
+ end
136
+
137
+ return possible_hosts, possible_paths
138
+ end
139
+
140
+ # returns available lists as an array
141
+ def get_lists()
142
+ lists = api_request("list")
143
+ return lists.split("\n")
144
+ end
145
+
146
+ # performs a request for data from Google, and parses the response
147
+ def get_data(lists)
148
+ say('Getting data...')
149
+ # build the request
150
+ request_body = ''
151
+ lists.each do |list|
152
+ request_body += "#{list};"
153
+
154
+ # append a:1,2,3,4,5,8
155
+ add = get_add_chunks(list)
156
+ if(add != '' && add != nil)
157
+ request_body += "a:#{add}"
158
+ end
159
+
160
+ # append [:]s:6,7,9,11
161
+ sub = get_sub_chunks(list)
162
+ if(sub != '' && sub != nil)
163
+ if(add != '' && add != nil)
164
+ request_body += ":"
165
+ end
166
+
167
+ request_body += "s:#{sub}"
168
+ end
169
+
170
+ request_body += "\n"
171
+ end
172
+
173
+ response = api_request("downloads", request_body)
174
+ response = response.split("\n")
175
+
176
+ # parse the response
177
+ say('Handling response...')
178
+ cur_list = ''
179
+ redirects = {}
180
+ response.each do |line|
181
+ line = line.split(':')
182
+ type = line[0]
183
+ data = line[1]
184
+
185
+ if(type == 'n')
186
+ # set the next allowed time to poll
187
+ delay = Time.now + data.to_i
188
+ say("Time until next request: #{data}")
189
+ $redis.setex("delay", data.to_i, delay.to_i)
190
+ elsif(type == 'i')
191
+ # set the current list
192
+ cur_list = data
193
+ redirects[cur_list] = []
194
+ say("Current list: #{cur_list}")
195
+ elsif(type == 'u')
196
+ # store the redirect
197
+ say("Redirect: #{data}")
198
+ redirects[cur_list].push(data)
199
+ elsif(type == 'ad')
200
+ say("Delete chunks: #{data}")
201
+ chunks = expand_ranges(data)
202
+ delete_add_chunks(cur_list, chunks)
203
+ elsif(type == 'sd')
204
+ say("Don't report chunks: #{data}")
205
+ chunks = expand_ranges(data)
206
+ delete_sub_chunks(cur_list, chunks)
207
+ else
208
+ say("I don't know how to handle this!")
209
+ say(line.inspect)
210
+ end
211
+ end
212
+
213
+ # handle the redirects
214
+ say('Handling redirects...')
215
+ redirects.each do |list, urls|
216
+ say("Handling #{list} redirects...")
217
+ i = 0
218
+ urls.each do |url|
219
+ i += 1
220
+ say("Handling #{list} redirect #{i} of #{urls.length}...")
221
+ handle_redirect(list, url)
222
+ end
223
+ end
224
+ end
225
+
226
+ def delete_add_chunks(list, chunks)
227
+ delete_chunks(list, 'add', chunks)
228
+ end
229
+
230
+ def delete_sub_chunks(list, chunks)
231
+ delete_chunks(list, 'sub', chunks)
232
+ end
233
+
234
+ def delete_chunks(list, type, chunks)
235
+ chunks.each do |chunk|
236
+ if(type == 'add')
237
+ # delete each of the prefixes
238
+ hosts = $redis.smembers("#{list}:chunk_#{chunk}")
239
+ hosts.each do |hosts|
240
+ $redis.del("#{list}:host_#{host}")
241
+ $redis.srem("#{list}:hosts", host)
242
+ end
243
+
244
+ # delete the list of prefixes
245
+ $redis.del("#{list}:chunk_#{chunk}")
246
+ end
247
+
248
+ # delete from our chunk list
249
+ $redis.srem("#{list}:#{type}_chunks", chunk)
250
+ end
251
+ end
252
+
253
+ def get_add_chunks(list)
254
+ return get_chunks(list, "add")
255
+ end
256
+
257
+ def get_sub_chunks(list)
258
+ return get_chunks(list, "sub")
259
+ end
260
+
261
+ def get_chunks(list, type)
262
+ chunks = $redis.smembers("#{list}:#{type}_chunks")
263
+ return convert_list_to_ranges(chunks)
264
+ end
265
+
266
+ # reads and parses the encoded data from a redirect url
267
+ def handle_redirect(list, url)
268
+ response = http_post_request("http://#{url}")
269
+ response = StringIO.new(response)
270
+
271
+ while(line = response.gets)
272
+ line = line.split(':')
273
+ type = line[0]
274
+ chunk_num = line[1].to_i
275
+ hash_len = line[2].to_i
276
+ chunk_len = line[3].to_i
277
+
278
+ data = response.read(chunk_len)
279
+
280
+ if(type == 'a')
281
+ if(chunk_len == 0)
282
+ # TODO
283
+ end
284
+
285
+ # store the chunk number in the add list
286
+ store_add_chunk(list, chunk_num)
287
+
288
+ entry_list = read_add_data(hash_len, data)
289
+
290
+ # add all these prefixes
291
+ add_entries(list, chunk_num, entry_list)
292
+ elsif(type == 's')
293
+ if(chunk_len == 0)
294
+ # TODO
295
+ end
296
+
297
+ # store the chunk number in the sub list
298
+ store_sub_chunk(list, chunk_num)
299
+
300
+ entry_list = read_sub_data(hash_len, data)
301
+
302
+ # delete all these prefixes
303
+ sub_entries(list, chunk_num, entry_list)
304
+ else
305
+ say "I don't know how to handle this!"
306
+ say line.inspect
307
+ end
308
+ end
309
+ end
310
+
311
+ def add_entries(list, chunk, entries)
312
+ entries.each do |entry|
313
+ $redis.sadd("#{list}:chunk_#{chunk}", entry['host'])
314
+ $redis.sadd("#{list}:host_#{entry['host']}", entry['path'])
315
+ $redis.sadd("#{list}:hosts", entry['host'])
316
+ end
317
+ end
318
+
319
+ def sub_entries(list, chunk, entries)
320
+ entries.each do |entry|
321
+ $redis.srem("#{list}:chunk_#{chunk}", entry['host'])
322
+ $redis.srem("#{list}:host_#{entry['host']}", entry['path'])
323
+ $redis.srem("#{list}:hosts", entry['host'])
324
+ end
325
+ end
326
+
327
+ def store_add_chunk(list, chunk)
328
+ store_chunk(list, 'add', chunk)
329
+ end
330
+
331
+ def store_sub_chunk(list, chunk)
332
+ store_chunk(list, 'sub', chunk)
333
+ end
334
+
335
+ def store_chunk(list, type, chunk)
336
+ $redis.sadd("#{list}:#{type}_chunks", chunk)
337
+ end
338
+
339
+ def read_add_data(hash_len, data)
340
+ return read_data(hash_len, data, false)
341
+ end
342
+
343
+ def read_sub_data(hash_len, data)
344
+ return read_data(hash_len, data, true)
345
+ end
346
+
347
+ # reads a chunk of encoded data and converts it into a list of entries
348
+ def read_data(hash_len, data, sub)
349
+ # returns an array of hashes of the form: { host, path, chunk }
350
+ entry_list = []
351
+ addchunknum = ""
352
+
353
+ data = StringIO.new(data)
354
+ while(hostkey = data.read(4))
355
+ hostkey = hostkey.unpack("H*")[0]
356
+ count = data.read(1).unpack("H*")[0].hex # or .to_i(16)
357
+ if(sub)
358
+ addchunknum = data.read(4).unpack("H*")[0]
359
+ end
360
+
361
+ # If count > 1, it will be prefix-chunk until the last one, which will be just prefix
362
+ count.times do |i|
363
+ entry = {}
364
+ entry['host'] = hostkey
365
+
366
+ path_prefix = data.read(hash_len).unpack("H*")[0]
367
+ entry['path'] = path_prefix
368
+
369
+ if(sub && count > 1 && i != count-1)
370
+ entry['chunk'] = data.read(4).unpack("H*")[0]
371
+ else
372
+ entry['chunk'] = addchunknum
373
+ end
374
+
375
+ entry_list.push(entry)
376
+ end
377
+ end
378
+
379
+ return entry_list
380
+ end
381
+
382
+ # transforms "1-2,4-6,8" into [1,2,4,5,6,8]
383
+ def expand_ranges(ranges)
384
+ result = []
385
+ ranges = ranges.split(',')
386
+ ranges.each do |range|
387
+ if(range.include? '-')
388
+ range = range.split('-')
389
+ a = range[0].to_i
390
+ b = range[1].to_i
391
+ [a..b].each do |i|
392
+ result.push(i)
393
+ end
394
+ else
395
+ result.push(range)
396
+ end
397
+ end
398
+
399
+ return result
400
+ end
401
+
402
+ # transforms [1,2,4,5,6,8] into "1-2,4-6,8"
403
+ def convert_list_to_ranges(list)
404
+ ranges = list.collect{|s| s.to_i}.sort.uniq.inject([]) do |spans, n|
405
+ if spans.empty? || spans.last.last != n - 1
406
+ spans + [n..n]
407
+ else
408
+ spans[0..-2] + [spans.last.first..n]
409
+ end
410
+ end
411
+
412
+ return ranges.join(',').gsub("..","-")
413
+ end
414
+
415
+ # makes a request to the google safe browsing api v2
416
+ def api_request(function, body = nil)
417
+ before = 'http://safebrowsing.clients.google.com/safebrowsing/'
418
+ after = "?client=api&apikey=#{$api_key}&appver=#{$appver}&pver=#{$pver}"
419
+ return http_post_request(before + function + after, body)
420
+ end
421
+
422
+ # makes an http post request with an empty body and returns the response
423
+ def http_post_request(url, body = nil)
424
+ uri = URI.parse(url)
425
+ http = Net::HTTP.new(uri.host, uri.port)
426
+ request = Net::HTTP::Post.new(uri.request_uri)
427
+ request.body = body || ''
428
+ response = http.request(request).body
429
+ return response
430
+ end
431
+
432
+ def say(msg)
433
+ if($debug)
434
+ puts "#{Time.now.utc}: #{msg}"
435
+ end
436
+ end
437
+ end
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google_safe_browsing_redis
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Brad Jewell
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-06-20 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: A ruby implementation of the Google Safe Browsing API v2 that uses Redis
15
+ email: brad811@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/canonicalize.rb
21
+ - lib/google_safe_browsing.rb
22
+ homepage: https://github.com/brad811/GoogleSafeBrowsing
23
+ licenses: []
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.23
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: A ruby implementation of the Google Safe Browsing API v2 that uses Redis
46
+ test_files: []