google_safe_browsing_redis 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,99 @@
1
+ require 'ip'
2
+ require 'uri'
3
+
4
+ class Canonicalize
5
+ def self.canonicalize(url)
6
+ url.strip!
7
+
8
+ # Remove any tab (0x09), CR (0x0d), and LF (0x0a) characters from the URL
9
+ url = url.gsub(' ','').gsub("\n",'').gsub("\r",'')
10
+
11
+ # If the URL ends in a fragment, the fragment should be removed
12
+ url = url.split('#')[0]
13
+
14
+ # Repeatedly URL-unescape the URL until it has no more hex-encodings
15
+ while(url != URI.unescape(url))
16
+ url = URI.unescape(url)
17
+ end
18
+
19
+ # Extract the hostname from the URL
20
+ protocol = url.split('://')[0]
21
+ if(protocol == nil || !url.include?('://'))
22
+ protocol = "http://"
23
+ host = url.split('/')[0]
24
+ path = url.sub(host, '')
25
+ else
26
+ protocol += "://"
27
+ host = url.sub(protocol, '').split('/')[0]
28
+ path = url.sub(protocol, '').sub(host, '')
29
+ end
30
+
31
+ query = ''
32
+ if(path.include?('?'))
33
+ query = path[path.index('?')..-1]
34
+ path = path.sub(query, '')
35
+ end
36
+
37
+ # Remove all leading and trailing dots
38
+ host.gsub!(/\A\.+|\.+\Z/, '')
39
+
40
+ # Replace consecutive dots with a single dot
41
+ host.gsub!(/\.+/, '.')
42
+
43
+ # If the hostname can be parsed as an IP address, it should be normalized to 4 dot-separated decimal values.
44
+ # The client should handle any legal IP- address encoding, including octal, hex, and fewer than 4 components.
45
+ if(host.match(/^\d+$/))
46
+ host = IP::V4.new(host.to_i).to_addr
47
+ end
48
+
49
+ # Lowercase the whole string
50
+ protocol.downcase!
51
+ host.downcase!
52
+
53
+ # The sequences "/../" and "/./" in the path should be resolved,
54
+ # by replacing "/./" with "/", and removing "/../" along with the preceding path component.
55
+ path = path.gsub('/./', '/')
56
+ trailing = path[-1..-1] == '/'
57
+ path_parts = path.split('/')
58
+ path = []
59
+ path_parts.each do |part|
60
+ if(part == '..')
61
+ path.pop
62
+ else
63
+ path.push(part)
64
+ end
65
+ end
66
+ path = path.join('/')
67
+ if(path == '' || trailing)
68
+ path += '/'
69
+ end
70
+
71
+ # Runs of consecutive slashes should be replaced with a single slash character
72
+ path.gsub!(/\/+/, '/')
73
+
74
+ # After performing these steps, percent-escape all characters in the URL which are <= ASCII 32, >= 127, "#", or "%".
75
+ # The escapes should use uppercase hex characters.
76
+ protocol = URI.escape(protocol).gsub('%5E', '^')
77
+ host = URI.escape(host).gsub('%5E', '^')
78
+ path = URI.escape(path).gsub('%5E', '^')
79
+ query = URI.escape(query).gsub('%5E', '^')
80
+
81
+ host = remove_user_password_and_port(host)
82
+
83
+ url = protocol + host + path + query
84
+
85
+ return url, { 'protocol' => protocol, 'host' => host, 'path' => path, 'query' => query }
86
+ end
87
+
88
+ def self.remove_user_password_and_port(host)
89
+ if(host.include?('@'))
90
+ host = host.split('@')[1]
91
+ end
92
+
93
+ if(host.include?(':'))
94
+ host = host.split(':')[0]
95
+ end
96
+
97
+ return host
98
+ end
99
+ end
@@ -0,0 +1,437 @@
1
+ require 'digest/sha2'
2
+ require 'ip'
3
+ require 'net/http'
4
+ require 'redis'
5
+ require 'resolv'
6
+ require 'uri'
7
+
8
+ require_relative './canonicalize'
9
+
10
+ class GoogleSafeBrowsing
11
+ $api_key = ''
12
+ $redis = nil
13
+ $debug = false
14
+
15
+ $appver = '0.1'
16
+ $pver = '2.2'
17
+
18
+ # the lists we care about
19
+ $lists = ["goog-malware-shavar", "googpub-phish-shavar"]
20
+
21
+ @delay = Time.now
22
+
23
+ # set the api key and redis object
24
+ def initialize(api_key, options = {})
25
+ $api_key = api_key
26
+ $redis = options[:redis] || Redis.new
27
+ $debug = options[:debug] || false
28
+ end
29
+
30
+ # request data from google's servers
31
+ def update()
32
+ say('Updating...')
33
+
34
+ # checking if we need to wait longer before updating
35
+ delay = $redis.get("delay")
36
+ if(delay != '' && delay != nil)
37
+ say("Error: must wait #{delay.to_i - Time.now.to_i} more seconds before updating! (#{delay})")
38
+ return
39
+ end
40
+
41
+ # check what lists we have access to
42
+ available_lists = get_lists()
43
+ say("Available lists: #{available_lists.inspect}")
44
+
45
+ # only download from lists we care about and have access to
46
+ lists = (available_lists & $lists)
47
+
48
+ get_data(lists)
49
+ end
50
+
51
+ # perform a lookup on a url
52
+ def lookup(url)
53
+ say("Checking url: #{url}")
54
+ url, parts = Canonicalize::canonicalize(url)
55
+ hosts, paths = get_possible_hosts_paths(parts)
56
+
57
+ # get all possible host+path combination hash prefixes
58
+ hostpaths = get_hash_prefixes(hosts.product(paths).collect{|a, b| a + b})
59
+
60
+ # add a trailing slash to all hosts, and get their hash prefixes
61
+ hosts = get_hash_prefixes(hosts.collect{|a| a + '/'})
62
+
63
+ $lists.each do |list|
64
+ hosts.each do |host|
65
+ is_member = $redis.sismember("#{list}:hosts", host)
66
+ if(is_member)
67
+ suffixes = $redis.smembers("#{list}:host_#{host}")
68
+ if(suffixes.length == 0 || suffixes & hostpaths != [])
69
+ say("URL matches a list: #{list} (#{url})")
70
+ return list
71
+ end
72
+ end
73
+ end
74
+ end
75
+
76
+ say("URL does not match any lists (#{url})")
77
+ return ''
78
+ end
79
+
80
+ # convert an array of strings into an array of 32 bit hash prefixes
81
+ def get_hash_prefixes(items)
82
+ prefixes = []
83
+ items.each do |item|
84
+ prefixes.push((Digest::SHA2.new << item).to_s[0..7])
85
+ end
86
+
87
+ return prefixes
88
+ end
89
+
90
+ # expand a url into its possible host-path combinations according to the Google API
91
+ def get_possible_hosts_paths(parts)
92
+ case parts['host']
93
+ when Resolv::IPv4::Regex
94
+ ip = true
95
+ when Resolv::IPv6::Regex
96
+ ip = true
97
+ else
98
+ ip = false
99
+ end
100
+
101
+ # For the hostname, the client will try at most 5 different strings. They are:
102
+ # - the exact hostname in the url
103
+ # - up to 4 hostnames formed by starting with the last 5 components and successively removing the leading component.
104
+ # The top-level domain can be skipped. These additional hostnames should not be checked if the host is an IP address.
105
+ possible_hosts = []
106
+
107
+ if(!ip)
108
+ host = parts['host'].split('.')
109
+ [host.length - 2, 4].min.times do |i|
110
+ possible_hosts.push(host[host.length-2-i..-1].join('.'))
111
+ end
112
+ end
113
+ possible_hosts.push(parts['host'])
114
+ possible_hosts.reverse!
115
+
116
+ # For the path, the client will also try at most 6 different strings. They are:
117
+ # - the exact path of the url, including query parameters
118
+ # - the exact path of the url, without query parameters
119
+ # - the 4 paths formed by starting at the root (/) and successively appending path components, including a trailing slash.
120
+ possible_paths = []
121
+
122
+ if(parts['query'] != '')
123
+ possible_paths.push(parts['path'] + parts['query'])
124
+ end
125
+ possible_paths.push(parts['path'])
126
+
127
+ path = parts['path'].split('/')
128
+ [path.length - 1, 5].min.times do |i|
129
+ possible_path = path[0..i].join('/')
130
+ if(possible_path == '' || i < path.length - 1)
131
+ possible_path += '/'
132
+ end
133
+
134
+ possible_paths.push(possible_path)
135
+ end
136
+
137
+ return possible_hosts, possible_paths
138
+ end
139
+
140
+ # returns available lists as an array
141
+ def get_lists()
142
+ lists = api_request("list")
143
+ return lists.split("\n")
144
+ end
145
+
146
+ # performs a request for data from Google, and parses the response
147
+ def get_data(lists)
148
+ say('Getting data...')
149
+ # build the request
150
+ request_body = ''
151
+ lists.each do |list|
152
+ request_body += "#{list};"
153
+
154
+ # append a:1,2,3,4,5,8
155
+ add = get_add_chunks(list)
156
+ if(add != '' && add != nil)
157
+ request_body += "a:#{add}"
158
+ end
159
+
160
+ # append [:]s:6,7,9,11
161
+ sub = get_sub_chunks(list)
162
+ if(sub != '' && sub != nil)
163
+ if(add != '' && add != nil)
164
+ request_body += ":"
165
+ end
166
+
167
+ request_body += "s:#{sub}"
168
+ end
169
+
170
+ request_body += "\n"
171
+ end
172
+
173
+ response = api_request("downloads", request_body)
174
+ response = response.split("\n")
175
+
176
+ # parse the response
177
+ say('Handling response...')
178
+ cur_list = ''
179
+ redirects = {}
180
+ response.each do |line|
181
+ line = line.split(':')
182
+ type = line[0]
183
+ data = line[1]
184
+
185
+ if(type == 'n')
186
+ # set the next allowed time to poll
187
+ delay = Time.now + data.to_i
188
+ say("Time until next request: #{data}")
189
+ $redis.setex("delay", data.to_i, delay.to_i)
190
+ elsif(type == 'i')
191
+ # set the current list
192
+ cur_list = data
193
+ redirects[cur_list] = []
194
+ say("Current list: #{cur_list}")
195
+ elsif(type == 'u')
196
+ # store the redirect
197
+ say("Redirect: #{data}")
198
+ redirects[cur_list].push(data)
199
+ elsif(type == 'ad')
200
+ say("Delete chunks: #{data}")
201
+ chunks = expand_ranges(data)
202
+ delete_add_chunks(cur_list, chunks)
203
+ elsif(type == 'sd')
204
+ say("Don't report chunks: #{data}")
205
+ chunks = expand_ranges(data)
206
+ delete_sub_chunks(cur_list, chunks)
207
+ else
208
+ say("I don't know how to handle this!")
209
+ say(line.inspect)
210
+ end
211
+ end
212
+
213
+ # handle the redirects
214
+ say('Handling redirects...')
215
+ redirects.each do |list, urls|
216
+ say("Handling #{list} redirects...")
217
+ i = 0
218
+ urls.each do |url|
219
+ i += 1
220
+ say("Handling #{list} redirect #{i} of #{urls.length}...")
221
+ handle_redirect(list, url)
222
+ end
223
+ end
224
+ end
225
+
226
+ def delete_add_chunks(list, chunks)
227
+ delete_chunks(list, 'add', chunks)
228
+ end
229
+
230
+ def delete_sub_chunks(list, chunks)
231
+ delete_chunks(list, 'sub', chunks)
232
+ end
233
+
234
+ def delete_chunks(list, type, chunks)
235
+ chunks.each do |chunk|
236
+ if(type == 'add')
237
+ # delete each of the prefixes
238
+ hosts = $redis.smembers("#{list}:chunk_#{chunk}")
239
+ hosts.each do |hosts|
240
+ $redis.del("#{list}:host_#{host}")
241
+ $redis.srem("#{list}:hosts", host)
242
+ end
243
+
244
+ # delete the list of prefixes
245
+ $redis.del("#{list}:chunk_#{chunk}")
246
+ end
247
+
248
+ # delete from our chunk list
249
+ $redis.srem("#{list}:#{type}_chunks", chunk)
250
+ end
251
+ end
252
+
253
+ def get_add_chunks(list)
254
+ return get_chunks(list, "add")
255
+ end
256
+
257
+ def get_sub_chunks(list)
258
+ return get_chunks(list, "sub")
259
+ end
260
+
261
+ def get_chunks(list, type)
262
+ chunks = $redis.smembers("#{list}:#{type}_chunks")
263
+ return convert_list_to_ranges(chunks)
264
+ end
265
+
266
+ # reads and parses the encoded data from a redirect url
267
+ def handle_redirect(list, url)
268
+ response = http_post_request("http://#{url}")
269
+ response = StringIO.new(response)
270
+
271
+ while(line = response.gets)
272
+ line = line.split(':')
273
+ type = line[0]
274
+ chunk_num = line[1].to_i
275
+ hash_len = line[2].to_i
276
+ chunk_len = line[3].to_i
277
+
278
+ data = response.read(chunk_len)
279
+
280
+ if(type == 'a')
281
+ if(chunk_len == 0)
282
+ # TODO
283
+ end
284
+
285
+ # store the chunk number in the add list
286
+ store_add_chunk(list, chunk_num)
287
+
288
+ entry_list = read_add_data(hash_len, data)
289
+
290
+ # add all these prefixes
291
+ add_entries(list, chunk_num, entry_list)
292
+ elsif(type == 's')
293
+ if(chunk_len == 0)
294
+ # TODO
295
+ end
296
+
297
+ # store the chunk number in the sub list
298
+ store_sub_chunk(list, chunk_num)
299
+
300
+ entry_list = read_sub_data(hash_len, data)
301
+
302
+ # delete all these prefixes
303
+ sub_entries(list, chunk_num, entry_list)
304
+ else
305
+ say "I don't know how to handle this!"
306
+ say line.inspect
307
+ end
308
+ end
309
+ end
310
+
311
+ def add_entries(list, chunk, entries)
312
+ entries.each do |entry|
313
+ $redis.sadd("#{list}:chunk_#{chunk}", entry['host'])
314
+ $redis.sadd("#{list}:host_#{entry['host']}", entry['path'])
315
+ $redis.sadd("#{list}:hosts", entry['host'])
316
+ end
317
+ end
318
+
319
+ def sub_entries(list, chunk, entries)
320
+ entries.each do |entry|
321
+ $redis.srem("#{list}:chunk_#{chunk}", entry['host'])
322
+ $redis.srem("#{list}:host_#{entry['host']}", entry['path'])
323
+ $redis.srem("#{list}:hosts", entry['host'])
324
+ end
325
+ end
326
+
327
+ def store_add_chunk(list, chunk)
328
+ store_chunk(list, 'add', chunk)
329
+ end
330
+
331
+ def store_sub_chunk(list, chunk)
332
+ store_chunk(list, 'sub', chunk)
333
+ end
334
+
335
+ def store_chunk(list, type, chunk)
336
+ $redis.sadd("#{list}:#{type}_chunks", chunk)
337
+ end
338
+
339
+ def read_add_data(hash_len, data)
340
+ return read_data(hash_len, data, false)
341
+ end
342
+
343
+ def read_sub_data(hash_len, data)
344
+ return read_data(hash_len, data, true)
345
+ end
346
+
347
+ # reads a chunk of encoded data and converts it into a list of entries
348
+ def read_data(hash_len, data, sub)
349
+ # returns an array of hashes of the form: { host, path, chunk }
350
+ entry_list = []
351
+ addchunknum = ""
352
+
353
+ data = StringIO.new(data)
354
+ while(hostkey = data.read(4))
355
+ hostkey = hostkey.unpack("H*")[0]
356
+ count = data.read(1).unpack("H*")[0].hex # or .to_i(16)
357
+ if(sub)
358
+ addchunknum = data.read(4).unpack("H*")[0]
359
+ end
360
+
361
+ # If count > 1, it will be prefix-chunk until the last one, which will be just prefix
362
+ count.times do |i|
363
+ entry = {}
364
+ entry['host'] = hostkey
365
+
366
+ path_prefix = data.read(hash_len).unpack("H*")[0]
367
+ entry['path'] = path_prefix
368
+
369
+ if(sub && count > 1 && i != count-1)
370
+ entry['chunk'] = data.read(4).unpack("H*")[0]
371
+ else
372
+ entry['chunk'] = addchunknum
373
+ end
374
+
375
+ entry_list.push(entry)
376
+ end
377
+ end
378
+
379
+ return entry_list
380
+ end
381
+
382
+ # transforms "1-2,4-6,8" into [1,2,4,5,6,8]
383
+ def expand_ranges(ranges)
384
+ result = []
385
+ ranges = ranges.split(',')
386
+ ranges.each do |range|
387
+ if(range.include? '-')
388
+ range = range.split('-')
389
+ a = range[0].to_i
390
+ b = range[1].to_i
391
+ [a..b].each do |i|
392
+ result.push(i)
393
+ end
394
+ else
395
+ result.push(range)
396
+ end
397
+ end
398
+
399
+ return result
400
+ end
401
+
402
+ # transforms [1,2,4,5,6,8] into "1-2,4-6,8"
403
+ def convert_list_to_ranges(list)
404
+ ranges = list.collect{|s| s.to_i}.sort.uniq.inject([]) do |spans, n|
405
+ if spans.empty? || spans.last.last != n - 1
406
+ spans + [n..n]
407
+ else
408
+ spans[0..-2] + [spans.last.first..n]
409
+ end
410
+ end
411
+
412
+ return ranges.join(',').gsub("..","-")
413
+ end
414
+
415
+ # makes a request to the google safe browsing api v2
416
+ def api_request(function, body = nil)
417
+ before = 'http://safebrowsing.clients.google.com/safebrowsing/'
418
+ after = "?client=api&apikey=#{$api_key}&appver=#{$appver}&pver=#{$pver}"
419
+ return http_post_request(before + function + after, body)
420
+ end
421
+
422
+ # makes an http post request with an empty body and returns the response
423
+ def http_post_request(url, body = nil)
424
+ uri = URI.parse(url)
425
+ http = Net::HTTP.new(uri.host, uri.port)
426
+ request = Net::HTTP::Post.new(uri.request_uri)
427
+ request.body = body || ''
428
+ response = http.request(request).body
429
+ return response
430
+ end
431
+
432
+ def say(msg)
433
+ if($debug)
434
+ puts "#{Time.now.utc}: #{msg}"
435
+ end
436
+ end
437
+ end
metadata ADDED
@@ -0,0 +1,46 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google_safe_browsing_redis
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Brad Jewell
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-06-20 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: A ruby implementation of the Google Safe Browsing API v2 that uses Redis
15
+ email: brad811@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/canonicalize.rb
21
+ - lib/google_safe_browsing.rb
22
+ homepage: https://github.com/brad811/GoogleSafeBrowsing
23
+ licenses: []
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - ! '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ requirements: []
41
+ rubyforge_project:
42
+ rubygems_version: 1.8.23
43
+ signing_key:
44
+ specification_version: 3
45
+ summary: A ruby implementation of the Google Safe Browsing API v2 that uses Redis
46
+ test_files: []