RubyGems - google_safe_browsing_redis - Versions diffs - 0.0.2 - Mend

google_safe_browsing_redis 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

data/lib/canonicalize.rb +99 -0
data/lib/google_safe_browsing.rb +437 -0
metadata +46 -0

data/lib/canonicalize.rb ADDED Viewed

@@ -0,0 +1,99 @@
+require 'ip'
+require 'uri'
+class Canonicalize
+	def self.canonicalize(url)
+		url.strip!
+		# Remove any tab (0x09), CR (0x0d), and LF (0x0a) characters from the URL
+		url = url.gsub('	','').gsub("\n",'').gsub("\r",'')
+		# If the URL ends in a fragment, the fragment should be removed
+		url = url.split('#')[0]
+		# Repeatedly URL-unescape the URL until it has no more hex-encodings
+		while(url != URI.unescape(url))
+			url = URI.unescape(url)
+		end
+		# Extract the hostname from the URL
+		protocol = url.split('://')[0]
+		if(protocol == nil || !url.include?('://'))
+			protocol = "http://"
+			host = url.split('/')[0]
+			path = url.sub(host, '')
+		else
+			protocol += "://"
+			host = url.sub(protocol, '').split('/')[0]
+			path = url.sub(protocol, '').sub(host, '')
+		end
+		query = ''
+		if(path.include?('?'))
+			query = path[path.index('?')..-1]
+			path = path.sub(query, '')
+		end
+		# Remove all leading and trailing dots
+		host.gsub!(/\A\.+|\.+\Z/, '')
+		# Replace consecutive dots with a single dot
+		host.gsub!(/\.+/, '.')
+		# If the hostname can be parsed as an IP address, it should be normalized to 4 dot-separated decimal values.
+		# The client should handle any legal IP- address encoding, including octal, hex, and fewer than 4 components.
+		if(host.match(/^\d+$/))
+			host = IP::V4.new(host.to_i).to_addr
+		end
+		# Lowercase the whole string
+		protocol.downcase!
+		host.downcase!
+		# The sequences "/../" and "/./" in the path should be resolved,
+		# by replacing "/./" with "/", and removing "/../" along with the preceding path component.
+		path = path.gsub('/./', '/')
+		trailing = path[-1..-1] == '/'
+		path_parts = path.split('/')
+		path = []
+		path_parts.each do |part|
+			if(part == '..')
+				path.pop
+			else
+				path.push(part)
+			end
+		end
+		path = path.join('/')
+		if(path == '' || trailing)
+			path += '/'
+		end
+		# Runs of consecutive slashes should be replaced with a single slash character
+		path.gsub!(/\/+/, '/')
+		# After performing these steps, percent-escape all characters in the URL which are <= ASCII 32, >= 127, "#", or "%".
+		# The escapes should use uppercase hex characters.
+		protocol = URI.escape(protocol).gsub('%5E', '^')
+		host = URI.escape(host).gsub('%5E', '^')
+		path = URI.escape(path).gsub('%5E', '^')
+		query = URI.escape(query).gsub('%5E', '^')
+		host = remove_user_password_and_port(host)
+		url = protocol + host + path + query
+		return url, { 'protocol' => protocol, 'host' => host, 'path' => path, 'query' => query }
+	end
+	def self.remove_user_password_and_port(host)
+		if(host.include?('@'))
+			host = host.split('@')[1]
+		end
+		if(host.include?(':'))
+			host = host.split(':')[0]
+		end
+		return host
+	end
+end

data/lib/google_safe_browsing.rb ADDED Viewed

@@ -0,0 +1,437 @@
+require 'digest/sha2'
+require 'ip'
+require 'net/http'
+require 'redis'
+require 'resolv'
+require 'uri'
+require_relative './canonicalize'
+class GoogleSafeBrowsing
+	$api_key = ''
+	$redis = nil
+	$debug = false
+	$appver = '0.1'
+	$pver = '2.2'
+	# the lists we care about
+	$lists = ["goog-malware-shavar", "googpub-phish-shavar"]
+	@delay = Time.now
+	# set the api key and redis object
+	def initialize(api_key, options = {})
+		$api_key = api_key
+		$redis = options[:redis] || Redis.new
+		$debug = options[:debug] || false
+	end
+	# request data from google's servers
+	def update()
+		say('Updating...')
+		# checking if we need to wait longer before updating
+		delay = $redis.get("delay")
+		if(delay != '' && delay != nil)
+			say("Error: must wait #{delay.to_i - Time.now.to_i} more seconds before updating! (#{delay})")
+			return
+		end
+		# check what lists we have access to
+		available_lists = get_lists()
+		say("Available lists: #{available_lists.inspect}")
+		# only download from lists we care about and have access to
+		lists = (available_lists & $lists)
+		get_data(lists)
+	end
+	# perform a lookup on a url
+	def lookup(url)
+		say("Checking url: #{url}")
+		url, parts = Canonicalize::canonicalize(url)
+		hosts, paths = get_possible_hosts_paths(parts)
+		# get all possible host+path combination hash prefixes
+		hostpaths = get_hash_prefixes(hosts.product(paths).collect{|a, b| a + b})
+		# add a trailing slash to all hosts, and get their hash prefixes
+		hosts = get_hash_prefixes(hosts.collect{|a| a + '/'})
+		$lists.each do |list|
+			hosts.each do |host|
+				is_member = $redis.sismember("#{list}:hosts", host)
+				if(is_member)
+					suffixes = $redis.smembers("#{list}:host_#{host}")
+					if(suffixes.length == 0 || suffixes & hostpaths != [])
+						say("URL matches a list: #{list} (#{url})")
+						return list
+					end
+				end
+			end
+		end
+		say("URL does not match any lists (#{url})")
+		return ''
+	end
+	# convert an array of strings into an array of 32 bit hash prefixes
+	def get_hash_prefixes(items)
+		prefixes = []
+		items.each do |item|
+			prefixes.push((Digest::SHA2.new << item).to_s[0..7])
+		end
+		return prefixes
+	end
+	# expand a url into its possible host-path combinations according to the Google API
+	def get_possible_hosts_paths(parts)
+		case parts['host']
+		when Resolv::IPv4::Regex
+			ip = true
+		when Resolv::IPv6::Regex
+			ip = true
+		else
+			ip = false
+		end
+		# For the hostname, the client will try at most 5 different strings. They are:
+		# - the exact hostname in the url
+		# - up to 4 hostnames formed by starting with the last 5 components and successively removing the leading component.
+		#   The top-level domain can be skipped. These additional hostnames should not be checked if the host is an IP address.
+		possible_hosts = []
+		if(!ip)
+			host = parts['host'].split('.')
+			[host.length - 2, 4].min.times do |i|
+				possible_hosts.push(host[host.length-2-i..-1].join('.'))
+			end
+		end
+		possible_hosts.push(parts['host'])
+		possible_hosts.reverse!
+		# For the path, the client will also try at most 6 different strings. They are:
+		# - the exact path of the url, including query parameters
+		# - the exact path of the url, without query parameters
+		# - the 4 paths formed by starting at the root (/) and successively appending path components, including a trailing slash.
+		possible_paths = []
+		if(parts['query'] != '')
+			possible_paths.push(parts['path'] + parts['query'])
+		end
+		possible_paths.push(parts['path'])
+		path = parts['path'].split('/')
+		[path.length - 1, 5].min.times do |i|
+			possible_path = path[0..i].join('/')
+			if(possible_path == '' || i < path.length - 1)
+				possible_path += '/'
+			end
+			possible_paths.push(possible_path)
+		end
+		return possible_hosts, possible_paths
+	end
+	# returns available lists as an array
+	def get_lists()
+		lists = api_request("list")
+		return lists.split("\n")
+	end
+	# performs a request for data from Google, and parses the response
+	def get_data(lists)
+		say('Getting data...')
+		# build the request
+		request_body = ''
+		lists.each do |list|
+			request_body += "#{list};"
+			# append a:1,2,3,4,5,8
+			add = get_add_chunks(list)
+			if(add != '' && add != nil)
+				request_body += "a:#{add}"
+			end
+			# append [:]s:6,7,9,11
+			sub = get_sub_chunks(list)
+			if(sub != '' && sub != nil)
+				if(add != '' && add != nil)
+					request_body += ":"
+				end
+				request_body += "s:#{sub}"
+			end
+			request_body += "\n"
+		end
+		response = api_request("downloads", request_body)
+		response = response.split("\n")
+		# parse the response
+		say('Handling response...')
+		cur_list = ''
+		redirects = {}
+		response.each do |line|
+			line = line.split(':')
+			type = line[0]
+			data = line[1]
+			if(type == 'n')
+				# set the next allowed time to poll
+				delay = Time.now + data.to_i
+				say("Time until next request: #{data}")
+				$redis.setex("delay", data.to_i, delay.to_i)
+			elsif(type == 'i')
+				# set the current list
+				cur_list = data
+				redirects[cur_list] = []
+				say("Current list: #{cur_list}")
+			elsif(type == 'u')
+				# store the redirect
+				say("Redirect: #{data}")
+				redirects[cur_list].push(data)
+			elsif(type == 'ad')
+				say("Delete chunks: #{data}")
+				chunks = expand_ranges(data)
+				delete_add_chunks(cur_list, chunks)
+			elsif(type == 'sd')
+				say("Don't report chunks: #{data}")
+				chunks = expand_ranges(data)
+				delete_sub_chunks(cur_list, chunks)
+			else
+				say("I don't know how to handle this!")
+				say(line.inspect)
+			end
+		end
+		# handle the redirects
+		say('Handling redirects...')
+		redirects.each do |list, urls|
+			say("Handling #{list} redirects...")
+			i = 0
+			urls.each do |url|
+				i += 1
+				say("Handling #{list} redirect #{i} of #{urls.length}...")
+				handle_redirect(list, url)
+			end
+		end
+	end
+	def delete_add_chunks(list, chunks)
+		delete_chunks(list, 'add', chunks)
+	end
+	def delete_sub_chunks(list, chunks)
+		delete_chunks(list, 'sub', chunks)
+	end
+	def delete_chunks(list, type, chunks)
+		chunks.each do |chunk|
+			if(type == 'add')
+				# delete each of the prefixes
+				hosts = $redis.smembers("#{list}:chunk_#{chunk}")
+				hosts.each do |hosts|
+					$redis.del("#{list}:host_#{host}")
+					$redis.srem("#{list}:hosts", host)
+				end
+				# delete the list of prefixes
+				$redis.del("#{list}:chunk_#{chunk}")
+			end
+			# delete from our chunk list
+			$redis.srem("#{list}:#{type}_chunks", chunk)
+		end
+	end
+	def get_add_chunks(list)
+		return get_chunks(list, "add")
+	end
+	def get_sub_chunks(list)
+		return get_chunks(list, "sub")
+	end
+	def get_chunks(list, type)
+		chunks = $redis.smembers("#{list}:#{type}_chunks")
+		return convert_list_to_ranges(chunks)
+	end
+	# reads and parses the encoded data from a redirect url
+	def handle_redirect(list, url)
+		response = http_post_request("http://#{url}")
+		response = StringIO.new(response)
+		while(line = response.gets)
+			line = line.split(':')
+			type = line[0]
+			chunk_num = line[1].to_i
+			hash_len = line[2].to_i
+			chunk_len = line[3].to_i
+			data = response.read(chunk_len)
+			if(type == 'a')
+				if(chunk_len == 0)
+					# TODO
+				end
+				# store the chunk number in the add list
+				store_add_chunk(list, chunk_num)
+				entry_list = read_add_data(hash_len, data)
+				# add all these prefixes
+				add_entries(list, chunk_num, entry_list)
+			elsif(type == 's')
+				if(chunk_len == 0)
+					# TODO
+				end
+				# store the chunk number in the sub list
+				store_sub_chunk(list, chunk_num)
+				entry_list = read_sub_data(hash_len, data)
+				# delete all these prefixes
+				sub_entries(list, chunk_num, entry_list)
+			else
+				say "I don't know how to handle this!"
+				say line.inspect
+			end
+		end
+	end
+	def add_entries(list, chunk, entries)
+		entries.each do |entry|
+			$redis.sadd("#{list}:chunk_#{chunk}", entry['host'])
+			$redis.sadd("#{list}:host_#{entry['host']}", entry['path'])
+			$redis.sadd("#{list}:hosts", entry['host'])
+		end
+	end
+	def sub_entries(list, chunk, entries)
+		entries.each do |entry|
+			$redis.srem("#{list}:chunk_#{chunk}", entry['host'])
+			$redis.srem("#{list}:host_#{entry['host']}", entry['path'])
+			$redis.srem("#{list}:hosts", entry['host'])
+		end
+	end
+	def store_add_chunk(list, chunk)
+		store_chunk(list, 'add', chunk)
+	end
+	def store_sub_chunk(list, chunk)
+		store_chunk(list, 'sub', chunk)
+	end
+	def store_chunk(list, type, chunk)
+		$redis.sadd("#{list}:#{type}_chunks", chunk)
+	end
+	def read_add_data(hash_len, data)
+		return read_data(hash_len, data, false)
+	end
+	def read_sub_data(hash_len, data)
+		return read_data(hash_len, data, true)
+	end
+	# reads a chunk of encoded data and converts it into a list of entries
+	def read_data(hash_len, data, sub)
+		# returns an array of hashes of the form: { host, path, chunk }
+		entry_list = []
+		addchunknum = ""
+		data = StringIO.new(data)
+		while(hostkey = data.read(4))
+			hostkey = hostkey.unpack("H*")[0]
+			count = data.read(1).unpack("H*")[0].hex # or .to_i(16)
+			if(sub)
+				addchunknum = data.read(4).unpack("H*")[0]
+			end
+			# If count > 1, it will be prefix-chunk until the last one, which will be just prefix
+			count.times do |i|
+				entry = {}
+				entry['host'] = hostkey
+				path_prefix = data.read(hash_len).unpack("H*")[0]
+				entry['path'] = path_prefix
+				if(sub && count > 1 && i != count-1)
+					entry['chunk'] = data.read(4).unpack("H*")[0]
+				else
+					entry['chunk'] = addchunknum
+				end
+				entry_list.push(entry)
+			end
+		end
+		return entry_list
+	end
+	# transforms "1-2,4-6,8" into [1,2,4,5,6,8]
+	def expand_ranges(ranges)
+		result = []
+		ranges = ranges.split(',')
+		ranges.each do |range|
+			if(range.include? '-')
+				range = range.split('-')
+				a = range[0].to_i
+				b = range[1].to_i
+				[a..b].each do |i|
+					result.push(i)
+				end
+			else
+				result.push(range)
+			end
+		end
+		return result
+	end
+	# transforms [1,2,4,5,6,8] into "1-2,4-6,8"
+	def convert_list_to_ranges(list)
+		ranges = list.collect{|s| s.to_i}.sort.uniq.inject([]) do |spans, n|
+			if spans.empty? || spans.last.last != n - 1
+				spans + [n..n]
+			else
+				spans[0..-2] + [spans.last.first..n]
+			end
+		end
+		return ranges.join(',').gsub("..","-")
+	end
+	# makes a request to the google safe browsing api v2
+	def api_request(function, body = nil)
+		before = 'http://safebrowsing.clients.google.com/safebrowsing/'
+		after = "?client=api&apikey=#{$api_key}&appver=#{$appver}&pver=#{$pver}"
+		return http_post_request(before + function + after, body)
+	end
+	# makes an http post request with an empty body and returns the response
+	def http_post_request(url, body = nil)
+		uri = URI.parse(url)
+		http = Net::HTTP.new(uri.host, uri.port)
+		request = Net::HTTP::Post.new(uri.request_uri)
+		request.body = body || ''
+		response = http.request(request).body
+		return response
+	end
+	def say(msg)
+		if($debug)
+			puts "#{Time.now.utc}: #{msg}"
+		end
+	end
+end

metadata ADDED Viewed

@@ -0,0 +1,46 @@
+--- !ruby/object:Gem::Specification
+name: google_safe_browsing_redis
+version: !ruby/object:Gem::Version
+  version: 0.0.2
+  prerelease:
+platform: ruby
+authors:
+- Brad Jewell
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-06-20 00:00:00.000000000 Z
+dependencies: []
+description: A ruby implementation of the Google Safe Browsing API v2 that uses Redis
+email: brad811@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/canonicalize.rb
+- lib/google_safe_browsing.rb
+homepage: https://github.com/brad811/GoogleSafeBrowsing
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
+specification_version: 3
+summary: A ruby implementation of the Google Safe Browsing API v2 that uses Redis
+test_files: []