webrisk_hash 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +159 -0
- data/Rakefile +8 -0
- data/lib/webrisk_hash/canonicalizer.rb +235 -0
- data/lib/webrisk_hash/hash.rb +33 -0
- data/lib/webrisk_hash/prefixes.rb +58 -0
- data/lib/webrisk_hash/suffixes.rb +134 -0
- data/lib/webrisk_hash/version.rb +5 -0
- data/lib/webrisk_hash.rb +19 -0
- data/sig/webrisk_hash.rbs +4 -0
- metadata +68 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 8631f253b88cecc677f4bc8123206df7a64a965525427c2222dbfecebf24343c
|
|
4
|
+
data.tar.gz: fb2c962b83d841558efe64c520c128d028e8e2f647069c73739ccb4c2a4c77cb
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: cc355aa0482853a46d86d08e5d6b6495b7ef9cbe12aaa343c24baf2cb9a18ce0bb81be30aeb98f1cbfdd9d1202764d39a8de42fdb3d4d9c659ff118bcc5f928a
|
|
7
|
+
data.tar.gz: dbe039e198022dffabcc59b2e854cd42050464a8c0c2941d96806eb201b1f0e4055efe363ac345d8094e6fd33959f46303f613fc379954d779c9c24972ca2272
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 AADS
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# WebriskHash
|
|
2
|
+
|
|
3
|
+
This Ruby gem implements the URL hashing and canonicalization algorithm described in
|
|
4
|
+
the Google Web Risk documentation: https://cloud.google.com/web-risk/docs/urls-hashing
|
|
5
|
+
|
|
6
|
+
## Overview
|
|
7
|
+
|
|
8
|
+
The Web Risk API uses URL hashing to check URLs against threat lists. This gem implements
|
|
9
|
+
the complete hashing process:
|
|
10
|
+
|
|
11
|
+
1. **Canonicalization** - Normalize URLs by removing fragments, resolving percent-encoding,
|
|
12
|
+
normalizing IP addresses, and more
|
|
13
|
+
2. **Suffix/Prefix Generation** - Create up to 30 host/path combinations from each URL
|
|
14
|
+
3. **Hash Computation** - Generate SHA256 hashes for each combination
|
|
15
|
+
4. **Prefix Extraction** - Extract hash prefixes (4-32 bytes) for efficient lookup
|
|
16
|
+
|
|
17
|
+
## Features
|
|
18
|
+
|
|
19
|
+
- **URL Canonicalization** (`WebriskHash.canonicalize`)
|
|
20
|
+
- Removes tab (0x09), CR (0x0d), LF (0x0a) characters
|
|
21
|
+
- Removes URL fragments
|
|
22
|
+
- Repeatedly percent-unescapes URLs until no more escapes remain
|
|
23
|
+
- Normalizes IP addresses (decimal, hex, octal, and dotted variants)
|
|
24
|
+
- Converts Internationalized Domain Names (IDN) to ASCII (Punycode)
|
|
25
|
+
- Resolves path segments (/../ and /./)
|
|
26
|
+
- Collapses consecutive slashes in paths
|
|
27
|
+
- Removes default ports (80 for HTTP, 443 for HTTPS)
|
|
28
|
+
- Percent-escapes characters <= ASCII 32, >= 127, #, and %
|
|
29
|
+
- Preserves query parameters without path canonicalization
|
|
30
|
+
|
|
31
|
+
- **Suffix/Prefix Expression Generation** (`WebriskHash.suffix_postfix_expressions`)
|
|
32
|
+
- Up to 5 host suffix variations (exact hostname + up to 4 from last 5 components)
|
|
33
|
+
- Up to 6 path prefix variations (with/without query + progressive paths)
|
|
34
|
+
- Combines to create up to 30 expressions per URL
|
|
35
|
+
- IP addresses use only exact hostname (no suffix variations)
|
|
36
|
+
- Handles query parameters correctly
|
|
37
|
+
|
|
38
|
+
- **SHA256 Hash Prefixes** (`WebriskHash.get_prefixes`, `WebriskHash.get_prefix_map`)
|
|
39
|
+
- FIPS-180-2 compliant SHA256 hashing
|
|
40
|
+
- Configurable prefix lengths (4-32 bytes = 32-256 bits)
|
|
41
|
+
- Hash prefix extraction from most significant bytes
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
Add to your `Gemfile`:
|
|
46
|
+
|
|
47
|
+
```ruby
|
|
48
|
+
gem "webrisk_hash"
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Or install and run locally:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
bundle install
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Usage Examples
|
|
58
|
+
|
|
59
|
+
### Basic Usage
|
|
60
|
+
|
|
61
|
+
```ruby
|
|
62
|
+
require "webrisk_hash"
|
|
63
|
+
|
|
64
|
+
url = "http://example.com/path/to/page?query=1"
|
|
65
|
+
|
|
66
|
+
# Step 1: Canonicalize the URL
|
|
67
|
+
canonical = WebriskHash.canonicalize(url)
|
|
68
|
+
# => "http://example.com/path/to/page?query=1"
|
|
69
|
+
|
|
70
|
+
# Step 2: Get hash prefixes (default 256 bits = 32 bytes)
|
|
71
|
+
prefixes = WebriskHash.get_prefixes(url)
|
|
72
|
+
# => #<Set: {"\xAB\xCD...(32 bytes)", ...}>
|
|
73
|
+
|
|
74
|
+
# Display as hex
|
|
75
|
+
puts prefixes.to_a.map { |p| p.unpack1("H*") }
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Working with Suffix/Prefix Expressions
|
|
79
|
+
|
|
80
|
+
```ruby
|
|
81
|
+
# Generate all suffix/prefix expressions for a URL
|
|
82
|
+
canonical = WebriskHash.canonicalize("http://a.b.c/1/2.html?param=1")
|
|
83
|
+
expressions = WebriskHash.suffix_postfix_expressions(canonical)
|
|
84
|
+
|
|
85
|
+
puts "Total expressions: #{expressions.size}"
|
|
86
|
+
expressions.each { |expr| puts " #{expr}" }
|
|
87
|
+
|
|
88
|
+
# Output:
|
|
89
|
+
# Total expressions: 8
|
|
90
|
+
# a.b.c/1/2.html?param=1
|
|
91
|
+
# a.b.c/1/2.html
|
|
92
|
+
# a.b.c/1/
|
|
93
|
+
# a.b.c/
|
|
94
|
+
# b.c/1/2.html?param=1
|
|
95
|
+
# b.c/1/2.html
|
|
96
|
+
# b.c/1/
|
|
97
|
+
# b.c/
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Getting Hash Prefixes with Custom Length
|
|
101
|
+
|
|
102
|
+
```ruby
|
|
103
|
+
# Get 32-bit (4 byte) hash prefixes instead of full 256-bit hashes
|
|
104
|
+
prefixes_32bit = WebriskHash.get_prefixes(url, 32)
|
|
105
|
+
|
|
106
|
+
# Get detailed mapping of expressions to hashes
|
|
107
|
+
map = WebriskHash.get_prefix_map(url, 32)
|
|
108
|
+
map.each do |expression, prefix_bin|
|
|
109
|
+
hex_prefix = prefix_bin.unpack1('H*')
|
|
110
|
+
puts "#{expression} -> #{hex_prefix}"
|
|
111
|
+
end
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Canonicalization Examples
|
|
115
|
+
|
|
116
|
+
```ruby
|
|
117
|
+
# Remove control characters
|
|
118
|
+
WebriskHash.canonicalize("http://google.com/foo\tbar\rbaz\n2")
|
|
119
|
+
# => "http://google.com/foobarbaz2"
|
|
120
|
+
|
|
121
|
+
# Percent-unescape repeatedly
|
|
122
|
+
WebriskHash.canonicalize("http://host/%25%32%35")
|
|
123
|
+
# => "http://host/%25"
|
|
124
|
+
|
|
125
|
+
# Normalize IP addresses
|
|
126
|
+
WebriskHash.canonicalize("http://3279880203/blah")
|
|
127
|
+
# => "http://195.127.0.11/blah"
|
|
128
|
+
|
|
129
|
+
# Resolve path segments
|
|
130
|
+
WebriskHash.canonicalize("http://google.com/blah/..")
|
|
131
|
+
# => "http://google.com/"
|
|
132
|
+
|
|
133
|
+
# Remove fragments
|
|
134
|
+
WebriskHash.canonicalize("http://evil.com/foo#bar")
|
|
135
|
+
# => "http://evil.com/foo"
|
|
136
|
+
|
|
137
|
+
# Lowercase hostname
|
|
138
|
+
WebriskHash.canonicalize("http://www.GOOgle.com/")
|
|
139
|
+
# => "http://www.google.com/"
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Development
|
|
143
|
+
|
|
144
|
+
Install dependencies and run tests:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
bundle install
|
|
148
|
+
rake spec
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
For interactive debugging use:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
bin/console
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## License
|
|
158
|
+
|
|
159
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module WebriskHash
|
|
4
|
+
# URL Canonicalization for Web Risk API
|
|
5
|
+
#
|
|
6
|
+
# Implements the canonicalization process described in the Web Risk API documentation:
|
|
7
|
+
# https://cloud.google.com/web-risk/docs/urls-hashing#canonicalization
|
|
8
|
+
#
|
|
9
|
+
# The canonicalization process includes:
|
|
10
|
+
# 1. Parse URL according to RFC 2396 (convert IDN to ASCII Punycode if needed)
|
|
11
|
+
# 2. Remove tab (0x09), CR (0x0d), and LF (0x0a) characters
|
|
12
|
+
# 3. Remove URL fragments
|
|
13
|
+
# 4. Repeatedly percent-unescape until no more percent-escapes
|
|
14
|
+
# 5. Normalize hostname:
|
|
15
|
+
# - Remove leading/trailing dots
|
|
16
|
+
# - Replace consecutive dots with single dot
|
|
17
|
+
# - Parse and normalize IP addresses (handle octal, hex, decimal)
|
|
18
|
+
# - Lowercase the entire hostname
|
|
19
|
+
# 6. Canonicalize path:
|
|
20
|
+
# - Resolve /../ and /./ sequences
|
|
21
|
+
# - Replace consecutive slashes with single slash
|
|
22
|
+
# 7. Percent-escape characters <= ASCII 32, >= 127, #, and %
|
|
23
|
+
# 8. Do NOT apply path canonicalization to query parameters
|
|
24
|
+
# 9. Remove default ports (80 for HTTP, 443 for HTTPS)
|
|
25
|
+
#
|
|
26
|
+
# @see https://cloud.google.com/web-risk/docs/urls-hashing#canonicalization
|
|
27
|
+
module Canonicalizer
|
|
28
|
+
extend self
|
|
29
|
+
|
|
30
|
+
# Canonicalize a URL according to Web Risk API specification
|
|
31
|
+
#
|
|
32
|
+
# @param url [String] The URL to canonicalize
|
|
33
|
+
# @return [String, nil] The canonicalized URL or nil if invalid
|
|
34
|
+
#
|
|
35
|
+
# @example
|
|
36
|
+
# canonicalize("http://host/%25%32%35")
|
|
37
|
+
# # => "http://host/%25"
|
|
38
|
+
#
|
|
39
|
+
# canonicalize("http://www.GOOgle.com/")
|
|
40
|
+
# # => "http://www.google.com/"
|
|
41
|
+
#
|
|
42
|
+
# canonicalize("http://3279880203/blah")
|
|
43
|
+
# # => "http://195.127.0.11/blah"
|
|
44
|
+
#
|
|
45
|
+
def canonicalize(url)
|
|
46
|
+
return nil if url.nil?
|
|
47
|
+
|
|
48
|
+
raw = url.dup.force_encoding(Encoding::BINARY)
|
|
49
|
+
raw = raw.gsub(/[\t\r\n]/, '')
|
|
50
|
+
raw = raw.sub(/\A +/n, '').sub(/ +\z/n, '')
|
|
51
|
+
url_with_scheme = raw.include?('://') ? raw : "http://#{raw}"
|
|
52
|
+
url_with_scheme = url_with_scheme.gsub(%r{(?<=://)([^/]*)}) { |auth| auth.gsub(' ', '%20') }
|
|
53
|
+
|
|
54
|
+
begin
|
|
55
|
+
a = Addressable::URI.parse(url_with_scheme)
|
|
56
|
+
rescue StandardError
|
|
57
|
+
return nil
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
schema = a.scheme
|
|
61
|
+
if a.user
|
|
62
|
+
a.user + (a.password ? ":#{a.password}" : '')
|
|
63
|
+
end
|
|
64
|
+
host = a.host
|
|
65
|
+
after_scheme = url_with_scheme.sub(%r{\A[^:]+://}, '')
|
|
66
|
+
slash_idx = after_scheme.index('/')
|
|
67
|
+
path_and_query = slash_idx ? after_scheme[slash_idx..] : '/'
|
|
68
|
+
|
|
69
|
+
path_and_query = path_and_query.split('#', 2)[0]
|
|
70
|
+
|
|
71
|
+
if path_and_query.include?('?')
|
|
72
|
+
raw_path, raw_query = path_and_query.split('?', 2)
|
|
73
|
+
else
|
|
74
|
+
raw_path = path_and_query
|
|
75
|
+
raw_query = nil
|
|
76
|
+
end
|
|
77
|
+
path = raw_path && !raw_path.empty? ? raw_path : '/'
|
|
78
|
+
query = raw_query
|
|
79
|
+
|
|
80
|
+
return nil if schema.nil? || host.nil? || host.length > 255
|
|
81
|
+
|
|
82
|
+
host_decoded = custom_decode_uri_component(normalize_ip_address(host)).gsub(/[\t\x0a\x0d]/, '')
|
|
83
|
+
|
|
84
|
+
begin
|
|
85
|
+
host_ascii = if /[^\x00-\x7F]/.match?(host_decoded)
|
|
86
|
+
Addressable::IDNA.to_ascii(host_decoded)
|
|
87
|
+
else
|
|
88
|
+
host_decoded
|
|
89
|
+
end
|
|
90
|
+
rescue StandardError
|
|
91
|
+
host_ascii = host_decoded
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
normalized_host = webrisk_uri_escape(host_ascii)
|
|
95
|
+
normalized_host = normalized_host.squeeze('.').gsub(/\A\.+|\.+\z/, '').downcase
|
|
96
|
+
|
|
97
|
+
normalized_path = normalize_component_encoding(normalize_dots_in_paths(path))
|
|
98
|
+
normalized_path = "#{normalized_path}/" if path.end_with?('/') && !normalized_path.end_with?('/')
|
|
99
|
+
|
|
100
|
+
normalized_query = query ? "?#{query}" : ''
|
|
101
|
+
|
|
102
|
+
"#{schema}://#{normalized_host}#{normalized_path}#{normalized_query}"
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def int2ip(ip_int)
|
|
106
|
+
num = ip_int.to_i
|
|
107
|
+
[(num >> 24) & 255, (num >> 16) & 255, (num >> 8) & 255, num & 255].join('.')
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def normalize_ip_address(c)
|
|
111
|
+
begin
|
|
112
|
+
parts = c.split('.')
|
|
113
|
+
parse_part = lambda do |p|
|
|
114
|
+
if /^0x/i.match?(p)
|
|
115
|
+
Integer(p)
|
|
116
|
+
elsif p =~ /^0[0-9]+$/ && p.length > 1
|
|
117
|
+
Integer(p, 8)
|
|
118
|
+
else
|
|
119
|
+
Integer(p)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
if parts.length == 1
|
|
124
|
+
n = parse_part.call(parts[0])
|
|
125
|
+
return int2ip(n) if n.between?(0, 0xFFFFFFFF)
|
|
126
|
+
elsif parts.length <= 4 && parts.all? { |p| p =~ /^([0-9]+|0x[0-9a-fA-F]+|0[0-7]*)$/ }
|
|
127
|
+
nums = parts.map { |p| parse_part.call(p) }
|
|
128
|
+
ip_int = case nums.length
|
|
129
|
+
when 1
|
|
130
|
+
nums[0]
|
|
131
|
+
when 2
|
|
132
|
+
(nums[0] << 24) | (nums[1] & 0xFFFFFF)
|
|
133
|
+
when 3
|
|
134
|
+
(nums[0] << 24) | ((nums[1] & 0xFF) << 16) | (nums[2] & 0xFFFF)
|
|
135
|
+
when 4
|
|
136
|
+
(nums[0] << 24) | ((nums[1] & 0xFF) << 16) | ((nums[2] & 0xFF) << 8) | (nums[3] & 0xFF)
|
|
137
|
+
end
|
|
138
|
+
return int2ip(ip_int) if ip_int.between?(0, 0xFFFFFFFF)
|
|
139
|
+
end
|
|
140
|
+
rescue StandardError
|
|
141
|
+
nil
|
|
142
|
+
end
|
|
143
|
+
c
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def normalize_component_encoding(c)
|
|
147
|
+
value = c
|
|
148
|
+
prev_value = nil
|
|
149
|
+
1000.times do
|
|
150
|
+
prev_value = value
|
|
151
|
+
value = custom_decode_uri_component(prev_value).gsub(/[\t\x0a\x0d]/, '')
|
|
152
|
+
break if value == prev_value
|
|
153
|
+
end
|
|
154
|
+
webrisk_uri_escape(value)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def normalize_dots_in_paths(path)
|
|
158
|
+
segments = path.split('/')
|
|
159
|
+
new_segments = []
|
|
160
|
+
empty_or_dot = ['', '.'].freeze
|
|
161
|
+
segments.each do |seg|
|
|
162
|
+
next if empty_or_dot.include?(seg)
|
|
163
|
+
|
|
164
|
+
if seg == '..'
|
|
165
|
+
new_segments.pop unless new_segments.empty?
|
|
166
|
+
else
|
|
167
|
+
new_segments << seg
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
result = "/#{new_segments.join('/')}"
|
|
171
|
+
result == '' ? '/' : result
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def custom_decode_uri_component(input)
|
|
175
|
+
replace_map = { '%FE%FF' => "\uFFFD\uFFFD", '%FF%FE' => "\uFFFD\uFFFD" }
|
|
176
|
+
multi_matcher = /((?:%[a-f0-9]{2})+)/i
|
|
177
|
+
input.scan(multi_matcher).flatten.each do |match|
|
|
178
|
+
replace_map[match] = URI.decode_www_form_component(match)
|
|
179
|
+
rescue StandardError
|
|
180
|
+
decoded = safe_decode(match)
|
|
181
|
+
replace_map[match] = decoded if decoded != match
|
|
182
|
+
end
|
|
183
|
+
replace_map['%C2'] = "\uFFFD"
|
|
184
|
+
replace_map.each do |k, v|
|
|
185
|
+
input = input.gsub(Regexp.new(Regexp.escape(k), Regexp::IGNORECASE), v)
|
|
186
|
+
end
|
|
187
|
+
input
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def safe_decode(input)
|
|
191
|
+
tokens = input.scan(/%[a-f0-9]{2}/i).map { |t| t }
|
|
192
|
+
(1..tokens.length).each do |split|
|
|
193
|
+
left = tokens[0, split]
|
|
194
|
+
right = tokens[split..] || []
|
|
195
|
+
begin
|
|
196
|
+
return URI.decode_www_form_component((left + right).join)
|
|
197
|
+
rescue StandardError
|
|
198
|
+
next
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
input
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def escape_character(code)
|
|
205
|
+
chr = code.chr(Encoding::UTF_8)
|
|
206
|
+
if code < 256
|
|
207
|
+
if code < 16
|
|
208
|
+
format('%%%02X', code)
|
|
209
|
+
elsif code <= 32 || code > 127 || chr == '%' || chr == '#'
|
|
210
|
+
format('%%%02X', code)
|
|
211
|
+
else
|
|
212
|
+
chr
|
|
213
|
+
end
|
|
214
|
+
else
|
|
215
|
+
escape_character(code >> 8) + escape_character(code % 256)
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def webrisk_uri_escape(s)
|
|
220
|
+
out = +''
|
|
221
|
+
i = 0
|
|
222
|
+
while i < s.length
|
|
223
|
+
c = s[i]
|
|
224
|
+
if c == '%' && s[i + 1] && s[i + 2] && s[(i + 1)..(i + 2)] =~ /^[0-9a-fA-F]{2}$/
|
|
225
|
+
out << '%' << s[(i + 1)..(i + 2)].upcase
|
|
226
|
+
i += 3
|
|
227
|
+
next
|
|
228
|
+
end
|
|
229
|
+
out << escape_character(c.ord)
|
|
230
|
+
i += 1
|
|
231
|
+
end
|
|
232
|
+
out
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module WebriskHash
|
|
4
|
+
module Hash
|
|
5
|
+
extend self
|
|
6
|
+
|
|
7
|
+
# Compute SHA256 hash and return truncated prefix
|
|
8
|
+
#
|
|
9
|
+
# For Web Risk, a hash prefix consists of the most significant 4-32 bytes
|
|
10
|
+
# of a SHA256 hash (32 to 256 bits)
|
|
11
|
+
#
|
|
12
|
+
# @param str [String] The string to hash (suffix/prefix expression)
|
|
13
|
+
# @param bits [Integer] Number of bits to return (will be converted to bytes)
|
|
14
|
+
# @return [String] Binary string containing the hash prefix
|
|
15
|
+
#
|
|
16
|
+
# @example FIPS-180-2 Example B1 (32 bits)
|
|
17
|
+
# truncated_sha256_prefix("abc", 32)
|
|
18
|
+
# # => "\xBA\x78\x16\xBF" (4 bytes)
|
|
19
|
+
# # Full SHA256: ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad
|
|
20
|
+
#
|
|
21
|
+
# @example FIPS-180-2 Example B2 (48 bits)
|
|
22
|
+
# input = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
|
|
23
|
+
# truncated_sha256_prefix(input, 48)
|
|
24
|
+
# # => "\x24\x8D\x6A\x61\xD2\x06" (6 bytes)
|
|
25
|
+
# # Full SHA256: 248d6a61d20638b8e5c026930c3e6039a33ce45964ff2167f6ecedd419db06c1
|
|
26
|
+
#
|
|
27
|
+
def truncated_sha256_prefix(str, bits)
|
|
28
|
+
len = bits / 8
|
|
29
|
+
digest = ::Digest::SHA256.digest(str)
|
|
30
|
+
digest.byteslice(0, len)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module WebriskHash
|
|
4
|
+
module Prefixes
|
|
5
|
+
extend self
|
|
6
|
+
|
|
7
|
+
# Get hash prefix map for a URL
|
|
8
|
+
#
|
|
9
|
+
# Returns an array of [expression, hash_prefix] pairs for all
|
|
10
|
+
# suffix/prefix expressions of the URL
|
|
11
|
+
#
|
|
12
|
+
# @param url [String] The URL to process
|
|
13
|
+
# @param size [Integer] Hash prefix size in bits (default: 256 bits = 32 bytes)
|
|
14
|
+
# @return [Array<Array>] Array of [expression, hash_prefix] pairs
|
|
15
|
+
#
|
|
16
|
+
# @example
|
|
17
|
+
# get_prefix_map("http://a.b.c/1/2.html?param=1")
|
|
18
|
+
# # => [
|
|
19
|
+
# # ["a.b.c/1/2.html?param=1", "\xAB\xCD..."],
|
|
20
|
+
# # ["a.b.c/1/2.html", "\x12\x34..."],
|
|
21
|
+
# # ...
|
|
22
|
+
# # ]
|
|
23
|
+
#
|
|
24
|
+
def get_prefix_map(url, size = 32 * 8)
|
|
25
|
+
canonical = WebriskHash.canonicalize(url)
|
|
26
|
+
return [] if canonical.nil?
|
|
27
|
+
|
|
28
|
+
WebriskHash.suffix_postfix_expressions(canonical).to_a.map { |u| [u, WebriskHash::Hash.truncated_sha256_prefix(u, size)] }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Get hash prefixes for a URL
|
|
32
|
+
#
|
|
33
|
+
# Returns a set of hash prefixes for all suffix/prefix expressions
|
|
34
|
+
# of the URL. This is the main method used to check URLs against
|
|
35
|
+
# Web Risk lists.
|
|
36
|
+
#
|
|
37
|
+
# @param url [String] The URL to process
|
|
38
|
+
# @param size [Integer] Hash prefix size in bits (default: 256 bits = 32 bytes)
|
|
39
|
+
# @return [Set<String>] Set of binary hash prefixes
|
|
40
|
+
#
|
|
41
|
+
# @example Get 32-bit hash prefixes
|
|
42
|
+
# get_prefixes("http://evil.com/malware", 32)
|
|
43
|
+
# # => #<Set: {"\xAB\xCD\xEF\x12", "\x34\x56\x78\x90", ...}>
|
|
44
|
+
#
|
|
45
|
+
# @example Get full 256-bit hashes (default)
|
|
46
|
+
# get_prefixes("http://example.com/")
|
|
47
|
+
# # => #<Set: {"\xAB\xCD...(32 bytes)", ...}>
|
|
48
|
+
#
|
|
49
|
+
def get_prefixes(url, size = 32 * 8)
|
|
50
|
+
canonical = WebriskHash.canonicalize(url)
|
|
51
|
+
return Set.new if canonical.nil?
|
|
52
|
+
|
|
53
|
+
Set.new(WebriskHash.suffix_postfix_expressions(canonical).to_a.map do |u|
|
|
54
|
+
WebriskHash::Hash.truncated_sha256_prefix(u, size)
|
|
55
|
+
end)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module WebriskHash
|
|
4
|
+
# Suffix/Prefix Expression Generation for Web Risk API
|
|
5
|
+
#
|
|
6
|
+
# Generates all possible host suffix and path prefix combinations for a canonicalized URL.
|
|
7
|
+
# These combinations are used to create the expressions that will be hashed and checked
|
|
8
|
+
# against Web Risk lists.
|
|
9
|
+
#
|
|
10
|
+
# The process follows the Web Risk API specification:
|
|
11
|
+
# https://cloud.google.com/web-risk/docs/urls-hashing#suffixprefix-expressions
|
|
12
|
+
#
|
|
13
|
+
# @see https://cloud.google.com/web-risk/docs/urls-hashing#suffixprefix-expressions
|
|
14
|
+
module Suffixes
|
|
15
|
+
extend self
|
|
16
|
+
|
|
17
|
+
# Generate suffix/prefix expressions for a canonicalized URL
|
|
18
|
+
#
|
|
19
|
+
# Returns up to 30 different host suffix and path prefix combinations.
|
|
20
|
+
# Only the host and path components are used; scheme, username, password,
|
|
21
|
+
# and port are discarded.
|
|
22
|
+
#
|
|
23
|
+
# For the host, tries at most 5 different strings:
|
|
24
|
+
# 1. The exact hostname in the URL
|
|
25
|
+
# 2. Up to 4 hostnames formed by starting with the last 5 components
|
|
26
|
+
# and successively removing the leading component
|
|
27
|
+
# Note: Additional hostnames are not checked if the host is an IP address
|
|
28
|
+
#
|
|
29
|
+
# For the path, tries at most 6 different strings:
|
|
30
|
+
# 1. The exact path of the URL, including query parameters
|
|
31
|
+
# 2. The exact path of the URL, without query parameters
|
|
32
|
+
# 3-6. The four paths formed by starting at root (/) and
|
|
33
|
+
# successively appending path components, including trailing slash
|
|
34
|
+
#
|
|
35
|
+
# @param canonical_url [String] The canonicalized URL
|
|
36
|
+
# @return [Set<String>] Set of suffix/prefix expressions (host/path combinations)
|
|
37
|
+
#
|
|
38
|
+
# @example For http://a.b.c/1/2.html?param=1
|
|
39
|
+
# suffix_postfix_expressions("http://a.b.c/1/2.html?param=1")
|
|
40
|
+
# # => Set with 8 expressions:
|
|
41
|
+
# # ["a.b.c/1/2.html?param=1", "a.b.c/1/2.html", "a.b.c/", "a.b.c/1/",
|
|
42
|
+
# # "b.c/1/2.html?param=1", "b.c/1/2.html", "b.c/", "b.c/1/"]
|
|
43
|
+
#
|
|
44
|
+
# @example For http://a.b.c.d.e.f.g/1.html
|
|
45
|
+
# suffix_postfix_expressions("http://a.b.c.d.e.f.g/1.html")
|
|
46
|
+
# # => Set with 10 expressions (5 host suffixes × 2 path prefixes)
|
|
47
|
+
# # Note: b.c.d.e.f.g is skipped (only last 5 components used)
|
|
48
|
+
#
|
|
49
|
+
# @example For http://1.2.3.4/1/
|
|
50
|
+
# suffix_postfix_expressions("http://1.2.3.4/1/")
|
|
51
|
+
# # => Set with 2 expressions:
|
|
52
|
+
# # ["1.2.3.4/1/", "1.2.3.4/"]
|
|
53
|
+
# # (IP addresses only use exact hostname)
|
|
54
|
+
#
|
|
55
|
+
def suffix_postfix_expressions(canonical_url)
|
|
56
|
+
return Set.new unless canonical_url
|
|
57
|
+
|
|
58
|
+
u = canonical_url.sub(%r{^[^:]+://}, '')
|
|
59
|
+
host, rest = u.split('/', 2)
|
|
60
|
+
path_and_query = "/#{rest || ''}"
|
|
61
|
+
|
|
62
|
+
if path_and_query.include?('?')
|
|
63
|
+
path, query = path_and_query.split('?', 2)
|
|
64
|
+
else
|
|
65
|
+
path = path_and_query
|
|
66
|
+
query = nil
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
host_suffixes = generate_host_suffixes(host)
|
|
70
|
+
path_prefixes = generate_path_prefixes(path, query)
|
|
71
|
+
|
|
72
|
+
results = []
|
|
73
|
+
host_suffixes.each do |h|
|
|
74
|
+
path_prefixes.each do |p|
|
|
75
|
+
results << (h + p)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
Set.new(results)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
def generate_host_suffixes(host)
|
|
85
|
+
return [host] if ip_address?(host)
|
|
86
|
+
|
|
87
|
+
suffixes = []
|
|
88
|
+
parts = host.split('.')
|
|
89
|
+
|
|
90
|
+
suffixes << host
|
|
91
|
+
|
|
92
|
+
if parts.length > 1
|
|
93
|
+
relevant_parts = parts.length > 5 ? parts[-5..] : parts
|
|
94
|
+
|
|
95
|
+
(2..relevant_parts.length).each do |num_components|
|
|
96
|
+
suffix = relevant_parts[-num_components..].join('.')
|
|
97
|
+
suffixes << suffix unless suffix == host
|
|
98
|
+
break if suffixes.length >= 5
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
suffixes.take(5)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def generate_path_prefixes(path, query)
|
|
106
|
+
prefixes = []
|
|
107
|
+
|
|
108
|
+
prefixes << "#{path}?#{query}" if query
|
|
109
|
+
prefixes << path unless prefixes.include?(path)
|
|
110
|
+
|
|
111
|
+
if path != '/'
|
|
112
|
+
segments = path.split('/').reject(&:empty?)
|
|
113
|
+
current = '/'
|
|
114
|
+
|
|
115
|
+
segments.each_with_index do |segment, index|
|
|
116
|
+
break if prefixes.length >= 6
|
|
117
|
+
|
|
118
|
+
current += segment
|
|
119
|
+
current += '/' if index < segments.length - 1 || path.end_with?('/')
|
|
120
|
+
|
|
121
|
+
prefixes << current unless prefixes.include?(current)
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
prefixes << '/' if !prefixes.include?('/') && prefixes.length < 6
|
|
126
|
+
|
|
127
|
+
prefixes.take(6)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def ip_address?(host)
|
|
131
|
+
host =~ /^(\d+\.){3}\d+$/
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
data/lib/webrisk_hash.rb
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'webrisk_hash/version'
|
|
4
|
+
|
|
5
|
+
require 'addressable/uri'
|
|
6
|
+
require 'digest'
|
|
7
|
+
require 'uri'
|
|
8
|
+
|
|
9
|
+
module WebriskHash
|
|
10
|
+
require_relative 'webrisk_hash/canonicalizer'
|
|
11
|
+
require_relative 'webrisk_hash/suffixes'
|
|
12
|
+
require_relative 'webrisk_hash/hash'
|
|
13
|
+
require_relative 'webrisk_hash/prefixes'
|
|
14
|
+
|
|
15
|
+
extend WebriskHash::Canonicalizer
|
|
16
|
+
extend WebriskHash::Suffixes
|
|
17
|
+
extend WebriskHash::Hash
|
|
18
|
+
extend WebriskHash::Prefixes
|
|
19
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: webrisk_hash
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- AADS team
|
|
8
|
+
bindir: exe
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: addressable
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - "~>"
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '2.8'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - "~>"
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '2.8'
|
|
26
|
+
description: 'This Ruby gem implements the URL hashing and canonicalization algorithm
|
|
27
|
+
described in the Google Web Risk documentation: https://cloud.google.com/web-risk/docs/urls-hashing'
|
|
28
|
+
email:
|
|
29
|
+
- support@aads.com
|
|
30
|
+
executables: []
|
|
31
|
+
extensions: []
|
|
32
|
+
extra_rdoc_files: []
|
|
33
|
+
files:
|
|
34
|
+
- LICENSE.txt
|
|
35
|
+
- README.md
|
|
36
|
+
- Rakefile
|
|
37
|
+
- lib/webrisk_hash.rb
|
|
38
|
+
- lib/webrisk_hash/canonicalizer.rb
|
|
39
|
+
- lib/webrisk_hash/hash.rb
|
|
40
|
+
- lib/webrisk_hash/prefixes.rb
|
|
41
|
+
- lib/webrisk_hash/suffixes.rb
|
|
42
|
+
- lib/webrisk_hash/version.rb
|
|
43
|
+
- sig/webrisk_hash.rbs
|
|
44
|
+
homepage: https://github.com/a-ads/webrisk_hash
|
|
45
|
+
licenses:
|
|
46
|
+
- MIT
|
|
47
|
+
metadata:
|
|
48
|
+
homepage_uri: https://github.com/a-ads/webrisk_hash
|
|
49
|
+
rubygems_mfa_required: 'true'
|
|
50
|
+
rdoc_options: []
|
|
51
|
+
require_paths:
|
|
52
|
+
- lib
|
|
53
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
54
|
+
requirements:
|
|
55
|
+
- - ">="
|
|
56
|
+
- !ruby/object:Gem::Version
|
|
57
|
+
version: 3.0.0
|
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
59
|
+
requirements:
|
|
60
|
+
- - ">="
|
|
61
|
+
- !ruby/object:Gem::Version
|
|
62
|
+
version: '0'
|
|
63
|
+
requirements: []
|
|
64
|
+
rubygems_version: 3.7.2
|
|
65
|
+
specification_version: 4
|
|
66
|
+
summary: Ruby gem implementation of the algorithm described in the Google Web Risk
|
|
67
|
+
documentation
|
|
68
|
+
test_files: []
|