google_safe_browsing 0.3.9 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/google_safe_browsing/canonicalize.rb +219 -173
- data/lib/google_safe_browsing/version.rb +1 -1
- metadata +5 -5
@@ -33,6 +33,7 @@ module GoogleSafeBrowsing
|
|
33
33
|
|
34
34
|
#split into host and path components
|
35
35
|
splits = split_host_path(cann)
|
36
|
+
|
36
37
|
cann = fix_host( splits[:host] ) + '/' + fix_path( splits[:path] )
|
37
38
|
|
38
39
|
# add leading protocol
|
@@ -48,6 +49,8 @@ module GoogleSafeBrowsing
|
|
48
49
|
# @return (Array) array of cannonicalized url permutation strings
|
49
50
|
def self.urls_for_lookup(lookup_url)
|
50
51
|
lookup_url = url(lookup_url)
|
52
|
+
#return empty array if url returns nil; for invalid url
|
53
|
+
return [] if lookup_url.blank?
|
51
54
|
|
52
55
|
lookup_url = remove_protocol(lookup_url)
|
53
56
|
|
@@ -71,219 +74,262 @@ module GoogleSafeBrowsing
|
|
71
74
|
cart_prod(host_strings, path_strings)
|
72
75
|
end
|
73
76
|
|
74
|
-
|
77
|
+
# private
|
75
78
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
79
|
+
# Generates the path permutations from the raw path string
|
80
|
+
#
|
81
|
+
# @param (String) raw_path path split from the full url string
|
82
|
+
# @return (Array) array of path permutation strings
|
83
|
+
def self.generate_path_strings(raw_path)
|
84
|
+
return [ '/', '' ] if raw_path == ''
|
82
85
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
+
path_split = raw_path.split('?')
|
87
|
+
path = path_split[0] || ''
|
88
|
+
params = path_split[1] || ''
|
86
89
|
|
87
90
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
91
|
+
path_components = path.split('/').first(3)
|
92
|
+
path_strings = [ '/' ]
|
93
|
+
path_components.length.times do
|
94
|
+
path_strings << '/' + path_components.join('/')
|
95
|
+
path_components.pop
|
96
|
+
end
|
94
97
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
else
|
99
|
-
p
|
100
|
-
end
|
101
|
-
end
|
102
|
-
path_strings.map!{ |p| p.to_s.gsub!(/\/+/, '/') }
|
103
|
-
path_strings.compact!
|
104
|
-
path_strings.uniq!
|
105
|
-
|
106
|
-
unless params.blank?
|
107
|
-
path_strings | path_strings.map do |p|
|
108
|
-
if p[-1] == '/'
|
109
|
-
p
|
110
|
-
else
|
111
|
-
"#{p}?#{params}"
|
112
|
-
end
|
113
|
-
end
|
98
|
+
path_strings.map! do |p|
|
99
|
+
unless p.index('.')
|
100
|
+
p + '/'
|
114
101
|
else
|
115
|
-
|
102
|
+
p
|
116
103
|
end
|
117
104
|
end
|
105
|
+
path_strings.map!{ |p| p.to_s.gsub!(/\/+/, '/') }
|
106
|
+
path_strings.compact!
|
107
|
+
path_strings.uniq!
|
118
108
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
result = []
|
126
|
-
a_one.each do |i|
|
127
|
-
a_two.each do |j|
|
128
|
-
result << "#{i}#{j}"
|
109
|
+
unless params.blank?
|
110
|
+
path_strings | path_strings.map do |p|
|
111
|
+
if p[-1] == '/'
|
112
|
+
p
|
113
|
+
else
|
114
|
+
"#{p}?#{params}"
|
129
115
|
end
|
130
116
|
end
|
131
|
-
|
117
|
+
else
|
118
|
+
return path_strings
|
132
119
|
end
|
120
|
+
end
|
133
121
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
122
|
+
# Returns the cartesian product of two arrays by concatination of the string representation of the elements
|
123
|
+
#
|
124
|
+
# @param (Array) a_one array of strings
|
125
|
+
# @param (Array) a_two array of strings
|
126
|
+
# @return (Array) cartesian product of arrays with elements concatinated
|
127
|
+
def self.cart_prod(a_one, a_two)
|
128
|
+
result = []
|
129
|
+
a_one.each do |i|
|
130
|
+
a_two.each do |j|
|
131
|
+
result << "#{i}#{j}"
|
144
132
|
end
|
145
|
-
|
146
|
-
ret
|
147
133
|
end
|
134
|
+
result
|
135
|
+
end
|
148
136
|
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
137
|
+
# Takes the canonicalized url and splits the host and the path apart
|
138
|
+
#
|
139
|
+
# @param (String) cann canonicalized url string
|
140
|
+
# @return (Hash) !{ :host => host_part, :path => path_part }
|
141
|
+
def self.split_host_path(cann)
|
142
|
+
ret= { :host => cann, :path => '' }
|
143
|
+
split_point = cann.index('/')
|
144
|
+
if split_point
|
145
|
+
ret[:host] = cann[0..split_point-1]
|
146
|
+
ret[:path] = cann[split_point+1..-1]
|
156
147
|
end
|
157
148
|
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
149
|
+
ret
|
150
|
+
end
|
151
|
+
|
152
|
+
# Strips the fragment portion of the url string (the last '#' and everything after)
|
153
|
+
#
|
154
|
+
# @param (String) string url
|
155
|
+
# @return (String) parameter with the fragment removed
|
156
|
+
def self.remove_fragment(string)
|
157
|
+
string = string[0..string.index('#')-1] if string.index('#')
|
158
|
+
string
|
159
|
+
end
|
160
|
+
|
161
|
+
# Continues to unescape the url until unescaping has no effect
|
162
|
+
#
|
163
|
+
# @param (String) url url string
|
164
|
+
# @return (String) fully unescaped url string
|
165
|
+
def self.recursively_unescape(url)
|
166
|
+
compare_url = url.clone
|
167
|
+
url = URI.unescape(url)
|
168
|
+
while(compare_url != url)
|
169
|
+
compare_url = url.clone
|
164
170
|
url = URI.unescape(url)
|
165
|
-
while(compare_url != url)
|
166
|
-
compare_url = url.clone
|
167
|
-
url = URI.unescape(url)
|
168
|
-
end
|
169
|
-
url
|
170
171
|
end
|
172
|
+
url
|
173
|
+
end
|
171
174
|
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
175
|
+
# Apply initial fixes to host string
|
176
|
+
#
|
177
|
+
# @param (String) host host string
|
178
|
+
# @return (String) standardized host string
|
179
|
+
def self.fix_host(host)
|
180
|
+
#puts "In Host: #{host}"
|
181
|
+
# remove leading and trailing dots, multiple dots to one
|
182
|
+
host.gsub!(/\A\.+|\.+\Z/, '')
|
183
|
+
host.gsub!(/\.+/, '.')
|
184
|
+
|
185
|
+
host.downcase!
|
186
|
+
|
187
|
+
host_splits = self.split_username_password_and_port(host)
|
188
|
+
|
189
|
+
if host_splits[:host] =~ /^\d+$/
|
190
|
+
host_splits[:host] = IP::V4.new(host.to_i).to_addr
|
191
|
+
elsif host_splits[:host] =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/
|
192
|
+
begin
|
193
|
+
host_splits[:host] = IP.new(host).to_addr
|
194
|
+
rescue ArgumentError
|
188
195
|
end
|
189
|
-
|
190
|
-
host
|
191
196
|
end
|
192
197
|
|
193
|
-
|
194
|
-
#
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
#puts "In Path: #{path}"
|
199
|
-
|
200
|
-
#remove leading slash
|
201
|
-
path = path[1..-1] if path[0..0] == '/'
|
198
|
+
result = host_splits[:host]
|
199
|
+
result = "#{host_splits[:creds]}@#{result}" unless host_splits[:creds].blank?
|
200
|
+
result = "#{result}:#{host_splits[:port]}" unless host_splits[:port].blank?
|
201
|
+
result
|
202
|
+
end
|
202
203
|
|
203
|
-
|
204
|
+
# Apply initial fixes to path string
|
205
|
+
#
|
206
|
+
# @param (String) path path string
|
207
|
+
# @return (String) standardized path string
|
208
|
+
def self.fix_path(path)
|
209
|
+
#puts "In Path: #{path}"
|
204
210
|
|
205
|
-
|
206
|
-
|
207
|
-
params = path[first_ques..-1]
|
208
|
-
path = path[0..first_ques-1]
|
209
|
-
end
|
211
|
+
#remove leading slash
|
212
|
+
path = path[1..-1] if path[0..0] == '/'
|
210
213
|
|
211
|
-
|
212
|
-
path.gsub!(/\/+/, '/')
|
214
|
+
preserve_trailing_slash = ( path[-1..-1] == '/' )
|
213
215
|
|
214
|
-
|
215
|
-
path.
|
216
|
-
|
217
|
-
|
218
|
-
|
216
|
+
if path.index('?')
|
217
|
+
first_ques = path.index('?')
|
218
|
+
params = path[first_ques..-1]
|
219
|
+
path = path[0..first_ques-1]
|
220
|
+
end
|
219
221
|
|
220
|
-
|
221
|
-
|
222
|
-
path += params if params
|
222
|
+
# remove multiple '/'
|
223
|
+
path.gsub!(/\/+/, '/')
|
223
224
|
|
224
|
-
|
225
|
+
new_path_array = []
|
226
|
+
path.split('/').each do |p|
|
227
|
+
new_path_array << p unless p == '.' || p == '..'
|
228
|
+
new_path_array.pop if p == '..'
|
225
229
|
end
|
226
230
|
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
# @return (String) escaped url string
|
231
|
-
def self.strict_escape(url)
|
232
|
-
url = URI.escape url
|
231
|
+
path = new_path_array.join('/')
|
232
|
+
path += '/' if preserve_trailing_slash
|
233
|
+
path += params if params
|
233
234
|
|
234
|
-
|
235
|
-
|
235
|
+
path
|
236
|
+
end
|
236
237
|
|
237
|
-
|
238
|
-
|
238
|
+
# Escape the url, but do not escape certain characters; such as the carat
|
239
|
+
#
|
240
|
+
# @param (String) url url string
|
241
|
+
# @return (String) escaped url string
|
242
|
+
def self.strict_escape(url)
|
243
|
+
url = URI.escape url
|
239
244
|
|
240
|
-
#
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
if cann.index(PROTOCOL_DELIMITER)
|
246
|
-
delimiting_index = cann.index(PROTOCOL_DELIMITER)
|
247
|
-
@protocol = cann[0..delimiting_index-1]
|
248
|
-
protocol_end_index = delimiting_index + PROTOCOL_DELIMITER.length
|
249
|
-
cann = cann[protocol_end_index..-1]
|
250
|
-
end
|
251
|
-
cann
|
252
|
-
end
|
245
|
+
# unescape carat, may need other optionally escapeable chars
|
246
|
+
url.gsub!('%5E','^')
|
247
|
+
|
248
|
+
url
|
249
|
+
end
|
253
250
|
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
251
|
+
# Strip the leading protocol from the url string
|
252
|
+
#
|
253
|
+
# @param (String) cann url string
|
254
|
+
# @return (String) url string without the protocol
|
255
|
+
def self.remove_protocol(cann)
|
256
|
+
if cann.index(PROTOCOL_DELIMITER)
|
257
|
+
delimiting_index = cann.index(PROTOCOL_DELIMITER)
|
258
|
+
@protocol = cann[0..delimiting_index-1]
|
259
|
+
protocol_end_index = delimiting_index + PROTOCOL_DELIMITER.length
|
260
|
+
cann = cann[protocol_end_index..-1]
|
261
261
|
end
|
262
|
+
cann
|
263
|
+
end
|
262
264
|
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
265
|
+
# Strip the user name, password and port number from the url
|
266
|
+
#
|
267
|
+
# @param (String) host_string host portion of the url
|
268
|
+
# @return (String) host portion of the url without the username, password and port
|
269
|
+
def self.strip_username_password_and_port_from_host(host_string)
|
270
|
+
host_string = remove_port(host_string)
|
271
|
+
remove_username_and_password(host_string)
|
272
|
+
end
|
273
|
+
|
274
|
+
# Strip port number from host string
|
275
|
+
#
|
276
|
+
# @param (see strip_username_password_and_port_from_host)
|
277
|
+
# @return (String) host part without the port number
|
278
|
+
def self.remove_port(host_string)
|
279
|
+
self.split_port(host_string)[:host]
|
280
|
+
end
|
281
|
+
|
282
|
+
# Strip user name and password from host part of url
|
283
|
+
#
|
284
|
+
# @param (see remove_port)
|
285
|
+
# @return (String) host part of url without user name or password
|
286
|
+
def self.remove_username_and_password(host_string)
|
287
|
+
self.split_username_and_password(host_string)[:host]
|
288
|
+
end
|
289
|
+
|
290
|
+
# Split user name, passowrd from the host
|
291
|
+
#
|
292
|
+
# @param (see remove_port)_
|
293
|
+
# @return (Hash) :host has the host string, :creds holds the username and password string
|
294
|
+
def self.split_username_and_password(host_string)
|
295
|
+
un_sep = host_string.index('@')
|
296
|
+
result = {}
|
297
|
+
if un_sep
|
298
|
+
splits = host_string.split('@')
|
299
|
+
result[:host] = splits[1]
|
300
|
+
result[:creds] = splits[0]
|
301
|
+
else
|
302
|
+
result[:host] = host_string
|
303
|
+
result[:creds] = nil
|
274
304
|
end
|
305
|
+
result
|
306
|
+
end
|
275
307
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
308
|
+
# Split post number and host string into a hash
|
309
|
+
#
|
310
|
+
# @param (See remove_port)
|
311
|
+
# @return (Hash) :host has the host string, :port holds the port number
|
312
|
+
def self.split_port(host_string)
|
313
|
+
port_sep = host_string.rindex(':')
|
314
|
+
result = {}
|
315
|
+
if port_sep
|
316
|
+
splits = host_string.split(':')
|
317
|
+
result[:host] = splits[0]
|
318
|
+
result[:port] = splits[1]
|
319
|
+
else
|
320
|
+
result[:host] = host_string
|
321
|
+
result[:port] = nil
|
287
322
|
end
|
323
|
+
result
|
324
|
+
end
|
325
|
+
|
326
|
+
# Split the user name, password and port from the host string
|
327
|
+
#
|
328
|
+
# @param (see remove_port)
|
329
|
+
# @return (Hash) :host as the host string; :creds has the username and password; :port holds the port number
|
330
|
+
def self.split_username_password_and_port(host_string)
|
331
|
+
result = self.split_username_and_password(host_string)
|
332
|
+
result.merge(self.split_port(result[:host]))
|
333
|
+
end
|
288
334
|
end
|
289
335
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: google_safe_browsing
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-07-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rails
|
@@ -151,7 +151,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
151
151
|
version: '0'
|
152
152
|
segments:
|
153
153
|
- 0
|
154
|
-
hash: -
|
154
|
+
hash: -4373468862577111822
|
155
155
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
156
156
|
none: false
|
157
157
|
requirements:
|
@@ -160,10 +160,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
160
160
|
version: '0'
|
161
161
|
segments:
|
162
162
|
- 0
|
163
|
-
hash: -
|
163
|
+
hash: -4373468862577111822
|
164
164
|
requirements: []
|
165
165
|
rubyforge_project:
|
166
|
-
rubygems_version: 1.8.
|
166
|
+
rubygems_version: 1.8.24
|
167
167
|
signing_key:
|
168
168
|
specification_version: 3
|
169
169
|
summary: Rails 3 plugin for Google's Safe Browsing API v2
|