UrlCategorise 0.0.2 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +13 -0
- data/.github/workflows/ci.yml +57 -0
- data/CLAUDE.md +135 -0
- data/Gemfile.lock +83 -55
- data/README.md +516 -26
- data/Rakefile +2 -0
- data/docs/.keep +2 -0
- data/docs/v0.1-context.md +93 -0
- data/lib/url_categorise/active_record_client.rb +118 -0
- data/lib/url_categorise/client.rb +189 -19
- data/lib/url_categorise/constants.rb +65 -3
- data/lib/url_categorise/models.rb +105 -0
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +11 -0
- data/url_categorise.gemspec +17 -9
- metadata +171 -27
@@ -0,0 +1,118 @@
|
|
1
|
+
require_relative 'models'
|
2
|
+
|
3
|
+
module UrlCategorise
|
4
|
+
class ActiveRecordClient < Client
|
5
|
+
def initialize(**kwargs)
|
6
|
+
raise "ActiveRecord not available" unless UrlCategorise::Models.available?
|
7
|
+
|
8
|
+
@use_database = kwargs.delete(:use_database) { true }
|
9
|
+
super(**kwargs)
|
10
|
+
|
11
|
+
populate_database if @use_database
|
12
|
+
end
|
13
|
+
|
14
|
+
def categorise(url)
|
15
|
+
return super(url) unless @use_database && UrlCategorise::Models.available?
|
16
|
+
|
17
|
+
host = (URI.parse(url).host || url).downcase.gsub("www.", "")
|
18
|
+
|
19
|
+
# Try database first
|
20
|
+
categories = UrlCategorise::Models::Domain.categorise(host)
|
21
|
+
return categories unless categories.empty?
|
22
|
+
|
23
|
+
# Fallback to memory-based categorization
|
24
|
+
super(url)
|
25
|
+
end
|
26
|
+
|
27
|
+
def categorise_ip(ip_address)
|
28
|
+
return super(ip_address) unless @use_database && UrlCategorise::Models.available?
|
29
|
+
|
30
|
+
# Try database first
|
31
|
+
categories = UrlCategorise::Models::IpAddress.categorise(ip_address)
|
32
|
+
return categories unless categories.empty?
|
33
|
+
|
34
|
+
# Fallback to memory-based categorization
|
35
|
+
super(ip_address)
|
36
|
+
end
|
37
|
+
|
38
|
+
def update_database
|
39
|
+
return unless @use_database && UrlCategorise::Models.available?
|
40
|
+
|
41
|
+
populate_database
|
42
|
+
end
|
43
|
+
|
44
|
+
def database_stats
|
45
|
+
return {} unless @use_database && UrlCategorise::Models.available?
|
46
|
+
|
47
|
+
{
|
48
|
+
domains: UrlCategorise::Models::Domain.count,
|
49
|
+
ip_addresses: UrlCategorise::Models::IpAddress.count,
|
50
|
+
list_metadata: UrlCategorise::Models::ListMetadata.count,
|
51
|
+
categories: UrlCategorise::Models::Domain.distinct.pluck(:categories).flatten.uniq.size
|
52
|
+
}
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def populate_database
|
58
|
+
return unless UrlCategorise::Models.available?
|
59
|
+
|
60
|
+
# Store list metadata
|
61
|
+
@host_urls.each do |category, urls|
|
62
|
+
urls.each do |url|
|
63
|
+
next unless url.is_a?(String)
|
64
|
+
|
65
|
+
metadata = @metadata[url] || {}
|
66
|
+
UrlCategorise::Models::ListMetadata.find_or_create_by(url: url) do |record|
|
67
|
+
record.name = category.to_s
|
68
|
+
record.categories = [category.to_s]
|
69
|
+
record.file_hash = metadata[:content_hash]
|
70
|
+
record.fetched_at = metadata[:last_updated]
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Store domain data
|
76
|
+
@hosts.each do |category, domains|
|
77
|
+
domains.each do |domain|
|
78
|
+
next if domain.nil? || domain.empty?
|
79
|
+
|
80
|
+
existing = UrlCategorise::Models::Domain.find_by(domain: domain)
|
81
|
+
if existing
|
82
|
+
# Add category if not already present
|
83
|
+
categories = existing.categories | [category.to_s]
|
84
|
+
existing.update(categories: categories) if categories != existing.categories
|
85
|
+
else
|
86
|
+
UrlCategorise::Models::Domain.create!(
|
87
|
+
domain: domain,
|
88
|
+
categories: [category.to_s]
|
89
|
+
)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# Store IP data (for IP-based lists)
|
95
|
+
ip_categories = [:sanctions_ips, :compromised_ips, :tor_exit_nodes, :open_proxy_ips,
|
96
|
+
:banking_trojans, :malicious_ssl_certificates, :top_attack_sources]
|
97
|
+
|
98
|
+
ip_categories.each do |category|
|
99
|
+
next unless @hosts[category]
|
100
|
+
|
101
|
+
@hosts[category].each do |ip|
|
102
|
+
next if ip.nil? || ip.empty? || !ip.match(/^\d+\.\d+\.\d+\.\d+$/)
|
103
|
+
|
104
|
+
existing = UrlCategorise::Models::IpAddress.find_by(ip_address: ip)
|
105
|
+
if existing
|
106
|
+
categories = existing.categories | [category.to_s]
|
107
|
+
existing.update(categories: categories) if categories != existing.categories
|
108
|
+
else
|
109
|
+
UrlCategorise::Models::IpAddress.create!(
|
110
|
+
ip_address: ip,
|
111
|
+
categories: [category.to_s]
|
112
|
+
)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -2,15 +2,23 @@ module UrlCategorise
|
|
2
2
|
class Client < ApiPattern::Client
|
3
3
|
include ::UrlCategorise::Constants
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
5
|
+
def self.compatible_api_version
|
6
|
+
'v2'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.api_version
|
10
|
+
'v2 2023-04-12'
|
11
|
+
end
|
12
|
+
|
13
|
+
attr_reader :host_urls, :hosts, :cache_dir, :force_download, :dns_servers, :metadata, :request_timeout
|
14
|
+
|
15
|
+
def initialize(host_urls: DEFAULT_HOST_URLS, cache_dir: nil, force_download: false, dns_servers: ['1.1.1.1', '1.0.0.1'], request_timeout: 10)
|
13
16
|
@host_urls = host_urls
|
17
|
+
@cache_dir = cache_dir
|
18
|
+
@force_download = force_download
|
19
|
+
@dns_servers = dns_servers
|
20
|
+
@request_timeout = request_timeout
|
21
|
+
@metadata = {}
|
14
22
|
@hosts = fetch_and_build_host_lists
|
15
23
|
end
|
16
24
|
|
@@ -19,10 +27,35 @@ module UrlCategorise
|
|
19
27
|
host = host.gsub("www.", "")
|
20
28
|
|
21
29
|
@hosts.keys.select do |category|
|
22
|
-
@hosts[category].
|
30
|
+
@hosts[category].any? do |blocked_host|
|
31
|
+
host == blocked_host || host.end_with?(".#{blocked_host}")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def categorise_ip(ip_address)
|
37
|
+
@hosts.keys.select do |category|
|
38
|
+
@hosts[category].include?(ip_address)
|
23
39
|
end
|
24
40
|
end
|
25
41
|
|
42
|
+
def resolve_and_categorise(domain)
|
43
|
+
categories = categorise(domain)
|
44
|
+
|
45
|
+
begin
|
46
|
+
resolver = Resolv::DNS.new(nameserver: @dns_servers)
|
47
|
+
ip_addresses = resolver.getaddresses(domain).map(&:to_s)
|
48
|
+
|
49
|
+
ip_addresses.each do |ip|
|
50
|
+
categories.concat(categorise_ip(ip))
|
51
|
+
end
|
52
|
+
rescue
|
53
|
+
# DNS resolution failed, return domain categories only
|
54
|
+
end
|
55
|
+
|
56
|
+
categories.uniq
|
57
|
+
end
|
58
|
+
|
26
59
|
def count_of_hosts
|
27
60
|
@hosts.keys.map do |category|
|
28
61
|
@hosts[category].size
|
@@ -41,10 +74,12 @@ module UrlCategorise
|
|
41
74
|
|
42
75
|
def hash_size_in_mb(hash)
|
43
76
|
size = 0
|
77
|
+
|
44
78
|
hash.each do |key, value|
|
45
79
|
size += value.join.length
|
46
80
|
end
|
47
|
-
|
81
|
+
|
82
|
+
(size / ONE_MEGABYTE).round(2)
|
48
83
|
end
|
49
84
|
|
50
85
|
def fetch_and_build_host_lists
|
@@ -63,25 +98,160 @@ module UrlCategorise
|
|
63
98
|
end
|
64
99
|
|
65
100
|
original_value << extra_category_values
|
66
|
-
@hosts[category] = original_value
|
101
|
+
@hosts[category] = original_value.uniq.compact
|
67
102
|
end
|
68
103
|
|
69
104
|
@hosts
|
70
105
|
end
|
71
106
|
|
72
107
|
def build_host_data(urls)
|
73
|
-
|
108
|
+
all_hosts = []
|
109
|
+
|
110
|
+
urls.each do |url|
|
74
111
|
next unless url_valid?(url)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
112
|
+
|
113
|
+
hosts_data = nil
|
114
|
+
|
115
|
+
if @cache_dir && !@force_download
|
116
|
+
hosts_data = read_from_cache(url)
|
117
|
+
end
|
118
|
+
|
119
|
+
if hosts_data.nil?
|
120
|
+
hosts_data = download_and_parse_list(url)
|
121
|
+
save_to_cache(url, hosts_data) if @cache_dir
|
81
122
|
end
|
82
|
-
|
123
|
+
|
124
|
+
all_hosts.concat(hosts_data) if hosts_data
|
125
|
+
end
|
126
|
+
|
127
|
+
all_hosts.compact.sort.uniq
|
128
|
+
end
|
129
|
+
|
130
|
+
def download_and_parse_list(url)
|
131
|
+
begin
|
132
|
+
raw_data = HTTParty.get(url, timeout: @request_timeout)
|
133
|
+
return [] if raw_data.body.nil? || raw_data.body.empty?
|
134
|
+
|
135
|
+
# Store metadata
|
136
|
+
etag = raw_data.headers['etag']
|
137
|
+
last_modified = raw_data.headers['last-modified']
|
138
|
+
@metadata[url] = {
|
139
|
+
last_updated: Time.now,
|
140
|
+
etag: etag,
|
141
|
+
last_modified: last_modified,
|
142
|
+
content_hash: Digest::SHA256.hexdigest(raw_data.body),
|
143
|
+
status: 'success'
|
144
|
+
}
|
145
|
+
|
146
|
+
parse_list_content(raw_data.body, detect_list_format(raw_data.body))
|
147
|
+
rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError => e
|
148
|
+
# Log the error but continue with other lists
|
149
|
+
@metadata[url] = {
|
150
|
+
last_updated: Time.now,
|
151
|
+
error: e.message,
|
152
|
+
status: 'failed'
|
153
|
+
}
|
154
|
+
return []
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def parse_list_content(content, format)
|
159
|
+
lines = content.split("\n").reject { |line| line.empty? || line.strip.start_with?('#') }
|
160
|
+
|
161
|
+
case format
|
162
|
+
when :hosts
|
163
|
+
lines.map { |line| line.split(' ')[1] }.compact
|
164
|
+
when :plain
|
165
|
+
lines.map(&:strip)
|
166
|
+
when :dnsmasq
|
167
|
+
lines.map { |line|
|
168
|
+
match = line.match(/address=\/(.+?)\//)
|
169
|
+
match ? match[1] : nil
|
170
|
+
}.compact
|
171
|
+
when :ublock
|
172
|
+
lines.map { |line| line.gsub(/^\|\|/, '').gsub(/[\$\^].*$/, '').strip }.reject(&:empty?)
|
173
|
+
else
|
174
|
+
lines.map(&:strip)
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
def detect_list_format(content)
|
179
|
+
sample_lines = content.split("\n").first(10)
|
180
|
+
|
181
|
+
return :hosts if sample_lines.any? { |line| line.match(/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+/) }
|
182
|
+
return :dnsmasq if sample_lines.any? { |line| line.include?('address=/') }
|
183
|
+
return :ublock if sample_lines.any? { |line| line.match(/^\|\|/) }
|
184
|
+
|
185
|
+
:plain
|
186
|
+
end
|
187
|
+
|
188
|
+
def cache_file_path(url)
|
189
|
+
return nil unless @cache_dir
|
190
|
+
|
191
|
+
FileUtils.mkdir_p(@cache_dir) unless Dir.exist?(@cache_dir)
|
192
|
+
filename = Digest::MD5.hexdigest(url) + '.cache'
|
193
|
+
File.join(@cache_dir, filename)
|
194
|
+
end
|
195
|
+
|
196
|
+
def read_from_cache(url)
|
197
|
+
cache_file = cache_file_path(url)
|
198
|
+
return nil unless cache_file && File.exist?(cache_file)
|
199
|
+
|
200
|
+
cache_data = Marshal.load(File.read(cache_file))
|
201
|
+
|
202
|
+
# Check if we should update based on hash or time
|
203
|
+
if should_update_cache?(url, cache_data)
|
204
|
+
return nil
|
205
|
+
end
|
206
|
+
|
207
|
+
cache_data[:hosts]
|
208
|
+
rescue
|
209
|
+
nil
|
210
|
+
end
|
211
|
+
|
212
|
+
def save_to_cache(url, hosts_data)
|
213
|
+
cache_file = cache_file_path(url)
|
214
|
+
return unless cache_file
|
215
|
+
|
216
|
+
cache_data = {
|
217
|
+
hosts: hosts_data,
|
218
|
+
metadata: @metadata[url],
|
219
|
+
cached_at: Time.now
|
220
|
+
}
|
221
|
+
|
222
|
+
File.write(cache_file, Marshal.dump(cache_data))
|
223
|
+
rescue
|
224
|
+
# Cache save failed, continue without caching
|
225
|
+
end
|
226
|
+
|
227
|
+
def should_update_cache?(url, cache_data)
|
228
|
+
return true if @force_download
|
229
|
+
return true unless cache_data[:metadata]
|
230
|
+
|
231
|
+
# Update if cache is older than 24 hours
|
232
|
+
cache_age = Time.now - cache_data[:cached_at]
|
233
|
+
return true if cache_age > 24 * 60 * 60
|
234
|
+
|
235
|
+
# Check if remote content has changed
|
236
|
+
begin
|
237
|
+
head_response = HTTParty.head(url, timeout: @request_timeout)
|
238
|
+
remote_etag = head_response.headers['etag']
|
239
|
+
remote_last_modified = head_response.headers['last-modified']
|
240
|
+
|
241
|
+
cached_metadata = cache_data[:metadata]
|
242
|
+
|
243
|
+
return true if remote_etag && cached_metadata[:etag] && remote_etag != cached_metadata[:etag]
|
244
|
+
return true if remote_last_modified && cached_metadata[:last_modified] && remote_last_modified != cached_metadata[:last_modified]
|
245
|
+
rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError
|
246
|
+
# If HEAD request fails, assume we should update
|
247
|
+
return true
|
248
|
+
end
|
249
|
+
|
250
|
+
false
|
83
251
|
end
|
84
252
|
|
253
|
+
private
|
254
|
+
|
85
255
|
def categories_with_keys
|
86
256
|
keyed_categories = {}
|
87
257
|
|
@@ -1,5 +1,6 @@
|
|
1
1
|
module UrlCategorise
|
2
2
|
module Constants
|
3
|
+
ONE_MEGABYTE = 1048576
|
3
4
|
DEFAULT_HOST_URLS = {
|
4
5
|
abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
|
5
6
|
adobe: ["https://github.com/blocklistproject/Lists/raw/master/adobe.txt"],
|
@@ -13,7 +14,7 @@ module UrlCategorise
|
|
13
14
|
drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
|
14
15
|
facebook: ["https://github.com/blocklistproject/Lists/raw/master/facebook.txt", "https://www.github.developerdan.com/hosts/lists/facebook-extended.txt", "https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/all", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/facebook.com"],
|
15
16
|
fraud: ["https://blocklistproject.github.io/Lists/fraud.txt"],
|
16
|
-
gambling: ["https://blocklistproject.github.io/Lists/gambling.txt"],
|
17
|
+
gambling: ["https://blocklistproject.github.io/Lists/gambling.txt", "https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/gambling.txt"],
|
17
18
|
gaming: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ubisoft.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-steam.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-activision.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-blizzard.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ea.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-epicgames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-nintendo.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-rockstargames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-roblox.txt"],
|
18
19
|
google: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/all"],
|
19
20
|
hate_and_junk: ["https://www.github.developerdan.com/hosts/lists/hate-and-junk-extended.txt"],
|
@@ -25,13 +26,13 @@ module UrlCategorise
|
|
25
26
|
nsa: ["https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS"],
|
26
27
|
phishing: ["https://blocklistproject.github.io/Lists/phishing.txt"],
|
27
28
|
pinterest: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all"],
|
28
|
-
piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt"],
|
29
|
+
piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt", "https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/anti.piracy.txt"],
|
29
30
|
pornography: ["https://blocklistproject.github.io/Lists/porn.txt"],
|
30
31
|
reddit: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt"],
|
31
32
|
redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
|
32
33
|
scam: ["https://blocklistproject.github.io/Lists/scam.txt"],
|
33
34
|
smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
|
34
|
-
social_media: [:facebook, :instagram, :linkedin, :pinterest, :reddit
|
35
|
+
social_media: [:facebook, :instagram, :linkedin, :pinterest, :reddit, :tiktok, :twitter, :whatsapp, :youtube],
|
35
36
|
tiktok: ["https://blocklistproject.github.io/Lists/tiktok.txt"],
|
36
37
|
torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
|
37
38
|
tracking: ["https://blocklistproject.github.io/Lists/tracking.txt"],
|
@@ -39,6 +40,67 @@ module UrlCategorise
|
|
39
40
|
vaping: ["https://github.com/blocklistproject/Lists/raw/master/vaping.txt"],
|
40
41
|
whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp"],
|
41
42
|
youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube"],
|
43
|
+
|
44
|
+
# Hagezi DNS Blocklists - specialized categories only
|
45
|
+
threat_intelligence: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/tif.txt"],
|
46
|
+
dyndns: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/dyndns.txt"],
|
47
|
+
badware_hoster: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/hoster.txt"],
|
48
|
+
most_abused_tlds: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/tlds.txt"],
|
49
|
+
newly_registered_domains: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/nrd.txt"],
|
50
|
+
dns_over_https_bypass: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/doh-vpn-proxy-bypass.txt"],
|
51
|
+
|
52
|
+
# StevenBlack hosts lists - specific categories only
|
53
|
+
fakenews: ["https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews/hosts"],
|
54
|
+
|
55
|
+
# Security threat lists
|
56
|
+
banking_trojans: ["https://feodotracker.abuse.ch/downloads/ipblocklist.txt"],
|
57
|
+
malware_domains: ["https://bazaar.abuse.ch/downloads/domain_blocklist.txt"],
|
58
|
+
malicious_ssl_certificates: ["https://sslbl.abuse.ch/blacklist/sslipblacklist.txt"],
|
59
|
+
threat_indicators: ["https://threatfox.abuse.ch/downloads/hostfile.txt"],
|
60
|
+
|
61
|
+
# Additional IP-based sanctions and abuse lists
|
62
|
+
sanctions_ips: ["https://lists.blocklist.de/lists/all.txt"],
|
63
|
+
compromised_ips: ["https://rules.emergingthreats.net/fwrules/emerging-Block-IPs.txt"],
|
64
|
+
tor_exit_nodes: ["https://www.dan.me.uk/torlist/"],
|
65
|
+
open_proxy_ips: ["https://raw.githubusercontent.com/stamparm/ipsum/master/ipsum.txt"],
|
66
|
+
|
67
|
+
# Network security feeds
|
68
|
+
top_attack_sources: ["https://www.dshield.org/feeds/suspiciousdomains_High.txt"],
|
69
|
+
suspicious_domains: ["https://www.dshield.org/feeds/suspiciousdomains_Medium.txt"],
|
70
|
+
|
71
|
+
# Extended categories for better organization
|
72
|
+
cryptojacking: ["https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt"],
|
73
|
+
ransomware: ["https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt"],
|
74
|
+
botnet_command_control: ["https://osint.bambenekconsulting.com/feeds/c2-dommasterlist.txt"],
|
75
|
+
phishing_extended: ["https://openphish.com/feed.txt"],
|
76
|
+
|
77
|
+
# Regional and specialized lists
|
78
|
+
chinese_ad_hosts: ["https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts"],
|
79
|
+
korean_ad_hosts: ["https://raw.githubusercontent.com/yous/YousList/master/hosts.txt"],
|
80
|
+
|
81
|
+
# Mobile and app-specific
|
82
|
+
mobile_ads: ["https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/MobileFilter/sections/adservers.txt"],
|
83
|
+
smart_tv_ads: ["https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt"],
|
84
|
+
|
85
|
+
# Content and informational categories
|
86
|
+
news: ["https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-only/hosts"],
|
87
|
+
legitimate_news: ["https://raw.githubusercontent.com/mitchellkrogza/The-Big-List-of-Hacked-Malware-Web-Sites/master/.dev-tools/_domains_fake_news/domains.txt"],
|
88
|
+
blogs: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/blogs-nl.txt"],
|
89
|
+
forums: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/forums-nl.txt"],
|
90
|
+
educational: ["https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/education/hosts"],
|
91
|
+
government: ["https://raw.githubusercontent.com/mitchellkrogza/The-Big-List-of-Hacked-Malware-Web-Sites/master/.dev-tools/_domains_government/domains.txt"],
|
92
|
+
health: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/health-nl.txt"],
|
93
|
+
finance: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/finance-nl.txt"],
|
94
|
+
streaming: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/streaming-nl.txt"],
|
95
|
+
shopping: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/shopping-nl.txt"],
|
96
|
+
|
97
|
+
# Professional and business
|
98
|
+
business: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/business-nl.txt"],
|
99
|
+
technology: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/tech-nl.txt"],
|
100
|
+
|
101
|
+
# Regional content
|
102
|
+
local_news: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/local-news-nl.txt"],
|
103
|
+
international_news: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/international-news-nl.txt"],
|
42
104
|
}
|
43
105
|
end
|
44
106
|
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
begin
|
2
|
+
require 'active_record'
|
3
|
+
rescue LoadError
|
4
|
+
# ActiveRecord not available, skip model definitions
|
5
|
+
module UrlCategorise
|
6
|
+
module Models
|
7
|
+
def self.available?
|
8
|
+
false
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
else
|
13
|
+
module UrlCategorise
|
14
|
+
module Models
|
15
|
+
def self.available?
|
16
|
+
true
|
17
|
+
end
|
18
|
+
|
19
|
+
class ListMetadata < ActiveRecord::Base
|
20
|
+
self.table_name = 'url_categorise_list_metadata'
|
21
|
+
|
22
|
+
validates :name, presence: true, uniqueness: true
|
23
|
+
validates :url, presence: true
|
24
|
+
validates :categories, presence: true
|
25
|
+
|
26
|
+
serialize :categories, coder: JSON
|
27
|
+
|
28
|
+
scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
|
29
|
+
scope :updated_since, ->(time) { where('updated_at > ?', time) }
|
30
|
+
end
|
31
|
+
|
32
|
+
class Domain < ActiveRecord::Base
|
33
|
+
self.table_name = 'url_categorise_domains'
|
34
|
+
|
35
|
+
validates :domain, presence: true, uniqueness: true
|
36
|
+
validates :categories, presence: true
|
37
|
+
|
38
|
+
serialize :categories, coder: JSON
|
39
|
+
|
40
|
+
scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
|
41
|
+
scope :search, ->(term) { where('domain LIKE ?', "%#{term}%") }
|
42
|
+
|
43
|
+
def self.categorise(domain_name)
|
44
|
+
record = find_by(domain: domain_name.downcase.gsub('www.', ''))
|
45
|
+
record ? record.categories : []
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
class IpAddress < ActiveRecord::Base
|
50
|
+
self.table_name = 'url_categorise_ip_addresses'
|
51
|
+
|
52
|
+
validates :ip_address, presence: true, uniqueness: true
|
53
|
+
validates :categories, presence: true
|
54
|
+
|
55
|
+
serialize :categories, coder: JSON
|
56
|
+
|
57
|
+
scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
|
58
|
+
scope :in_subnet, ->(subnet) { where('ip_address LIKE ?', "#{subnet}%") }
|
59
|
+
|
60
|
+
def self.categorise(ip)
|
61
|
+
record = find_by(ip_address: ip)
|
62
|
+
record ? record.categories : []
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Generator for Rails integration
|
67
|
+
def self.generate_migration
|
68
|
+
<<~MIGRATION
|
69
|
+
class CreateUrlCategoriseTables < ActiveRecord::Migration[8.0]
|
70
|
+
def change
|
71
|
+
create_table :url_categorise_list_metadata do |t|
|
72
|
+
t.string :name, null: false, index: { unique: true }
|
73
|
+
t.string :url, null: false
|
74
|
+
t.text :categories, null: false
|
75
|
+
t.string :file_path
|
76
|
+
t.datetime :fetched_at
|
77
|
+
t.string :file_hash
|
78
|
+
t.datetime :file_updated_at
|
79
|
+
t.timestamps
|
80
|
+
end
|
81
|
+
|
82
|
+
create_table :url_categorise_domains do |t|
|
83
|
+
t.string :domain, null: false, index: { unique: true }
|
84
|
+
t.text :categories, null: false
|
85
|
+
t.timestamps
|
86
|
+
end
|
87
|
+
|
88
|
+
add_index :url_categorise_domains, :domain
|
89
|
+
add_index :url_categorise_domains, :categories
|
90
|
+
|
91
|
+
create_table :url_categorise_ip_addresses do |t|
|
92
|
+
t.string :ip_address, null: false, index: { unique: true }
|
93
|
+
t.text :categories, null: false
|
94
|
+
t.timestamps
|
95
|
+
end
|
96
|
+
|
97
|
+
add_index :url_categorise_ip_addresses, :ip_address
|
98
|
+
add_index :url_categorise_ip_addresses, :categories
|
99
|
+
end
|
100
|
+
end
|
101
|
+
MIGRATION
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
data/lib/url_categorise.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
require 'httparty'
|
2
2
|
require 'nokogiri'
|
3
|
+
require 'digest'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'resolv'
|
3
6
|
|
4
7
|
require 'api-pattern'
|
5
8
|
|
@@ -8,6 +11,14 @@ require 'url_categorise/constants'
|
|
8
11
|
|
9
12
|
require 'url_categorise/client'
|
10
13
|
|
14
|
+
# Optional ActiveRecord integration
|
15
|
+
begin
|
16
|
+
require 'url_categorise/models'
|
17
|
+
require 'url_categorise/active_record_client'
|
18
|
+
rescue LoadError
|
19
|
+
# ActiveRecord not available, skip
|
20
|
+
end
|
21
|
+
|
11
22
|
module UrlCategorise
|
12
23
|
class Error < StandardError; end
|
13
24
|
end
|
data/url_categorise.gemspec
CHANGED
@@ -21,16 +21,24 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.bindir = "exe"
|
22
22
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
23
23
|
spec.require_paths = ["lib"]
|
24
|
+
spec.required_ruby_version = ">= 3.0.0"
|
24
25
|
|
25
|
-
spec.add_dependency "api_pattern", "
|
26
|
+
spec.add_dependency "api_pattern", ">= 0.0.5", "< 1.0"
|
27
|
+
spec.add_dependency "httparty", ">= 0.22.0", "< 1.0"
|
28
|
+
spec.add_dependency "nokogiri", ">= 1.16.0", "< 2.0"
|
29
|
+
spec.add_dependency "csv", ">= 3.3.0", "< 4.0"
|
30
|
+
spec.add_dependency "digest", ">= 3.1.0", "< 4.0"
|
31
|
+
spec.add_dependency "fileutils", ">= 1.7.0", "< 2.0"
|
32
|
+
spec.add_dependency "resolv", ">= 0.4.0", "< 1.0"
|
26
33
|
|
27
34
|
# Development dependancies
|
28
|
-
spec.add_development_dependency "rake", "~> 13.0
|
29
|
-
spec.add_development_dependency "minitest", "~> 5.
|
30
|
-
spec.add_development_dependency "minitest-focus", "~> 1.
|
31
|
-
spec.add_development_dependency "minitest-reporters", "~> 1.
|
32
|
-
spec.add_development_dependency "timecop", "~> 0.9.
|
33
|
-
spec.add_development_dependency "mocha", "~> 2.
|
34
|
-
spec.add_development_dependency "pry", "~> 0.
|
35
|
-
spec.add_development_dependency "webmock", "~> 3.
|
35
|
+
spec.add_development_dependency "rake", "~> 13.3.0"
|
36
|
+
spec.add_development_dependency "minitest", "~> 5.25.5"
|
37
|
+
spec.add_development_dependency "minitest-focus", "~> 1.4.0"
|
38
|
+
spec.add_development_dependency "minitest-reporters", "~> 1.7.1"
|
39
|
+
spec.add_development_dependency "timecop", "~> 0.9.10"
|
40
|
+
spec.add_development_dependency "mocha", "~> 2.4.5"
|
41
|
+
spec.add_development_dependency "pry", "~> 0.15.2"
|
42
|
+
spec.add_development_dependency "webmock", "~> 3.24.0"
|
43
|
+
spec.add_development_dependency "simplecov", "~> 0.22.0"
|
36
44
|
end
|