UrlCategorise 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +5 -1
- data/.github/workflows/ci.yml +2 -2
- data/CLAUDE.md +12 -2
- data/Gemfile +2 -2
- data/Gemfile.lock +8 -9
- data/README.md +189 -1
- data/Rakefile +8 -8
- data/bin/check_lists +12 -13
- data/bin/console +3 -3
- data/lib/url_categorise/active_record_client.rb +97 -20
- data/lib/url_categorise/client.rb +220 -111
- data/lib/url_categorise/constants.rb +86 -71
- data/lib/url_categorise/dataset_processor.rb +471 -0
- data/lib/url_categorise/models.rb +53 -14
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +1 -0
- data/url_categorise.gemspec +34 -32
- metadata +90 -49
@@ -1,89 +1,104 @@
|
|
1
1
|
module UrlCategorise
|
2
2
|
module Constants
|
3
|
-
ONE_MEGABYTE =
|
3
|
+
ONE_MEGABYTE = 1_048_576
|
4
|
+
|
5
|
+
# crawler data
|
6
|
+
# https://commoncrawl.org/
|
7
|
+
|
8
|
+
# Usually used to train deep models. Using directly here
|
9
|
+
CATEGORIY_DATABASES = [
|
10
|
+
{ type: :kaggle, path: 'shaurov/website-classification-using-url' },
|
11
|
+
{ type: :kaggle, path: 'hetulmehta/website-classification' },
|
12
|
+
{ type: :kaggle, path: 'shawon10/url-classification-dataset-dmoz' },
|
13
|
+
{ type: :csv, path: 'https://query.data.world/s/zackomeddpgotrp3yel66aphvvlcuq?dws=00000' }
|
14
|
+
]
|
15
|
+
|
4
16
|
DEFAULT_HOST_URLS = {
|
5
|
-
abuse: [
|
6
|
-
adobe: [
|
7
|
-
advertising: [
|
8
|
-
amazon: [
|
9
|
-
amp_hosts: [
|
10
|
-
apple: [
|
11
|
-
cloudflare: [
|
12
|
-
crypto: [
|
13
|
-
dating_services: [
|
14
|
-
drugs: [
|
15
|
-
facebook: [
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
17
|
+
abuse: ['https://github.com/blocklistproject/Lists/raw/master/abuse.txt'],
|
18
|
+
adobe: ['https://github.com/blocklistproject/Lists/raw/master/adobe.txt'],
|
19
|
+
advertising: ['https://blocklistproject.github.io/Lists/ads.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-advert_01.txt'],
|
20
|
+
amazon: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/amazon/all'],
|
21
|
+
amp_hosts: ['https://www.github.developerdan.com/hosts/lists/amp-hosts-extended.txt'],
|
22
|
+
apple: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/apple/all'],
|
23
|
+
cloudflare: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/cloudflare/all'],
|
24
|
+
crypto: ['https://github.com/blocklistproject/Lists/raw/master/crypto.txt'],
|
25
|
+
dating_services: ['https://www.github.developerdan.com/hosts/lists/dating-services-extended.txt'],
|
26
|
+
drugs: ['https://github.com/blocklistproject/Lists/raw/master/drugs.txt'],
|
27
|
+
facebook: ['https://github.com/blocklistproject/Lists/raw/master/facebook.txt',
|
28
|
+
'https://www.github.developerdan.com/hosts/lists/facebook-extended.txt', 'https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/all', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/facebook.com'],
|
29
|
+
fraud: ['https://blocklistproject.github.io/Lists/fraud.txt'],
|
30
|
+
gambling: ['https://blocklistproject.github.io/Lists/gambling.txt', 'https://raw.githubusercontent.com/hagezi/dns-blocklists/main/adblock/gambling.txt'],
|
31
|
+
gaming: ['https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ubisoft.txt',
|
32
|
+
'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-steam.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-activision.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-blizzard.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ea.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-epicgames.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-nintendo.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-rockstargames.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-roblox.txt'],
|
33
|
+
google: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/all'],
|
34
|
+
hate_and_junk: ['https://www.github.developerdan.com/hosts/lists/hate-and-junk-extended.txt'],
|
35
|
+
instagram: ['https://github.com/jmdugan/blocklists/raw/master/corporations/facebook/instagram'],
|
36
|
+
linkedin: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/linkedin'],
|
37
|
+
malware: ['https://blocklistproject.github.io/Lists/malware.txt',
|
38
|
+
'https://feodotracker.abuse.ch/downloads/ipblocklist.txt', 'https://sslbl.abuse.ch/blacklist/sslipblacklist.txt'],
|
39
|
+
microsoft: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/all'],
|
40
|
+
mozilla: ['https://github.com/jmdugan/blocklists/raw/master/corporations/mozilla/all'],
|
41
|
+
nsa: ['https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS'],
|
42
|
+
phishing: ['https://blocklistproject.github.io/Lists/phishing.txt'],
|
43
|
+
pinterest: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all'],
|
44
|
+
piracy: ['https://github.com/blocklistproject/Lists/raw/master/piracy.txt', 'https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/anti.piracy.txt'],
|
45
|
+
pornography: ['https://blocklistproject.github.io/Lists/porn.txt'],
|
46
|
+
reddit: ['https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt'],
|
47
|
+
redirect: ['https://github.com/blocklistproject/Lists/raw/master/redirect.txt'],
|
48
|
+
scam: ['https://blocklistproject.github.io/Lists/scam.txt'],
|
49
|
+
smart_tv: ['https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt'],
|
50
|
+
social_media: %i[facebook instagram linkedin pinterest reddit tiktok twitter whatsapp youtube],
|
51
|
+
tiktok: ['https://blocklistproject.github.io/Lists/tiktok.txt'],
|
52
|
+
torrent: ['https://github.com/blocklistproject/Lists/raw/master/torrent.txt'],
|
53
|
+
tracking: ['https://blocklistproject.github.io/Lists/tracking.txt'],
|
54
|
+
twitter: ['https://github.com/blocklistproject/Lists/raw/master/twitter.txt', 'https://github.com/jmdugan/blocklists/raw/master/corporations/twitter/all'],
|
55
|
+
vaping: ['https://github.com/blocklistproject/Lists/raw/master/vaping.txt'],
|
56
|
+
whatsapp: ['https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp'],
|
57
|
+
youtube: ['https://github.com/blocklistproject/Lists/raw/master/youtube.txt', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube'],
|
58
|
+
|
44
59
|
# Hagezi DNS Blocklists - specialized categories only
|
45
|
-
threat_intelligence: [
|
46
|
-
dyndns: [
|
47
|
-
badware_hoster: [
|
48
|
-
most_abused_tlds: [
|
49
|
-
newly_registered_domains: [
|
50
|
-
dns_over_https_bypass: [
|
51
|
-
|
60
|
+
threat_intelligence: ['https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/ips/tif.txt'],
|
61
|
+
dyndns: ['https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/dyndns.txt'],
|
62
|
+
badware_hoster: ['https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/hoster.txt'],
|
63
|
+
most_abused_tlds: ['https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/spam-tlds.txt'],
|
64
|
+
newly_registered_domains: ['https://github.com/xRuffKez/NRD/raw/refs/heads/main/lists/14-day/adblock/nrd-14day_adblock.txt'],
|
65
|
+
dns_over_https_bypass: ['https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/doh-vpn-proxy-bypass.txt'],
|
66
|
+
|
52
67
|
# StevenBlack hosts lists - specific categories only
|
53
|
-
fakenews: [
|
54
|
-
|
68
|
+
fakenews: ['https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews/hosts'],
|
69
|
+
|
55
70
|
# Security threat lists
|
56
|
-
threat_indicators: [
|
57
|
-
|
71
|
+
threat_indicators: ['https://threatfox.abuse.ch/downloads/hostfile.txt'],
|
72
|
+
|
58
73
|
# Additional IP-based sanctions and abuse lists
|
59
|
-
sanctions_ips: [
|
60
|
-
compromised_ips: [
|
61
|
-
tor_exit_nodes: [
|
62
|
-
open_proxy_ips: [
|
63
|
-
|
74
|
+
sanctions_ips: ['https://lists.blocklist.de/lists/all.txt'],
|
75
|
+
compromised_ips: ['https://rules.emergingthreats.net/fwrules/emerging-Block-IPs.txt'],
|
76
|
+
tor_exit_nodes: ['https://www.dan.me.uk/torlist/'],
|
77
|
+
open_proxy_ips: ['https://raw.githubusercontent.com/stamparm/ipsum/master/ipsum.txt'],
|
78
|
+
|
64
79
|
# Network security feeds
|
65
|
-
top_attack_sources: [
|
66
|
-
suspicious_domains: [
|
67
|
-
|
80
|
+
top_attack_sources: ['https://www.dshield.org/feeds/suspiciousdomains_High.txt'],
|
81
|
+
suspicious_domains: ['https://www.dshield.org/feeds/suspiciousdomains_Medium.txt'],
|
82
|
+
|
68
83
|
# Extended categories for better organization
|
69
|
-
cryptojacking: [
|
84
|
+
cryptojacking: ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt'],
|
70
85
|
# ransomware: ["https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt"],
|
71
86
|
# botnet_command_control: ["https://osint.bambenekconsulting.com/feeds/c2-dommasterlist.txt"], # URL returns 403 Forbidden
|
72
|
-
phishing_extended: [
|
73
|
-
|
87
|
+
phishing_extended: ['https://openphish.com/feed.txt'],
|
88
|
+
|
74
89
|
# Regional and specialized lists
|
75
|
-
chinese_ad_hosts: [
|
76
|
-
korean_ad_hosts: [
|
77
|
-
|
90
|
+
chinese_ad_hosts: ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts'],
|
91
|
+
korean_ad_hosts: ['https://raw.githubusercontent.com/yous/YousList/master/hosts.txt'],
|
92
|
+
|
78
93
|
# Mobile and app-specific
|
79
|
-
mobile_ads: [
|
80
|
-
smart_tv_ads: [
|
81
|
-
|
94
|
+
mobile_ads: ['https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/MobileFilter/sections/adservers.txt'],
|
95
|
+
smart_tv_ads: ['https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'],
|
96
|
+
|
82
97
|
# Content and informational categories
|
83
|
-
news: [
|
84
|
-
#
|
98
|
+
news: ['https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-only/hosts']
|
99
|
+
# NOTE: The following categories had broken URLs and have been commented out:
|
85
100
|
# legitimate_news: URLs from mitchellkrogza repository return 404
|
86
|
-
# blogs, forums, health, finance, streaming, shopping: blocklistproject alt-version URLs return 404
|
101
|
+
# blogs, forums, health, finance, streaming, shopping: blocklistproject alt-version URLs return 404
|
87
102
|
# educational: StevenBlack educational hosts URL returns 404
|
88
103
|
# government: mitchellkrogza government domains URL returns 404
|
89
104
|
# business, technology: blocklistproject alt-version URLs return 404
|