UrlCategorise 0.1.2 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,89 +1,104 @@
1
1
  module UrlCategorise
2
2
  module Constants
3
- ONE_MEGABYTE = 1048576
3
+ ONE_MEGABYTE = 1_048_576
4
+
5
+ # crawler data
6
+ # https://commoncrawl.org/
7
+
8
+ # Usually used to train deep models. Using directly here
9
+ CATEGORIY_DATABASES = [
10
+ { type: :kaggle, path: 'shaurov/website-classification-using-url' },
11
+ { type: :kaggle, path: 'hetulmehta/website-classification' },
12
+ { type: :kaggle, path: 'shawon10/url-classification-dataset-dmoz' },
13
+ { type: :csv, path: 'https://query.data.world/s/zackomeddpgotrp3yel66aphvvlcuq?dws=00000' }
14
+ ]
15
+
4
16
  DEFAULT_HOST_URLS = {
5
- abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
6
- adobe: ["https://github.com/blocklistproject/Lists/raw/master/adobe.txt"],
7
- advertising: ["https://blocklistproject.github.io/Lists/ads.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-advert_01.txt"],
8
- amazon: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/amazon/all"],
9
- amp_hosts: ["https://www.github.developerdan.com/hosts/lists/amp-hosts-extended.txt"],
10
- apple: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/apple/all"],
11
- cloudflare: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/cloudflare/all"],
12
- crypto: ["https://github.com/blocklistproject/Lists/raw/master/crypto.txt"],
13
- dating_services: ["https://www.github.developerdan.com/hosts/lists/dating-services-extended.txt"],
14
- drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
15
- facebook: ["https://github.com/blocklistproject/Lists/raw/master/facebook.txt", "https://www.github.developerdan.com/hosts/lists/facebook-extended.txt", "https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/all", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/facebook.com"],
16
- fraud: ["https://blocklistproject.github.io/Lists/fraud.txt"],
17
- gambling: ["https://blocklistproject.github.io/Lists/gambling.txt", "https://raw.githubusercontent.com/hagezi/dns-blocklists/main/adblock/gambling.txt"],
18
- gaming: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ubisoft.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-steam.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-activision.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-blizzard.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ea.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-epicgames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-nintendo.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-rockstargames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-roblox.txt"],
19
- google: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/all"],
20
- hate_and_junk: ["https://www.github.developerdan.com/hosts/lists/hate-and-junk-extended.txt"],
21
- instagram: ["https://github.com/jmdugan/blocklists/raw/master/corporations/facebook/instagram"],
22
- linkedin: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/linkedin"],
23
- malware: ["https://blocklistproject.github.io/Lists/malware.txt", "https://feodotracker.abuse.ch/downloads/ipblocklist.txt", "https://sslbl.abuse.ch/blacklist/sslipblacklist.txt"],
24
- microsoft: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/all"],
25
- mozilla: ["https://github.com/jmdugan/blocklists/raw/master/corporations/mozilla/all"],
26
- nsa: ["https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS"],
27
- phishing: ["https://blocklistproject.github.io/Lists/phishing.txt"],
28
- pinterest: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all"],
29
- piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt", "https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/anti.piracy.txt"],
30
- pornography: ["https://blocklistproject.github.io/Lists/porn.txt"],
31
- reddit: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt"],
32
- redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
33
- scam: ["https://blocklistproject.github.io/Lists/scam.txt"],
34
- smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
35
- social_media: [:facebook, :instagram, :linkedin, :pinterest, :reddit, :tiktok, :twitter, :whatsapp, :youtube],
36
- tiktok: ["https://blocklistproject.github.io/Lists/tiktok.txt"],
37
- torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
38
- tracking: ["https://blocklistproject.github.io/Lists/tracking.txt"],
39
- twitter: ["https://github.com/blocklistproject/Lists/raw/master/twitter.txt", "https://github.com/jmdugan/blocklists/raw/master/corporations/twitter/all"],
40
- vaping: ["https://github.com/blocklistproject/Lists/raw/master/vaping.txt"],
41
- whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp"],
42
- youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube"],
43
-
17
+ abuse: ['https://github.com/blocklistproject/Lists/raw/master/abuse.txt'],
18
+ adobe: ['https://github.com/blocklistproject/Lists/raw/master/adobe.txt'],
19
+ advertising: ['https://blocklistproject.github.io/Lists/ads.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-advert_01.txt'],
20
+ amazon: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/amazon/all'],
21
+ amp_hosts: ['https://www.github.developerdan.com/hosts/lists/amp-hosts-extended.txt'],
22
+ apple: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/apple/all'],
23
+ cloudflare: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/cloudflare/all'],
24
+ crypto: ['https://github.com/blocklistproject/Lists/raw/master/crypto.txt'],
25
+ dating_services: ['https://www.github.developerdan.com/hosts/lists/dating-services-extended.txt'],
26
+ drugs: ['https://github.com/blocklistproject/Lists/raw/master/drugs.txt'],
27
+ facebook: ['https://github.com/blocklistproject/Lists/raw/master/facebook.txt',
28
+ 'https://www.github.developerdan.com/hosts/lists/facebook-extended.txt', 'https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/all', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/facebook.com'],
29
+ fraud: ['https://blocklistproject.github.io/Lists/fraud.txt'],
30
+ gambling: ['https://blocklistproject.github.io/Lists/gambling.txt', 'https://raw.githubusercontent.com/hagezi/dns-blocklists/main/adblock/gambling.txt'],
31
+ gaming: ['https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ubisoft.txt',
32
+ 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-steam.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-activision.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-blizzard.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ea.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-epicgames.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-nintendo.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-rockstargames.txt', 'https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-roblox.txt'],
33
+ google: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/all'],
34
+ hate_and_junk: ['https://www.github.developerdan.com/hosts/lists/hate-and-junk-extended.txt'],
35
+ instagram: ['https://github.com/jmdugan/blocklists/raw/master/corporations/facebook/instagram'],
36
+ linkedin: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/linkedin'],
37
+ malware: ['https://blocklistproject.github.io/Lists/malware.txt',
38
+ 'https://feodotracker.abuse.ch/downloads/ipblocklist.txt', 'https://sslbl.abuse.ch/blacklist/sslipblacklist.txt'],
39
+ microsoft: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/all'],
40
+ mozilla: ['https://github.com/jmdugan/blocklists/raw/master/corporations/mozilla/all'],
41
+ nsa: ['https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS'],
42
+ phishing: ['https://blocklistproject.github.io/Lists/phishing.txt'],
43
+ pinterest: ['https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all'],
44
+ piracy: ['https://github.com/blocklistproject/Lists/raw/master/piracy.txt', 'https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/anti.piracy.txt'],
45
+ pornography: ['https://blocklistproject.github.io/Lists/porn.txt'],
46
+ reddit: ['https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt'],
47
+ redirect: ['https://github.com/blocklistproject/Lists/raw/master/redirect.txt'],
48
+ scam: ['https://blocklistproject.github.io/Lists/scam.txt'],
49
+ smart_tv: ['https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt'],
50
+ social_media: %i[facebook instagram linkedin pinterest reddit tiktok twitter whatsapp youtube],
51
+ tiktok: ['https://blocklistproject.github.io/Lists/tiktok.txt'],
52
+ torrent: ['https://github.com/blocklistproject/Lists/raw/master/torrent.txt'],
53
+ tracking: ['https://blocklistproject.github.io/Lists/tracking.txt'],
54
+ twitter: ['https://github.com/blocklistproject/Lists/raw/master/twitter.txt', 'https://github.com/jmdugan/blocklists/raw/master/corporations/twitter/all'],
55
+ vaping: ['https://github.com/blocklistproject/Lists/raw/master/vaping.txt'],
56
+ whatsapp: ['https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp'],
57
+ youtube: ['https://github.com/blocklistproject/Lists/raw/master/youtube.txt', 'https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube'],
58
+
44
59
  # Hagezi DNS Blocklists - specialized categories only
45
- threat_intelligence: ["https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/ips/tif.txt"],
46
- dyndns: ["https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/dyndns.txt"],
47
- badware_hoster: ["https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/hoster.txt"],
48
- most_abused_tlds: ["https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/spam-tlds.txt"],
49
- newly_registered_domains: ["https://github.com/xRuffKez/NRD/raw/refs/heads/main/lists/14-day/adblock/nrd-14day_adblock.txt"],
50
- dns_over_https_bypass: ["https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/doh-vpn-proxy-bypass.txt"],
51
-
60
+ threat_intelligence: ['https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/ips/tif.txt'],
61
+ dyndns: ['https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/dyndns.txt'],
62
+ badware_hoster: ['https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/hoster.txt'],
63
+ most_abused_tlds: ['https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/spam-tlds.txt'],
64
+ newly_registered_domains: ['https://github.com/xRuffKez/NRD/raw/refs/heads/main/lists/14-day/adblock/nrd-14day_adblock.txt'],
65
+ dns_over_https_bypass: ['https://github.com/hagezi/dns-blocklists/raw/refs/heads/main/adblock/doh-vpn-proxy-bypass.txt'],
66
+
52
67
  # StevenBlack hosts lists - specific categories only
53
- fakenews: ["https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews/hosts"],
54
-
68
+ fakenews: ['https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews/hosts'],
69
+
55
70
  # Security threat lists
56
- threat_indicators: ["https://threatfox.abuse.ch/downloads/hostfile.txt"],
57
-
71
+ threat_indicators: ['https://threatfox.abuse.ch/downloads/hostfile.txt'],
72
+
58
73
  # Additional IP-based sanctions and abuse lists
59
- sanctions_ips: ["https://lists.blocklist.de/lists/all.txt"],
60
- compromised_ips: ["https://rules.emergingthreats.net/fwrules/emerging-Block-IPs.txt"],
61
- tor_exit_nodes: ["https://www.dan.me.uk/torlist/"],
62
- open_proxy_ips: ["https://raw.githubusercontent.com/stamparm/ipsum/master/ipsum.txt"],
63
-
74
+ sanctions_ips: ['https://lists.blocklist.de/lists/all.txt'],
75
+ compromised_ips: ['https://rules.emergingthreats.net/fwrules/emerging-Block-IPs.txt'],
76
+ tor_exit_nodes: ['https://www.dan.me.uk/torlist/'],
77
+ open_proxy_ips: ['https://raw.githubusercontent.com/stamparm/ipsum/master/ipsum.txt'],
78
+
64
79
  # Network security feeds
65
- top_attack_sources: ["https://www.dshield.org/feeds/suspiciousdomains_High.txt"],
66
- suspicious_domains: ["https://www.dshield.org/feeds/suspiciousdomains_Medium.txt"],
67
-
80
+ top_attack_sources: ['https://www.dshield.org/feeds/suspiciousdomains_High.txt'],
81
+ suspicious_domains: ['https://www.dshield.org/feeds/suspiciousdomains_Medium.txt'],
82
+
68
83
  # Extended categories for better organization
69
- cryptojacking: ["https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt"],
84
+ cryptojacking: ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt'],
70
85
  # ransomware: ["https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt"],
71
86
  # botnet_command_control: ["https://osint.bambenekconsulting.com/feeds/c2-dommasterlist.txt"], # URL returns 403 Forbidden
72
- phishing_extended: ["https://openphish.com/feed.txt"],
73
-
87
+ phishing_extended: ['https://openphish.com/feed.txt'],
88
+
74
89
  # Regional and specialized lists
75
- chinese_ad_hosts: ["https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts"],
76
- korean_ad_hosts: ["https://raw.githubusercontent.com/yous/YousList/master/hosts.txt"],
77
-
90
+ chinese_ad_hosts: ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts'],
91
+ korean_ad_hosts: ['https://raw.githubusercontent.com/yous/YousList/master/hosts.txt'],
92
+
78
93
  # Mobile and app-specific
79
- mobile_ads: ["https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/MobileFilter/sections/adservers.txt"],
80
- smart_tv_ads: ["https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt"],
81
-
94
+ mobile_ads: ['https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/MobileFilter/sections/adservers.txt'],
95
+ smart_tv_ads: ['https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'],
96
+
82
97
  # Content and informational categories
83
- news: ["https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-only/hosts"],
84
- # Note: The following categories had broken URLs and have been commented out:
98
+ news: ['https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-only/hosts']
99
+ # NOTE: The following categories had broken URLs and have been commented out:
85
100
  # legitimate_news: URLs from mitchellkrogza repository return 404
86
- # blogs, forums, health, finance, streaming, shopping: blocklistproject alt-version URLs return 404
101
+ # blogs, forums, health, finance, streaming, shopping: blocklistproject alt-version URLs return 404
87
102
  # educational: StevenBlack educational hosts URL returns 404
88
103
  # government: mitchellkrogza government domains URL returns 404
89
104
  # business, technology: blocklistproject alt-version URLs return 404