UrlCategorise 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +25 -1
- data/lib/url_categorise/client.rb +55 -14
- data/lib/url_categorise/constants.rb +28 -20
- data/lib/url_categorise/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6fd17f1e01f20ae4e0efa2ce243c6d32fd24f4a184a52530b158a4edf80ffdfd
|
4
|
+
data.tar.gz: ab9356f712aaf3f7087814cfb3e56cca522a59321bbc5216f141b4b81f1eab71
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7f0cadcb9a7254e3d964708ab7b073f2df1f320062f2b7310070381c79bbca0e3d4e6886ecfb29f2a881eb19c779a2a41b6583d2c5787b432eca5d5d63090d50
|
7
|
+
data.tar.gz: 590195a7abfcb38110f30f83ec0eb4baece4f18d42d4eed1864f468736948887ced5f6ffbe2a78714d928372566793d6dd453653e0b2c2970aa1a628e254c322
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -15,15 +15,39 @@ And then execute:
|
|
15
15
|
|
16
16
|
Or install it yourself as:
|
17
17
|
|
18
|
-
$ gem install
|
18
|
+
$ gem install UrlCategorise
|
19
19
|
|
20
20
|
## Usage
|
21
|
+
The default host lists I picked for their separated categories.
|
22
|
+
I didn't select them for the quality of data
|
23
|
+
Use at your own risk!
|
21
24
|
|
22
25
|
```ruby
|
23
26
|
require 'url_categorise'
|
24
27
|
client = UrlCategorise::Client.new
|
25
28
|
|
29
|
+
client.count_of_hosts
|
30
|
+
client.count_of_categories
|
31
|
+
client.size_of_data
|
26
32
|
|
33
|
+
client.categorise(url)
|
34
|
+
|
35
|
+
# Can also initialise the client using a custom dataset
|
36
|
+
host_urls = {
|
37
|
+
abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"]
|
38
|
+
}
|
39
|
+
|
40
|
+
require 'url_categorise'
|
41
|
+
client = UrlCategorise::Client.new(host_urls: host_urls)
|
42
|
+
|
43
|
+
# You can also define symbols to combine other categories
|
44
|
+
host_urls = {
|
45
|
+
abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
|
46
|
+
bad_links: [:abuse]
|
47
|
+
}
|
48
|
+
|
49
|
+
require 'url_categorise'
|
50
|
+
client = UrlCategorise::Client.new(host_urls: host_urls)
|
27
51
|
```
|
28
52
|
|
29
53
|
## Development
|
@@ -4,27 +4,19 @@ module UrlCategorise
|
|
4
4
|
|
5
5
|
attr_reader :host_urls, :hosts
|
6
6
|
|
7
|
+
# TODO: Save to folder
|
8
|
+
# TODO: Read from disk the database
|
7
9
|
# TODO: Sanctioned IPs
|
8
|
-
# TODO: More default lists
|
9
10
|
# TODO: ActiveRecord support
|
10
11
|
# TODO: List of abuse IPs
|
11
|
-
# TODO: https://github.com/blocklistproject/Lists
|
12
|
-
# TODO: https://github.com/nickoppen/pihole-blocklists
|
13
12
|
def initialize(host_urls: DEFAULT_HOST_URLS)
|
14
13
|
@host_urls = host_urls
|
15
|
-
|
16
|
-
end
|
17
|
-
|
18
|
-
def self.compatible_api_version
|
19
|
-
'v1'
|
20
|
-
end
|
21
|
-
|
22
|
-
def self.api_version
|
23
|
-
'v2 2023-05-19'
|
14
|
+
@hosts = fetch_and_build_host_lists
|
24
15
|
end
|
25
16
|
|
26
17
|
def categorise(url)
|
27
18
|
host = (URI.parse(url).host || url).downcase
|
19
|
+
host = host.gsub("www.", "")
|
28
20
|
|
29
21
|
@hosts.keys.select do |category|
|
30
22
|
@hosts[category].include?(host)
|
@@ -42,7 +34,17 @@ module UrlCategorise
|
|
42
34
|
end
|
43
35
|
|
44
36
|
def size_of_data
|
37
|
+
hash_size_in_mb(@hosts)
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
45
41
|
|
42
|
+
def hash_size_in_mb(hash)
|
43
|
+
size = 0
|
44
|
+
hash.each do |key, value|
|
45
|
+
size += value.join.length
|
46
|
+
end
|
47
|
+
(size / 1.megabyte).round(2)
|
46
48
|
end
|
47
49
|
|
48
50
|
def fetch_and_build_host_lists
|
@@ -52,18 +54,57 @@ module UrlCategorise
|
|
52
54
|
@hosts[category] = build_host_data(host_urls[category])
|
53
55
|
end
|
54
56
|
|
57
|
+
sub_category_values = categories_with_keys
|
58
|
+
sub_category_values.keys.each do |category|
|
59
|
+
original_value = @hosts[category] || []
|
60
|
+
|
61
|
+
extra_category_values = sub_category_values[category].each do |sub_category|
|
62
|
+
@hosts[sub_category]
|
63
|
+
end
|
64
|
+
|
65
|
+
original_value << extra_category_values
|
66
|
+
@hosts[category] = original_value
|
67
|
+
end
|
68
|
+
|
55
69
|
@hosts
|
56
70
|
end
|
57
71
|
|
58
72
|
def build_host_data(urls)
|
59
73
|
urls.map do |url|
|
74
|
+
next unless url_valid?(url)
|
75
|
+
|
60
76
|
raw_data = HTTParty.get(url)
|
61
77
|
raw_data.split("\n").reject do |line|
|
62
|
-
line
|
78
|
+
line[0] == "#"
|
63
79
|
end.map do |line|
|
64
|
-
line.gsub("0.0.0.0 ", "")
|
80
|
+
line.split(' ')[1] # Select the domain name # gsub("0.0.0.0 ", "")
|
65
81
|
end
|
66
82
|
end.flatten.compact.sort
|
67
83
|
end
|
84
|
+
|
85
|
+
def categories_with_keys
|
86
|
+
keyed_categories = {}
|
87
|
+
|
88
|
+
host_urls.keys.each do |category|
|
89
|
+
category_values = host_urls[category].select do |url|
|
90
|
+
url_not_valid?(url) && url.is_a?(Symbol)
|
91
|
+
end
|
92
|
+
|
93
|
+
keyed_categories[category] = category_values
|
94
|
+
end
|
95
|
+
|
96
|
+
keyed_categories
|
97
|
+
end
|
98
|
+
|
99
|
+
def url_not_valid?(url)
|
100
|
+
url_valid?(url)
|
101
|
+
end
|
102
|
+
|
103
|
+
def url_valid?(url)
|
104
|
+
uri = URI.parse(url)
|
105
|
+
uri.is_a?(URI::HTTP) && !uri.host.nil?
|
106
|
+
rescue URI::InvalidURIError
|
107
|
+
false
|
108
|
+
end
|
68
109
|
end
|
69
110
|
end
|
@@ -1,36 +1,44 @@
|
|
1
1
|
module UrlCategorise
|
2
2
|
module Constants
|
3
|
-
# Resources used:
|
4
|
-
# https://blocklist.site/#
|
5
|
-
# https://github.com/lightswitch05/hosts
|
6
|
-
#
|
7
3
|
DEFAULT_HOST_URLS = {
|
8
|
-
|
4
|
+
abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
|
5
|
+
adobe: ["https://github.com/blocklistproject/Lists/raw/master/adobe.txt"],
|
6
|
+
advertising: ["https://blocklistproject.github.io/Lists/ads.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-advert_01.txt"],
|
7
|
+
amazon: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/amazon/all"],
|
9
8
|
amp_hosts: ["https://www.github.developerdan.com/hosts/lists/amp-hosts-extended.txt"],
|
9
|
+
apple: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/apple/all"],
|
10
|
+
cloudflare: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/cloudflare/all"],
|
11
|
+
crypto: ["https://github.com/blocklistproject/Lists/raw/master/crypto.txt"],
|
10
12
|
dating_services: ["https://www.github.developerdan.com/hosts/lists/dating-services-extended.txt"],
|
11
|
-
|
13
|
+
drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
|
14
|
+
facebook: ["https://github.com/blocklistproject/Lists/raw/master/facebook.txt", "https://www.github.developerdan.com/hosts/lists/facebook-extended.txt", "https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/all", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/facebook.com"],
|
12
15
|
fraud: ["https://blocklistproject.github.io/Lists/fraud.txt"],
|
13
16
|
gambling: ["https://blocklistproject.github.io/Lists/gambling.txt"],
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
+
gaming: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ubisoft.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-steam.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-activision.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-blizzard.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ea.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-epicgames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-nintendo.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-rockstargames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-roblox.txt"],
|
18
|
+
google: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/all"],
|
19
|
+
hate_and_junk: ["https://www.github.developerdan.com/hosts/lists/hate-and-junk-extended.txt"],
|
20
|
+
instagram: ["https://github.com/jmdugan/blocklists/raw/master/corporations/facebook/instagram"],
|
21
|
+
linkedin: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/linkedin"],
|
22
|
+
malware: ["https://blocklistproject.github.io/Lists/malware.txt", "http://www.malwaredomainlist.com/hostslist/hosts.txt"],
|
23
|
+
microsoft: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/all"],
|
24
|
+
mozilla: ["https://github.com/jmdugan/blocklists/raw/master/corporations/mozilla/all"],
|
25
|
+
nsa: ["https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS"],
|
17
26
|
phishing: ["https://blocklistproject.github.io/Lists/phishing.txt"],
|
27
|
+
pinterest: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all"],
|
28
|
+
piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt"],
|
18
29
|
pornography: ["https://blocklistproject.github.io/Lists/porn.txt"],
|
30
|
+
reddit: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt"],
|
31
|
+
redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
|
19
32
|
scam: ["https://blocklistproject.github.io/Lists/scam.txt"],
|
33
|
+
smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
|
34
|
+
social_media: [:facebook, :instagram, :linkedin, :pinterest, :reddit,:tiktok, :twitter, :whatsapp, :youtube],
|
20
35
|
tiktok: ["https://blocklistproject.github.io/Lists/tiktok.txt"],
|
36
|
+
torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
|
21
37
|
tracking: ["https://blocklistproject.github.io/Lists/tracking.txt"],
|
22
|
-
twitter: ["https://github.com/blocklistproject/Lists/raw/master/twitter.txt"],
|
38
|
+
twitter: ["https://github.com/blocklistproject/Lists/raw/master/twitter.txt", "https://github.com/jmdugan/blocklists/raw/master/corporations/twitter/all"],
|
23
39
|
vaping: ["https://github.com/blocklistproject/Lists/raw/master/vaping.txt"],
|
24
|
-
whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt"],
|
25
|
-
youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt"],
|
26
|
-
torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
|
27
|
-
smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
|
28
|
-
redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
|
29
|
-
piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt"],
|
30
|
-
drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
|
31
|
-
crypto: ["https://github.com/blocklistproject/Lists/raw/master/crypto.txt"],
|
32
|
-
adobe: ["https://github.com/blocklistproject/Lists/raw/master/adobe.txt"],
|
33
|
-
abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
|
40
|
+
whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp"],
|
41
|
+
youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube"],
|
34
42
|
}
|
35
43
|
end
|
36
44
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: UrlCategorise
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-05-
|
11
|
+
date: 2023-05-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: api_pattern
|