UrlCategorise 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9ed8fbd044a6405977b3e720454c6eb3bfebdc6ee38e6b00f860ac2e3d49d3da
4
- data.tar.gz: c795f83ee5ce691af751f19681eaec865c2a01aadc27ef120c87ed7bf9ce339f
3
+ metadata.gz: 6fd17f1e01f20ae4e0efa2ce243c6d32fd24f4a184a52530b158a4edf80ffdfd
4
+ data.tar.gz: ab9356f712aaf3f7087814cfb3e56cca522a59321bbc5216f141b4b81f1eab71
5
5
  SHA512:
6
- metadata.gz: e4aa69b7edecd520db0a2df0ea48498f730f6ddd49d6f0d3d116954aaa7b486ae87c395b655aa81523e43a4189536f7b3d5033443e5f5ecae649c445ff2b178f
7
- data.tar.gz: c00b7d76d298538dcb55314cbda83dde5daa5bc78c2661bf44d818a62d663050957a6ff2d9063ec46c4cece3b92a18c14a471fdf67534b073f8a20fa560ad01d
6
+ metadata.gz: 7f0cadcb9a7254e3d964708ab7b073f2df1f320062f2b7310070381c79bbca0e3d4e6886ecfb29f2a881eb19c779a2a41b6583d2c5787b432eca5d5d63090d50
7
+ data.tar.gz: 590195a7abfcb38110f30f83ec0eb4baece4f18d42d4eed1864f468736948887ced5f6ffbe2a78714d928372566793d6dd453653e0b2c2970aa1a628e254c322
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- UrlCategorise (0.0.1)
4
+ UrlCategorise (0.0.2)
5
5
  api_pattern (~> 0.0.4)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -15,15 +15,39 @@ And then execute:
15
15
 
16
16
  Or install it yourself as:
17
17
 
18
- $ gem install url_categorise
18
+ $ gem install UrlCategorise
19
19
 
20
20
  ## Usage
21
+ The default host lists I picked for their separated categories.
22
+ I didn't select them for the quality of data
23
+ Use at your own risk!
21
24
 
22
25
  ```ruby
23
26
  require 'url_categorise'
24
27
  client = UrlCategorise::Client.new
25
28
 
29
+ client.count_of_hosts
30
+ client.count_of_categories
31
+ client.size_of_data
26
32
 
33
+ client.categorise(url)
34
+
35
+ # Can also initialise the client using a custom dataset
36
+ host_urls = {
37
+ abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"]
38
+ }
39
+
40
+ require 'url_categorise'
41
+ client = UrlCategorise::Client.new(host_urls: host_urls)
42
+
43
+ # You can also define symbols to combine other categories
44
+ host_urls = {
45
+ abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
46
+ bad_links: [:abuse]
47
+ }
48
+
49
+ require 'url_categorise'
50
+ client = UrlCategorise::Client.new(host_urls: host_urls)
27
51
  ```
28
52
 
29
53
  ## Development
@@ -4,27 +4,19 @@ module UrlCategorise
4
4
 
5
5
  attr_reader :host_urls, :hosts
6
6
 
7
+ # TODO: Save to folder
8
+ # TODO: Read from disk the database
7
9
  # TODO: Sanctioned IPs
8
- # TODO: More default lists
9
10
  # TODO: ActiveRecord support
10
11
  # TODO: List of abuse IPs
11
- # TODO: https://github.com/blocklistproject/Lists
12
- # TODO: https://github.com/nickoppen/pihole-blocklists
13
12
  def initialize(host_urls: DEFAULT_HOST_URLS)
14
13
  @host_urls = host_urls
15
- # @hosts = fetch_and_build_host_lists
16
- end
17
-
18
- def self.compatible_api_version
19
- 'v1'
20
- end
21
-
22
- def self.api_version
23
- 'v2 2023-05-19'
14
+ @hosts = fetch_and_build_host_lists
24
15
  end
25
16
 
26
17
  def categorise(url)
27
18
  host = (URI.parse(url).host || url).downcase
19
+ host = host.gsub("www.", "")
28
20
 
29
21
  @hosts.keys.select do |category|
30
22
  @hosts[category].include?(host)
@@ -42,7 +34,17 @@ module UrlCategorise
42
34
  end
43
35
 
44
36
  def size_of_data
37
+ hash_size_in_mb(@hosts)
38
+ end
39
+
40
+ private
45
41
 
42
+ def hash_size_in_mb(hash)
43
+ size = 0
44
+ hash.each do |key, value|
45
+ size += value.join.length
46
+ end
47
+ (size / 1.megabyte).round(2)
46
48
  end
47
49
 
48
50
  def fetch_and_build_host_lists
@@ -52,18 +54,57 @@ module UrlCategorise
52
54
  @hosts[category] = build_host_data(host_urls[category])
53
55
  end
54
56
 
57
+ sub_category_values = categories_with_keys
58
+ sub_category_values.keys.each do |category|
59
+ original_value = @hosts[category] || []
60
+
61
+ extra_category_values = sub_category_values[category].each do |sub_category|
62
+ @hosts[sub_category]
63
+ end
64
+
65
+ original_value << extra_category_values
66
+ @hosts[category] = original_value
67
+ end
68
+
55
69
  @hosts
56
70
  end
57
71
 
58
72
  def build_host_data(urls)
59
73
  urls.map do |url|
74
+ next unless url_valid?(url)
75
+
60
76
  raw_data = HTTParty.get(url)
61
77
  raw_data.split("\n").reject do |line|
62
- line.include?("#")
78
+ line[0] == "#"
63
79
  end.map do |line|
64
- line.gsub("0.0.0.0 ", "")
80
+ line.split(' ')[1] # Select the domain name # gsub("0.0.0.0 ", "")
65
81
  end
66
82
  end.flatten.compact.sort
67
83
  end
84
+
85
+ def categories_with_keys
86
+ keyed_categories = {}
87
+
88
+ host_urls.keys.each do |category|
89
+ category_values = host_urls[category].select do |url|
90
+ url_not_valid?(url) && url.is_a?(Symbol)
91
+ end
92
+
93
+ keyed_categories[category] = category_values
94
+ end
95
+
96
+ keyed_categories
97
+ end
98
+
99
+ def url_not_valid?(url)
100
+ url_valid?(url)
101
+ end
102
+
103
+ def url_valid?(url)
104
+ uri = URI.parse(url)
105
+ uri.is_a?(URI::HTTP) && !uri.host.nil?
106
+ rescue URI::InvalidURIError
107
+ false
108
+ end
68
109
  end
69
110
  end
@@ -1,36 +1,44 @@
1
1
  module UrlCategorise
2
2
  module Constants
3
- # Resources used:
4
- # https://blocklist.site/#
5
- # https://github.com/lightswitch05/hosts
6
- #
7
3
  DEFAULT_HOST_URLS = {
8
- advertising: ["https://blocklistproject.github.io/Lists/ads.txt"],
4
+ abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
5
+ adobe: ["https://github.com/blocklistproject/Lists/raw/master/adobe.txt"],
6
+ advertising: ["https://blocklistproject.github.io/Lists/ads.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-advert_01.txt"],
7
+ amazon: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/amazon/all"],
9
8
  amp_hosts: ["https://www.github.developerdan.com/hosts/lists/amp-hosts-extended.txt"],
9
+ apple: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/apple/all"],
10
+ cloudflare: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/cloudflare/all"],
11
+ crypto: ["https://github.com/blocklistproject/Lists/raw/master/crypto.txt"],
10
12
  dating_services: ["https://www.github.developerdan.com/hosts/lists/dating-services-extended.txt"],
11
- facebook: ["https://github.com/blocklistproject/Lists/raw/master/facebook.txt", "https://www.github.developerdan.com/hosts/lists/facebook-extended.txt", "https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt"],
13
+ drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
14
+ facebook: ["https://github.com/blocklistproject/Lists/raw/master/facebook.txt", "https://www.github.developerdan.com/hosts/lists/facebook-extended.txt", "https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/all", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/facebook.com"],
12
15
  fraud: ["https://blocklistproject.github.io/Lists/fraud.txt"],
13
16
  gambling: ["https://blocklistproject.github.io/Lists/gambling.txt"],
14
- hate: [],
15
- junk: [],
16
- malware: ["https://blocklistproject.github.io/Lists/malware.txt"],
17
+ gaming: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ubisoft.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-steam.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-activision.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-blizzard.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ea.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-epicgames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-nintendo.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-rockstargames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-roblox.txt"],
18
+ google: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/all"],
19
+ hate_and_junk: ["https://www.github.developerdan.com/hosts/lists/hate-and-junk-extended.txt"],
20
+ instagram: ["https://github.com/jmdugan/blocklists/raw/master/corporations/facebook/instagram"],
21
+ linkedin: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/linkedin"],
22
+ malware: ["https://blocklistproject.github.io/Lists/malware.txt", "http://www.malwaredomainlist.com/hostslist/hosts.txt"],
23
+ microsoft: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/all"],
24
+ mozilla: ["https://github.com/jmdugan/blocklists/raw/master/corporations/mozilla/all"],
25
+ nsa: ["https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS"],
17
26
  phishing: ["https://blocklistproject.github.io/Lists/phishing.txt"],
27
+ pinterest: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all"],
28
+ piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt"],
18
29
  pornography: ["https://blocklistproject.github.io/Lists/porn.txt"],
30
+ reddit: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt"],
31
+ redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
19
32
  scam: ["https://blocklistproject.github.io/Lists/scam.txt"],
33
+ smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
34
+ social_media: [:facebook, :instagram, :linkedin, :pinterest, :reddit,:tiktok, :twitter, :whatsapp, :youtube],
20
35
  tiktok: ["https://blocklistproject.github.io/Lists/tiktok.txt"],
36
+ torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
21
37
  tracking: ["https://blocklistproject.github.io/Lists/tracking.txt"],
22
- twitter: ["https://github.com/blocklistproject/Lists/raw/master/twitter.txt"],
38
+ twitter: ["https://github.com/blocklistproject/Lists/raw/master/twitter.txt", "https://github.com/jmdugan/blocklists/raw/master/corporations/twitter/all"],
23
39
  vaping: ["https://github.com/blocklistproject/Lists/raw/master/vaping.txt"],
24
- whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt"],
25
- youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt"],
26
- torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
27
- smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
28
- redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
29
- piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt"],
30
- drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
31
- crypto: ["https://github.com/blocklistproject/Lists/raw/master/crypto.txt"],
32
- adobe: ["https://github.com/blocklistproject/Lists/raw/master/adobe.txt"],
33
- abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
40
+ whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp"],
41
+ youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube"],
34
42
  }
35
43
  end
36
44
  end
@@ -1,3 +1,3 @@
1
1
  module UrlCategorise
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: UrlCategorise
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-19 00:00:00.000000000 Z
11
+ date: 2023-05-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: api_pattern