UrlCategorise 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9ed8fbd044a6405977b3e720454c6eb3bfebdc6ee38e6b00f860ac2e3d49d3da
4
- data.tar.gz: c795f83ee5ce691af751f19681eaec865c2a01aadc27ef120c87ed7bf9ce339f
3
+ metadata.gz: 6fd17f1e01f20ae4e0efa2ce243c6d32fd24f4a184a52530b158a4edf80ffdfd
4
+ data.tar.gz: ab9356f712aaf3f7087814cfb3e56cca522a59321bbc5216f141b4b81f1eab71
5
5
  SHA512:
6
- metadata.gz: e4aa69b7edecd520db0a2df0ea48498f730f6ddd49d6f0d3d116954aaa7b486ae87c395b655aa81523e43a4189536f7b3d5033443e5f5ecae649c445ff2b178f
7
- data.tar.gz: c00b7d76d298538dcb55314cbda83dde5daa5bc78c2661bf44d818a62d663050957a6ff2d9063ec46c4cece3b92a18c14a471fdf67534b073f8a20fa560ad01d
6
+ metadata.gz: 7f0cadcb9a7254e3d964708ab7b073f2df1f320062f2b7310070381c79bbca0e3d4e6886ecfb29f2a881eb19c779a2a41b6583d2c5787b432eca5d5d63090d50
7
+ data.tar.gz: 590195a7abfcb38110f30f83ec0eb4baece4f18d42d4eed1864f468736948887ced5f6ffbe2a78714d928372566793d6dd453653e0b2c2970aa1a628e254c322
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- UrlCategorise (0.0.1)
4
+ UrlCategorise (0.0.2)
5
5
  api_pattern (~> 0.0.4)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -15,15 +15,39 @@ And then execute:
15
15
 
16
16
  Or install it yourself as:
17
17
 
18
- $ gem install url_categorise
18
+ $ gem install UrlCategorise
19
19
 
20
20
  ## Usage
21
+ The default host lists I picked for their separated categories.
22
+ I didn't select them for the quality of data
23
+ Use at your own risk!
21
24
 
22
25
  ```ruby
23
26
  require 'url_categorise'
24
27
  client = UrlCategorise::Client.new
25
28
 
29
+ client.count_of_hosts
30
+ client.count_of_categories
31
+ client.size_of_data
26
32
 
33
+ client.categorise(url)
34
+
35
+ # Can also initialise the client using a custom dataset
36
+ host_urls = {
37
+ abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"]
38
+ }
39
+
40
+ require 'url_categorise'
41
+ client = UrlCategorise::Client.new(host_urls: host_urls)
42
+
43
+ # You can also define symbols to combine other categories
44
+ host_urls = {
45
+ abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
46
+ bad_links: [:abuse]
47
+ }
48
+
49
+ require 'url_categorise'
50
+ client = UrlCategorise::Client.new(host_urls: host_urls)
27
51
  ```
28
52
 
29
53
  ## Development
@@ -4,27 +4,19 @@ module UrlCategorise
4
4
 
5
5
  attr_reader :host_urls, :hosts
6
6
 
7
+ # TODO: Save to folder
8
+ # TODO: Read from disk the database
7
9
  # TODO: Sanctioned IPs
8
- # TODO: More default lists
9
10
  # TODO: ActiveRecord support
10
11
  # TODO: List of abuse IPs
11
- # TODO: https://github.com/blocklistproject/Lists
12
- # TODO: https://github.com/nickoppen/pihole-blocklists
13
12
  def initialize(host_urls: DEFAULT_HOST_URLS)
14
13
  @host_urls = host_urls
15
- # @hosts = fetch_and_build_host_lists
16
- end
17
-
18
- def self.compatible_api_version
19
- 'v1'
20
- end
21
-
22
- def self.api_version
23
- 'v2 2023-05-19'
14
+ @hosts = fetch_and_build_host_lists
24
15
  end
25
16
 
26
17
  def categorise(url)
27
18
  host = (URI.parse(url).host || url).downcase
19
+ host = host.gsub("www.", "")
28
20
 
29
21
  @hosts.keys.select do |category|
30
22
  @hosts[category].include?(host)
@@ -42,7 +34,17 @@ module UrlCategorise
42
34
  end
43
35
 
44
36
  def size_of_data
37
+ hash_size_in_mb(@hosts)
38
+ end
39
+
40
+ private
45
41
 
42
+ def hash_size_in_mb(hash)
43
+ size = 0
44
+ hash.each do |key, value|
45
+ size += value.join.length
46
+ end
47
+ (size / 1.megabyte).round(2)
46
48
  end
47
49
 
48
50
  def fetch_and_build_host_lists
@@ -52,18 +54,57 @@ module UrlCategorise
52
54
  @hosts[category] = build_host_data(host_urls[category])
53
55
  end
54
56
 
57
+ sub_category_values = categories_with_keys
58
+ sub_category_values.keys.each do |category|
59
+ original_value = @hosts[category] || []
60
+
61
+ extra_category_values = sub_category_values[category].each do |sub_category|
62
+ @hosts[sub_category]
63
+ end
64
+
65
+ original_value << extra_category_values
66
+ @hosts[category] = original_value
67
+ end
68
+
55
69
  @hosts
56
70
  end
57
71
 
58
72
  def build_host_data(urls)
59
73
  urls.map do |url|
74
+ next unless url_valid?(url)
75
+
60
76
  raw_data = HTTParty.get(url)
61
77
  raw_data.split("\n").reject do |line|
62
- line.include?("#")
78
+ line[0] == "#"
63
79
  end.map do |line|
64
- line.gsub("0.0.0.0 ", "")
80
+ line.split(' ')[1] # Select the domain name # gsub("0.0.0.0 ", "")
65
81
  end
66
82
  end.flatten.compact.sort
67
83
  end
84
+
85
+ def categories_with_keys
86
+ keyed_categories = {}
87
+
88
+ host_urls.keys.each do |category|
89
+ category_values = host_urls[category].select do |url|
90
+ url_not_valid?(url) && url.is_a?(Symbol)
91
+ end
92
+
93
+ keyed_categories[category] = category_values
94
+ end
95
+
96
+ keyed_categories
97
+ end
98
+
99
+ def url_not_valid?(url)
100
+ url_valid?(url)
101
+ end
102
+
103
+ def url_valid?(url)
104
+ uri = URI.parse(url)
105
+ uri.is_a?(URI::HTTP) && !uri.host.nil?
106
+ rescue URI::InvalidURIError
107
+ false
108
+ end
68
109
  end
69
110
  end
@@ -1,36 +1,44 @@
1
1
  module UrlCategorise
2
2
  module Constants
3
- # Resources used:
4
- # https://blocklist.site/#
5
- # https://github.com/lightswitch05/hosts
6
- #
7
3
  DEFAULT_HOST_URLS = {
8
- advertising: ["https://blocklistproject.github.io/Lists/ads.txt"],
4
+ abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
5
+ adobe: ["https://github.com/blocklistproject/Lists/raw/master/adobe.txt"],
6
+ advertising: ["https://blocklistproject.github.io/Lists/ads.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-advert_01.txt"],
7
+ amazon: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/amazon/all"],
9
8
  amp_hosts: ["https://www.github.developerdan.com/hosts/lists/amp-hosts-extended.txt"],
9
+ apple: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/apple/all"],
10
+ cloudflare: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/cloudflare/all"],
11
+ crypto: ["https://github.com/blocklistproject/Lists/raw/master/crypto.txt"],
10
12
  dating_services: ["https://www.github.developerdan.com/hosts/lists/dating-services-extended.txt"],
11
- facebook: ["https://github.com/blocklistproject/Lists/raw/master/facebook.txt", "https://www.github.developerdan.com/hosts/lists/facebook-extended.txt", "https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt"],
13
+ drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
14
+ facebook: ["https://github.com/blocklistproject/Lists/raw/master/facebook.txt", "https://www.github.developerdan.com/hosts/lists/facebook-extended.txt", "https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/all", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/facebook.com"],
12
15
  fraud: ["https://blocklistproject.github.io/Lists/fraud.txt"],
13
16
  gambling: ["https://blocklistproject.github.io/Lists/gambling.txt"],
14
- hate: [],
15
- junk: [],
16
- malware: ["https://blocklistproject.github.io/Lists/malware.txt"],
17
+ gaming: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ubisoft.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-steam.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-activision.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-blizzard.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ea.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-epicgames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-nintendo.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-rockstargames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-roblox.txt"],
18
+ google: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/all"],
19
+ hate_and_junk: ["https://www.github.developerdan.com/hosts/lists/hate-and-junk-extended.txt"],
20
+ instagram: ["https://github.com/jmdugan/blocklists/raw/master/corporations/facebook/instagram"],
21
+ linkedin: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/linkedin"],
22
+ malware: ["https://blocklistproject.github.io/Lists/malware.txt", "http://www.malwaredomainlist.com/hostslist/hosts.txt"],
23
+ microsoft: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/all"],
24
+ mozilla: ["https://github.com/jmdugan/blocklists/raw/master/corporations/mozilla/all"],
25
+ nsa: ["https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS"],
17
26
  phishing: ["https://blocklistproject.github.io/Lists/phishing.txt"],
27
+ pinterest: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all"],
28
+ piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt"],
18
29
  pornography: ["https://blocklistproject.github.io/Lists/porn.txt"],
30
+ reddit: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt"],
31
+ redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
19
32
  scam: ["https://blocklistproject.github.io/Lists/scam.txt"],
33
+ smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
34
+ social_media: [:facebook, :instagram, :linkedin, :pinterest, :reddit,:tiktok, :twitter, :whatsapp, :youtube],
20
35
  tiktok: ["https://blocklistproject.github.io/Lists/tiktok.txt"],
36
+ torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
21
37
  tracking: ["https://blocklistproject.github.io/Lists/tracking.txt"],
22
- twitter: ["https://github.com/blocklistproject/Lists/raw/master/twitter.txt"],
38
+ twitter: ["https://github.com/blocklistproject/Lists/raw/master/twitter.txt", "https://github.com/jmdugan/blocklists/raw/master/corporations/twitter/all"],
23
39
  vaping: ["https://github.com/blocklistproject/Lists/raw/master/vaping.txt"],
24
- whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt"],
25
- youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt"],
26
- torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
27
- smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
28
- redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
29
- piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt"],
30
- drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
31
- crypto: ["https://github.com/blocklistproject/Lists/raw/master/crypto.txt"],
32
- adobe: ["https://github.com/blocklistproject/Lists/raw/master/adobe.txt"],
33
- abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
40
+ whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp"],
41
+ youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube"],
34
42
  }
35
43
  end
36
44
  end
@@ -1,3 +1,3 @@
1
1
  module UrlCategorise
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: UrlCategorise
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-19 00:00:00.000000000 Z
11
+ date: 2023-05-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: api_pattern