UrlCategorise 0.0.1 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9ed8fbd044a6405977b3e720454c6eb3bfebdc6ee38e6b00f860ac2e3d49d3da
4
- data.tar.gz: c795f83ee5ce691af751f19681eaec865c2a01aadc27ef120c87ed7bf9ce339f
3
+ metadata.gz: 926043e28097f20035b4dbc943534c63a5c3c0a429745e4bb42e7dd0701295c1
4
+ data.tar.gz: 3ef850d4c43266a6ec15a7653c64b97720efc2e46298ed36be07d48d70ada772
5
5
  SHA512:
6
- metadata.gz: e4aa69b7edecd520db0a2df0ea48498f730f6ddd49d6f0d3d116954aaa7b486ae87c395b655aa81523e43a4189536f7b3d5033443e5f5ecae649c445ff2b178f
7
- data.tar.gz: c00b7d76d298538dcb55314cbda83dde5daa5bc78c2661bf44d818a62d663050957a6ff2d9063ec46c4cece3b92a18c14a471fdf67534b073f8a20fa560ad01d
6
+ metadata.gz: c6504dd8ec0a5f284bc78dfcbb7e45b9e1752c50f6f32a380c52128c19364b976232bd4af93c943d73f0b7cbb6e2ac3e44574e90a0fde38da87f5585b92fc3c0
7
+ data.tar.gz: c2de16323a1cfa085ac15590f7601b5307cc7c660243f48b6a5db10df25809041b119eb3f48da382effbe120150c2967ae4224213508fca896c45889bad6bce1
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- UrlCategorise (0.0.1)
4
+ UrlCategorise (0.0.3)
5
5
  api_pattern (~> 0.0.4)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -15,15 +15,40 @@ And then execute:
15
15
 
16
16
  Or install it yourself as:
17
17
 
18
- $ gem install url_categorise
18
+ $ gem install UrlCategorise
19
19
 
20
20
  ## Usage
21
+ The default host lists I picked for their separated categories.
22
+ I didn't select them for the quality of data
23
+ Use at your own risk!
21
24
 
22
25
  ```ruby
23
26
  require 'url_categorise'
24
27
  client = UrlCategorise::Client.new
25
28
 
29
+ client.count_of_hosts
30
+ client.count_of_categories
31
+ client.size_of_data # In megabytes
26
32
 
33
+ url = "www.google.com"
34
+ client.categorise(url)
35
+
36
+ # Can also initialise the client using a custom dataset
37
+ host_urls = {
38
+ abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"]
39
+ }
40
+
41
+ require 'url_categorise'
42
+ client = UrlCategorise::Client.new(host_urls: host_urls)
43
+
44
+ # You can also define symbols to combine other categories
45
+ host_urls = {
46
+ abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
47
+ bad_links: [:abuse]
48
+ }
49
+
50
+ require 'url_categorise'
51
+ client = UrlCategorise::Client.new(host_urls: host_urls)
27
52
  ```
28
53
 
29
54
  ## Development
@@ -4,27 +4,19 @@ module UrlCategorise
4
4
 
5
5
  attr_reader :host_urls, :hosts
6
6
 
7
+ # TODO: Save to folder
8
+ # TODO: Read from disk the database
7
9
  # TODO: Sanctioned IPs
8
- # TODO: More default lists
9
10
  # TODO: ActiveRecord support
10
11
  # TODO: List of abuse IPs
11
- # TODO: https://github.com/blocklistproject/Lists
12
- # TODO: https://github.com/nickoppen/pihole-blocklists
13
12
  def initialize(host_urls: DEFAULT_HOST_URLS)
14
13
  @host_urls = host_urls
15
- # @hosts = fetch_and_build_host_lists
16
- end
17
-
18
- def self.compatible_api_version
19
- 'v1'
20
- end
21
-
22
- def self.api_version
23
- 'v2 2023-05-19'
14
+ @hosts = fetch_and_build_host_lists
24
15
  end
25
16
 
26
17
  def categorise(url)
27
18
  host = (URI.parse(url).host || url).downcase
19
+ host = host.gsub("www.", "")
28
20
 
29
21
  @hosts.keys.select do |category|
30
22
  @hosts[category].include?(host)
@@ -42,7 +34,19 @@ module UrlCategorise
42
34
  end
43
35
 
44
36
  def size_of_data
37
+ hash_size_in_mb(@hosts)
38
+ end
39
+
40
+ private
41
+
42
+ def hash_size_in_mb(hash)
43
+ size = 0
45
44
 
45
+ hash.each do |key, value|
46
+ size += value.join.length
47
+ end
48
+
49
+ (size / ONE_MEGABYTE).round(2)
46
50
  end
47
51
 
48
52
  def fetch_and_build_host_lists
@@ -52,18 +56,57 @@ module UrlCategorise
52
56
  @hosts[category] = build_host_data(host_urls[category])
53
57
  end
54
58
 
59
+ sub_category_values = categories_with_keys
60
+ sub_category_values.keys.each do |category|
61
+ original_value = @hosts[category] || []
62
+
63
+ extra_category_values = sub_category_values[category].each do |sub_category|
64
+ @hosts[sub_category]
65
+ end
66
+
67
+ original_value << extra_category_values
68
+ @hosts[category] = original_value.uniq.compact
69
+ end
70
+
55
71
  @hosts
56
72
  end
57
73
 
58
74
  def build_host_data(urls)
59
75
  urls.map do |url|
76
+ next unless url_valid?(url)
77
+
60
78
  raw_data = HTTParty.get(url)
61
79
  raw_data.split("\n").reject do |line|
62
- line.include?("#")
80
+ line[0] == "#"
63
81
  end.map do |line|
64
- line.gsub("0.0.0.0 ", "")
82
+ line.split(' ')[1] # Select the domain name # gsub("0.0.0.0 ", "")
65
83
  end
66
84
  end.flatten.compact.sort
67
85
  end
86
+
87
+ def categories_with_keys
88
+ keyed_categories = {}
89
+
90
+ host_urls.keys.each do |category|
91
+ category_values = host_urls[category].select do |url|
92
+ url_not_valid?(url) && url.is_a?(Symbol)
93
+ end
94
+
95
+ keyed_categories[category] = category_values
96
+ end
97
+
98
+ keyed_categories
99
+ end
100
+
101
+ def url_not_valid?(url)
102
+ url_valid?(url)
103
+ end
104
+
105
+ def url_valid?(url)
106
+ uri = URI.parse(url)
107
+ uri.is_a?(URI::HTTP) && !uri.host.nil?
108
+ rescue URI::InvalidURIError
109
+ false
110
+ end
68
111
  end
69
112
  end
@@ -1,36 +1,45 @@
1
1
  module UrlCategorise
2
2
  module Constants
3
- # Resources used:
4
- # https://blocklist.site/#
5
- # https://github.com/lightswitch05/hosts
6
- #
3
+ ONE_MEGABYTE = 1048576
7
4
  DEFAULT_HOST_URLS = {
8
- advertising: ["https://blocklistproject.github.io/Lists/ads.txt"],
5
+ abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
6
+ adobe: ["https://github.com/blocklistproject/Lists/raw/master/adobe.txt"],
7
+ advertising: ["https://blocklistproject.github.io/Lists/ads.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-advert_01.txt"],
8
+ amazon: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/amazon/all"],
9
9
  amp_hosts: ["https://www.github.developerdan.com/hosts/lists/amp-hosts-extended.txt"],
10
+ apple: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/apple/all"],
11
+ cloudflare: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/cloudflare/all"],
12
+ crypto: ["https://github.com/blocklistproject/Lists/raw/master/crypto.txt"],
10
13
  dating_services: ["https://www.github.developerdan.com/hosts/lists/dating-services-extended.txt"],
11
- facebook: ["https://github.com/blocklistproject/Lists/raw/master/facebook.txt", "https://www.github.developerdan.com/hosts/lists/facebook-extended.txt", "https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt"],
14
+ drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
15
+ facebook: ["https://github.com/blocklistproject/Lists/raw/master/facebook.txt", "https://www.github.developerdan.com/hosts/lists/facebook-extended.txt", "https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/all", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/facebook.com"],
12
16
  fraud: ["https://blocklistproject.github.io/Lists/fraud.txt"],
13
17
  gambling: ["https://blocklistproject.github.io/Lists/gambling.txt"],
14
- hate: [],
15
- junk: [],
16
- malware: ["https://blocklistproject.github.io/Lists/malware.txt"],
18
+ gaming: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ubisoft.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-steam.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-activision.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-blizzard.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ea.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-epicgames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-nintendo.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-rockstargames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-roblox.txt"],
19
+ google: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/all"],
20
+ hate_and_junk: ["https://www.github.developerdan.com/hosts/lists/hate-and-junk-extended.txt"],
21
+ instagram: ["https://github.com/jmdugan/blocklists/raw/master/corporations/facebook/instagram"],
22
+ linkedin: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/linkedin"],
23
+ malware: ["https://blocklistproject.github.io/Lists/malware.txt", "http://www.malwaredomainlist.com/hostslist/hosts.txt"],
24
+ microsoft: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/microsoft/all"],
25
+ mozilla: ["https://github.com/jmdugan/blocklists/raw/master/corporations/mozilla/all"],
26
+ nsa: ["https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS"],
17
27
  phishing: ["https://blocklistproject.github.io/Lists/phishing.txt"],
28
+ pinterest: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all"],
29
+ piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt"],
18
30
  pornography: ["https://blocklistproject.github.io/Lists/porn.txt"],
31
+ reddit: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt"],
32
+ redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
19
33
  scam: ["https://blocklistproject.github.io/Lists/scam.txt"],
34
+ smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
35
+ social_media: [:facebook, :instagram, :linkedin, :pinterest, :reddit,:tiktok, :twitter, :whatsapp, :youtube],
20
36
  tiktok: ["https://blocklistproject.github.io/Lists/tiktok.txt"],
37
+ torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
21
38
  tracking: ["https://blocklistproject.github.io/Lists/tracking.txt"],
22
- twitter: ["https://github.com/blocklistproject/Lists/raw/master/twitter.txt"],
39
+ twitter: ["https://github.com/blocklistproject/Lists/raw/master/twitter.txt", "https://github.com/jmdugan/blocklists/raw/master/corporations/twitter/all"],
23
40
  vaping: ["https://github.com/blocklistproject/Lists/raw/master/vaping.txt"],
24
- whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt"],
25
- youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt"],
26
- torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
27
- smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
28
- redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
29
- piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt"],
30
- drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
31
- crypto: ["https://github.com/blocklistproject/Lists/raw/master/crypto.txt"],
32
- adobe: ["https://github.com/blocklistproject/Lists/raw/master/adobe.txt"],
33
- abuse: ["https://github.com/blocklistproject/Lists/raw/master/abuse.txt"],
41
+ whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp"],
42
+ youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube"],
34
43
  }
35
44
  end
36
45
  end
@@ -1,3 +1,3 @@
1
1
  module UrlCategorise
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: UrlCategorise
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - trex22
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-19 00:00:00.000000000 Z
11
+ date: 2023-05-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: api_pattern