UrlCategorise 0.0.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ require_relative 'models'
2
+
3
+ module UrlCategorise
4
+ class ActiveRecordClient < Client
5
+ def initialize(**kwargs)
6
+ raise "ActiveRecord not available" unless UrlCategorise::Models.available?
7
+
8
+ @use_database = kwargs.delete(:use_database) { true }
9
+ super(**kwargs)
10
+
11
+ populate_database if @use_database
12
+ end
13
+
14
+ def categorise(url)
15
+ return super(url) unless @use_database && UrlCategorise::Models.available?
16
+
17
+ host = (URI.parse(url).host || url).downcase.gsub("www.", "")
18
+
19
+ # Try database first
20
+ categories = UrlCategorise::Models::Domain.categorise(host)
21
+ return categories unless categories.empty?
22
+
23
+ # Fallback to memory-based categorization
24
+ super(url)
25
+ end
26
+
27
+ def categorise_ip(ip_address)
28
+ return super(ip_address) unless @use_database && UrlCategorise::Models.available?
29
+
30
+ # Try database first
31
+ categories = UrlCategorise::Models::IpAddress.categorise(ip_address)
32
+ return categories unless categories.empty?
33
+
34
+ # Fallback to memory-based categorization
35
+ super(ip_address)
36
+ end
37
+
38
+ def update_database
39
+ return unless @use_database && UrlCategorise::Models.available?
40
+
41
+ populate_database
42
+ end
43
+
44
+ def database_stats
45
+ return {} unless @use_database && UrlCategorise::Models.available?
46
+
47
+ {
48
+ domains: UrlCategorise::Models::Domain.count,
49
+ ip_addresses: UrlCategorise::Models::IpAddress.count,
50
+ list_metadata: UrlCategorise::Models::ListMetadata.count,
51
+ categories: UrlCategorise::Models::Domain.distinct.pluck(:categories).flatten.uniq.size
52
+ }
53
+ end
54
+
55
+ private
56
+
57
+ def populate_database
58
+ return unless UrlCategorise::Models.available?
59
+
60
+ # Store list metadata
61
+ @host_urls.each do |category, urls|
62
+ urls.each do |url|
63
+ next unless url.is_a?(String)
64
+
65
+ metadata = @metadata[url] || {}
66
+ UrlCategorise::Models::ListMetadata.find_or_create_by(url: url) do |record|
67
+ record.name = category.to_s
68
+ record.categories = [category.to_s]
69
+ record.file_hash = metadata[:content_hash]
70
+ record.fetched_at = metadata[:last_updated]
71
+ end
72
+ end
73
+ end
74
+
75
+ # Store domain data
76
+ @hosts.each do |category, domains|
77
+ domains.each do |domain|
78
+ next if domain.nil? || domain.empty?
79
+
80
+ existing = UrlCategorise::Models::Domain.find_by(domain: domain)
81
+ if existing
82
+ # Add category if not already present
83
+ categories = existing.categories | [category.to_s]
84
+ existing.update(categories: categories) if categories != existing.categories
85
+ else
86
+ UrlCategorise::Models::Domain.create!(
87
+ domain: domain,
88
+ categories: [category.to_s]
89
+ )
90
+ end
91
+ end
92
+ end
93
+
94
+ # Store IP data (for IP-based lists)
95
+ ip_categories = [:sanctions_ips, :compromised_ips, :tor_exit_nodes, :open_proxy_ips,
96
+ :banking_trojans, :malicious_ssl_certificates, :top_attack_sources]
97
+
98
+ ip_categories.each do |category|
99
+ next unless @hosts[category]
100
+
101
+ @hosts[category].each do |ip|
102
+ next if ip.nil? || ip.empty? || !ip.match(/^\d+\.\d+\.\d+\.\d+$/)
103
+
104
+ existing = UrlCategorise::Models::IpAddress.find_by(ip_address: ip)
105
+ if existing
106
+ categories = existing.categories | [category.to_s]
107
+ existing.update(categories: categories) if categories != existing.categories
108
+ else
109
+ UrlCategorise::Models::IpAddress.create!(
110
+ ip_address: ip,
111
+ categories: [category.to_s]
112
+ )
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
@@ -2,15 +2,23 @@ module UrlCategorise
2
2
  class Client < ApiPattern::Client
3
3
  include ::UrlCategorise::Constants
4
4
 
5
- attr_reader :host_urls, :hosts
6
-
7
- # TODO: Save to folder
8
- # TODO: Read from disk the database
9
- # TODO: Sanctioned IPs
10
- # TODO: ActiveRecord support
11
- # TODO: List of abuse IPs
12
- def initialize(host_urls: DEFAULT_HOST_URLS)
5
+ def self.compatible_api_version
6
+ 'v2'
7
+ end
8
+
9
+ def self.api_version
10
+ 'v2 2023-04-12'
11
+ end
12
+
13
+ attr_reader :host_urls, :hosts, :cache_dir, :force_download, :dns_servers, :metadata, :request_timeout
14
+
15
+ def initialize(host_urls: DEFAULT_HOST_URLS, cache_dir: nil, force_download: false, dns_servers: ['1.1.1.1', '1.0.0.1'], request_timeout: 10)
13
16
  @host_urls = host_urls
17
+ @cache_dir = cache_dir
18
+ @force_download = force_download
19
+ @dns_servers = dns_servers
20
+ @request_timeout = request_timeout
21
+ @metadata = {}
14
22
  @hosts = fetch_and_build_host_lists
15
23
  end
16
24
 
@@ -19,10 +27,35 @@ module UrlCategorise
19
27
  host = host.gsub("www.", "")
20
28
 
21
29
  @hosts.keys.select do |category|
22
- @hosts[category].include?(host)
30
+ @hosts[category].any? do |blocked_host|
31
+ host == blocked_host || host.end_with?(".#{blocked_host}")
32
+ end
33
+ end
34
+ end
35
+
36
+ def categorise_ip(ip_address)
37
+ @hosts.keys.select do |category|
38
+ @hosts[category].include?(ip_address)
23
39
  end
24
40
  end
25
41
 
42
+ def resolve_and_categorise(domain)
43
+ categories = categorise(domain)
44
+
45
+ begin
46
+ resolver = Resolv::DNS.new(nameserver: @dns_servers)
47
+ ip_addresses = resolver.getaddresses(domain).map(&:to_s)
48
+
49
+ ip_addresses.each do |ip|
50
+ categories.concat(categorise_ip(ip))
51
+ end
52
+ rescue
53
+ # DNS resolution failed, return domain categories only
54
+ end
55
+
56
+ categories.uniq
57
+ end
58
+
26
59
  def count_of_hosts
27
60
  @hosts.keys.map do |category|
28
61
  @hosts[category].size
@@ -72,18 +105,153 @@ module UrlCategorise
72
105
  end
73
106
 
74
107
  def build_host_data(urls)
75
- urls.map do |url|
108
+ all_hosts = []
109
+
110
+ urls.each do |url|
76
111
  next unless url_valid?(url)
77
-
78
- raw_data = HTTParty.get(url)
79
- raw_data.split("\n").reject do |line|
80
- line[0] == "#"
81
- end.map do |line|
82
- line.split(' ')[1] # Select the domain name # gsub("0.0.0.0 ", "")
112
+
113
+ hosts_data = nil
114
+
115
+ if @cache_dir && !@force_download
116
+ hosts_data = read_from_cache(url)
83
117
  end
84
- end.flatten.compact.sort
118
+
119
+ if hosts_data.nil?
120
+ hosts_data = download_and_parse_list(url)
121
+ save_to_cache(url, hosts_data) if @cache_dir
122
+ end
123
+
124
+ all_hosts.concat(hosts_data) if hosts_data
125
+ end
126
+
127
+ all_hosts.compact.sort.uniq
128
+ end
129
+
130
+ def download_and_parse_list(url)
131
+ begin
132
+ raw_data = HTTParty.get(url, timeout: @request_timeout)
133
+ return [] if raw_data.body.nil? || raw_data.body.empty?
134
+
135
+ # Store metadata
136
+ etag = raw_data.headers['etag']
137
+ last_modified = raw_data.headers['last-modified']
138
+ @metadata[url] = {
139
+ last_updated: Time.now,
140
+ etag: etag,
141
+ last_modified: last_modified,
142
+ content_hash: Digest::SHA256.hexdigest(raw_data.body),
143
+ status: 'success'
144
+ }
145
+
146
+ parse_list_content(raw_data.body, detect_list_format(raw_data.body))
147
+ rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError => e
148
+ # Log the error but continue with other lists
149
+ @metadata[url] = {
150
+ last_updated: Time.now,
151
+ error: e.message,
152
+ status: 'failed'
153
+ }
154
+ return []
155
+ end
156
+ end
157
+
158
+ def parse_list_content(content, format)
159
+ lines = content.split("\n").reject { |line| line.empty? || line.strip.start_with?('#') }
160
+
161
+ case format
162
+ when :hosts
163
+ lines.map { |line| line.split(' ')[1] }.compact
164
+ when :plain
165
+ lines.map(&:strip)
166
+ when :dnsmasq
167
+ lines.map { |line|
168
+ match = line.match(/address=\/(.+?)\//)
169
+ match ? match[1] : nil
170
+ }.compact
171
+ when :ublock
172
+ lines.map { |line| line.gsub(/^\|\|/, '').gsub(/[\$\^].*$/, '').strip }.reject(&:empty?)
173
+ else
174
+ lines.map(&:strip)
175
+ end
176
+ end
177
+
178
+ def detect_list_format(content)
179
+ sample_lines = content.split("\n").first(10)
180
+
181
+ return :hosts if sample_lines.any? { |line| line.match(/^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s+/) }
182
+ return :dnsmasq if sample_lines.any? { |line| line.include?('address=/') }
183
+ return :ublock if sample_lines.any? { |line| line.match(/^\|\|/) }
184
+
185
+ :plain
186
+ end
187
+
188
+ def cache_file_path(url)
189
+ return nil unless @cache_dir
190
+
191
+ FileUtils.mkdir_p(@cache_dir) unless Dir.exist?(@cache_dir)
192
+ filename = Digest::MD5.hexdigest(url) + '.cache'
193
+ File.join(@cache_dir, filename)
85
194
  end
86
195
 
196
+ def read_from_cache(url)
197
+ cache_file = cache_file_path(url)
198
+ return nil unless cache_file && File.exist?(cache_file)
199
+
200
+ cache_data = Marshal.load(File.read(cache_file))
201
+
202
+ # Check if we should update based on hash or time
203
+ if should_update_cache?(url, cache_data)
204
+ return nil
205
+ end
206
+
207
+ cache_data[:hosts]
208
+ rescue
209
+ nil
210
+ end
211
+
212
+ def save_to_cache(url, hosts_data)
213
+ cache_file = cache_file_path(url)
214
+ return unless cache_file
215
+
216
+ cache_data = {
217
+ hosts: hosts_data,
218
+ metadata: @metadata[url],
219
+ cached_at: Time.now
220
+ }
221
+
222
+ File.write(cache_file, Marshal.dump(cache_data))
223
+ rescue
224
+ # Cache save failed, continue without caching
225
+ end
226
+
227
+ def should_update_cache?(url, cache_data)
228
+ return true if @force_download
229
+ return true unless cache_data[:metadata]
230
+
231
+ # Update if cache is older than 24 hours
232
+ cache_age = Time.now - cache_data[:cached_at]
233
+ return true if cache_age > 24 * 60 * 60
234
+
235
+ # Check if remote content has changed
236
+ begin
237
+ head_response = HTTParty.head(url, timeout: @request_timeout)
238
+ remote_etag = head_response.headers['etag']
239
+ remote_last_modified = head_response.headers['last-modified']
240
+
241
+ cached_metadata = cache_data[:metadata]
242
+
243
+ return true if remote_etag && cached_metadata[:etag] && remote_etag != cached_metadata[:etag]
244
+ return true if remote_last_modified && cached_metadata[:last_modified] && remote_last_modified != cached_metadata[:last_modified]
245
+ rescue HTTParty::Error, Net::HTTPError, SocketError, Timeout::Error, URI::InvalidURIError, StandardError
246
+ # If HEAD request fails, assume we should update
247
+ return true
248
+ end
249
+
250
+ false
251
+ end
252
+
253
+ private
254
+
87
255
  def categories_with_keys
88
256
  keyed_categories = {}
89
257
 
@@ -14,7 +14,7 @@ module UrlCategorise
14
14
  drugs: ["https://github.com/blocklistproject/Lists/raw/master/drugs.txt"],
15
15
  facebook: ["https://github.com/blocklistproject/Lists/raw/master/facebook.txt", "https://www.github.developerdan.com/hosts/lists/facebook-extended.txt", "https://raw.githubusercontent.com/blocklistproject/Lists/master/facebook.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/all", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/facebook.com"],
16
16
  fraud: ["https://blocklistproject.github.io/Lists/fraud.txt"],
17
- gambling: ["https://blocklistproject.github.io/Lists/gambling.txt"],
17
+ gambling: ["https://blocklistproject.github.io/Lists/gambling.txt", "https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/gambling.txt"],
18
18
  gaming: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ubisoft.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-steam.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-activision.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-blizzard.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-ea.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-epicgames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-nintendo.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-rockstargames.txt", "https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-roblox.txt"],
19
19
  google: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/all"],
20
20
  hate_and_junk: ["https://www.github.developerdan.com/hosts/lists/hate-and-junk-extended.txt"],
@@ -26,13 +26,13 @@ module UrlCategorise
26
26
  nsa: ["https://raw.githubusercontent.com/tigthor/NSA-CIA-Blocklist/main/HOSTS/HOSTS"],
27
27
  phishing: ["https://blocklistproject.github.io/Lists/phishing.txt"],
28
28
  pinterest: ["https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/pinterest/all"],
29
- piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt"],
29
+ piracy: ["https://github.com/blocklistproject/Lists/raw/master/piracy.txt", "https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/anti.piracy.txt"],
30
30
  pornography: ["https://blocklistproject.github.io/Lists/porn.txt"],
31
31
  reddit: ["https://raw.githubusercontent.com/nickoppen/pihole-blocklists/master/blocklist-reddit.txt"],
32
32
  redirect: ["https://github.com/blocklistproject/Lists/raw/master/redirect.txt"],
33
33
  scam: ["https://blocklistproject.github.io/Lists/scam.txt"],
34
34
  smart_tv: ["https://github.com/blocklistproject/Lists/raw/master/smart-tv.txt"],
35
- social_media: [:facebook, :instagram, :linkedin, :pinterest, :reddit,:tiktok, :twitter, :whatsapp, :youtube],
35
+ social_media: [:facebook, :instagram, :linkedin, :pinterest, :reddit, :tiktok, :twitter, :whatsapp, :youtube],
36
36
  tiktok: ["https://blocklistproject.github.io/Lists/tiktok.txt"],
37
37
  torrent: ["https://github.com/blocklistproject/Lists/raw/master/torrent.txt"],
38
38
  tracking: ["https://blocklistproject.github.io/Lists/tracking.txt"],
@@ -40,6 +40,67 @@ module UrlCategorise
40
40
  vaping: ["https://github.com/blocklistproject/Lists/raw/master/vaping.txt"],
41
41
  whatsapp: ["https://github.com/blocklistproject/Lists/raw/master/whatsapp.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/facebook/whatsapp"],
42
42
  youtube: ["https://github.com/blocklistproject/Lists/raw/master/youtube.txt", "https://raw.githubusercontent.com/jmdugan/blocklists/master/corporations/google/youtube"],
43
+
44
+ # Hagezi DNS Blocklists - specialized categories only
45
+ threat_intelligence: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/tif.txt"],
46
+ dyndns: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/dyndns.txt"],
47
+ badware_hoster: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/hoster.txt"],
48
+ most_abused_tlds: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/tlds.txt"],
49
+ newly_registered_domains: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/nrd.txt"],
50
+ dns_over_https_bypass: ["https://cdn.jsdelivr.net/gh/hagezi/dns-blocklists@release/adblock/doh-vpn-proxy-bypass.txt"],
51
+
52
+ # StevenBlack hosts lists - specific categories only
53
+ fakenews: ["https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews/hosts"],
54
+
55
+ # Security threat lists
56
+ banking_trojans: ["https://feodotracker.abuse.ch/downloads/ipblocklist.txt"],
57
+ malware_domains: ["https://bazaar.abuse.ch/downloads/domain_blocklist.txt"],
58
+ malicious_ssl_certificates: ["https://sslbl.abuse.ch/blacklist/sslipblacklist.txt"],
59
+ threat_indicators: ["https://threatfox.abuse.ch/downloads/hostfile.txt"],
60
+
61
+ # Additional IP-based sanctions and abuse lists
62
+ sanctions_ips: ["https://lists.blocklist.de/lists/all.txt"],
63
+ compromised_ips: ["https://rules.emergingthreats.net/fwrules/emerging-Block-IPs.txt"],
64
+ tor_exit_nodes: ["https://www.dan.me.uk/torlist/"],
65
+ open_proxy_ips: ["https://raw.githubusercontent.com/stamparm/ipsum/master/ipsum.txt"],
66
+
67
+ # Network security feeds
68
+ top_attack_sources: ["https://www.dshield.org/feeds/suspiciousdomains_High.txt"],
69
+ suspicious_domains: ["https://www.dshield.org/feeds/suspiciousdomains_Medium.txt"],
70
+
71
+ # Extended categories for better organization
72
+ cryptojacking: ["https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt"],
73
+ ransomware: ["https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt"],
74
+ botnet_command_control: ["https://osint.bambenekconsulting.com/feeds/c2-dommasterlist.txt"],
75
+ phishing_extended: ["https://openphish.com/feed.txt"],
76
+
77
+ # Regional and specialized lists
78
+ chinese_ad_hosts: ["https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts"],
79
+ korean_ad_hosts: ["https://raw.githubusercontent.com/yous/YousList/master/hosts.txt"],
80
+
81
+ # Mobile and app-specific
82
+ mobile_ads: ["https://raw.githubusercontent.com/AdguardTeam/AdguardFilters/master/MobileFilter/sections/adservers.txt"],
83
+ smart_tv_ads: ["https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt"],
84
+
85
+ # Content and informational categories
86
+ news: ["https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-only/hosts"],
87
+ legitimate_news: ["https://raw.githubusercontent.com/mitchellkrogza/The-Big-List-of-Hacked-Malware-Web-Sites/master/.dev-tools/_domains_fake_news/domains.txt"],
88
+ blogs: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/blogs-nl.txt"],
89
+ forums: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/forums-nl.txt"],
90
+ educational: ["https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/education/hosts"],
91
+ government: ["https://raw.githubusercontent.com/mitchellkrogza/The-Big-List-of-Hacked-Malware-Web-Sites/master/.dev-tools/_domains_government/domains.txt"],
92
+ health: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/health-nl.txt"],
93
+ finance: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/finance-nl.txt"],
94
+ streaming: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/streaming-nl.txt"],
95
+ shopping: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/shopping-nl.txt"],
96
+
97
+ # Professional and business
98
+ business: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/business-nl.txt"],
99
+ technology: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/tech-nl.txt"],
100
+
101
+ # Regional content
102
+ local_news: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/local-news-nl.txt"],
103
+ international_news: ["https://raw.githubusercontent.com/blocklistproject/Lists/master/alt-version/international-news-nl.txt"],
43
104
  }
44
105
  end
45
106
  end
@@ -0,0 +1,105 @@
1
+ begin
2
+ require 'active_record'
3
+ rescue LoadError
4
+ # ActiveRecord not available, skip model definitions
5
+ module UrlCategorise
6
+ module Models
7
+ def self.available?
8
+ false
9
+ end
10
+ end
11
+ end
12
+ else
13
+ module UrlCategorise
14
+ module Models
15
+ def self.available?
16
+ true
17
+ end
18
+
19
+ class ListMetadata < ActiveRecord::Base
20
+ self.table_name = 'url_categorise_list_metadata'
21
+
22
+ validates :name, presence: true, uniqueness: true
23
+ validates :url, presence: true
24
+ validates :categories, presence: true
25
+
26
+ serialize :categories, coder: JSON
27
+
28
+ scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
29
+ scope :updated_since, ->(time) { where('updated_at > ?', time) }
30
+ end
31
+
32
+ class Domain < ActiveRecord::Base
33
+ self.table_name = 'url_categorise_domains'
34
+
35
+ validates :domain, presence: true, uniqueness: true
36
+ validates :categories, presence: true
37
+
38
+ serialize :categories, coder: JSON
39
+
40
+ scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
41
+ scope :search, ->(term) { where('domain LIKE ?', "%#{term}%") }
42
+
43
+ def self.categorise(domain_name)
44
+ record = find_by(domain: domain_name.downcase.gsub('www.', ''))
45
+ record ? record.categories : []
46
+ end
47
+ end
48
+
49
+ class IpAddress < ActiveRecord::Base
50
+ self.table_name = 'url_categorise_ip_addresses'
51
+
52
+ validates :ip_address, presence: true, uniqueness: true
53
+ validates :categories, presence: true
54
+
55
+ serialize :categories, coder: JSON
56
+
57
+ scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
58
+ scope :in_subnet, ->(subnet) { where('ip_address LIKE ?', "#{subnet}%") }
59
+
60
+ def self.categorise(ip)
61
+ record = find_by(ip_address: ip)
62
+ record ? record.categories : []
63
+ end
64
+ end
65
+
66
+ # Generator for Rails integration
67
+ def self.generate_migration
68
+ <<~MIGRATION
69
+ class CreateUrlCategoriseTables < ActiveRecord::Migration[8.0]
70
+ def change
71
+ create_table :url_categorise_list_metadata do |t|
72
+ t.string :name, null: false, index: { unique: true }
73
+ t.string :url, null: false
74
+ t.text :categories, null: false
75
+ t.string :file_path
76
+ t.datetime :fetched_at
77
+ t.string :file_hash
78
+ t.datetime :file_updated_at
79
+ t.timestamps
80
+ end
81
+
82
+ create_table :url_categorise_domains do |t|
83
+ t.string :domain, null: false, index: { unique: true }
84
+ t.text :categories, null: false
85
+ t.timestamps
86
+ end
87
+
88
+ add_index :url_categorise_domains, :domain
89
+ add_index :url_categorise_domains, :categories
90
+
91
+ create_table :url_categorise_ip_addresses do |t|
92
+ t.string :ip_address, null: false, index: { unique: true }
93
+ t.text :categories, null: false
94
+ t.timestamps
95
+ end
96
+
97
+ add_index :url_categorise_ip_addresses, :ip_address
98
+ add_index :url_categorise_ip_addresses, :categories
99
+ end
100
+ end
101
+ MIGRATION
102
+ end
103
+ end
104
+ end
105
+ end
@@ -1,3 +1,3 @@
1
1
  module UrlCategorise
2
- VERSION = "0.0.3"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -1,5 +1,8 @@
1
1
  require 'httparty'
2
2
  require 'nokogiri'
3
+ require 'digest'
4
+ require 'fileutils'
5
+ require 'resolv'
3
6
 
4
7
  require 'api-pattern'
5
8
 
@@ -8,6 +11,14 @@ require 'url_categorise/constants'
8
11
 
9
12
  require 'url_categorise/client'
10
13
 
14
+ # Optional ActiveRecord integration
15
+ begin
16
+ require 'url_categorise/models'
17
+ require 'url_categorise/active_record_client'
18
+ rescue LoadError
19
+ # ActiveRecord not available, skip
20
+ end
21
+
11
22
  module UrlCategorise
12
23
  class Error < StandardError; end
13
24
  end
@@ -21,16 +21,24 @@ Gem::Specification.new do |spec|
21
21
  spec.bindir = "exe"
22
22
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
23
  spec.require_paths = ["lib"]
24
+ spec.required_ruby_version = ">= 3.0.0"
24
25
 
25
- spec.add_dependency "api_pattern", "~> 0.0.4"
26
+ spec.add_dependency "api_pattern", ">= 0.0.5", "< 1.0"
27
+ spec.add_dependency "httparty", ">= 0.22.0", "< 1.0"
28
+ spec.add_dependency "nokogiri", ">= 1.16.0", "< 2.0"
29
+ spec.add_dependency "csv", ">= 3.3.0", "< 4.0"
30
+ spec.add_dependency "digest", ">= 3.1.0", "< 4.0"
31
+ spec.add_dependency "fileutils", ">= 1.7.0", "< 2.0"
32
+ spec.add_dependency "resolv", ">= 0.4.0", "< 1.0"
26
33
 
27
34
  # Development dependancies
28
- spec.add_development_dependency "rake", "~> 13.0.6"
29
- spec.add_development_dependency "minitest", "~> 5.18.0"
30
- spec.add_development_dependency "minitest-focus", "~> 1.3.1"
31
- spec.add_development_dependency "minitest-reporters", "~> 1.6.0"
32
- spec.add_development_dependency "timecop", "~> 0.9.6"
33
- spec.add_development_dependency "mocha", "~> 2.0.2"
34
- spec.add_development_dependency "pry", "~> 0.14.2"
35
- spec.add_development_dependency "webmock", "~> 3.18.1"
35
+ spec.add_development_dependency "rake", "~> 13.3.0"
36
+ spec.add_development_dependency "minitest", "~> 5.25.5"
37
+ spec.add_development_dependency "minitest-focus", "~> 1.4.0"
38
+ spec.add_development_dependency "minitest-reporters", "~> 1.7.1"
39
+ spec.add_development_dependency "timecop", "~> 0.9.10"
40
+ spec.add_development_dependency "mocha", "~> 2.4.5"
41
+ spec.add_development_dependency "pry", "~> 0.15.2"
42
+ spec.add_development_dependency "webmock", "~> 3.24.0"
43
+ spec.add_development_dependency "simplecov", "~> 0.22.0"
36
44
  end