UrlCategorise 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,471 @@
1
+ require 'httparty'
2
+ require 'csv'
3
+ require 'digest'
4
+ require 'fileutils'
5
+ require 'net/http'
6
+ require 'timeout'
7
+ require 'zip'
8
+ require 'json'
9
+
10
+ module UrlCategorise
11
+ class DatasetProcessor
12
+ include HTTParty
13
+
14
+ KAGGLE_BASE_URL = 'https://www.kaggle.com/api/v1'
15
+ DEFAULT_DOWNLOAD_PATH = './downloads'
16
+ DEFAULT_CACHE_PATH = './cache'
17
+ DEFAULT_TIMEOUT = 30
18
+ DEFAULT_CREDENTIALS_FILE = File.expand_path('~/.kaggle/kaggle.json')
19
+
20
+ attr_reader :username, :api_key, :download_path, :cache_path, :timeout, :kaggle_enabled
21
+
22
+ def initialize(username: nil, api_key: nil, credentials_file: nil, download_path: nil, cache_path: nil,
23
+ timeout: nil, enable_kaggle: true)
24
+ @kaggle_enabled = enable_kaggle
25
+
26
+ if @kaggle_enabled
27
+ load_credentials(username, api_key, credentials_file)
28
+ warn_if_kaggle_credentials_missing
29
+ else
30
+ @username = nil
31
+ @api_key = nil
32
+ end
33
+
34
+ @download_path = download_path || DEFAULT_DOWNLOAD_PATH
35
+ @cache_path = cache_path || DEFAULT_CACHE_PATH
36
+ @timeout = timeout || DEFAULT_TIMEOUT
37
+
38
+ ensure_directories_exist
39
+ setup_httparty_options if kaggle_credentials_available?
40
+ end
41
+
42
+ def process_kaggle_dataset(dataset_owner, dataset_name, options = {})
43
+ unless @kaggle_enabled
44
+ raise Error, 'Kaggle functionality is disabled. Set enable_kaggle: true to use Kaggle datasets.'
45
+ end
46
+
47
+ dataset_path = "#{dataset_owner}/#{dataset_name}"
48
+
49
+ # Check cache first if requested - no credentials needed for cached data
50
+ if options[:use_cache]
51
+ cached_data = load_from_cache(generate_cache_key(dataset_path, :kaggle))
52
+ return cached_data if cached_data
53
+ end
54
+
55
+ # Check if we already have extracted files - no credentials needed
56
+ extracted_dir = get_extracted_dir(dataset_path)
57
+ if options[:use_cache] && Dir.exist?(extracted_dir) && !Dir.empty?(extracted_dir)
58
+ return handle_existing_dataset(extracted_dir, options)
59
+ end
60
+
61
+ # Only require credentials if we need to download fresh data
62
+ unless kaggle_credentials_available?
63
+ raise Error, 'Kaggle credentials required for downloading new datasets. ' \
64
+ 'Set KAGGLE_USERNAME/KAGGLE_KEY environment variables, provide credentials explicitly, ' \
65
+ 'or place kaggle.json file in ~/.kaggle/ directory.'
66
+ end
67
+
68
+ # Download from Kaggle API
69
+ response = authenticated_request(:get, "/datasets/download/#{dataset_path}")
70
+
71
+ raise Error, "Failed to download Kaggle dataset: #{response.message}" unless response.success?
72
+
73
+ # Process the downloaded data
74
+ result = process_dataset_response(response.body, dataset_path, :kaggle, options)
75
+
76
+ # Cache if requested
77
+ cache_processed_data(generate_cache_key(dataset_path, :kaggle), result) if options[:use_cache] && result
78
+
79
+ result
80
+ end
81
+
82
+ def process_csv_dataset(url, options = {})
83
+ cache_key = generate_cache_key(url, :csv)
84
+
85
+ # Check cache first if requested
86
+ if options[:use_cache]
87
+ cached_data = load_from_cache(cache_key)
88
+ return cached_data if cached_data
89
+ end
90
+
91
+ # Download CSV directly
92
+ response = HTTParty.get(url, timeout: @timeout, follow_redirects: true)
93
+
94
+ raise Error, "Failed to download CSV dataset: #{response.message}" unless response.success?
95
+
96
+ # Parse CSV content
97
+ result = parse_csv_content(response.body, options)
98
+
99
+ # Cache if requested
100
+ cache_processed_data(cache_key, result) if options[:use_cache] && result
101
+
102
+ result
103
+ end
104
+
105
+ def generate_dataset_hash(data)
106
+ content = case data
107
+ when Hash
108
+ data.to_json
109
+ when Array
110
+ data.to_json
111
+ when String
112
+ data
113
+ else
114
+ data.to_s
115
+ end
116
+
117
+ Digest::SHA256.hexdigest(content)
118
+ end
119
+
120
+ def integrate_dataset_into_categorization(dataset, category_mappings = {})
121
+ categorized_data = {}
122
+
123
+ case dataset
124
+ when Hash
125
+ # Single dataset with multiple files
126
+ dataset.each do |file_name, data|
127
+ process_dataset_file(data, file_name, category_mappings, categorized_data)
128
+ end
129
+ when Array
130
+ # Single file dataset
131
+ process_dataset_file(dataset, 'default', category_mappings, categorized_data)
132
+ else
133
+ raise Error, "Unsupported dataset format: #{dataset.class}"
134
+ end
135
+
136
+ # Add metadata
137
+ categorized_data[:_metadata] = {
138
+ processed_at: Time.now,
139
+ data_hash: generate_dataset_hash(dataset),
140
+ total_entries: count_total_entries(dataset)
141
+ }
142
+
143
+ categorized_data
144
+ end
145
+
146
+ private
147
+
148
+ def kaggle_credentials_available?
149
+ valid_credential?(@username) && valid_credential?(@api_key)
150
+ end
151
+
152
+ def warn_if_kaggle_credentials_missing
153
+ return if kaggle_credentials_available?
154
+
155
+ warn 'Warning: Kaggle credentials not found. Kaggle datasets will only work if they are already cached. ' \
156
+ 'To download new Kaggle datasets, set KAGGLE_USERNAME/KAGGLE_KEY environment variables, ' \
157
+ 'provide credentials explicitly, or place kaggle.json file in ~/.kaggle/ directory.'
158
+ end
159
+
160
+ def valid_credential?(credential)
161
+ credential && !credential.to_s.strip.empty?
162
+ end
163
+
164
+ def load_credentials(username, api_key, credentials_file)
165
+ # Try provided credentials file first
166
+ if credentials_file && File.exist?(credentials_file)
167
+ credentials = load_credentials_from_file(credentials_file)
168
+ @username = username || credentials['username']
169
+ @api_key = api_key || credentials['key']
170
+ # Try default kaggle.json file if no explicit credentials
171
+ elsif !username && !api_key && File.exist?(DEFAULT_CREDENTIALS_FILE)
172
+ credentials = load_credentials_from_file(DEFAULT_CREDENTIALS_FILE)
173
+ @username = credentials['username']
174
+ @api_key = credentials['key']
175
+ else
176
+ # Fall back to environment variables
177
+ @username = username || ENV['KAGGLE_USERNAME']
178
+ @api_key = api_key || ENV['KAGGLE_KEY']
179
+ end
180
+ end
181
+
182
+ def load_credentials_from_file(file_path)
183
+ content = File.read(file_path)
184
+ JSON.parse(content)
185
+ rescue JSON::ParserError => e
186
+ raise Error, "Invalid credentials file format: #{e.message}"
187
+ rescue StandardError => e
188
+ raise Error, "Failed to read credentials file: #{e.message}"
189
+ end
190
+
191
+ def ensure_directories_exist
192
+ FileUtils.mkdir_p(@download_path) unless Dir.exist?(@download_path)
193
+ FileUtils.mkdir_p(@cache_path) unless Dir.exist?(@cache_path)
194
+ end
195
+
196
+ def setup_httparty_options
197
+ self.class.base_uri KAGGLE_BASE_URL
198
+ self.class.default_options.merge!({
199
+ headers: {
200
+ 'User-Agent' => 'url_categorise-ruby-client'
201
+ },
202
+ timeout: @timeout,
203
+ basic_auth: {
204
+ username: @username,
205
+ password: @api_key
206
+ }
207
+ })
208
+ end
209
+
210
+ def authenticated_request(method, endpoint, options = {})
211
+ self.class.send(method, endpoint, options)
212
+ rescue Timeout::Error, Net::ReadTimeout, Net::OpenTimeout
213
+ raise Error, 'Request timed out'
214
+ rescue StandardError => e
215
+ raise Error, "Request failed: #{e.message}"
216
+ end
217
+
218
+ def process_dataset_response(content, dataset_path, source_type, options)
219
+ if source_type == :kaggle
220
+ # Kaggle returns ZIP files
221
+ zip_file = save_zip_file(dataset_path, content)
222
+ extracted_dir = get_extracted_dir(dataset_path)
223
+ extract_zip_file(zip_file, extracted_dir)
224
+ File.delete(zip_file) if File.exist?(zip_file)
225
+ handle_extracted_dataset(extracted_dir, options)
226
+ else
227
+ # Direct content processing
228
+ parse_csv_content(content, options)
229
+ end
230
+ end
231
+
232
+ def get_extracted_dir(dataset_path)
233
+ dir_name = dataset_path.gsub('/', '_').gsub(/[^a-zA-Z0-9_-]/, '_')
234
+ File.join(@download_path, dir_name)
235
+ end
236
+
237
+ def save_zip_file(dataset_path, content)
238
+ filename = "#{dataset_path.gsub('/', '_')}_#{Time.now.to_i}.zip"
239
+ file_path = File.join(@download_path, filename)
240
+
241
+ File.open(file_path, 'wb') do |file|
242
+ file.write(content)
243
+ end
244
+
245
+ file_path
246
+ end
247
+
248
+ def extract_zip_file(zip_file_path, extract_to_dir)
249
+ FileUtils.mkdir_p(extract_to_dir)
250
+
251
+ Zip::File.open(zip_file_path) do |zip_file|
252
+ zip_file.each do |entry|
253
+ extract_path = File.join(extract_to_dir, entry.name)
254
+
255
+ if entry.directory?
256
+ FileUtils.mkdir_p(extract_path)
257
+ else
258
+ parent_dir = File.dirname(extract_path)
259
+ FileUtils.mkdir_p(parent_dir) unless Dir.exist?(parent_dir)
260
+
261
+ File.open(extract_path, 'wb') do |f|
262
+ f.write entry.get_input_stream.read
263
+ end
264
+ end
265
+ end
266
+ end
267
+ rescue Zip::Error => e
268
+ raise Error, "Failed to extract zip file: #{e.message}"
269
+ end
270
+
271
+ def handle_existing_dataset(extracted_dir, _options)
272
+ csv_files = find_csv_files(extracted_dir)
273
+ return parse_csv_files_to_hash(csv_files) unless csv_files.empty?
274
+
275
+ extracted_dir
276
+ end
277
+
278
+ def handle_extracted_dataset(extracted_dir, _options)
279
+ csv_files = find_csv_files(extracted_dir)
280
+ return parse_csv_files_to_hash(csv_files) unless csv_files.empty?
281
+
282
+ extracted_dir
283
+ end
284
+
285
+ def find_csv_files(directory)
286
+ Dir.glob(File.join(directory, '**', '*.csv'))
287
+ end
288
+
289
+ def parse_csv_files_to_hash(csv_files)
290
+ result = {}
291
+
292
+ csv_files.each do |csv_file|
293
+ file_name = File.basename(csv_file, '.csv')
294
+ result[file_name] = parse_csv_file(csv_file)
295
+ end
296
+
297
+ # If there's only one CSV file, return its data directly
298
+ result.length == 1 ? result.values.first : result
299
+ end
300
+
301
+ def parse_csv_file(file_path)
302
+ raise Error, "File does not exist: #{file_path}" unless File.exist?(file_path)
303
+
304
+ data = []
305
+ CSV.foreach(file_path, headers: true, liberal_parsing: true) do |row|
306
+ data << row.to_hash
307
+ end
308
+
309
+ data
310
+ rescue CSV::MalformedCSVError => e
311
+ raise Error, "Failed to parse CSV file: #{e.message}"
312
+ end
313
+
314
+ def parse_csv_content(content, _options = {})
315
+ data = []
316
+ CSV.parse(content, headers: true, liberal_parsing: true) do |row|
317
+ data << row.to_hash
318
+ end
319
+
320
+ data
321
+ rescue CSV::MalformedCSVError => e
322
+ raise Error, "Failed to parse CSV content: #{e.message}"
323
+ end
324
+
325
+ def generate_cache_key(identifier, source_type)
326
+ sanitized = identifier.gsub(/[^a-zA-Z0-9_-]/, '_')
327
+ "#{source_type}_#{sanitized}_processed.json"
328
+ end
329
+
330
+ def load_from_cache(cache_key)
331
+ cache_file_path = File.join(@cache_path, cache_key)
332
+ return nil unless File.exist?(cache_file_path)
333
+
334
+ content = File.read(cache_file_path)
335
+ JSON.parse(content)
336
+ rescue JSON::ParserError
337
+ nil # Invalid cache, will re-process
338
+ rescue StandardError
339
+ nil # Cache read error, will re-process
340
+ end
341
+
342
+ def cache_processed_data(cache_key, data)
343
+ cache_file_path = File.join(@cache_path, cache_key)
344
+ File.write(cache_file_path, JSON.pretty_generate(data))
345
+ rescue StandardError
346
+ # Cache write failed, continue without caching
347
+ end
348
+
349
+ def process_dataset_file(data, file_name, category_mappings, categorized_data)
350
+ return unless data.is_a?(Array) && !data.empty?
351
+
352
+ # If explicit column mappings are provided, use them for all rows
353
+ if category_mappings[:url_column] && category_mappings[:category_column]
354
+ url_col = category_mappings[:url_column]
355
+ category_col = category_mappings[:category_column]
356
+
357
+ data.each do |row|
358
+ url = row[url_col]&.strip
359
+ next unless url && !url.empty?
360
+
361
+ # Extract domain from URL
362
+ domain = extract_domain(url)
363
+ next unless domain
364
+
365
+ # Determine category
366
+ category = determine_category(row, category_col, category_mappings, file_name)
367
+
368
+ # Add to categorized data
369
+ categorized_data[category] ||= []
370
+ categorized_data[category] << domain unless categorized_data[category].include?(domain)
371
+ end
372
+ else
373
+ # Auto-detect columns for each row (handles mixed column structures)
374
+ data.each do |row|
375
+ url_columns = detect_url_columns(row)
376
+ category_columns = detect_category_columns(row)
377
+
378
+ # Use detected columns for this specific row
379
+ url_col = url_columns.first
380
+ category_col = category_columns.first
381
+
382
+ next unless url_col # Must have URL column
383
+
384
+ url = row[url_col]&.strip
385
+ next unless url && !url.empty?
386
+
387
+ # Extract domain from URL
388
+ domain = extract_domain(url)
389
+ next unless domain
390
+
391
+ # Determine category
392
+ category = determine_category(row, category_col, category_mappings, file_name)
393
+
394
+ # Add to categorized data
395
+ categorized_data[category] ||= []
396
+ categorized_data[category] << domain unless categorized_data[category].include?(domain)
397
+ end
398
+ end
399
+ end
400
+
401
+ def detect_url_columns(sample_row)
402
+ url_indicators = %w[url domain website site link address]
403
+ sample_row.keys.select do |key|
404
+ key_lower = key.to_s.downcase
405
+ url_indicators.any? { |indicator| key_lower.include?(indicator) }
406
+ end
407
+ end
408
+
409
+ def detect_category_columns(sample_row)
410
+ category_indicators = %w[category class type classification label]
411
+ sample_row.keys.select do |key|
412
+ key_lower = key.to_s.downcase
413
+ category_indicators.any? { |indicator| key_lower.include?(indicator) }
414
+ end
415
+ end
416
+
417
+ def extract_domain(url)
418
+ # Handle both full URLs and domain-only entries
419
+ return nil if url.nil? || url.empty?
420
+
421
+ # Add protocol if missing
422
+ url = "http://#{url}" unless url.match?(%r{\A\w+://})
423
+
424
+ uri = URI.parse(url)
425
+ domain = uri.host&.downcase
426
+ domain = domain.gsub(/\Awww\./, '') if domain # Remove www prefix
427
+ domain
428
+ rescue URI::InvalidURIError
429
+ # If URI parsing fails, try to extract domain manually
430
+ cleaned = url.gsub(%r{\A\w+://}, '').gsub(%r{/.*\z}, '').downcase
431
+ cleaned = cleaned.gsub(/\Awww\./, '')
432
+ cleaned.empty? ? nil : cleaned
433
+ end
434
+
435
+ def determine_category(row, category_col, category_mappings, file_name)
436
+ # Use explicit category column if available
437
+ if category_col && row[category_col]
438
+ category = row[category_col].to_s.strip.downcase
439
+ return map_category_name(category, category_mappings)
440
+ end
441
+
442
+ # Use file name as category if no category column
443
+ map_category_name(file_name, category_mappings)
444
+ end
445
+
446
+ def map_category_name(original_name, category_mappings)
447
+ # Use provided mapping or sanitize the name
448
+ mapped = category_mappings[:category_map]&.[](original_name)
449
+ return mapped if mapped
450
+
451
+ # Sanitize and format category name
452
+ sanitized = original_name.to_s.downcase
453
+ .gsub(/[^a-z0-9_]/, '_')
454
+ .gsub(/_+/, '_')
455
+ .gsub(/\A_|_\z/, '')
456
+
457
+ sanitized.empty? ? 'dataset_category' : sanitized
458
+ end
459
+
460
+ def count_total_entries(dataset)
461
+ case dataset
462
+ when Hash
463
+ dataset.values.map { |v| v.is_a?(Array) ? v.length : 1 }.sum
464
+ when Array
465
+ dataset.length
466
+ else
467
+ 1
468
+ end
469
+ end
470
+ end
471
+ end
@@ -18,28 +18,28 @@ else
18
18
 
19
19
  class ListMetadata < ActiveRecord::Base
20
20
  self.table_name = 'url_categorise_list_metadata'
21
-
21
+
22
22
  validates :name, presence: true, uniqueness: true
23
23
  validates :url, presence: true
24
24
  validates :categories, presence: true
25
-
25
+
26
26
  serialize :categories, coder: JSON
27
-
27
+
28
28
  scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
29
29
  scope :updated_since, ->(time) { where('updated_at > ?', time) }
30
30
  end
31
31
 
32
32
  class Domain < ActiveRecord::Base
33
33
  self.table_name = 'url_categorise_domains'
34
-
34
+
35
35
  validates :domain, presence: true, uniqueness: true
36
36
  validates :categories, presence: true
37
-
37
+
38
38
  serialize :categories, coder: JSON
39
-
39
+
40
40
  scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
41
41
  scope :search, ->(term) { where('domain LIKE ?', "%#{term}%") }
42
-
42
+
43
43
  def self.categorise(domain_name)
44
44
  record = find_by(domain: domain_name.downcase.gsub('www.', ''))
45
45
  record ? record.categories : []
@@ -48,21 +48,45 @@ else
48
48
 
49
49
  class IpAddress < ActiveRecord::Base
50
50
  self.table_name = 'url_categorise_ip_addresses'
51
-
51
+
52
52
  validates :ip_address, presence: true, uniqueness: true
53
53
  validates :categories, presence: true
54
-
54
+
55
55
  serialize :categories, coder: JSON
56
-
56
+
57
57
  scope :by_category, ->(category) { where('categories LIKE ?', "%#{category}%") }
58
58
  scope :in_subnet, ->(subnet) { where('ip_address LIKE ?', "#{subnet}%") }
59
-
59
+
60
60
  def self.categorise(ip)
61
61
  record = find_by(ip_address: ip)
62
62
  record ? record.categories : []
63
63
  end
64
64
  end
65
65
 
66
+ class DatasetMetadata < ActiveRecord::Base
67
+ self.table_name = 'url_categorise_dataset_metadata'
68
+
69
+ validates :source_type, presence: true, inclusion: { in: %w[kaggle csv] }
70
+ validates :identifier, presence: true
71
+ validates :data_hash, presence: true, uniqueness: true
72
+ validates :total_entries, presence: true, numericality: { greater_than: 0 }
73
+
74
+ serialize :category_mappings, coder: JSON
75
+ serialize :processing_options, coder: JSON
76
+
77
+ scope :by_source, ->(source) { where(source_type: source) }
78
+ scope :by_identifier, ->(identifier) { where(identifier: identifier) }
79
+ scope :processed_since, ->(time) { where('processed_at > ?', time) }
80
+
81
+ def kaggle_dataset?
82
+ source_type == 'kaggle'
83
+ end
84
+
85
+ def csv_dataset?
86
+ source_type == 'csv'
87
+ end
88
+ end
89
+
66
90
  # Generator for Rails integration
67
91
  def self.generate_migration
68
92
  <<~MIGRATION
@@ -84,7 +108,7 @@ else
84
108
  t.text :categories, null: false
85
109
  t.timestamps
86
110
  end
87
-
111
+ #{' '}
88
112
  add_index :url_categorise_domains, :domain
89
113
  add_index :url_categorise_domains, :categories
90
114
 
@@ -93,13 +117,28 @@ else
93
117
  t.text :categories, null: false
94
118
  t.timestamps
95
119
  end
96
-
120
+ #{' '}
97
121
  add_index :url_categorise_ip_addresses, :ip_address
98
122
  add_index :url_categorise_ip_addresses, :categories
123
+
124
+ create_table :url_categorise_dataset_metadata do |t|
125
+ t.string :source_type, null: false, index: true
126
+ t.string :identifier, null: false
127
+ t.string :data_hash, null: false, index: { unique: true }
128
+ t.integer :total_entries, null: false
129
+ t.text :category_mappings
130
+ t.text :processing_options
131
+ t.datetime :processed_at
132
+ t.timestamps
133
+ end
134
+ #{' '}
135
+ add_index :url_categorise_dataset_metadata, :source_type
136
+ add_index :url_categorise_dataset_metadata, :identifier
137
+ add_index :url_categorise_dataset_metadata, :processed_at
99
138
  end
100
139
  end
101
140
  MIGRATION
102
141
  end
103
142
  end
104
143
  end
105
- end
144
+ end
@@ -1,3 +1,3 @@
1
1
  module UrlCategorise
2
- VERSION = "0.1.2"
2
+ VERSION = '0.1.3'
3
3
  end
@@ -8,6 +8,7 @@ require 'api-pattern'
8
8
 
9
9
  require 'url_categorise/version'
10
10
  require 'url_categorise/constants'
11
+ require 'url_categorise/dataset_processor'
11
12
 
12
13
  require 'url_categorise/client'
13
14