UrlCategorise 0.1.6 ā 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +2 -1
- data/.gitignore +1 -0
- data/CLAUDE.md +71 -8
- data/Gemfile.lock +5 -1
- data/README.md +129 -11
- data/bin/export_csv +44 -7
- data/bin/generate_video_lists +373 -0
- data/docs/video-url-detection.md +353 -0
- data/lib/url_categorise/client.rb +320 -58
- data/lib/url_categorise/constants.rb +9 -6
- data/lib/url_categorise/dataset_processor.rb +18 -6
- data/lib/url_categorise/iab_compliance.rb +2 -0
- data/lib/url_categorise/version.rb +1 -1
- data/lists/video_hosting_domains.hosts +7057 -0
- data/lists/video_url_patterns.txt +297 -0
- data/url_categorise.gemspec +1 -0
- metadata +19 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e4725733a26240e3bc23bfe9292d3ed0222db4e0308c5228da12a9ede1347e4b
|
4
|
+
data.tar.gz: a315a03357b48260c543459fab91ac8ec0601a0d7001b120bdc6f72e543e3671
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 74c8be653a2ae97ad74300105a95c326685d9669defd5c980cf16636e96965080a587453c21d5af6014874145f8f93d66c444574142f1568588f82b9577a7440
|
7
|
+
data.tar.gz: 450ae86237aa6a743a289e9b1165b9bae49f78068af8d727114763629b863567dc37de1b2ba38522e93a0ae08dd1683b79a062df2e2d18e76884fd4f8415c1db
|
data/.claude/settings.local.json
CHANGED
data/.gitignore
CHANGED
data/CLAUDE.md
CHANGED
@@ -103,6 +103,34 @@ The gem includes automatic monitoring and cleanup of broken URLs:
|
|
103
103
|
|
104
104
|
### New Features (Latest Version)
|
105
105
|
|
106
|
+
#### Video Hosting Detection and Regex Categorization
|
107
|
+
Advanced video content detection system with:
|
108
|
+
|
109
|
+
- **Comprehensive Video Hosting Lists**: Generate PiHole-compatible hosts files from yt-dlp extractors
|
110
|
+
- **Regex-Based Content Detection**: Distinguish between video content URLs vs homepages/profiles/playlists
|
111
|
+
- **Direct Video URL Detection**: `video_url?` method to check if URLs are direct video content links
|
112
|
+
- **Automatic List Generation**: `bin/generate_video_lists` script fetches and processes yt-dlp data
|
113
|
+
- **Video Hosting Category**: Separate `video_hosting` category with 3,500+ domains
|
114
|
+
- **Smart Content Categorization**: URLs matching video patterns get `*_content` suffix categories
|
115
|
+
- **Remote Pattern Files**: Automatically downloads video patterns from GitHub repository
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
# Enable regex categorization for video content detection (uses remote patterns by default)
|
119
|
+
client = UrlCategorise::Client.new(regex_categorization: true)
|
120
|
+
|
121
|
+
# Basic domain categorization
|
122
|
+
client.categorise('https://youtube.com') # => [:video_hosting]
|
123
|
+
|
124
|
+
# Enhanced content detection
|
125
|
+
client.categorise('https://youtube.com/watch?v=abc123') # => [:video_hosting, :video_hosting_content]
|
126
|
+
|
127
|
+
# Direct video URL detection
|
128
|
+
client.video_url?('https://youtube.com/watch?v=abc123') # => true
|
129
|
+
client.video_url?('https://youtube.com') # => false
|
130
|
+
client.video_url?('https://vimeo.com/123456789') # => true
|
131
|
+
client.video_url?('https://tiktok.com/@user/video/123') # => true
|
132
|
+
```
|
133
|
+
|
106
134
|
#### Dynamic Settings with ActiveAttr
|
107
135
|
The Client class now uses ActiveAttr to provide dynamic attribute modification:
|
108
136
|
|
@@ -141,7 +169,7 @@ result = client.export_hosts_files('/custom/export/path')
|
|
141
169
|
```
|
142
170
|
|
143
171
|
##### CSV Data Export
|
144
|
-
Export all data as a single CSV file for AI training and analysis:
|
172
|
+
Export all data as a single comprehensive CSV file for AI training and analysis:
|
145
173
|
|
146
174
|
```ruby
|
147
175
|
# Export to default location (cache_dir/exports/csv or ./exports/csv)
|
@@ -150,33 +178,68 @@ result = client.export_csv_data
|
|
150
178
|
# Export to custom location
|
151
179
|
result = client.export_csv_data('/custom/export/path')
|
152
180
|
|
153
|
-
#
|
154
|
-
#
|
181
|
+
# Returns:
|
182
|
+
# {
|
183
|
+
# csv_file: 'url_categorise_comprehensive_export_TIMESTAMP.csv',
|
184
|
+
# summary_file: 'export_summary_TIMESTAMP.json',
|
185
|
+
# total_entries: 75000,
|
186
|
+
# summary: { domain_categorization_entries: 50000, dataset_content_entries: 25000 },
|
187
|
+
# export_directory: '/export/path'
|
188
|
+
# }
|
155
189
|
```
|
156
190
|
|
191
|
+
**Comprehensive Export Features:**
|
192
|
+
- **Everything in One File**: Combined domains, categories, and raw dataset content
|
193
|
+
- **Rich Dataset Content**: Original titles, descriptions, summaries, and text from datasets
|
194
|
+
- **Dynamic Headers**: Automatically includes all available fields from any dataset
|
195
|
+
- **Data Type Tracking**: Distinguishes between processed domains and raw dataset entries
|
196
|
+
- **Perfect for AI/ML**: Single file with both structured categorization and rich textual features
|
197
|
+
|
157
198
|
#### CLI Commands
|
158
|
-
|
199
|
+
Command-line utilities for comprehensive data export:
|
159
200
|
|
160
201
|
```bash
|
161
|
-
# Export hosts files
|
202
|
+
# Export hosts files per category
|
162
203
|
$ bundle exec export_hosts --output /tmp/hosts --verbose
|
163
204
|
|
164
|
-
#
|
165
|
-
$ bundle exec export_csv --
|
205
|
+
# Full CSV export with datasets and all features
|
206
|
+
$ bundle exec export_csv --auto-load-datasets --iab-compliance --smart-categorization --verbose
|
207
|
+
|
208
|
+
# Custom configuration export
|
209
|
+
$ bundle exec export_csv --cache-dir ./custom_cache --kaggle-credentials ~/kaggle.json --output /tmp/export
|
166
210
|
|
167
|
-
#
|
211
|
+
# Basic domain categorization only
|
212
|
+
$ bundle exec export_csv --output /tmp/basic
|
213
|
+
|
214
|
+
# Health check for all blocklist URLs
|
168
215
|
$ bundle exec check_lists
|
216
|
+
|
217
|
+
# Generate updated video hosting lists
|
218
|
+
$ ruby bin/generate_video_lists
|
169
219
|
```
|
170
220
|
|
221
|
+
**Enhanced CLI Features:**
|
222
|
+
- `--auto-load-datasets`: Automatically load datasets from constants for rich content export
|
223
|
+
- `--kaggle-credentials FILE`: Custom Kaggle API credentials file path
|
224
|
+
- Full integration with all client features (IAB compliance, smart categorization, etc.)
|
225
|
+
- Verbose output shows dataset statistics and loading progress
|
226
|
+
- Video list generation from yt-dlp extractors with manual curation
|
227
|
+
|
171
228
|
### List Sources
|
172
229
|
Primary sources include:
|
173
230
|
- The Block List Project
|
174
231
|
- hagezi/dns-blocklists
|
175
232
|
- StevenBlack/hosts
|
176
233
|
- Various specialized security lists
|
234
|
+
- **yt-dlp video extractors**: Comprehensive video hosting domain detection (3,500+ domains)
|
235
|
+
- **GitHub-hosted video patterns**: Remote video URL detection patterns with manual curation
|
177
236
|
- **Kaggle datasets**: Public URL classification datasets
|
178
237
|
- **Custom CSV files**: Direct CSV dataset URLs with flexible column mapping
|
179
238
|
|
239
|
+
**Video hosting lists are now automatically fetched from:**
|
240
|
+
- Video domains: `https://raw.githubusercontent.com/TRex22/url_categorise/refs/heads/main/lists/video_hosting_domains.hosts`
|
241
|
+
- URL patterns: `https://raw.githubusercontent.com/TRex22/url_categorise/refs/heads/main/lists/video_url_patterns.txt`
|
242
|
+
|
180
243
|
### Testing Guidelines
|
181
244
|
- Mock all HTTP requests using WebMock
|
182
245
|
- Test both success and failure scenarios
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
UrlCategorise (0.1.
|
4
|
+
UrlCategorise (0.1.7)
|
5
5
|
active_attr (>= 0.17.1, < 1.0)
|
6
6
|
api_pattern (>= 0.0.6, < 1.0)
|
7
7
|
csv (>= 3.3.0, < 4.0)
|
@@ -11,6 +11,7 @@ PATH
|
|
11
11
|
json (>= 2.7.0, < 3.0)
|
12
12
|
kaggle (>= 0.0.3, < 1.0)
|
13
13
|
nokogiri (>= 1.18.9, < 2.0)
|
14
|
+
reline (>= 0.6.2)
|
14
15
|
resolv (>= 0.4.0, < 1.0)
|
15
16
|
rubyzip (>= 2.3.0, < 3.0)
|
16
17
|
|
@@ -88,6 +89,7 @@ GEM
|
|
88
89
|
multi_xml (>= 0.5.2)
|
89
90
|
i18n (1.14.7)
|
90
91
|
concurrent-ruby (~> 1.0)
|
92
|
+
io-console (0.8.1)
|
91
93
|
json (2.13.2)
|
92
94
|
kaggle (0.0.3)
|
93
95
|
csv (>= 3.3)
|
@@ -137,6 +139,8 @@ GEM
|
|
137
139
|
loofah (~> 2.21)
|
138
140
|
nokogiri (>= 1.15.7, != 1.16.7, != 1.16.6, != 1.16.5, != 1.16.4, != 1.16.3, != 1.16.2, != 1.16.1, != 1.16.0.rc1, != 1.16.0)
|
139
141
|
rake (13.3.0)
|
142
|
+
reline (0.6.2)
|
143
|
+
io-console (~> 0.5)
|
140
144
|
resolv (0.6.2)
|
141
145
|
rexml (3.4.1)
|
142
146
|
ruby-progressbar (1.13.0)
|
data/README.md
CHANGED
@@ -5,6 +5,8 @@ A comprehensive Ruby gem for categorizing URLs and domains based on various secu
|
|
5
5
|
## Features
|
6
6
|
|
7
7
|
- **Comprehensive Coverage**: 60+ high-quality categories including security, content, and specialized lists
|
8
|
+
- **Video Content Detection**: Advanced regex-based categorization with `video_url?` method to distinguish video content from other website resources
|
9
|
+
- **Custom Video Lists**: Generate and maintain comprehensive video hosting domain lists using yt-dlp extractors
|
8
10
|
- **Kaggle Dataset Integration**: Automatic loading and processing of machine learning datasets from Kaggle
|
9
11
|
- **Multiple Data Sources**: Supports blocklists, CSV datasets, and Kaggle ML datasets
|
10
12
|
- **Multiple List Formats**: Supports hosts files, pfSense, AdSense, uBlock Origin, dnsmasq, and plain text formats
|
@@ -115,7 +117,7 @@ Each category gets its own hosts file (e.g., `malware.hosts`, `advertising.hosts
|
|
115
117
|
|
116
118
|
#### CSV Data Export
|
117
119
|
|
118
|
-
Export all data as a single CSV file for AI training and analysis:
|
120
|
+
Export all data as a single comprehensive CSV file for AI training and analysis:
|
119
121
|
|
120
122
|
```ruby
|
121
123
|
# Export to default location
|
@@ -125,30 +127,63 @@ result = client.export_csv_data
|
|
125
127
|
client.iab_compliance_enabled = true
|
126
128
|
result = client.export_csv_data('/custom/export/path')
|
127
129
|
|
128
|
-
#
|
129
|
-
#
|
130
|
-
#
|
131
|
-
#
|
132
|
-
|
133
|
-
#
|
134
|
-
#
|
130
|
+
# Returns information about created files:
|
131
|
+
# {
|
132
|
+
# csv_file: '/path/url_categorise_comprehensive_export_20231201_143022.csv',
|
133
|
+
# summary_file: '/path/export_summary_20231201_143022.json',
|
134
|
+
# total_entries: 50000,
|
135
|
+
# summary: { ... },
|
136
|
+
# export_directory: '/path'
|
137
|
+
# }
|
135
138
|
```
|
136
139
|
|
140
|
+
**Single comprehensive CSV file contains:**
|
141
|
+
|
142
|
+
- **Domain Categorization Data**: All processed domains with categories, source types, IAB mappings
|
143
|
+
- **Raw Dataset Content**: Original dataset entries with titles, descriptions, text, summaries, and all available fields
|
144
|
+
- **Dynamic Headers**: Automatically adapts to include all available data fields
|
145
|
+
- **Data Type Column**: Distinguishes between 'domain_categorization', 'raw_dataset_content', etc.
|
146
|
+
|
147
|
+
**Key Features:**
|
148
|
+
- Everything in one file for easy analysis and AI/ML training
|
149
|
+
- Rich textual content from original datasets
|
150
|
+
- IAB Content Taxonomy compliance mapping
|
151
|
+
- Smart categorization metadata
|
152
|
+
- Source type tracking (dataset vs blocklist)
|
153
|
+
|
137
154
|
#### CLI Commands
|
138
155
|
|
139
|
-
|
156
|
+
Command-line utilities for data export:
|
140
157
|
|
141
158
|
```bash
|
142
159
|
# Export hosts files
|
143
160
|
$ bundle exec export_hosts --output /tmp/hosts --verbose
|
144
161
|
|
145
|
-
# Export CSV data with
|
146
|
-
$ bundle exec export_csv --output /tmp/csv --iab-compliance --verbose
|
162
|
+
# Export CSV data with all features enabled
|
163
|
+
$ bundle exec export_csv --output /tmp/csv --iab-compliance --smart-categorization --auto-load-datasets --verbose
|
164
|
+
|
165
|
+
# Generate updated video hosting lists
|
166
|
+
$ ruby bin/generate_video_lists
|
167
|
+
|
168
|
+
# Check health of all blocklist URLs
|
169
|
+
$ bundle exec check_lists
|
170
|
+
|
171
|
+
# Export with custom Kaggle credentials
|
172
|
+
$ bundle exec export_csv --auto-load-datasets --kaggle-credentials ~/my-kaggle.json --verbose
|
173
|
+
|
174
|
+
# Basic export (domains only)
|
175
|
+
$ bundle exec export_csv --output /tmp/csv
|
147
176
|
|
148
177
|
# Check URL health (existing command)
|
149
178
|
$ bundle exec check_lists
|
150
179
|
```
|
151
180
|
|
181
|
+
**Key CLI Options:**
|
182
|
+
- `--auto-load-datasets`: Load datasets from constants to include rich text content
|
183
|
+
- `--kaggle-credentials FILE`: Specify custom Kaggle credentials file
|
184
|
+
- `--iab-compliance`: Enable IAB Content Taxonomy mapping
|
185
|
+
- `--smart-categorization`: Enable intelligent category filtering
|
186
|
+
|
152
187
|
## Advanced Configuration
|
153
188
|
|
154
189
|
### File Caching
|
@@ -228,6 +263,89 @@ host_urls = {
|
|
228
263
|
client = UrlCategorise::Client.new(host_urls: host_urls)
|
229
264
|
```
|
230
265
|
|
266
|
+
### Video Content Detection
|
267
|
+
|
268
|
+
The gem includes advanced regex-based categorization specifically for video hosting platforms. This helps distinguish between actual video content URLs and other resources like homepages, user profiles, playlists, or community content.
|
269
|
+
|
270
|
+
#### Video Hosting Domains
|
271
|
+
|
272
|
+
The gem maintains a comprehensive list of video hosting domains extracted from yt-dlp (YouTube-dl fork) extractors:
|
273
|
+
|
274
|
+
```ruby
|
275
|
+
# Generate/update video hosting lists
|
276
|
+
system("ruby bin/generate_video_lists")
|
277
|
+
|
278
|
+
# Use video hosting categorization
|
279
|
+
client = UrlCategorise::Client.new
|
280
|
+
categories = client.categorise("youtube.com")
|
281
|
+
# => [:video_hosting]
|
282
|
+
```
|
283
|
+
|
284
|
+
#### Video Content vs Other Resources
|
285
|
+
|
286
|
+
Enable regex categorization to distinguish video content from other resources:
|
287
|
+
|
288
|
+
```ruby
|
289
|
+
client = UrlCategorise::Client.new(
|
290
|
+
regex_categorization: true # Uses remote video patterns by default
|
291
|
+
)
|
292
|
+
|
293
|
+
# Regular homepage gets basic category
|
294
|
+
client.categorise("https://youtube.com")
|
295
|
+
# => [:video_hosting]
|
296
|
+
|
297
|
+
# Actual video URL gets enhanced categorization
|
298
|
+
client.categorise("https://youtube.com/watch?v=dQw4w9WgXcQ")
|
299
|
+
# => [:video_hosting, :video_hosting_content]
|
300
|
+
|
301
|
+
# User profile page - no content enhancement
|
302
|
+
client.categorise("https://youtube.com/@username")
|
303
|
+
# => [:video_hosting]
|
304
|
+
```
|
305
|
+
|
306
|
+
#### Direct Video URL Detection
|
307
|
+
|
308
|
+
Use the `video_url?` method to check if a URL is a direct link to video content:
|
309
|
+
|
310
|
+
```ruby
|
311
|
+
client = UrlCategorise::Client.new(regex_categorization: true)
|
312
|
+
|
313
|
+
# Check if URLs are direct video content links
|
314
|
+
client.video_url?("https://youtube.com/watch?v=dQw4w9WgXcQ") # => true
|
315
|
+
client.video_url?("https://youtube.com") # => false
|
316
|
+
client.video_url?("https://youtube.com/@channel") # => false
|
317
|
+
client.video_url?("https://vimeo.com/123456789") # => true
|
318
|
+
client.video_url?("https://tiktok.com/@user/video/123") # => true
|
319
|
+
|
320
|
+
# Works with various video hosting platforms
|
321
|
+
client.video_url?("https://dailymotion.com/video/x7abc123") # => true
|
322
|
+
client.video_url?("https://twitch.tv/videos/1234567890") # => true
|
323
|
+
|
324
|
+
# Returns false for non-video domains
|
325
|
+
client.video_url?("https://google.com/search?q=cats") # => false
|
326
|
+
```
|
327
|
+
|
328
|
+
**How it works:**
|
329
|
+
1. First checks if the URL is from a known video hosting domain
|
330
|
+
2. Then uses regex patterns to determine if it's a direct video content URL
|
331
|
+
3. Returns `true` only if both conditions are met
|
332
|
+
4. Handles invalid URLs gracefully (returns `false`)
|
333
|
+
|
334
|
+
#### Maintaining Video Lists
|
335
|
+
|
336
|
+
The gem includes a script to generate and maintain comprehensive video hosting lists:
|
337
|
+
|
338
|
+
```bash
|
339
|
+
# Generate updated video hosting lists
|
340
|
+
ruby bin/generate_video_lists
|
341
|
+
|
342
|
+
# This creates:
|
343
|
+
# - lists/video_hosting_domains.hosts (PiHole compatible)
|
344
|
+
# - lists/video_url_patterns.txt (Regex patterns for content detection)
|
345
|
+
```
|
346
|
+
|
347
|
+
The script fetches data from yt-dlp extractors and combines it with manually curated major platforms to ensure comprehensive coverage.
|
348
|
+
|
231
349
|
### Smart Categorization (Post-Processing)
|
232
350
|
|
233
351
|
Smart categorization solves the problem of overly broad domain-level categorization. For example, `reddit.com` might appear in health & fitness blocklists, but not all Reddit content is health-related.
|
data/bin/export_csv
CHANGED
@@ -9,13 +9,15 @@ options = {
|
|
9
9
|
cache_dir: nil,
|
10
10
|
verbose: false,
|
11
11
|
iab_compliance: false,
|
12
|
-
smart_categorization: false
|
12
|
+
smart_categorization: false,
|
13
|
+
auto_load_datasets: false,
|
14
|
+
kaggle_credentials_file: nil
|
13
15
|
}
|
14
16
|
|
15
17
|
OptionParser.new do |opts|
|
16
18
|
opts.banner = "Usage: #{$0} [options]"
|
17
19
|
opts.separator ""
|
18
|
-
opts.separator "Export all categorized domains and
|
20
|
+
opts.separator "Export all categorized domains and dataset content as a single CSV file for AI training"
|
19
21
|
opts.separator ""
|
20
22
|
|
21
23
|
opts.on("-o", "--output PATH", "Output directory path (default: cache_dir/exports/csv or ./exports/csv)") do |path|
|
@@ -34,6 +36,14 @@ OptionParser.new do |opts|
|
|
34
36
|
options[:smart_categorization] = true
|
35
37
|
end
|
36
38
|
|
39
|
+
opts.on("--auto-load-datasets", "Auto-load datasets from constants for rich content export") do
|
40
|
+
options[:auto_load_datasets] = true
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on("--kaggle-credentials FILE", "Path to Kaggle credentials file (default: ~/.kaggle/kaggle.json)") do |file|
|
44
|
+
options[:kaggle_credentials_file] = file
|
45
|
+
end
|
46
|
+
|
37
47
|
opts.on("-v", "--verbose", "Verbose output") do
|
38
48
|
options[:verbose] = true
|
39
49
|
end
|
@@ -48,12 +58,38 @@ puts "=== UrlCategorise CSV Data Export ===" if options[:verbose]
|
|
48
58
|
puts "Initializing client..." if options[:verbose]
|
49
59
|
|
50
60
|
begin
|
61
|
+
# Build dataset config if datasets should be loaded
|
62
|
+
dataset_config = {}
|
63
|
+
if options[:auto_load_datasets]
|
64
|
+
dataset_config = {
|
65
|
+
cache_path: options[:cache_dir] ? File.join(options[:cache_dir], 'datasets') : './url_cache/datasets',
|
66
|
+
download_path: options[:cache_dir] ? File.join(options[:cache_dir], 'downloads') : './url_cache/downloads'
|
67
|
+
}
|
68
|
+
|
69
|
+
# Add Kaggle credentials if provided
|
70
|
+
if options[:kaggle_credentials_file]
|
71
|
+
dataset_config[:kaggle] = { credentials_file: options[:kaggle_credentials_file] }
|
72
|
+
elsif File.exist?(File.expand_path('~/.kaggle/kaggle.json'))
|
73
|
+
dataset_config[:kaggle] = { credentials_file: '~/.kaggle/kaggle.json' }
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
51
77
|
client = UrlCategorise::Client.new(
|
52
78
|
cache_dir: options[:cache_dir],
|
53
79
|
iab_compliance: options[:iab_compliance],
|
54
|
-
smart_categorization: options[:smart_categorization]
|
80
|
+
smart_categorization: options[:smart_categorization],
|
81
|
+
auto_load_datasets: options[:auto_load_datasets],
|
82
|
+
dataset_config: dataset_config
|
55
83
|
)
|
56
84
|
|
85
|
+
if options[:verbose] && options[:auto_load_datasets]
|
86
|
+
puts "Client initialized with dataset loading enabled"
|
87
|
+
puts "Dataset statistics:"
|
88
|
+
puts " Dataset categories: #{client.count_of_dataset_categories}"
|
89
|
+
puts " Dataset hosts: #{client.count_of_dataset_hosts.to_s.reverse.gsub(/(\\d{3})(?=\\d)/, '\\1,').reverse}"
|
90
|
+
puts " Dataset data size: #{client.size_of_dataset_data.round(2)} MB" if client.respond_to?(:size_of_dataset_data)
|
91
|
+
end
|
92
|
+
|
57
93
|
puts "Exporting CSV data..." if options[:verbose]
|
58
94
|
|
59
95
|
result = client.export_csv_data(options[:output_path])
|
@@ -61,13 +97,14 @@ begin
|
|
61
97
|
puts "\nā
Export completed successfully!"
|
62
98
|
puts "š Export directory: #{result[:export_directory]}"
|
63
99
|
puts "š CSV file: #{result[:csv_file]}"
|
64
|
-
puts "š
|
100
|
+
puts "š Summary file: #{result[:summary_file]}"
|
65
101
|
|
66
102
|
puts "\nš Data Summary:"
|
67
|
-
puts " Total
|
103
|
+
puts " Total entries: #{result[:total_entries]}"
|
104
|
+
puts " Domain categorizations: #{result[:summary][:domain_categorization_entries]}"
|
105
|
+
puts " Dataset content entries: #{result[:summary][:dataset_content_entries]}"
|
68
106
|
puts " Total categories: #{result[:summary][:total_categories]}"
|
69
|
-
puts "
|
70
|
-
puts " Blocklist categories: #{result[:summary][:blocklist_categories_count]}"
|
107
|
+
puts " Has dataset content: #{result[:summary][:has_dataset_content]}"
|
71
108
|
|
72
109
|
if options[:verbose]
|
73
110
|
puts "\nš·ļø Categories included:"
|