UrlCategorise 0.1.3 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +7 -2
- data/.gitignore +1 -0
- data/CLAUDE.md +77 -2
- data/Gemfile.lock +13 -1
- data/README.md +332 -7
- data/bin/export_csv +83 -0
- data/bin/export_hosts +68 -0
- data/bin/rake +2 -0
- data/correct_usage_example.rb +64 -0
- data/docs/v0.1.4-features.md +215 -0
- data/lib/url_categorise/active_record_client.rb +1 -1
- data/lib/url_categorise/client.rb +431 -33
- data/lib/url_categorise/dataset_processor.rb +9 -4
- data/lib/url_categorise/iab_compliance.rb +147 -0
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +2 -0
- data/url_categorise.gemspec +4 -2
- metadata +52 -3
@@ -0,0 +1,147 @@
|
|
1
|
+
module UrlCategorise
|
2
|
+
module IabCompliance
|
3
|
+
IAB_V2_MAPPINGS = {
|
4
|
+
# Content Categories
|
5
|
+
advertising: 'IAB3', # Advertising
|
6
|
+
automotive: 'IAB2', # Automotive
|
7
|
+
books_literature: 'IAB20', # Books & Literature
|
8
|
+
business: 'IAB3', # Business
|
9
|
+
careers: 'IAB4', # Careers
|
10
|
+
education: 'IAB5', # Education
|
11
|
+
entertainment: 'IAB1', # Arts & Entertainment
|
12
|
+
finance: 'IAB13', # Personal Finance
|
13
|
+
food_drink: 'IAB8', # Food & Drink
|
14
|
+
health: 'IAB7', # Health & Fitness
|
15
|
+
hobbies_interests: 'IAB9', # Hobbies & Interests
|
16
|
+
home_garden: 'IAB10', # Home & Garden
|
17
|
+
law_government: 'IAB11', # Law, Government & Politics
|
18
|
+
news: 'IAB12', # News
|
19
|
+
parenting: 'IAB6', # Family & Parenting
|
20
|
+
pets: 'IAB16', # Pets
|
21
|
+
philosophy: 'IAB21', # Philosophy/Religion
|
22
|
+
real_estate: 'IAB21', # Real Estate
|
23
|
+
science: 'IAB15', # Science
|
24
|
+
shopping: 'IAB22', # Shopping
|
25
|
+
sports: 'IAB17', # Sports
|
26
|
+
style_fashion: 'IAB18', # Style & Fashion
|
27
|
+
technology: 'IAB19', # Technology & Computing
|
28
|
+
travel: 'IAB20', # Travel
|
29
|
+
|
30
|
+
# Security & Malware Categories
|
31
|
+
malware: 'IAB25', # Non-Standard Content (custom extension)
|
32
|
+
phishing: 'IAB25', # Non-Standard Content (custom extension)
|
33
|
+
gambling: 'IAB7-39', # Gambling
|
34
|
+
pornography: 'IAB25-3', # Pornography
|
35
|
+
violence: 'IAB25', # Non-Standard Content (custom extension)
|
36
|
+
illegal: 'IAB25', # Non-Standard Content (custom extension)
|
37
|
+
|
38
|
+
# Network & Security
|
39
|
+
botnet_command_control: 'IAB25', # Non-Standard Content (custom extension)
|
40
|
+
threat_intelligence: 'IAB25', # Non-Standard Content (custom extension)
|
41
|
+
suspicious_domains: 'IAB25', # Non-Standard Content (custom extension)
|
42
|
+
compromised_ips: 'IAB25', # Non-Standard Content (custom extension)
|
43
|
+
tor_exit_nodes: 'IAB25', # Non-Standard Content (custom extension)
|
44
|
+
|
45
|
+
# Social & Media
|
46
|
+
social_media: 'IAB14', # Society
|
47
|
+
streaming: 'IAB1-2', # Music
|
48
|
+
blogs: 'IAB14', # Society
|
49
|
+
forums: 'IAB19', # Technology & Computing
|
50
|
+
|
51
|
+
# Geographic/Language Specific
|
52
|
+
chinese_ad_hosts: 'IAB3', # Advertising
|
53
|
+
korean_ad_hosts: 'IAB3', # Advertising
|
54
|
+
mobile_ads: 'IAB3', # Advertising
|
55
|
+
smart_tv_ads: 'IAB3', # Advertising
|
56
|
+
|
57
|
+
# Specialized
|
58
|
+
newly_registered_domains: 'IAB25', # Non-Standard Content (custom extension)
|
59
|
+
dns_over_https_bypass: 'IAB25', # Non-Standard Content (custom extension)
|
60
|
+
sanctions_ips: 'IAB25', # Non-Standard Content (custom extension)
|
61
|
+
cryptojacking: 'IAB25', # Non-Standard Content (custom extension)
|
62
|
+
phishing_extended: 'IAB25' # Non-Standard Content (custom extension)
|
63
|
+
}.freeze
|
64
|
+
|
65
|
+
IAB_V3_MAPPINGS = {
|
66
|
+
# Tier-1 Categories (IAB Content Taxonomy 3.0)
|
67
|
+
advertising: '3', # Advertising
|
68
|
+
automotive: '2', # Automotive
|
69
|
+
books_literature: '20', # Books & Literature
|
70
|
+
business: '3', # Business
|
71
|
+
careers: '4', # Careers
|
72
|
+
education: '5', # Education
|
73
|
+
entertainment: '1', # Arts & Entertainment
|
74
|
+
finance: '13', # Personal Finance
|
75
|
+
food_drink: '8', # Food & Drink
|
76
|
+
health: '7', # Health & Fitness & Wellness
|
77
|
+
hobbies_interests: '9', # Hobbies & Interests
|
78
|
+
home_garden: '10', # Home & Garden
|
79
|
+
law_government: '11', # Law, Government & Politics
|
80
|
+
news: '12', # News & Politics
|
81
|
+
parenting: '6', # Family & Parenting
|
82
|
+
pets: '16', # Pets
|
83
|
+
philosophy: '21', # Philosophy/Religion & Spirituality
|
84
|
+
real_estate: '21', # Real Estate
|
85
|
+
science: '15', # Science
|
86
|
+
shopping: '22', # Shopping
|
87
|
+
sports: '17', # Sports
|
88
|
+
style_fashion: '18', # Style & Fashion
|
89
|
+
technology: '19', # Technology & Computing
|
90
|
+
travel: '20', # Travel
|
91
|
+
|
92
|
+
# Security & Malware Categories (Custom extensions)
|
93
|
+
malware: '626', # Illegal Content (custom mapping)
|
94
|
+
phishing: '626', # Illegal Content (custom mapping)
|
95
|
+
gambling: '7-39', # Gambling (subcategory)
|
96
|
+
pornography: '626', # Adult Content
|
97
|
+
violence: '626', # Illegal Content (custom mapping)
|
98
|
+
illegal: '626', # Illegal Content
|
99
|
+
|
100
|
+
# Network & Security (Custom extensions)
|
101
|
+
botnet_command_control: '626', # Illegal Content (custom mapping)
|
102
|
+
threat_intelligence: '626', # Illegal Content (custom mapping)
|
103
|
+
suspicious_domains: '626', # Illegal Content (custom mapping)
|
104
|
+
compromised_ips: '626', # Illegal Content (custom mapping)
|
105
|
+
tor_exit_nodes: '626', # Illegal Content (custom mapping)
|
106
|
+
|
107
|
+
# Social & Media
|
108
|
+
social_media: '14', # Society
|
109
|
+
streaming: '1-2', # Music & Audio
|
110
|
+
blogs: '14', # Society
|
111
|
+
forums: '19', # Technology & Computing
|
112
|
+
|
113
|
+
# Geographic/Language Specific
|
114
|
+
chinese_ad_hosts: '3', # Advertising
|
115
|
+
korean_ad_hosts: '3', # Advertising
|
116
|
+
mobile_ads: '3', # Advertising
|
117
|
+
smart_tv_ads: '3', # Advertising
|
118
|
+
|
119
|
+
# Specialized
|
120
|
+
newly_registered_domains: '626', # Illegal Content (custom mapping)
|
121
|
+
dns_over_https_bypass: '626', # Illegal Content (custom mapping)
|
122
|
+
sanctions_ips: '626', # Illegal Content (custom mapping)
|
123
|
+
cryptojacking: '626', # Illegal Content (custom mapping)
|
124
|
+
phishing_extended: '626' # Illegal Content (custom mapping)
|
125
|
+
}.freeze
|
126
|
+
|
127
|
+
def self.map_category_to_iab(category, version = :v3)
|
128
|
+
category_sym = category.to_sym
|
129
|
+
mapping = version == :v2 ? IAB_V2_MAPPINGS : IAB_V3_MAPPINGS
|
130
|
+
mapping[category_sym] || 'Unknown'
|
131
|
+
end
|
132
|
+
|
133
|
+
def self.get_iab_categories(categories, version = :v3)
|
134
|
+
categories.map { |cat| map_category_to_iab(cat, version) }.uniq
|
135
|
+
end
|
136
|
+
|
137
|
+
def self.supported_versions
|
138
|
+
%i[v2 v3]
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.category_exists?(category, version = :v3)
|
142
|
+
category_sym = category.to_sym
|
143
|
+
mapping = version == :v2 ? IAB_V2_MAPPINGS : IAB_V3_MAPPINGS
|
144
|
+
mapping.key?(category_sym)
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
data/lib/url_categorise.rb
CHANGED
@@ -3,12 +3,14 @@ require 'nokogiri'
|
|
3
3
|
require 'digest'
|
4
4
|
require 'fileutils'
|
5
5
|
require 'resolv'
|
6
|
+
require 'active_attr'
|
6
7
|
|
7
8
|
require 'api-pattern'
|
8
9
|
|
9
10
|
require 'url_categorise/version'
|
10
11
|
require 'url_categorise/constants'
|
11
12
|
require 'url_categorise/dataset_processor'
|
13
|
+
require 'url_categorise/iab_compliance'
|
12
14
|
|
13
15
|
require 'url_categorise/client'
|
14
16
|
|
data/url_categorise.gemspec
CHANGED
@@ -18,17 +18,19 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
19
19
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
20
20
|
end
|
21
|
-
spec.bindir = '
|
22
|
-
spec.executables =
|
21
|
+
spec.bindir = 'bin'
|
22
|
+
spec.executables = ['export_hosts', 'export_csv', 'check_lists']
|
23
23
|
spec.require_paths = ['lib']
|
24
24
|
spec.required_ruby_version = '>= 3.0.0'
|
25
25
|
|
26
|
+
spec.add_dependency 'active_attr', '>= 0.17.1', '< 1.0'
|
26
27
|
spec.add_dependency 'api_pattern', '>= 0.0.6', '< 1.0'
|
27
28
|
spec.add_dependency 'csv', '>= 3.3.0', '< 4.0'
|
28
29
|
spec.add_dependency 'digest', '>= 3.1.0', '< 4.0'
|
29
30
|
spec.add_dependency 'fileutils', '>= 1.7.0', '< 2.0'
|
30
31
|
spec.add_dependency 'httparty', '>= 0.22.0', '< 1.0'
|
31
32
|
spec.add_dependency 'json', '>= 2.7.0', '< 3.0'
|
33
|
+
spec.add_dependency 'kaggle', '>= 0.0.3', '< 1.0'
|
32
34
|
spec.add_dependency 'nokogiri', '>= 1.18.9', '< 2.0'
|
33
35
|
spec.add_dependency 'resolv', '>= 0.4.0', '< 1.0'
|
34
36
|
spec.add_dependency 'rubyzip', '>= 2.3.0', '< 3.0'
|
metadata
CHANGED
@@ -1,14 +1,34 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: UrlCategorise
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- trex22
|
8
|
-
bindir:
|
8
|
+
bindir: bin
|
9
9
|
cert_chain: []
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
11
11
|
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: active_attr
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - ">="
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: 0.17.1
|
19
|
+
- - "<"
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
version: 0.17.1
|
29
|
+
- - "<"
|
30
|
+
- !ruby/object:Gem::Version
|
31
|
+
version: '1.0'
|
12
32
|
- !ruby/object:Gem::Dependency
|
13
33
|
name: api_pattern
|
14
34
|
requirement: !ruby/object:Gem::Requirement
|
@@ -129,6 +149,26 @@ dependencies:
|
|
129
149
|
- - "<"
|
130
150
|
- !ruby/object:Gem::Version
|
131
151
|
version: '3.0'
|
152
|
+
- !ruby/object:Gem::Dependency
|
153
|
+
name: kaggle
|
154
|
+
requirement: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - ">="
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: 0.0.3
|
159
|
+
- - "<"
|
160
|
+
- !ruby/object:Gem::Version
|
161
|
+
version: '1.0'
|
162
|
+
type: :runtime
|
163
|
+
prerelease: false
|
164
|
+
version_requirements: !ruby/object:Gem::Requirement
|
165
|
+
requirements:
|
166
|
+
- - ">="
|
167
|
+
- !ruby/object:Gem::Version
|
168
|
+
version: 0.0.3
|
169
|
+
- - "<"
|
170
|
+
- !ruby/object:Gem::Version
|
171
|
+
version: '1.0'
|
132
172
|
- !ruby/object:Gem::Dependency
|
133
173
|
name: nokogiri
|
134
174
|
requirement: !ruby/object:Gem::Requirement
|
@@ -361,7 +401,10 @@ description: A client for using the UrlCategorise API in Ruby. Built from their
|
|
361
401
|
documentation. This is an unofficial project.
|
362
402
|
email:
|
363
403
|
- contact@jasonchalom.com
|
364
|
-
executables:
|
404
|
+
executables:
|
405
|
+
- check_lists
|
406
|
+
- export_csv
|
407
|
+
- export_hosts
|
365
408
|
extensions: []
|
366
409
|
extra_rdoc_files: []
|
367
410
|
files:
|
@@ -378,14 +421,20 @@ files:
|
|
378
421
|
- Rakefile
|
379
422
|
- bin/check_lists
|
380
423
|
- bin/console
|
424
|
+
- bin/export_csv
|
425
|
+
- bin/export_hosts
|
426
|
+
- bin/rake
|
381
427
|
- bin/setup
|
428
|
+
- correct_usage_example.rb
|
382
429
|
- docs/.keep
|
383
430
|
- docs/v0.1-context.md
|
431
|
+
- docs/v0.1.4-features.md
|
384
432
|
- lib/url_categorise.rb
|
385
433
|
- lib/url_categorise/active_record_client.rb
|
386
434
|
- lib/url_categorise/client.rb
|
387
435
|
- lib/url_categorise/constants.rb
|
388
436
|
- lib/url_categorise/dataset_processor.rb
|
437
|
+
- lib/url_categorise/iab_compliance.rb
|
389
438
|
- lib/url_categorise/models.rb
|
390
439
|
- lib/url_categorise/version.rb
|
391
440
|
- url_categorise.gemspec
|