UrlCategorise 0.0.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.claude/settings.local.json +13 -0
- data/.github/workflows/ci.yml +57 -0
- data/CLAUDE.md +135 -0
- data/Gemfile.lock +83 -55
- data/README.md +516 -27
- data/Rakefile +2 -0
- data/docs/.keep +2 -0
- data/docs/v0.1-context.md +93 -0
- data/lib/url_categorise/active_record_client.rb +118 -0
- data/lib/url_categorise/client.rb +185 -17
- data/lib/url_categorise/constants.rb +64 -3
- data/lib/url_categorise/models.rb +105 -0
- data/lib/url_categorise/version.rb +1 -1
- data/lib/url_categorise.rb +11 -0
- data/url_categorise.gemspec +17 -9
- metadata +171 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c6aea0bce1ffdcf4a3b8e951add06bbc936a44442f50773d84c1e4032cc5b208
|
4
|
+
data.tar.gz: '090e4d0fded41f907e9047b21b5787c647189f2ffeae1b2b04025038788ee800'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 389698a8563c2d65a19826ecb2ce63e898c490724581e0352974a7775ff5de4a0d062eba8b4c22a3a6205624788901f90aea7c10644517be354fcd70e2539606
|
7
|
+
data.tar.gz: 88afbabfd44b51f341466cd334410f01014b5dc3c26ad3b9cc0ec71a407b2c3020b64095907a593ae4213055f5da727c994b08a4a800979369b73855541bd9fc
|
@@ -0,0 +1,57 @@
|
|
1
|
+
name: CI
|
2
|
+
|
3
|
+
on:
|
4
|
+
push:
|
5
|
+
branches: [ main, develop ]
|
6
|
+
pull_request:
|
7
|
+
branches: [ main, develop ]
|
8
|
+
|
9
|
+
jobs:
|
10
|
+
test:
|
11
|
+
runs-on: ubuntu-latest
|
12
|
+
strategy:
|
13
|
+
matrix:
|
14
|
+
ruby-version: ['3.0', '3.1', '3.2', '3.3', '3.4']
|
15
|
+
|
16
|
+
steps:
|
17
|
+
- uses: actions/checkout@v4
|
18
|
+
|
19
|
+
- name: Set up Ruby ${{ matrix.ruby-version }}
|
20
|
+
uses: ruby/setup-ruby@v1
|
21
|
+
with:
|
22
|
+
ruby-version: ${{ matrix.ruby-version }}
|
23
|
+
bundler-cache: true
|
24
|
+
|
25
|
+
- name: Install dependencies
|
26
|
+
run: bundle install
|
27
|
+
|
28
|
+
- name: Run tests
|
29
|
+
run: bundle exec rake test
|
30
|
+
|
31
|
+
- name: Run linter (if available)
|
32
|
+
run: bundle exec rubocop || true
|
33
|
+
continue-on-error: true
|
34
|
+
|
35
|
+
coverage:
|
36
|
+
runs-on: ubuntu-latest
|
37
|
+
steps:
|
38
|
+
- uses: actions/checkout@v4
|
39
|
+
|
40
|
+
- name: Set up Ruby
|
41
|
+
uses: ruby/setup-ruby@v1
|
42
|
+
with:
|
43
|
+
ruby-version: '3.4'
|
44
|
+
bundler-cache: true
|
45
|
+
|
46
|
+
- name: Install dependencies
|
47
|
+
run: bundle install
|
48
|
+
|
49
|
+
- name: Run tests with coverage
|
50
|
+
run: bundle exec rake test
|
51
|
+
env:
|
52
|
+
COVERAGE: true
|
53
|
+
|
54
|
+
- name: Upload coverage to Codecov
|
55
|
+
uses: codecov/codecov-action@v4
|
56
|
+
with:
|
57
|
+
fail_ci_if_error: false
|
data/CLAUDE.md
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
# UrlCategorise Development Guidelines
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
UrlCategorise is a Ruby gem for categorizing URLs and domains based on various security and content blocklists. It downloads and processes multiple types of lists to provide comprehensive domain categorization.
|
5
|
+
|
6
|
+
## Development Requirements
|
7
|
+
|
8
|
+
### Testing Standards
|
9
|
+
- **ALL new changes MUST include new tests**
|
10
|
+
- **Test coverage MUST be 90% or higher**
|
11
|
+
- **NEVER delete, skip, or environment-check tests to make them pass**
|
12
|
+
- **Tests MUST pass because the underlying code works correctly**
|
13
|
+
- Use minitest for all testing
|
14
|
+
- Use WebMock for HTTP request stubbing in tests
|
15
|
+
- Run tests with: `bundle exec rake test`
|
16
|
+
- SimpleCov integration is mandatory for coverage tracking
|
17
|
+
|
18
|
+
### Dependencies and Rails Support
|
19
|
+
- **MUST use the latest stable versions of gems**
|
20
|
+
- Ruby >= 3.0.0 (currently using 3.4+)
|
21
|
+
- **MUST use minitest and rake** for testing and build automation
|
22
|
+
- **Rails compatibility MUST support Rails 8.x** and current stable versions
|
23
|
+
- Dependencies are managed via Gemfile and gemspec
|
24
|
+
- ActiveRecord integration must be optional and backward compatible
|
25
|
+
|
26
|
+
#### Rails 8 Integration
|
27
|
+
- ActiveRecord models use `coder: JSON` for serialization (Rails 8 compatible)
|
28
|
+
- Migration version set to `ActiveRecord::Migration[8.0]`
|
29
|
+
- Optional database integration with automatic fallback to memory-based categorization
|
30
|
+
- Installation: Generate migration with `UrlCategorise::Models.generate_migration`
|
31
|
+
- Usage: Use `UrlCategorise::ActiveRecordClient` instead of `UrlCategorise::Client`
|
32
|
+
|
33
|
+
### Code Quality
|
34
|
+
- Follow Ruby best practices and conventions
|
35
|
+
- Use meaningful variable and method names
|
36
|
+
- Add appropriate error handling
|
37
|
+
- Ensure thread safety where applicable
|
38
|
+
|
39
|
+
### Supported List Formats
|
40
|
+
The gem supports multiple blocklist formats:
|
41
|
+
- Standard hosts files (0.0.0.0 domain.com)
|
42
|
+
- pfSense format
|
43
|
+
- AdSense lists
|
44
|
+
- uBlock Origin files
|
45
|
+
- dnsmasq format
|
46
|
+
- Plain text domain lists
|
47
|
+
|
48
|
+
### Category Management Guidelines
|
49
|
+
- **Category names MUST be human-readable and intuitive**
|
50
|
+
- **NEVER add combined/meta lists as categories** (e.g., hagezi_light, stevenblack_all)
|
51
|
+
- **First try to add new lists to existing categories** before creating new ones
|
52
|
+
- **Use descriptive names instead of provider prefixes**:
|
53
|
+
- ❌ Bad: `abuse_ch_feodo`, `dshield_block_list`, `botnet_c2`, `doh_vpn_proxy_bypass`
|
54
|
+
- ✅ Good: `banking_trojans`, `suspicious_domains`, `botnet_command_control`, `dns_over_https_bypass`
|
55
|
+
- **Logical category organization**:
|
56
|
+
- Security threats: `malware`, `phishing`, `ransomware`, `botnet_command_control`, `banking_trojans`
|
57
|
+
- Content filtering: `advertising`, `gambling`, `pornography`, `social_media`
|
58
|
+
- Network security: `suspicious_domains`, `threat_intelligence`, `dns_over_https_bypass`
|
59
|
+
- Geographic/specialized: `sanctions`, `newly_registered_domains`
|
60
|
+
- Content categories: `news`, `blogs`, `forums`, `educational`, `health`, `finance`
|
61
|
+
- Business categories: `business`, `technology`, `government`, `streaming`, `shopping`
|
62
|
+
|
63
|
+
### Required Category Name Fixes
|
64
|
+
The following categories need to be renamed for human readability:
|
65
|
+
- `abuse_ch_feodo` → `banking_trojans`
|
66
|
+
- `abuse_ch_malware_bazaar` → `malware_domains`
|
67
|
+
- `abuse_ch_ssl_blacklist` → `malicious_ssl_certificates`
|
68
|
+
- `abuse_ch_threat_fox` → `threat_indicators`
|
69
|
+
- `dshield_top_attackers` → `top_attack_sources`
|
70
|
+
- `dshield_block_list` → `suspicious_domains`
|
71
|
+
- `botnet_c2` → `botnet_command_control`
|
72
|
+
- `doh_vpn_proxy_bypass` → `dns_over_https_bypass`
|
73
|
+
|
74
|
+
### Core Features
|
75
|
+
- Domain/URL categorization
|
76
|
+
- Multiple list format parsing
|
77
|
+
- Hash-based file update detection
|
78
|
+
- Optional local file caching
|
79
|
+
- IP sanctions list checking
|
80
|
+
- DNS resolution for domain-to-IP mapping
|
81
|
+
- ActiveRecord/Rails integration (optional)
|
82
|
+
|
83
|
+
### Architecture
|
84
|
+
- `Client` class: Main interface for categorization
|
85
|
+
- `Constants` module: Contains default list URLs and categories
|
86
|
+
- Modular design allows extending with new list sources
|
87
|
+
- Support for custom list directories and caching
|
88
|
+
|
89
|
+
### List Sources
|
90
|
+
Primary sources include:
|
91
|
+
- The Block List Project
|
92
|
+
- hagezi/dns-blocklists
|
93
|
+
- StevenBlack/hosts
|
94
|
+
- Various specialized security lists
|
95
|
+
|
96
|
+
### Testing Guidelines
|
97
|
+
- Mock all HTTP requests using WebMock
|
98
|
+
- Test both success and failure scenarios
|
99
|
+
- Verify proper parsing of different list formats
|
100
|
+
- Test edge cases (empty responses, malformed data)
|
101
|
+
- Include integration tests for the full categorization flow
|
102
|
+
|
103
|
+
### Performance Considerations
|
104
|
+
- Implement efficient parsing for large lists
|
105
|
+
- Use appropriate data structures for fast lookups
|
106
|
+
- Consider memory usage with large datasets
|
107
|
+
- Provide options for selective list loading
|
108
|
+
|
109
|
+
### Configuration
|
110
|
+
- Allow custom list URLs
|
111
|
+
- Support for local file directories
|
112
|
+
- Configurable DNS servers for IP resolution
|
113
|
+
- Optional caching parameters
|
114
|
+
|
115
|
+
## Build and Release Process
|
116
|
+
1. Update version number in `lib/url_categorise/version.rb`
|
117
|
+
2. Update CHANGELOG.md with new features
|
118
|
+
3. Run full test suite: `bundle exec rake test`
|
119
|
+
4. Update documentation as needed
|
120
|
+
5. Build gem: `gem build url_categorise.gemspec`
|
121
|
+
6. Release: `gem push url_categorise-x.x.x.gem`
|
122
|
+
|
123
|
+
## Contributing
|
124
|
+
- Fork the repository
|
125
|
+
- Create a feature branch
|
126
|
+
- Add comprehensive tests for new functionality
|
127
|
+
- Ensure all tests pass
|
128
|
+
- Update documentation
|
129
|
+
- Submit a pull request
|
130
|
+
|
131
|
+
## CI/CD
|
132
|
+
- GitHub Actions workflow runs tests on multiple Ruby versions
|
133
|
+
- All tests must pass before merging
|
134
|
+
- Coverage reporting with Codecov integration
|
135
|
+
- Automated dependency updates where appropriate
|
data/Gemfile.lock
CHANGED
@@ -1,21 +1,27 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
UrlCategorise (0.0
|
5
|
-
api_pattern (
|
4
|
+
UrlCategorise (0.1.0)
|
5
|
+
api_pattern (>= 0.0.5, < 1.0)
|
6
|
+
csv (>= 3.3.0, < 4.0)
|
7
|
+
digest (>= 3.1.0, < 4.0)
|
8
|
+
fileutils (>= 1.7.0, < 2.0)
|
9
|
+
httparty (>= 0.22.0, < 1.0)
|
10
|
+
nokogiri (>= 1.16.0, < 2.0)
|
11
|
+
resolv (>= 0.4.0, < 1.0)
|
6
12
|
|
7
13
|
GEM
|
8
14
|
remote: https://rubygems.org/
|
9
15
|
specs:
|
10
|
-
actionpack (7.0.
|
11
|
-
actionview (= 7.0.
|
12
|
-
activesupport (= 7.0.
|
13
|
-
rack (~> 2.0, >= 2.2.
|
16
|
+
actionpack (7.0.8.7)
|
17
|
+
actionview (= 7.0.8.7)
|
18
|
+
activesupport (= 7.0.8.7)
|
19
|
+
rack (~> 2.0, >= 2.2.4)
|
14
20
|
rack-test (>= 0.6.3)
|
15
21
|
rails-dom-testing (~> 2.0)
|
16
22
|
rails-html-sanitizer (~> 1.0, >= 1.2.0)
|
17
|
-
actionview (7.0.
|
18
|
-
activesupport (= 7.0.
|
23
|
+
actionview (7.0.8.7)
|
24
|
+
activesupport (= 7.0.8.7)
|
19
25
|
builder (~> 3.1)
|
20
26
|
erubi (~> 1.4)
|
21
27
|
rails-dom-testing (~> 2.0)
|
@@ -24,89 +30,111 @@ GEM
|
|
24
30
|
actionpack (>= 3.0.2, < 7.1)
|
25
31
|
activemodel (>= 3.0.2, < 7.1)
|
26
32
|
activesupport (>= 3.0.2, < 7.1)
|
27
|
-
activemodel (7.0.
|
28
|
-
activesupport (= 7.0.
|
29
|
-
activesupport (7.0.
|
33
|
+
activemodel (7.0.8.7)
|
34
|
+
activesupport (= 7.0.8.7)
|
35
|
+
activesupport (7.0.8.7)
|
30
36
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
31
37
|
i18n (>= 1.6, < 2)
|
32
38
|
minitest (>= 5.1)
|
33
39
|
tzinfo (~> 2.0)
|
34
|
-
addressable (2.8.
|
35
|
-
public_suffix (>= 2.0.2, <
|
40
|
+
addressable (2.8.7)
|
41
|
+
public_suffix (>= 2.0.2, < 7.0)
|
36
42
|
ansi (1.5.0)
|
37
|
-
api_pattern (0.0.
|
43
|
+
api_pattern (0.0.5)
|
38
44
|
active_attr (~> 0.15.4)
|
39
|
-
|
40
|
-
|
41
|
-
|
45
|
+
csv (~> 3.3.0)
|
46
|
+
httparty (~> 0.22.0)
|
47
|
+
nokogiri (~> 1.16.0)
|
48
|
+
bigdecimal (3.2.2)
|
49
|
+
builder (3.3.0)
|
42
50
|
coderay (1.1.3)
|
43
|
-
concurrent-ruby (1.
|
44
|
-
crack (0.
|
51
|
+
concurrent-ruby (1.3.5)
|
52
|
+
crack (1.0.0)
|
53
|
+
bigdecimal
|
45
54
|
rexml
|
46
55
|
crass (1.0.6)
|
47
|
-
|
48
|
-
|
49
|
-
|
56
|
+
csv (3.3.5)
|
57
|
+
digest (3.2.0)
|
58
|
+
docile (1.4.1)
|
59
|
+
erubi (1.13.1)
|
60
|
+
fileutils (1.7.3)
|
61
|
+
hashdiff (1.2.0)
|
62
|
+
httparty (0.22.0)
|
63
|
+
csv
|
50
64
|
mini_mime (>= 1.0.0)
|
51
65
|
multi_xml (>= 0.5.2)
|
52
|
-
i18n (1.
|
66
|
+
i18n (1.14.7)
|
53
67
|
concurrent-ruby (~> 1.0)
|
54
|
-
loofah (2.
|
68
|
+
loofah (2.24.1)
|
55
69
|
crass (~> 1.0.2)
|
56
70
|
nokogiri (>= 1.12.0)
|
57
|
-
method_source (1.
|
58
|
-
mini_mime (1.1.
|
59
|
-
|
60
|
-
minitest
|
71
|
+
method_source (1.1.0)
|
72
|
+
mini_mime (1.1.5)
|
73
|
+
mini_portile2 (2.8.9)
|
74
|
+
minitest (5.25.5)
|
75
|
+
minitest-focus (1.4.0)
|
61
76
|
minitest (>= 4, < 6)
|
62
|
-
minitest-reporters (1.
|
77
|
+
minitest-reporters (1.7.1)
|
63
78
|
ansi
|
64
79
|
builder
|
65
80
|
minitest (>= 5.0)
|
66
81
|
ruby-progressbar
|
67
|
-
mocha (2.
|
82
|
+
mocha (2.4.5)
|
68
83
|
ruby2_keywords (>= 0.0.5)
|
69
|
-
multi_xml (0.
|
70
|
-
|
84
|
+
multi_xml (0.7.2)
|
85
|
+
bigdecimal (~> 3.1)
|
86
|
+
nokogiri (1.16.8)
|
87
|
+
mini_portile2 (~> 2.8.2)
|
71
88
|
racc (~> 1.4)
|
72
|
-
pry (0.
|
89
|
+
pry (0.15.2)
|
73
90
|
coderay (~> 1.1)
|
74
91
|
method_source (~> 1.0)
|
75
|
-
public_suffix (
|
76
|
-
racc (1.
|
77
|
-
rack (2.2.
|
78
|
-
rack-test (2.
|
92
|
+
public_suffix (6.0.2)
|
93
|
+
racc (1.8.1)
|
94
|
+
rack (2.2.17)
|
95
|
+
rack-test (2.2.0)
|
79
96
|
rack (>= 1.3)
|
80
|
-
rails-dom-testing (2.0
|
81
|
-
activesupport (>=
|
97
|
+
rails-dom-testing (2.3.0)
|
98
|
+
activesupport (>= 5.0.0)
|
99
|
+
minitest
|
82
100
|
nokogiri (>= 1.6)
|
83
|
-
rails-html-sanitizer (1.
|
84
|
-
loofah (~> 2.
|
85
|
-
|
86
|
-
|
101
|
+
rails-html-sanitizer (1.6.2)
|
102
|
+
loofah (~> 2.21)
|
103
|
+
nokogiri (>= 1.15.7, != 1.16.7, != 1.16.6, != 1.16.5, != 1.16.4, != 1.16.3, != 1.16.2, != 1.16.1, != 1.16.0.rc1, != 1.16.0)
|
104
|
+
rake (13.3.0)
|
105
|
+
resolv (0.6.2)
|
106
|
+
rexml (3.4.1)
|
87
107
|
ruby-progressbar (1.13.0)
|
88
108
|
ruby2_keywords (0.0.5)
|
89
|
-
|
109
|
+
simplecov (0.22.0)
|
110
|
+
docile (~> 1.1)
|
111
|
+
simplecov-html (~> 0.11)
|
112
|
+
simplecov_json_formatter (~> 0.1)
|
113
|
+
simplecov-html (0.13.2)
|
114
|
+
simplecov_json_formatter (0.1.4)
|
115
|
+
timecop (0.9.10)
|
90
116
|
tzinfo (2.0.6)
|
91
117
|
concurrent-ruby (~> 1.0)
|
92
|
-
webmock (3.
|
118
|
+
webmock (3.24.0)
|
93
119
|
addressable (>= 2.8.0)
|
94
120
|
crack (>= 0.3.2)
|
95
121
|
hashdiff (>= 0.4.0, < 2.0.0)
|
96
122
|
|
97
123
|
PLATFORMS
|
98
|
-
arm64-darwin-
|
124
|
+
arm64-darwin-24
|
125
|
+
ruby
|
99
126
|
|
100
127
|
DEPENDENCIES
|
101
128
|
UrlCategorise!
|
102
|
-
minitest (~> 5.
|
103
|
-
minitest-focus (~> 1.
|
104
|
-
minitest-reporters (~> 1.
|
105
|
-
mocha (~> 2.
|
106
|
-
pry (~> 0.
|
107
|
-
rake (~> 13.0
|
108
|
-
|
109
|
-
|
129
|
+
minitest (~> 5.25.5)
|
130
|
+
minitest-focus (~> 1.4.0)
|
131
|
+
minitest-reporters (~> 1.7.1)
|
132
|
+
mocha (~> 2.4.5)
|
133
|
+
pry (~> 0.15.2)
|
134
|
+
rake (~> 13.3.0)
|
135
|
+
simplecov (~> 0.22.0)
|
136
|
+
timecop (~> 0.9.10)
|
137
|
+
webmock (~> 3.24.0)
|
110
138
|
|
111
139
|
BUNDLED WITH
|
112
|
-
2.
|
140
|
+
2.6.7
|