domain_extractor 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +145 -0
- data/README.md +237 -0
- data/lib/domain_extractor/formatter.rb +105 -0
- data/lib/domain_extractor/version.rb +1 -1
- data/lib/domain_extractor.rb +27 -0
- data/spec/formatter_spec.rb +299 -0
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bb9ff9b765f3037fb6a2f0af330ecf415c76dde8b59aea7c362e353460d20049
|
|
4
|
+
data.tar.gz: 3349872d55a4a6252a1886b69eacc33743af7b6a8d74be1fb142842884cb41e7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 54d64c3c9b3cf04ac2405c86f563cea1d22d0e37491caee3de5e0a6ab569686ce57b4a0efa9b23729bb6efe5a2eda25b5421c1a0188dd241348f4dd2f0663540
|
|
7
|
+
data.tar.gz: ee15d47829741ac24ebcc13621a62e83f421f6dd2572f0abafeaca2fbac12da02b72bec9350aeb2f375da71de13518a82856dc5af049fb813952509ed9b5e94f
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,151 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.2.7] - 2025-11-09
|
|
9
|
+
|
|
10
|
+
### Added - URL Formatting API
|
|
11
|
+
|
|
12
|
+
Added a comprehensive `format` method for programmatic URL normalization and transformation. The formatter provides precise control over URL structure, protocol, and formatting while maintaining the same validation modes as the Rails validator.
|
|
13
|
+
|
|
14
|
+
#### Features
|
|
15
|
+
|
|
16
|
+
**Core Method:**
|
|
17
|
+
- `DomainExtractor.format(url, **options)` - Format and normalize URLs based on specified options
|
|
18
|
+
- Returns formatted URL string or `nil` for invalid input
|
|
19
|
+
- Strips paths and query parameters from URLs
|
|
20
|
+
- Supports all validation modes from the Rails validator
|
|
21
|
+
|
|
22
|
+
**Validation Modes:**
|
|
23
|
+
- `:standard` (default) - Preserves full host as-is while normalizing protocol/slashes
|
|
24
|
+
- `:root_domain` - Strips all subdomains, returns only root domain
|
|
25
|
+
- `:root_or_custom_subdomain` - Preserves custom subdomains but removes 'www'
|
|
26
|
+
|
|
27
|
+
**Formatting Options:**
|
|
28
|
+
- `use_protocol` (default: `true`) - Include/exclude protocol in output
|
|
29
|
+
- `use_https` (default: `true`) - Use HTTPS vs HTTP (only when `use_protocol` is true)
|
|
30
|
+
- `use_trailing_slash` (default: `false`) - Add/remove trailing slash from output
|
|
31
|
+
|
|
32
|
+
#### Usage Examples
|
|
33
|
+
|
|
34
|
+
**Basic Formatting:**
|
|
35
|
+
```ruby
|
|
36
|
+
# Remove trailing slash (default)
|
|
37
|
+
DomainExtractor.format('https://example.com/')
|
|
38
|
+
# => 'https://example.com'
|
|
39
|
+
|
|
40
|
+
# Strip paths and query parameters
|
|
41
|
+
DomainExtractor.format('https://example.com/path?query=value')
|
|
42
|
+
# => 'https://example.com'
|
|
43
|
+
|
|
44
|
+
# Normalize to HTTPS
|
|
45
|
+
DomainExtractor.format('http://example.com')
|
|
46
|
+
# => 'https://example.com'
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
**Validation Modes:**
|
|
50
|
+
```ruby
|
|
51
|
+
# Root domain only (strips subdomains)
|
|
52
|
+
DomainExtractor.format('https://shop.example.com', validation: :root_domain)
|
|
53
|
+
# => 'https://example.com'
|
|
54
|
+
|
|
55
|
+
# Strip www but keep custom subdomains
|
|
56
|
+
DomainExtractor.format('https://www.example.com', validation: :root_or_custom_subdomain)
|
|
57
|
+
# => 'https://example.com'
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**Protocol Control:**
|
|
61
|
+
```ruby
|
|
62
|
+
# Without protocol
|
|
63
|
+
DomainExtractor.format('https://example.com', use_protocol: false)
|
|
64
|
+
# => 'example.com'
|
|
65
|
+
|
|
66
|
+
# Force HTTP instead of HTTPS
|
|
67
|
+
DomainExtractor.format('https://example.com', use_https: false)
|
|
68
|
+
# => 'http://example.com'
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
**Trailing Slash Control:**
|
|
72
|
+
```ruby
|
|
73
|
+
# Add trailing slash
|
|
74
|
+
DomainExtractor.format('https://example.com', use_trailing_slash: true)
|
|
75
|
+
# => 'https://example.com/'
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**Combined Options:**
|
|
79
|
+
```ruby
|
|
80
|
+
# Root domain, no protocol, with trailing slash
|
|
81
|
+
DomainExtractor.format('https://shop.example.com/path',
|
|
82
|
+
validation: :root_domain,
|
|
83
|
+
use_protocol: false,
|
|
84
|
+
use_trailing_slash: true)
|
|
85
|
+
# => 'example.com/'
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
#### Real-World Use Cases
|
|
89
|
+
|
|
90
|
+
**Canonical URL Generation:**
|
|
91
|
+
```ruby
|
|
92
|
+
def canonical_url(url)
|
|
93
|
+
DomainExtractor.format(url,
|
|
94
|
+
validation: :root_or_custom_subdomain,
|
|
95
|
+
use_https: true,
|
|
96
|
+
use_trailing_slash: false)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
canonical_url('http://www.example.com/') # => 'https://example.com'
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Domain Normalization for Allowlists:**
|
|
103
|
+
```ruby
|
|
104
|
+
def normalize_domain(url)
|
|
105
|
+
DomainExtractor.format(url, validation: :root_domain, use_protocol: false)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
normalize_domain('https://shop.example.com/path') # => 'example.com'
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
**Multi-Tenant URL Standardization:**
|
|
112
|
+
```ruby
|
|
113
|
+
class Tenant < ApplicationRecord
|
|
114
|
+
before_validation :normalize_custom_domain
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
def normalize_custom_domain
|
|
119
|
+
return if custom_domain.blank?
|
|
120
|
+
|
|
121
|
+
self.custom_domain = DomainExtractor.format(
|
|
122
|
+
custom_domain,
|
|
123
|
+
validation: :root_or_custom_subdomain,
|
|
124
|
+
use_https: true,
|
|
125
|
+
use_trailing_slash: false
|
|
126
|
+
)
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
#### Implementation Details
|
|
132
|
+
|
|
133
|
+
- **Performance**: Leverages existing DomainExtractor parsing engine with minimal overhead
|
|
134
|
+
- **Nil-safe**: Returns `nil` for invalid URLs instead of raising exceptions
|
|
135
|
+
- **Consistent API**: Uses same option names and validation modes as Rails validator
|
|
136
|
+
- **Path/Query Stripping**: Automatically removes paths and query parameters
|
|
137
|
+
- **Multi-part TLD Support**: Correctly handles complex TLDs like `.co.uk`, `.com.au`
|
|
138
|
+
|
|
139
|
+
#### Code Quality
|
|
140
|
+
|
|
141
|
+
- **49 comprehensive test cases** covering all formatting modes and options
|
|
142
|
+
- **RuboCop clean** with zero offenses
|
|
143
|
+
- **100% test coverage** maintained across entire gem (200 total tests)
|
|
144
|
+
- **Well-documented** with extensive README section and real-world examples
|
|
145
|
+
|
|
146
|
+
#### Documentation
|
|
147
|
+
|
|
148
|
+
- Added comprehensive **URL Formatting** section to README.md
|
|
149
|
+
- Includes examples for all validation modes and options
|
|
150
|
+
- Real-world use cases: canonical URLs, domain normalization, multi-tenant standardization
|
|
151
|
+
- Clear API reference with all available options
|
|
152
|
+
|
|
8
153
|
## [0.2.6] - 2025-11-09
|
|
9
154
|
|
|
10
155
|
### Fixed - Rails Validator Registration
|
data/README.md
CHANGED
|
@@ -13,6 +13,8 @@ Use **DomainExtractor** whenever you need a dependable tld parser for tricky mul
|
|
|
13
13
|
✅ **Accurate Multi-part TLD Parser** - Handles complex multi-part TLDs (co.uk, com.au, gov.br) using the [Public Suffix List](https://publicsuffix.org/)
|
|
14
14
|
✅ **Nested Subdomain Extraction** - Correctly parses multi-level subdomains (api.staging.example.com)
|
|
15
15
|
✅ **Smart URL Normalization** - Automatically handles URLs with or without schemes
|
|
16
|
+
✅ **Powerful URL Formatting** - Transform and standardize URLs with flexible options
|
|
17
|
+
✅ **Rails Integration** - Custom ActiveModel validator for declarative URL validation
|
|
16
18
|
✅ **Query Parameter Parsing** - Parse query strings into structured hashes
|
|
17
19
|
✅ **Batch Processing** - Parse multiple URLs efficiently
|
|
18
20
|
✅ **IP Address Detection** - Identifies and handles IPv4 and IPv6 addresses
|
|
@@ -355,6 +357,241 @@ DomainExtractor.parse_query_params(query_string)
|
|
|
355
357
|
# Returns: Hash of query parameters
|
|
356
358
|
```
|
|
357
359
|
|
|
360
|
+
```ruby
|
|
361
|
+
DomainExtractor.format(url_string, **options)
|
|
362
|
+
|
|
363
|
+
# => Formats a URL according to the specified options.
|
|
364
|
+
|
|
365
|
+
# Returns: Formatted URL string or nil if invalid
|
|
366
|
+
# Options:
|
|
367
|
+
# :validation (:standard, :root_domain, :root_or_custom_subdomain)
|
|
368
|
+
# :use_protocol (true/false)
|
|
369
|
+
# :use_https (true/false)
|
|
370
|
+
# :use_trailing_slash (true/false)
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
## URL Formatting
|
|
374
|
+
|
|
375
|
+
DomainExtractor provides powerful URL formatting capabilities to normalize, transform, and standardize URLs according to your application's requirements.
|
|
376
|
+
|
|
377
|
+
### Basic Formatting
|
|
378
|
+
|
|
379
|
+
```ruby
|
|
380
|
+
# Remove trailing slash (default)
|
|
381
|
+
DomainExtractor.format('https://example.com/')
|
|
382
|
+
# => 'https://example.com'
|
|
383
|
+
|
|
384
|
+
# Strip paths and query parameters
|
|
385
|
+
DomainExtractor.format('https://example.com/path?query=value')
|
|
386
|
+
# => 'https://example.com'
|
|
387
|
+
|
|
388
|
+
# Normalize to HTTPS
|
|
389
|
+
DomainExtractor.format('http://example.com')
|
|
390
|
+
# => 'https://example.com'
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
### Validation Modes
|
|
394
|
+
|
|
395
|
+
#### Standard Mode (Default)
|
|
396
|
+
|
|
397
|
+
Preserves the full host as-is while normalizing protocol and trailing slashes.
|
|
398
|
+
|
|
399
|
+
```ruby
|
|
400
|
+
DomainExtractor.format('https://shop.example.com')
|
|
401
|
+
# => 'https://shop.example.com'
|
|
402
|
+
|
|
403
|
+
DomainExtractor.format('https://www.example.com/')
|
|
404
|
+
# => 'https://www.example.com'
|
|
405
|
+
|
|
406
|
+
DomainExtractor.format('https://api.staging.example.com')
|
|
407
|
+
# => 'https://api.staging.example.com'
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
#### Root Domain Mode
|
|
411
|
+
|
|
412
|
+
Strips all subdomains and returns only the root domain.
|
|
413
|
+
|
|
414
|
+
```ruby
|
|
415
|
+
DomainExtractor.format('https://shop.example.com', validation: :root_domain)
|
|
416
|
+
# => 'https://example.com'
|
|
417
|
+
|
|
418
|
+
DomainExtractor.format('https://www.example.com/', validation: :root_domain)
|
|
419
|
+
# => 'https://example.com'
|
|
420
|
+
|
|
421
|
+
DomainExtractor.format('https://api.staging.example.com', validation: :root_domain)
|
|
422
|
+
# => 'https://example.com'
|
|
423
|
+
|
|
424
|
+
# Works with multi-part TLDs
|
|
425
|
+
DomainExtractor.format('https://shop.example.co.uk', validation: :root_domain)
|
|
426
|
+
# => 'https://example.co.uk'
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
#### Root or Custom Subdomain Mode
|
|
430
|
+
|
|
431
|
+
Preserves custom subdomains but specifically removes the 'www' subdomain.
|
|
432
|
+
|
|
433
|
+
```ruby
|
|
434
|
+
DomainExtractor.format('https://example.com', validation: :root_or_custom_subdomain)
|
|
435
|
+
# => 'https://example.com'
|
|
436
|
+
|
|
437
|
+
DomainExtractor.format('https://shop.example.com', validation: :root_or_custom_subdomain)
|
|
438
|
+
# => 'https://shop.example.com'
|
|
439
|
+
|
|
440
|
+
# Strips www subdomain
|
|
441
|
+
DomainExtractor.format('https://www.example.com', validation: :root_or_custom_subdomain)
|
|
442
|
+
# => 'https://example.com'
|
|
443
|
+
|
|
444
|
+
DomainExtractor.format('https://api.example.com', validation: :root_or_custom_subdomain)
|
|
445
|
+
# => 'https://api.example.com'
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
### Protocol Options
|
|
449
|
+
|
|
450
|
+
#### Without Protocol
|
|
451
|
+
|
|
452
|
+
Remove the protocol entirely from the output.
|
|
453
|
+
|
|
454
|
+
```ruby
|
|
455
|
+
DomainExtractor.format('https://example.com', use_protocol: false)
|
|
456
|
+
# => 'example.com'
|
|
457
|
+
|
|
458
|
+
DomainExtractor.format('https://shop.example.com', use_protocol: false)
|
|
459
|
+
# => 'shop.example.com'
|
|
460
|
+
|
|
461
|
+
# Combine with root_domain
|
|
462
|
+
DomainExtractor.format('https://shop.example.com',
|
|
463
|
+
validation: :root_domain,
|
|
464
|
+
use_protocol: false)
|
|
465
|
+
# => 'example.com'
|
|
466
|
+
```
|
|
467
|
+
|
|
468
|
+
#### HTTP vs HTTPS
|
|
469
|
+
|
|
470
|
+
Control which protocol to use in the output.
|
|
471
|
+
|
|
472
|
+
```ruby
|
|
473
|
+
# Default: use HTTPS
|
|
474
|
+
DomainExtractor.format('http://example.com')
|
|
475
|
+
# => 'https://example.com'
|
|
476
|
+
|
|
477
|
+
# Allow HTTP
|
|
478
|
+
DomainExtractor.format('https://example.com', use_https: false)
|
|
479
|
+
# => 'http://example.com'
|
|
480
|
+
|
|
481
|
+
DomainExtractor.format('http://example.com', use_https: false)
|
|
482
|
+
# => 'http://example.com'
|
|
483
|
+
```
|
|
484
|
+
|
|
485
|
+
### Trailing Slash Options
|
|
486
|
+
|
|
487
|
+
```ruby
|
|
488
|
+
# Remove trailing slash (default)
|
|
489
|
+
DomainExtractor.format('https://example.com/')
|
|
490
|
+
# => 'https://example.com'
|
|
491
|
+
|
|
492
|
+
# Add trailing slash
|
|
493
|
+
DomainExtractor.format('https://example.com', use_trailing_slash: true)
|
|
494
|
+
# => 'https://example.com/'
|
|
495
|
+
|
|
496
|
+
DomainExtractor.format('https://example.com/', use_trailing_slash: true)
|
|
497
|
+
# => 'https://example.com/'
|
|
498
|
+
|
|
499
|
+
# Works with other options
|
|
500
|
+
DomainExtractor.format('https://shop.example.com',
|
|
501
|
+
validation: :root_domain,
|
|
502
|
+
use_trailing_slash: true)
|
|
503
|
+
# => 'https://example.com/'
|
|
504
|
+
```
|
|
505
|
+
|
|
506
|
+
### Combined Options
|
|
507
|
+
|
|
508
|
+
Mix and match options for precise URL formatting:
|
|
509
|
+
|
|
510
|
+
```ruby
|
|
511
|
+
# Root domain, no protocol, with trailing slash
|
|
512
|
+
DomainExtractor.format('https://shop.example.com/path',
|
|
513
|
+
validation: :root_domain,
|
|
514
|
+
use_protocol: false,
|
|
515
|
+
use_trailing_slash: true)
|
|
516
|
+
# => 'example.com/'
|
|
517
|
+
|
|
518
|
+
# Strip www, use HTTP, with trailing slash
|
|
519
|
+
DomainExtractor.format('https://www.example.com',
|
|
520
|
+
validation: :root_or_custom_subdomain,
|
|
521
|
+
use_https: false,
|
|
522
|
+
use_trailing_slash: true)
|
|
523
|
+
# => 'http://example.com/'
|
|
524
|
+
|
|
525
|
+
# Standard mode, no protocol, with trailing slash
|
|
526
|
+
DomainExtractor.format('https://api.example.com',
|
|
527
|
+
use_protocol: false,
|
|
528
|
+
use_trailing_slash: true)
|
|
529
|
+
# => 'api.example.com/'
|
|
530
|
+
```
|
|
531
|
+
|
|
532
|
+
### Real-World Use Cases
|
|
533
|
+
|
|
534
|
+
#### Canonical URL Generation
|
|
535
|
+
|
|
536
|
+
```ruby
|
|
537
|
+
def canonical_url(url)
|
|
538
|
+
DomainExtractor.format(url,
|
|
539
|
+
validation: :root_or_custom_subdomain,
|
|
540
|
+
use_https: true,
|
|
541
|
+
use_trailing_slash: false)
|
|
542
|
+
end
|
|
543
|
+
|
|
544
|
+
canonical_url('http://www.example.com/') # => 'https://example.com'
|
|
545
|
+
canonical_url('https://shop.example.com/') # => 'https://shop.example.com'
|
|
546
|
+
```
|
|
547
|
+
|
|
548
|
+
#### Domain Normalization for Allowlists
|
|
549
|
+
|
|
550
|
+
```ruby
|
|
551
|
+
def normalize_domain_for_allowlist(url)
|
|
552
|
+
DomainExtractor.format(url,
|
|
553
|
+
validation: :root_domain,
|
|
554
|
+
use_protocol: false)
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
normalize_domain_for_allowlist('https://shop.example.com/path') # => 'example.com'
|
|
558
|
+
normalize_domain_for_allowlist('http://www.example.com') # => 'example.com'
|
|
559
|
+
```
|
|
560
|
+
|
|
561
|
+
#### Multi-Tenant URL Standardization
|
|
562
|
+
|
|
563
|
+
```ruby
|
|
564
|
+
class Tenant < ApplicationRecord
|
|
565
|
+
before_validation :normalize_custom_domain
|
|
566
|
+
|
|
567
|
+
private
|
|
568
|
+
|
|
569
|
+
def normalize_custom_domain
|
|
570
|
+
return if custom_domain.blank?
|
|
571
|
+
|
|
572
|
+
self.custom_domain = DomainExtractor.format(
|
|
573
|
+
custom_domain,
|
|
574
|
+
validation: :root_or_custom_subdomain,
|
|
575
|
+
use_https: true,
|
|
576
|
+
use_trailing_slash: false
|
|
577
|
+
)
|
|
578
|
+
end
|
|
579
|
+
end
|
|
580
|
+
```
|
|
581
|
+
|
|
582
|
+
#### API Endpoint Formatting
|
|
583
|
+
|
|
584
|
+
```ruby
|
|
585
|
+
def format_api_endpoint(url)
|
|
586
|
+
DomainExtractor.format(url,
|
|
587
|
+
validation: :standard,
|
|
588
|
+
use_https: true,
|
|
589
|
+
use_trailing_slash: true)
|
|
590
|
+
end
|
|
591
|
+
|
|
592
|
+
format_api_endpoint('http://api.example.com') # => 'https://api.example.com/'
|
|
593
|
+
```
|
|
594
|
+
|
|
358
595
|
## Rails Integration
|
|
359
596
|
|
|
360
597
|
DomainExtractor provides a custom ActiveModel validator for Rails applications, enabling declarative URL/domain validation with multiple modes and options.
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module DomainExtractor
|
|
4
|
+
# Formatter provides URL formatting based on validation modes and protocol requirements.
|
|
5
|
+
#
|
|
6
|
+
# Formats a URL string according to the specified options:
|
|
7
|
+
# - Validation modes: :standard, :root_domain, :root_or_custom_subdomain
|
|
8
|
+
# - Protocol options: use_protocol, use_https
|
|
9
|
+
# - Trailing slash: use_trailing_slash
|
|
10
|
+
#
|
|
11
|
+
# @example Standard formatting
|
|
12
|
+
# DomainExtractor.format('https://www.example.com/')
|
|
13
|
+
# # => 'https://www.example.com'
|
|
14
|
+
#
|
|
15
|
+
# @example Root domain only
|
|
16
|
+
# DomainExtractor.format('https://shop.example.com/path', validation: :root_domain)
|
|
17
|
+
# # => 'https://example.com'
|
|
18
|
+
#
|
|
19
|
+
# @example Without protocol
|
|
20
|
+
# DomainExtractor.format('https://example.com', use_protocol: false)
|
|
21
|
+
# # => 'example.com'
|
|
22
|
+
module Formatter
|
|
23
|
+
VALIDATION_MODES = %i[standard root_domain root_or_custom_subdomain].freeze
|
|
24
|
+
WWW_SUBDOMAIN = 'www'
|
|
25
|
+
|
|
26
|
+
module_function
|
|
27
|
+
|
|
28
|
+
# Format a URL according to the specified options
|
|
29
|
+
#
|
|
30
|
+
# @param url [String] The URL to format
|
|
31
|
+
# @param options [Hash] Formatting options
|
|
32
|
+
# @option options [Symbol] :validation (:standard) Validation mode
|
|
33
|
+
# @option options [Boolean] :use_protocol (true) Include protocol in output
|
|
34
|
+
# @option options [Boolean] :use_https (true) Use https instead of http
|
|
35
|
+
# @option options [Boolean] :use_trailing_slash (false) Include trailing slash
|
|
36
|
+
# @return [String, nil] Formatted URL or nil if invalid
|
|
37
|
+
def call(url, **options)
|
|
38
|
+
validation = options.fetch(:validation, :standard)
|
|
39
|
+
use_protocol = options.fetch(:use_protocol, true)
|
|
40
|
+
use_https = options.fetch(:use_https, true)
|
|
41
|
+
use_trailing_slash = options.fetch(:use_trailing_slash, false)
|
|
42
|
+
|
|
43
|
+
validate_options!(validation)
|
|
44
|
+
|
|
45
|
+
# Parse the URL
|
|
46
|
+
parsed = DomainExtractor.parse(url)
|
|
47
|
+
return nil unless parsed.valid?
|
|
48
|
+
|
|
49
|
+
# Build the formatted URL based on validation mode
|
|
50
|
+
formatted_host = build_host(parsed, validation)
|
|
51
|
+
build_url(formatted_host, use_protocol, use_https, use_trailing_slash)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def validate_options!(validation)
|
|
55
|
+
return if VALIDATION_MODES.include?(validation)
|
|
56
|
+
|
|
57
|
+
raise ArgumentError, "Invalid validation mode: #{validation}. " \
|
|
58
|
+
"Must be one of: #{VALIDATION_MODES.join(', ')}"
|
|
59
|
+
end
|
|
60
|
+
private_class_method :validate_options!
|
|
61
|
+
|
|
62
|
+
# Build the host portion based on validation mode
|
|
63
|
+
def build_host(parsed, validation)
|
|
64
|
+
case validation
|
|
65
|
+
when :standard
|
|
66
|
+
# Return the full host as-is
|
|
67
|
+
parsed.host
|
|
68
|
+
when :root_domain
|
|
69
|
+
# Return only the root domain (no subdomains)
|
|
70
|
+
parsed.root_domain
|
|
71
|
+
when :root_or_custom_subdomain
|
|
72
|
+
# Return root domain or custom subdomain (strip www)
|
|
73
|
+
if parsed.subdomain == WWW_SUBDOMAIN
|
|
74
|
+
parsed.root_domain
|
|
75
|
+
else
|
|
76
|
+
parsed.host
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
private_class_method :build_host
|
|
81
|
+
|
|
82
|
+
# Build the final URL string with protocol and trailing slash options
|
|
83
|
+
def build_url(host, use_protocol, use_https, use_trailing_slash)
|
|
84
|
+
url = ''
|
|
85
|
+
|
|
86
|
+
# Add protocol if requested
|
|
87
|
+
if use_protocol
|
|
88
|
+
protocol = use_https ? 'https://' : 'http://'
|
|
89
|
+
url = protocol + host
|
|
90
|
+
else
|
|
91
|
+
url = host
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Add or remove trailing slash
|
|
95
|
+
if use_trailing_slash
|
|
96
|
+
url += '/' unless url.end_with?('/')
|
|
97
|
+
else
|
|
98
|
+
url = url.chomp('/')
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
url
|
|
102
|
+
end
|
|
103
|
+
private_class_method :build_url
|
|
104
|
+
end
|
|
105
|
+
end
|
data/lib/domain_extractor.rb
CHANGED
|
@@ -8,6 +8,7 @@ require_relative 'domain_extractor/errors'
|
|
|
8
8
|
require_relative 'domain_extractor/parsed_url'
|
|
9
9
|
require_relative 'domain_extractor/parser'
|
|
10
10
|
require_relative 'domain_extractor/query_params'
|
|
11
|
+
require_relative 'domain_extractor/formatter'
|
|
11
12
|
|
|
12
13
|
# Conditionally load Rails validator if ActiveModel is available
|
|
13
14
|
begin
|
|
@@ -70,6 +71,32 @@ module DomainExtractor
|
|
|
70
71
|
QueryParams.call(query_string)
|
|
71
72
|
end
|
|
72
73
|
|
|
74
|
+
# Format a URL according to the specified options.
|
|
75
|
+
# Returns a formatted URL string or nil if the input is invalid.
|
|
76
|
+
#
|
|
77
|
+
# @param url [String] The URL to format
|
|
78
|
+
# @param options [Hash] Formatting options
|
|
79
|
+
# @option options [Symbol] :validation (:standard) Validation mode
|
|
80
|
+
# @option options [Boolean] :use_protocol (true) Include protocol in output
|
|
81
|
+
# @option options [Boolean] :use_https (true) Use https instead of http
|
|
82
|
+
# @option options [Boolean] :use_trailing_slash (false) Include trailing slash
|
|
83
|
+
# @return [String, nil]
|
|
84
|
+
#
|
|
85
|
+
# @example Standard formatting
|
|
86
|
+
# DomainExtractor.format('https://www.example.com/')
|
|
87
|
+
# # => 'https://www.example.com'
|
|
88
|
+
#
|
|
89
|
+
# @example Root domain only
|
|
90
|
+
# DomainExtractor.format('https://shop.example.com/path', validation: :root_domain)
|
|
91
|
+
# # => 'https://example.com'
|
|
92
|
+
#
|
|
93
|
+
# @example Without protocol
|
|
94
|
+
# DomainExtractor.format('https://example.com', use_protocol: false)
|
|
95
|
+
# # => 'example.com'
|
|
96
|
+
def format(url, **)
|
|
97
|
+
Formatter.call(url, **)
|
|
98
|
+
end
|
|
99
|
+
|
|
73
100
|
alias parse_query parse_query_params
|
|
74
101
|
end
|
|
75
102
|
end
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe DomainExtractor::Formatter do
|
|
6
|
+
describe '.call' do
|
|
7
|
+
context 'with :standard validation mode' do
|
|
8
|
+
it 'formats a simple URL with default options' do
|
|
9
|
+
result = described_class.call('https://example.com')
|
|
10
|
+
expect(result).to eq('https://example.com')
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it 'removes trailing slash by default' do
|
|
14
|
+
result = described_class.call('https://example.com/')
|
|
15
|
+
expect(result).to eq('https://example.com')
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
it 'preserves subdomains' do
|
|
19
|
+
result = described_class.call('https://shop.example.com')
|
|
20
|
+
expect(result).to eq('https://shop.example.com')
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
it 'preserves www subdomain' do
|
|
24
|
+
result = described_class.call('https://www.example.com')
|
|
25
|
+
expect(result).to eq('https://www.example.com')
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
it 'preserves multi-level subdomains' do
|
|
29
|
+
result = described_class.call('https://api.staging.example.com')
|
|
30
|
+
expect(result).to eq('https://api.staging.example.com')
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it 'handles URLs without protocol' do
|
|
34
|
+
result = described_class.call('example.com')
|
|
35
|
+
expect(result).to eq('https://example.com')
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
it 'strips path from URL' do
|
|
39
|
+
result = described_class.call('https://example.com/path/to/page')
|
|
40
|
+
expect(result).to eq('https://example.com')
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it 'strips query parameters from URL' do
|
|
44
|
+
result = described_class.call('https://example.com?foo=bar')
|
|
45
|
+
expect(result).to eq('https://example.com')
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
context 'with :root_domain validation mode' do
|
|
50
|
+
it 'returns root domain for URL with subdomain' do
|
|
51
|
+
result = described_class.call('https://shop.example.com', validation: :root_domain)
|
|
52
|
+
expect(result).to eq('https://example.com')
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it 'returns root domain for URL with www' do
|
|
56
|
+
result = described_class.call('https://www.example.com', validation: :root_domain)
|
|
57
|
+
expect(result).to eq('https://example.com')
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
it 'returns root domain for URL without subdomain' do
|
|
61
|
+
result = described_class.call('https://example.com', validation: :root_domain)
|
|
62
|
+
expect(result).to eq('https://example.com')
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
it 'returns root domain for multi-level subdomains' do
|
|
66
|
+
result = described_class.call('https://api.staging.example.com', validation: :root_domain)
|
|
67
|
+
expect(result).to eq('https://example.com')
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
it 'handles multi-part TLDs' do
|
|
71
|
+
result = described_class.call('https://shop.example.co.uk', validation: :root_domain)
|
|
72
|
+
expect(result).to eq('https://example.co.uk')
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
context 'with :root_or_custom_subdomain validation mode' do
|
|
77
|
+
it 'preserves root domain' do
|
|
78
|
+
result = described_class.call('https://example.com', validation: :root_or_custom_subdomain)
|
|
79
|
+
expect(result).to eq('https://example.com')
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it 'preserves custom subdomains' do
|
|
83
|
+
result = described_class.call('https://shop.example.com', validation: :root_or_custom_subdomain)
|
|
84
|
+
expect(result).to eq('https://shop.example.com')
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
it 'strips www subdomain' do
|
|
88
|
+
result = described_class.call('https://www.example.com', validation: :root_or_custom_subdomain)
|
|
89
|
+
expect(result).to eq('https://example.com')
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
it 'preserves api subdomain' do
|
|
93
|
+
result = described_class.call('https://api.example.com', validation: :root_or_custom_subdomain)
|
|
94
|
+
expect(result).to eq('https://api.example.com')
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
it 'preserves multi-level custom subdomains' do
|
|
98
|
+
result = described_class.call('https://api.staging.example.com', validation: :root_or_custom_subdomain)
|
|
99
|
+
expect(result).to eq('https://api.staging.example.com')
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
context 'with use_protocol option' do
|
|
104
|
+
it 'includes protocol by default' do
|
|
105
|
+
result = described_class.call('https://example.com')
|
|
106
|
+
expect(result).to eq('https://example.com')
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
it 'includes protocol when use_protocol is true' do
|
|
110
|
+
result = described_class.call('https://example.com', use_protocol: true)
|
|
111
|
+
expect(result).to eq('https://example.com')
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
it 'excludes protocol when use_protocol is false' do
|
|
115
|
+
result = described_class.call('https://example.com', use_protocol: false)
|
|
116
|
+
expect(result).to eq('example.com')
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
it 'excludes protocol with subdomain' do
|
|
120
|
+
result = described_class.call('https://shop.example.com', use_protocol: false)
|
|
121
|
+
expect(result).to eq('shop.example.com')
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
it 'works with root_domain validation' do
|
|
125
|
+
result = described_class.call('https://shop.example.com',
|
|
126
|
+
validation: :root_domain,
|
|
127
|
+
use_protocol: false)
|
|
128
|
+
expect(result).to eq('example.com')
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
context 'with use_https option' do
|
|
133
|
+
it 'uses https by default' do
|
|
134
|
+
result = described_class.call('http://example.com')
|
|
135
|
+
expect(result).to eq('https://example.com')
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
it 'uses https when use_https is true' do
|
|
139
|
+
result = described_class.call('http://example.com', use_https: true)
|
|
140
|
+
expect(result).to eq('https://example.com')
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it 'uses http when use_https is false' do
|
|
144
|
+
result = described_class.call('https://example.com', use_https: false)
|
|
145
|
+
expect(result).to eq('http://example.com')
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
it 'preserves http when use_https is false' do
|
|
149
|
+
result = described_class.call('http://example.com', use_https: false)
|
|
150
|
+
expect(result).to eq('http://example.com')
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it 'ignores use_https when use_protocol is false' do
|
|
154
|
+
result = described_class.call('https://example.com',
|
|
155
|
+
use_protocol: false,
|
|
156
|
+
use_https: false)
|
|
157
|
+
expect(result).to eq('example.com')
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
context 'with use_trailing_slash option' do
|
|
162
|
+
it 'removes trailing slash by default' do
|
|
163
|
+
result = described_class.call('https://example.com/')
|
|
164
|
+
expect(result).to eq('https://example.com')
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
it 'removes trailing slash when use_trailing_slash is false' do
|
|
168
|
+
result = described_class.call('https://example.com/', use_trailing_slash: false)
|
|
169
|
+
expect(result).to eq('https://example.com')
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it 'adds trailing slash when use_trailing_slash is true' do
|
|
173
|
+
result = described_class.call('https://example.com', use_trailing_slash: true)
|
|
174
|
+
expect(result).to eq('https://example.com/')
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
it 'preserves trailing slash when use_trailing_slash is true' do
|
|
178
|
+
result = described_class.call('https://example.com/', use_trailing_slash: true)
|
|
179
|
+
expect(result).to eq('https://example.com/')
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
it 'works without protocol' do
|
|
183
|
+
result = described_class.call('https://example.com',
|
|
184
|
+
use_protocol: false,
|
|
185
|
+
use_trailing_slash: true)
|
|
186
|
+
expect(result).to eq('example.com/')
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
it 'works with root_domain validation' do
|
|
190
|
+
result = described_class.call('https://shop.example.com',
|
|
191
|
+
validation: :root_domain,
|
|
192
|
+
use_trailing_slash: true)
|
|
193
|
+
expect(result).to eq('https://example.com/')
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
context 'with combined options' do
|
|
198
|
+
it 'formats with all options: root_domain, no protocol, with trailing slash' do
|
|
199
|
+
result = described_class.call('https://shop.example.com/path',
|
|
200
|
+
validation: :root_domain,
|
|
201
|
+
use_protocol: false,
|
|
202
|
+
use_trailing_slash: true)
|
|
203
|
+
expect(result).to eq('example.com/')
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
it 'formats with root_or_custom_subdomain, http protocol, no trailing slash' do
|
|
207
|
+
result = described_class.call('https://www.example.com/',
|
|
208
|
+
validation: :root_or_custom_subdomain,
|
|
209
|
+
use_https: false,
|
|
210
|
+
use_trailing_slash: false)
|
|
211
|
+
expect(result).to eq('http://example.com')
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
it 'formats with standard, no protocol, http, with trailing slash' do
|
|
215
|
+
result = described_class.call('https://api.example.com',
|
|
216
|
+
validation: :standard,
|
|
217
|
+
use_protocol: false,
|
|
218
|
+
use_trailing_slash: true)
|
|
219
|
+
expect(result).to eq('api.example.com/')
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
it 'strips www and adds trailing slash' do
|
|
223
|
+
result = described_class.call('https://www.example.com',
|
|
224
|
+
validation: :root_or_custom_subdomain,
|
|
225
|
+
use_trailing_slash: true)
|
|
226
|
+
expect(result).to eq('https://example.com/')
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
context 'with multi-part TLDs' do
|
|
231
|
+
it 'handles UK domains with standard mode' do
|
|
232
|
+
result = described_class.call('https://shop.example.co.uk')
|
|
233
|
+
expect(result).to eq('https://shop.example.co.uk')
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
it 'handles UK domains with root_domain mode' do
|
|
237
|
+
result = described_class.call('https://shop.example.co.uk', validation: :root_domain)
|
|
238
|
+
expect(result).to eq('https://example.co.uk')
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
it 'handles Australian domains' do
|
|
242
|
+
result = described_class.call('https://www.example.com.au',
|
|
243
|
+
validation: :root_or_custom_subdomain)
|
|
244
|
+
expect(result).to eq('https://example.com.au')
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
context 'with invalid input' do
|
|
249
|
+
it 'returns nil for invalid URLs' do
|
|
250
|
+
result = described_class.call('not-a-url')
|
|
251
|
+
expect(result).to be_nil
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
it 'returns nil for nil input' do
|
|
255
|
+
result = described_class.call(nil)
|
|
256
|
+
expect(result).to be_nil
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
it 'returns nil for empty string' do
|
|
260
|
+
result = described_class.call('')
|
|
261
|
+
expect(result).to be_nil
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
it 'returns nil for IP addresses' do
|
|
265
|
+
result = described_class.call('https://192.168.1.1')
|
|
266
|
+
expect(result).to be_nil
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
context 'with error handling' do
|
|
271
|
+
it 'raises error for invalid validation mode' do
|
|
272
|
+
expect do
|
|
273
|
+
described_class.call('https://example.com', validation: :invalid_mode)
|
|
274
|
+
end.to raise_error(ArgumentError, /Invalid validation mode/)
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
RSpec.describe DomainExtractor do
|
|
281
|
+
describe '.format' do
|
|
282
|
+
it 'delegates to Formatter.call' do
|
|
283
|
+
result = DomainExtractor.format('https://www.example.com/')
|
|
284
|
+
expect(result).to eq('https://www.example.com')
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
it 'passes options correctly' do
|
|
288
|
+
result = DomainExtractor.format('https://shop.example.com',
|
|
289
|
+
validation: :root_domain,
|
|
290
|
+
use_protocol: false)
|
|
291
|
+
expect(result).to eq('example.com')
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
it 'returns nil for invalid URLs' do
|
|
295
|
+
result = DomainExtractor.format('invalid-url')
|
|
296
|
+
expect(result).to be_nil
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: domain_extractor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- OpenSite AI
|
|
@@ -43,6 +43,7 @@ files:
|
|
|
43
43
|
- lib/domain_extractor.rb
|
|
44
44
|
- lib/domain_extractor/domain_validator.rb
|
|
45
45
|
- lib/domain_extractor/errors.rb
|
|
46
|
+
- lib/domain_extractor/formatter.rb
|
|
46
47
|
- lib/domain_extractor/normalizer.rb
|
|
47
48
|
- lib/domain_extractor/parsed_url.rb
|
|
48
49
|
- lib/domain_extractor/parser.rb
|
|
@@ -52,6 +53,7 @@ files:
|
|
|
52
53
|
- lib/domain_extractor/version.rb
|
|
53
54
|
- spec/domain_extractor_spec.rb
|
|
54
55
|
- spec/domain_validator_spec.rb
|
|
56
|
+
- spec/formatter_spec.rb
|
|
55
57
|
- spec/parsed_url_spec.rb
|
|
56
58
|
- spec/spec_helper.rb
|
|
57
59
|
homepage: https://github.com/opensite-ai/domain_extractor
|