domain_extractor 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +30 -3
- data/lib/domain_extractor/errors.rb +11 -0
- data/lib/domain_extractor/parser.rb +21 -6
- data/lib/domain_extractor/version.rb +1 -1
- data/lib/domain_extractor.rb +12 -3
- data/spec/domain_extractor_spec.rb +50 -12
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ada4e5f18e79e144f7fc82e53c4f9cffc5b750f32ac579e5533ad50a3c4ee07b
|
|
4
|
+
data.tar.gz: 7213bcce37c9956ff164411433e929ad7ae8f0953e3101a2bd93d7f761ab37dc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9eb86f40c167428966581ee0446db8ef03994680904c7712f22b75e4117304bce59928ef16ba29143da6ad2046011f0cc5f428a54405922cfeab21e06ea9a06f
|
|
7
|
+
data.tar.gz: cae58aa85a024e3e10a236041476775e887896d1bb96af35a6e6a14f5cb97f2f2caefff7a2804c02ec9885fbcb4e92e48cdef2bbc2e9bfcfecd7a6b58e47e6ad
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.1.7] - 2025-10-31
|
|
11
|
+
|
|
12
|
+
### Added valid? method and enhanced error handling
|
|
13
|
+
|
|
14
|
+
- Added `DomainExtractor.valid?` helper to allow safe URL pre-checks without raising.
|
|
15
|
+
- `DomainExtractor.parse` now raises `DomainExtractor::InvalidURLError` with a clear `"Invalid URL Value"` message when the input cannot be parsed.
|
|
16
|
+
|
|
10
17
|
## [0.1.6] - 2025-10-31
|
|
11
18
|
|
|
12
19
|
### Integrate Rakefile for Release and Task Workflow Refactors
|
data/README.md
CHANGED
|
@@ -52,6 +52,14 @@ result[:domain] # => 'example'
|
|
|
52
52
|
result[:tld] # => 'co.uk'
|
|
53
53
|
result[:root_domain] # => 'example.co.uk'
|
|
54
54
|
result[:host] # => 'www.example.co.uk'
|
|
55
|
+
|
|
56
|
+
# Guard a parse with the validity helper
|
|
57
|
+
url = 'https://www.example.co.uk/path?query=value'
|
|
58
|
+
if DomainExtractor.valid?(url)
|
|
59
|
+
DomainExtractor.parse(url)
|
|
60
|
+
else
|
|
61
|
+
# handle invalid input
|
|
62
|
+
end
|
|
55
63
|
```
|
|
56
64
|
|
|
57
65
|
## Usage Examples
|
|
@@ -105,13 +113,25 @@ urls = ['https://example.com', 'https://blog.example.org']
|
|
|
105
113
|
results = DomainExtractor.parse_batch(urls)
|
|
106
114
|
```
|
|
107
115
|
|
|
116
|
+
### Validation and Error Handling
|
|
117
|
+
|
|
118
|
+
```ruby
|
|
119
|
+
DomainExtractor.valid?('https://www.example.com') # => true
|
|
120
|
+
|
|
121
|
+
# DomainExtractor.parse raises DomainExtractor::InvalidURLError on invalid input
|
|
122
|
+
DomainExtractor.parse('not-a-url')
|
|
123
|
+
# => raises DomainExtractor::InvalidURLError (message: "Invalid URL Value")
|
|
124
|
+
```
|
|
125
|
+
|
|
108
126
|
## API Reference
|
|
109
127
|
|
|
110
128
|
### `DomainExtractor.parse(url_string)`
|
|
111
129
|
|
|
112
130
|
Parses a URL string and extracts domain components.
|
|
113
131
|
|
|
114
|
-
**Returns:** Hash with keys `:subdomain`, `:domain`, `:tld`, `:root_domain`, `:host`, `:path`
|
|
132
|
+
**Returns:** Hash with keys `:subdomain`, `:domain`, `:tld`, `:root_domain`, `:host`, `:path`
|
|
133
|
+
|
|
134
|
+
**Raises:** `DomainExtractor::InvalidURLError` when the URL fails validation
|
|
115
135
|
|
|
116
136
|
### `DomainExtractor.parse_batch(urls)`
|
|
117
137
|
|
|
@@ -119,6 +139,12 @@ Parses multiple URLs efficiently.
|
|
|
119
139
|
|
|
120
140
|
**Returns:** Array of parsed results
|
|
121
141
|
|
|
142
|
+
### `DomainExtractor.valid?(url_string)`
|
|
143
|
+
|
|
144
|
+
Checks if a URL can be parsed successfully without raising.
|
|
145
|
+
|
|
146
|
+
**Returns:** `true` or `false`
|
|
147
|
+
|
|
122
148
|
### `DomainExtractor.parse_query_params(query_string)`
|
|
123
149
|
|
|
124
150
|
Parses a query string into a hash of parameters.
|
|
@@ -146,8 +172,9 @@ track_event('page_view', source_domain: parsed[:root_domain]) if parsed
|
|
|
146
172
|
|
|
147
173
|
```ruby
|
|
148
174
|
def internal_link?(url, base_domain)
|
|
149
|
-
|
|
150
|
-
|
|
175
|
+
return false unless DomainExtractor.valid?(url)
|
|
176
|
+
|
|
177
|
+
DomainExtractor.parse(url)[:root_domain] == base_domain
|
|
151
178
|
end
|
|
152
179
|
```
|
|
153
180
|
|
|
@@ -13,18 +13,21 @@ module DomainExtractor
|
|
|
13
13
|
module_function
|
|
14
14
|
|
|
15
15
|
def call(raw_url)
|
|
16
|
-
|
|
17
|
-
return unless
|
|
18
|
-
|
|
19
|
-
host = uri.host&.downcase
|
|
20
|
-
return if invalid_host?(host)
|
|
16
|
+
components = extract_components(raw_url)
|
|
17
|
+
return unless components
|
|
21
18
|
|
|
22
|
-
domain =
|
|
19
|
+
uri, domain, host = components
|
|
23
20
|
build_result(domain: domain, host: host, uri: uri)
|
|
24
21
|
rescue ::URI::InvalidURIError, ::PublicSuffix::Error
|
|
25
22
|
nil
|
|
26
23
|
end
|
|
27
24
|
|
|
25
|
+
def valid?(raw_url)
|
|
26
|
+
!!extract_components(raw_url)
|
|
27
|
+
rescue ::URI::InvalidURIError, ::PublicSuffix::Error
|
|
28
|
+
false
|
|
29
|
+
end
|
|
30
|
+
|
|
28
31
|
def build_uri(raw_url)
|
|
29
32
|
normalized = Normalizer.call(raw_url)
|
|
30
33
|
return unless normalized
|
|
@@ -38,6 +41,18 @@ module DomainExtractor
|
|
|
38
41
|
end
|
|
39
42
|
private_class_method :invalid_host?
|
|
40
43
|
|
|
44
|
+
def extract_components(raw_url)
|
|
45
|
+
uri = build_uri(raw_url)
|
|
46
|
+
return unless uri
|
|
47
|
+
|
|
48
|
+
host = uri.host&.downcase
|
|
49
|
+
return if invalid_host?(host)
|
|
50
|
+
|
|
51
|
+
domain = ::PublicSuffix.parse(host)
|
|
52
|
+
[uri, domain, host]
|
|
53
|
+
end
|
|
54
|
+
private_class_method :extract_components
|
|
55
|
+
|
|
41
56
|
def build_result(domain:, host:, uri:)
|
|
42
57
|
Result.build(
|
|
43
58
|
subdomain: domain.trd,
|
data/lib/domain_extractor.rb
CHANGED
|
@@ -4,6 +4,7 @@ require 'uri'
|
|
|
4
4
|
require 'public_suffix'
|
|
5
5
|
|
|
6
6
|
require_relative 'domain_extractor/version'
|
|
7
|
+
require_relative 'domain_extractor/errors'
|
|
7
8
|
require_relative 'domain_extractor/parser'
|
|
8
9
|
require_relative 'domain_extractor/query_params'
|
|
9
10
|
|
|
@@ -12,10 +13,18 @@ require_relative 'domain_extractor/query_params'
|
|
|
12
13
|
module DomainExtractor
|
|
13
14
|
class << self
|
|
14
15
|
# Parse an individual URL and extract domain attributes.
|
|
16
|
+
# Raises DomainExtractor::InvalidURLError when the URL fails validation.
|
|
15
17
|
# @param url [String, #to_s]
|
|
16
|
-
# @return [Hash
|
|
18
|
+
# @return [Hash]
|
|
17
19
|
def parse(url)
|
|
18
|
-
Parser.call(url)
|
|
20
|
+
Parser.call(url) || raise(InvalidURLError)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Determine if a URL is considered valid by the parser.
|
|
24
|
+
# @param url [String, #to_s]
|
|
25
|
+
# @return [Boolean]
|
|
26
|
+
def valid?(url)
|
|
27
|
+
Parser.valid?(url)
|
|
19
28
|
end
|
|
20
29
|
|
|
21
30
|
# Parse many URLs and return their individual parse results.
|
|
@@ -24,7 +33,7 @@ module DomainExtractor
|
|
|
24
33
|
def parse_batch(urls)
|
|
25
34
|
return [] unless urls.respond_to?(:map)
|
|
26
35
|
|
|
27
|
-
urls.map { |url|
|
|
36
|
+
urls.map { |url| Parser.call(url) }
|
|
28
37
|
end
|
|
29
38
|
|
|
30
39
|
# Convert a query string into a Hash representation.
|
|
@@ -142,32 +142,70 @@ RSpec.describe DomainExtractor do
|
|
|
142
142
|
end
|
|
143
143
|
|
|
144
144
|
context 'with invalid URLs' do
|
|
145
|
-
it '
|
|
146
|
-
expect
|
|
145
|
+
it 'raises InvalidURLError for malformed URLs' do
|
|
146
|
+
expect { described_class.parse('http://') }.to raise_error(
|
|
147
|
+
DomainExtractor::InvalidURLError,
|
|
148
|
+
'Invalid URL Value'
|
|
149
|
+
)
|
|
147
150
|
end
|
|
148
151
|
|
|
149
|
-
it '
|
|
150
|
-
expect
|
|
152
|
+
it 'raises InvalidURLError for invalid domains' do
|
|
153
|
+
expect { described_class.parse('not_a_url') }.to raise_error(
|
|
154
|
+
DomainExtractor::InvalidURLError,
|
|
155
|
+
'Invalid URL Value'
|
|
156
|
+
)
|
|
151
157
|
end
|
|
152
158
|
|
|
153
|
-
it '
|
|
154
|
-
expect
|
|
159
|
+
it 'raises InvalidURLError for IP addresses' do
|
|
160
|
+
expect { described_class.parse('192.168.1.1') }.to raise_error(
|
|
161
|
+
DomainExtractor::InvalidURLError,
|
|
162
|
+
'Invalid URL Value'
|
|
163
|
+
)
|
|
155
164
|
end
|
|
156
165
|
|
|
157
|
-
it '
|
|
158
|
-
expect
|
|
166
|
+
it 'raises InvalidURLError for IPv6 addresses' do
|
|
167
|
+
expect { described_class.parse('[2001:db8::1]') }.to raise_error(
|
|
168
|
+
DomainExtractor::InvalidURLError,
|
|
169
|
+
'Invalid URL Value'
|
|
170
|
+
)
|
|
159
171
|
end
|
|
160
172
|
|
|
161
|
-
it '
|
|
162
|
-
expect
|
|
173
|
+
it 'raises InvalidURLError for empty string' do
|
|
174
|
+
expect { described_class.parse('') }.to raise_error(DomainExtractor::InvalidURLError, 'Invalid URL Value')
|
|
163
175
|
end
|
|
164
176
|
|
|
165
|
-
it '
|
|
166
|
-
expect
|
|
177
|
+
it 'raises InvalidURLError for nil' do
|
|
178
|
+
expect { described_class.parse(nil) }.to raise_error(DomainExtractor::InvalidURLError, 'Invalid URL Value')
|
|
167
179
|
end
|
|
168
180
|
end
|
|
169
181
|
end
|
|
170
182
|
|
|
183
|
+
describe '.valid?' do
|
|
184
|
+
it 'returns true for a normalized domain' do
|
|
185
|
+
expect(described_class.valid?('dashtrack.com')).to be(true)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
it 'returns true for a full URL with subdomain and query' do
|
|
189
|
+
expect(described_class.valid?('https://www.example.co.uk/path?query=value')).to be(true)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
it 'returns false for malformed URLs' do
|
|
193
|
+
expect(described_class.valid?('http://')).to be(false)
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
it 'returns false for invalid domains' do
|
|
197
|
+
expect(described_class.valid?('not_a_url')).to be(false)
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
it 'returns false for IP addresses' do
|
|
201
|
+
expect(described_class.valid?('192.168.1.1')).to be(false)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
it 'returns false for nil values' do
|
|
205
|
+
expect(described_class.valid?(nil)).to be(false)
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
171
209
|
describe '.parse_query_params' do
|
|
172
210
|
it 'converts simple query string to hash' do
|
|
173
211
|
result = described_class.parse_query_params('foo=bar')
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: domain_extractor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- OpenSite AI
|
|
@@ -41,6 +41,7 @@ files:
|
|
|
41
41
|
- LICENSE.txt
|
|
42
42
|
- README.md
|
|
43
43
|
- lib/domain_extractor.rb
|
|
44
|
+
- lib/domain_extractor/errors.rb
|
|
44
45
|
- lib/domain_extractor/normalizer.rb
|
|
45
46
|
- lib/domain_extractor/parser.rb
|
|
46
47
|
- lib/domain_extractor/query_params.rb
|