twingly-url 1.3.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -20
- data/lib/twingly/url.rb +141 -20
- data/lib/twingly/url/error.rb +8 -0
- data/lib/twingly/url/hasher.rb +1 -1
- data/lib/twingly/url/null_url.rb +30 -0
- data/lib/twingly/url/utilities.rb +7 -5
- data/lib/twingly/version.rb +5 -0
- metadata +9 -23
- data/lib/twingly-url-normalizer.rb +0 -2
- data/lib/twingly/url/normalizer.rb +0 -36
- data/lib/version.rb +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2332efd0ec1df89e43ac8c6dbfe50e32111ee4e1
|
4
|
+
data.tar.gz: f9242f3818d99ef4b49d997898fd22fba951ea9d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 614aac135b4e9c8fe61dbf9f0e5c13928c5e71f7b552bf980f6ae3331a17eb6cfcbd2d6421d5fdaf49bca9fca723d4a188c1585b8e7b77987a6cd8b27d9abcbc
|
7
|
+
data.tar.gz: 9d33a5f6ebb37eb410779cd40350df8bea3f7ffad72f93e8b8015e0dca086c3a86b66110cde34c28ac05a1da08985aeee2b46eba9fd6e0b4dce6f56e5e01e7f2
|
data/README.md
CHANGED
@@ -5,10 +5,7 @@
|
|
5
5
|
Twingly URL tools.
|
6
6
|
|
7
7
|
* `twingly/url` - Parse and validate URLs
|
8
|
-
* `Twingly::URL.parse` - Returns
|
9
|
-
* `Twingly::URL.validate` - Validates a URL
|
10
|
-
* `twingly/url/normalizer` - Normalize URLs
|
11
|
-
* `Twingly::URL::Normalizer.normalize(string)` - Extracts URLs from string (Array)
|
8
|
+
* `Twingly::URL.parse` - Returns one or more `Twingly::URL` instance
|
12
9
|
* `twingly/url/hasher` - Generate URL hashes suitable for primary keys
|
13
10
|
* `Twingly::URL::Hasher.taskdb_hash(url)` - MD5 hexdigest
|
14
11
|
* `Twingly::URL::Hasher.blogstream_hash(url)` - MD5 hexdigest
|
@@ -16,27 +13,12 @@ Twingly URL tools.
|
|
16
13
|
* `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest
|
17
14
|
* `Twingly::URL::Hasher.pingloggerdb_hash(url)` - SHA256 64-bit unsigned, native endian digest
|
18
15
|
* `twingly/url/utilities` - Utilities to work with URLs
|
19
|
-
* `Twingly::URL::Utilities.
|
16
|
+
* `Twingly::URL::Utilities.extract_valid_urls` - Returns Array of valid `Twingly::URL`
|
20
17
|
|
21
18
|
## Installation
|
22
19
|
|
23
20
|
gem install twingly-url
|
24
21
|
|
25
|
-
## Normalization example
|
26
|
-
|
27
|
-
```ruby
|
28
|
-
require 'twingly/url/normalizer'
|
29
|
-
|
30
|
-
Twingly::URL::Normalizer.normalize('http://duh.se')
|
31
|
-
# => ["http://www.duh.se/"]
|
32
|
-
|
33
|
-
Twingly::URL::Normalizer.normalize('http://duh.se http://blog.twingly.com/')
|
34
|
-
# => ["http://www.duh.se/", "http://blog.twingly.com/"]
|
35
|
-
|
36
|
-
Twingly::URL::Normalizer.normalize('no URL')
|
37
|
-
# => []
|
38
|
-
```
|
39
|
-
|
40
22
|
## Tests
|
41
23
|
|
42
24
|
Run tests with
|
data/lib/twingly/url.rb
CHANGED
@@ -1,36 +1,157 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require "addressable/uri"
|
2
|
+
require "public_suffix"
|
3
3
|
|
4
|
-
|
4
|
+
require_relative "url/null_url"
|
5
|
+
require_relative "url/error"
|
5
6
|
|
6
|
-
|
7
|
+
PublicSuffix::List.private_domains = false
|
7
8
|
|
8
9
|
module Twingly
|
9
|
-
|
10
|
-
|
10
|
+
class URL
|
11
|
+
include Comparable
|
12
|
+
|
13
|
+
SCHEMES = %w(http https)
|
14
|
+
ENDS_WITH_SLASH = /\/+$/
|
15
|
+
|
16
|
+
def self.parse(potential_url)
|
17
|
+
potential_url = String(potential_url)
|
18
|
+
potential_url = potential_url.scrub
|
19
|
+
potential_url = potential_url.strip
|
20
|
+
|
21
|
+
internal_parse(potential_url)
|
22
|
+
rescue Twingly::URL::Error, Twingly::URL::Error::ParseError => error
|
23
|
+
NullURL.new
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.internal_parse(potential_url)
|
27
|
+
if potential_url.is_a?(Addressable::URI)
|
28
|
+
addressable_uri = potential_url
|
29
|
+
else
|
30
|
+
addressable_uri = Addressable::URI.heuristic_parse(potential_url)
|
31
|
+
end
|
32
|
+
|
33
|
+
raise Twingly::Error::ParseError if addressable_uri.nil?
|
34
|
+
|
35
|
+
public_suffix_domain = PublicSuffix.parse(addressable_uri.display_uri.host)
|
36
|
+
|
37
|
+
self.new(addressable_uri, public_suffix_domain)
|
38
|
+
rescue Addressable::URI::InvalidURIError, PublicSuffix::DomainInvalid => error
|
39
|
+
error.extend(Twingly::URL::Error)
|
40
|
+
raise
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize(addressable_uri, public_suffix_domain)
|
44
|
+
unless addressable_uri.is_a?(Addressable::URI)
|
45
|
+
raise ArgumentError, "First parameter must be an Addressable::URI"
|
46
|
+
end
|
47
|
+
|
48
|
+
unless public_suffix_domain.is_a?(PublicSuffix::Domain)
|
49
|
+
raise ArgumentError, "Second parameter must be a PublicSuffix::Domain"
|
50
|
+
end
|
51
|
+
|
52
|
+
@addressable_uri = addressable_uri
|
53
|
+
@public_suffix_domain = public_suffix_domain
|
54
|
+
end
|
55
|
+
|
56
|
+
def scheme
|
57
|
+
addressable_uri.scheme
|
58
|
+
end
|
59
|
+
|
60
|
+
def trd
|
61
|
+
public_suffix_domain.trd
|
62
|
+
end
|
63
|
+
|
64
|
+
def sld
|
65
|
+
public_suffix_domain.sld
|
66
|
+
end
|
67
|
+
|
68
|
+
def tld
|
69
|
+
public_suffix_domain.tld
|
70
|
+
end
|
71
|
+
|
72
|
+
def domain
|
73
|
+
public_suffix_domain.domain
|
74
|
+
end
|
11
75
|
|
12
|
-
|
13
|
-
|
14
|
-
|
76
|
+
def host
|
77
|
+
addressable_uri.host
|
78
|
+
end
|
79
|
+
|
80
|
+
def origin
|
81
|
+
addressable_uri.origin
|
82
|
+
end
|
83
|
+
|
84
|
+
def path
|
85
|
+
addressable_uri.path
|
86
|
+
end
|
87
|
+
|
88
|
+
def without_scheme
|
89
|
+
self.to_s.sub(/\A#{scheme}:/, "")
|
90
|
+
end
|
91
|
+
|
92
|
+
def normalized
|
93
|
+
normalized_url = addressable_uri.dup
|
94
|
+
|
95
|
+
normalized_url.scheme = normalized_scheme
|
96
|
+
normalized_url.host = normalized_host
|
97
|
+
normalized_url.path = normalized_path
|
98
|
+
|
99
|
+
self.class.internal_parse(normalized_url)
|
100
|
+
end
|
101
|
+
|
102
|
+
def normalized_scheme
|
103
|
+
addressable_uri.scheme.downcase
|
104
|
+
end
|
105
|
+
|
106
|
+
def normalized_host
|
107
|
+
host = addressable_uri.normalized_host
|
108
|
+
domain = public_suffix_domain
|
109
|
+
|
110
|
+
unless domain.subdomain?
|
111
|
+
host = "www.#{host}"
|
15
112
|
end
|
113
|
+
|
114
|
+
host = normalize_blogspot(host, domain)
|
115
|
+
|
116
|
+
host
|
117
|
+
end
|
118
|
+
|
119
|
+
def normalized_path
|
120
|
+
path = strip_trailing_slashes(addressable_uri.path)
|
121
|
+
|
122
|
+
(path.empty?) ? "/" : path
|
123
|
+
end
|
124
|
+
|
125
|
+
def valid?
|
126
|
+
addressable_uri && public_suffix_domain && SCHEMES.include?(normalized_scheme)
|
127
|
+
end
|
128
|
+
|
129
|
+
def <=>(other)
|
130
|
+
self.to_s <=> other.to_s
|
16
131
|
end
|
17
132
|
|
18
|
-
def
|
19
|
-
|
20
|
-
UrlObject.new(url, domain)
|
133
|
+
def to_s
|
134
|
+
addressable_uri.to_s
|
21
135
|
end
|
22
136
|
|
23
|
-
def
|
24
|
-
|
25
|
-
|
137
|
+
def inspect
|
138
|
+
sprintf("#<%s:0x%x %s>", self.class.name, __id__, self.to_s)
|
139
|
+
end
|
26
140
|
|
27
|
-
|
28
|
-
|
29
|
-
|
141
|
+
private
|
142
|
+
|
143
|
+
attr_reader :addressable_uri, :public_suffix_domain
|
144
|
+
|
145
|
+
def normalize_blogspot(host, domain)
|
146
|
+
if domain.sld.downcase == "blogspot"
|
147
|
+
host.sub(/\Awww\./i, "").sub(/#{domain.tld}\z/i, "com")
|
148
|
+
else
|
149
|
+
host
|
150
|
+
end
|
30
151
|
end
|
31
152
|
|
32
|
-
def
|
33
|
-
|
153
|
+
def strip_trailing_slashes(path)
|
154
|
+
path.sub(ENDS_WITH_SLASH, "")
|
34
155
|
end
|
35
156
|
end
|
36
157
|
end
|
data/lib/twingly/url/hasher.rb
CHANGED
@@ -0,0 +1,30 @@
|
|
1
|
+
module Twingly
|
2
|
+
class URL
|
3
|
+
class NullURL
|
4
|
+
include Comparable
|
5
|
+
|
6
|
+
def method_missing(name, *)
|
7
|
+
error = NoMethodError.new("undefined method `#{name}'")
|
8
|
+
raise error unless Twingly::URL.instance_methods.include?(name)
|
9
|
+
|
10
|
+
""
|
11
|
+
end
|
12
|
+
|
13
|
+
def normalized
|
14
|
+
self
|
15
|
+
end
|
16
|
+
|
17
|
+
def valid?
|
18
|
+
false
|
19
|
+
end
|
20
|
+
|
21
|
+
def <=>(other)
|
22
|
+
self.to_s <=> other.to_s
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
""
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,12 +1,14 @@
|
|
1
1
|
module Twingly
|
2
|
-
|
2
|
+
class URL
|
3
3
|
module Utilities
|
4
4
|
module_function
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
def extract_valid_urls(text_or_array)
|
7
|
+
potential_urls = Array(text_or_array).flat_map(&:split)
|
8
|
+
potential_urls.map do |potential_url|
|
9
|
+
url = Twingly::URL.parse(potential_url)
|
10
|
+
url if url.valid?
|
11
|
+
end.compact
|
10
12
|
end
|
11
13
|
end
|
12
14
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twingly-url
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Twingly AB
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-10-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '1.4'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: minitest-reporters
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '1'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '1'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: rake
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -67,19 +53,19 @@ dependencies:
|
|
67
53
|
- !ruby/object:Gem::Version
|
68
54
|
version: '10'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
56
|
+
name: rspec
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
72
58
|
requirements:
|
73
59
|
- - "~>"
|
74
60
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
61
|
+
version: '3'
|
76
62
|
type: :development
|
77
63
|
prerelease: false
|
78
64
|
version_requirements: !ruby/object:Gem::Requirement
|
79
65
|
requirements:
|
80
66
|
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
68
|
+
version: '3'
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: ruby-prof
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -102,12 +88,12 @@ extensions: []
|
|
102
88
|
extra_rdoc_files: []
|
103
89
|
files:
|
104
90
|
- README.md
|
105
|
-
- lib/twingly-url-normalizer.rb
|
106
91
|
- lib/twingly/url.rb
|
92
|
+
- lib/twingly/url/error.rb
|
107
93
|
- lib/twingly/url/hasher.rb
|
108
|
-
- lib/twingly/url/
|
94
|
+
- lib/twingly/url/null_url.rb
|
109
95
|
- lib/twingly/url/utilities.rb
|
110
|
-
- lib/version.rb
|
96
|
+
- lib/twingly/version.rb
|
111
97
|
homepage: http://github.com/twingly/twingly-url
|
112
98
|
licenses:
|
113
99
|
- MIT
|
@@ -128,7 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
128
114
|
version: '0'
|
129
115
|
requirements: []
|
130
116
|
rubyforge_project:
|
131
|
-
rubygems_version: 2.4.5
|
117
|
+
rubygems_version: 2.4.5.1
|
132
118
|
signing_key:
|
133
119
|
specification_version: 4
|
134
120
|
summary: Ruby library for URL handling
|
@@ -1,36 +0,0 @@
|
|
1
|
-
require 'twingly/url'
|
2
|
-
|
3
|
-
module Twingly
|
4
|
-
module URL
|
5
|
-
module Normalizer
|
6
|
-
module_function
|
7
|
-
|
8
|
-
def normalize(potential_urls)
|
9
|
-
extract_urls(potential_urls).map do |potential_url|
|
10
|
-
normalize_url(potential_url)
|
11
|
-
end.compact
|
12
|
-
end
|
13
|
-
|
14
|
-
def extract_urls(potential_urls)
|
15
|
-
Array(potential_urls).map(&:split).flatten
|
16
|
-
end
|
17
|
-
|
18
|
-
def normalize_url(potential_url)
|
19
|
-
result = Twingly::URL.parse(potential_url)
|
20
|
-
|
21
|
-
return nil unless result.valid?
|
22
|
-
|
23
|
-
unless result.domain.subdomain?
|
24
|
-
result.url.host = "www.#{result.domain}"
|
25
|
-
end
|
26
|
-
|
27
|
-
if result.url.path.empty?
|
28
|
-
result.url.path = "/"
|
29
|
-
end
|
30
|
-
|
31
|
-
result.url.to_s.downcase
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|