twingly-url 1.3.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -20
- data/lib/twingly/url.rb +141 -20
- data/lib/twingly/url/error.rb +8 -0
- data/lib/twingly/url/hasher.rb +1 -1
- data/lib/twingly/url/null_url.rb +30 -0
- data/lib/twingly/url/utilities.rb +7 -5
- data/lib/twingly/version.rb +5 -0
- metadata +9 -23
- data/lib/twingly-url-normalizer.rb +0 -2
- data/lib/twingly/url/normalizer.rb +0 -36
- data/lib/version.rb +0 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2332efd0ec1df89e43ac8c6dbfe50e32111ee4e1
|
4
|
+
data.tar.gz: f9242f3818d99ef4b49d997898fd22fba951ea9d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 614aac135b4e9c8fe61dbf9f0e5c13928c5e71f7b552bf980f6ae3331a17eb6cfcbd2d6421d5fdaf49bca9fca723d4a188c1585b8e7b77987a6cd8b27d9abcbc
|
7
|
+
data.tar.gz: 9d33a5f6ebb37eb410779cd40350df8bea3f7ffad72f93e8b8015e0dca086c3a86b66110cde34c28ac05a1da08985aeee2b46eba9fd6e0b4dce6f56e5e01e7f2
|
data/README.md
CHANGED
@@ -5,10 +5,7 @@
|
|
5
5
|
Twingly URL tools.
|
6
6
|
|
7
7
|
* `twingly/url` - Parse and validate URLs
|
8
|
-
* `Twingly::URL.parse` - Returns
|
9
|
-
* `Twingly::URL.validate` - Validates a URL
|
10
|
-
* `twingly/url/normalizer` - Normalize URLs
|
11
|
-
* `Twingly::URL::Normalizer.normalize(string)` - Extracts URLs from string (Array)
|
8
|
+
* `Twingly::URL.parse` - Returns one or more `Twingly::URL` instance
|
12
9
|
* `twingly/url/hasher` - Generate URL hashes suitable for primary keys
|
13
10
|
* `Twingly::URL::Hasher.taskdb_hash(url)` - MD5 hexdigest
|
14
11
|
* `Twingly::URL::Hasher.blogstream_hash(url)` - MD5 hexdigest
|
@@ -16,27 +13,12 @@ Twingly URL tools.
|
|
16
13
|
* `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest
|
17
14
|
* `Twingly::URL::Hasher.pingloggerdb_hash(url)` - SHA256 64-bit unsigned, native endian digest
|
18
15
|
* `twingly/url/utilities` - Utilities to work with URLs
|
19
|
-
* `Twingly::URL::Utilities.
|
16
|
+
* `Twingly::URL::Utilities.extract_valid_urls` - Returns Array of valid `Twingly::URL`
|
20
17
|
|
21
18
|
## Installation
|
22
19
|
|
23
20
|
gem install twingly-url
|
24
21
|
|
25
|
-
## Normalization example
|
26
|
-
|
27
|
-
```ruby
|
28
|
-
require 'twingly/url/normalizer'
|
29
|
-
|
30
|
-
Twingly::URL::Normalizer.normalize('http://duh.se')
|
31
|
-
# => ["http://www.duh.se/"]
|
32
|
-
|
33
|
-
Twingly::URL::Normalizer.normalize('http://duh.se http://blog.twingly.com/')
|
34
|
-
# => ["http://www.duh.se/", "http://blog.twingly.com/"]
|
35
|
-
|
36
|
-
Twingly::URL::Normalizer.normalize('no URL')
|
37
|
-
# => []
|
38
|
-
```
|
39
|
-
|
40
22
|
## Tests
|
41
23
|
|
42
24
|
Run tests with
|
data/lib/twingly/url.rb
CHANGED
@@ -1,36 +1,157 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require "addressable/uri"
|
2
|
+
require "public_suffix"
|
3
3
|
|
4
|
-
|
4
|
+
require_relative "url/null_url"
|
5
|
+
require_relative "url/error"
|
5
6
|
|
6
|
-
|
7
|
+
PublicSuffix::List.private_domains = false
|
7
8
|
|
8
9
|
module Twingly
|
9
|
-
|
10
|
-
|
10
|
+
class URL
|
11
|
+
include Comparable
|
12
|
+
|
13
|
+
SCHEMES = %w(http https)
|
14
|
+
ENDS_WITH_SLASH = /\/+$/
|
15
|
+
|
16
|
+
def self.parse(potential_url)
|
17
|
+
potential_url = String(potential_url)
|
18
|
+
potential_url = potential_url.scrub
|
19
|
+
potential_url = potential_url.strip
|
20
|
+
|
21
|
+
internal_parse(potential_url)
|
22
|
+
rescue Twingly::URL::Error, Twingly::URL::Error::ParseError => error
|
23
|
+
NullURL.new
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.internal_parse(potential_url)
|
27
|
+
if potential_url.is_a?(Addressable::URI)
|
28
|
+
addressable_uri = potential_url
|
29
|
+
else
|
30
|
+
addressable_uri = Addressable::URI.heuristic_parse(potential_url)
|
31
|
+
end
|
32
|
+
|
33
|
+
raise Twingly::Error::ParseError if addressable_uri.nil?
|
34
|
+
|
35
|
+
public_suffix_domain = PublicSuffix.parse(addressable_uri.display_uri.host)
|
36
|
+
|
37
|
+
self.new(addressable_uri, public_suffix_domain)
|
38
|
+
rescue Addressable::URI::InvalidURIError, PublicSuffix::DomainInvalid => error
|
39
|
+
error.extend(Twingly::URL::Error)
|
40
|
+
raise
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize(addressable_uri, public_suffix_domain)
|
44
|
+
unless addressable_uri.is_a?(Addressable::URI)
|
45
|
+
raise ArgumentError, "First parameter must be an Addressable::URI"
|
46
|
+
end
|
47
|
+
|
48
|
+
unless public_suffix_domain.is_a?(PublicSuffix::Domain)
|
49
|
+
raise ArgumentError, "Second parameter must be a PublicSuffix::Domain"
|
50
|
+
end
|
51
|
+
|
52
|
+
@addressable_uri = addressable_uri
|
53
|
+
@public_suffix_domain = public_suffix_domain
|
54
|
+
end
|
55
|
+
|
56
|
+
def scheme
|
57
|
+
addressable_uri.scheme
|
58
|
+
end
|
59
|
+
|
60
|
+
def trd
|
61
|
+
public_suffix_domain.trd
|
62
|
+
end
|
63
|
+
|
64
|
+
def sld
|
65
|
+
public_suffix_domain.sld
|
66
|
+
end
|
67
|
+
|
68
|
+
def tld
|
69
|
+
public_suffix_domain.tld
|
70
|
+
end
|
71
|
+
|
72
|
+
def domain
|
73
|
+
public_suffix_domain.domain
|
74
|
+
end
|
11
75
|
|
12
|
-
|
13
|
-
|
14
|
-
|
76
|
+
def host
|
77
|
+
addressable_uri.host
|
78
|
+
end
|
79
|
+
|
80
|
+
def origin
|
81
|
+
addressable_uri.origin
|
82
|
+
end
|
83
|
+
|
84
|
+
def path
|
85
|
+
addressable_uri.path
|
86
|
+
end
|
87
|
+
|
88
|
+
def without_scheme
|
89
|
+
self.to_s.sub(/\A#{scheme}:/, "")
|
90
|
+
end
|
91
|
+
|
92
|
+
def normalized
|
93
|
+
normalized_url = addressable_uri.dup
|
94
|
+
|
95
|
+
normalized_url.scheme = normalized_scheme
|
96
|
+
normalized_url.host = normalized_host
|
97
|
+
normalized_url.path = normalized_path
|
98
|
+
|
99
|
+
self.class.internal_parse(normalized_url)
|
100
|
+
end
|
101
|
+
|
102
|
+
def normalized_scheme
|
103
|
+
addressable_uri.scheme.downcase
|
104
|
+
end
|
105
|
+
|
106
|
+
def normalized_host
|
107
|
+
host = addressable_uri.normalized_host
|
108
|
+
domain = public_suffix_domain
|
109
|
+
|
110
|
+
unless domain.subdomain?
|
111
|
+
host = "www.#{host}"
|
15
112
|
end
|
113
|
+
|
114
|
+
host = normalize_blogspot(host, domain)
|
115
|
+
|
116
|
+
host
|
117
|
+
end
|
118
|
+
|
119
|
+
def normalized_path
|
120
|
+
path = strip_trailing_slashes(addressable_uri.path)
|
121
|
+
|
122
|
+
(path.empty?) ? "/" : path
|
123
|
+
end
|
124
|
+
|
125
|
+
def valid?
|
126
|
+
addressable_uri && public_suffix_domain && SCHEMES.include?(normalized_scheme)
|
127
|
+
end
|
128
|
+
|
129
|
+
def <=>(other)
|
130
|
+
self.to_s <=> other.to_s
|
16
131
|
end
|
17
132
|
|
18
|
-
def
|
19
|
-
|
20
|
-
UrlObject.new(url, domain)
|
133
|
+
def to_s
|
134
|
+
addressable_uri.to_s
|
21
135
|
end
|
22
136
|
|
23
|
-
def
|
24
|
-
|
25
|
-
|
137
|
+
def inspect
|
138
|
+
sprintf("#<%s:0x%x %s>", self.class.name, __id__, self.to_s)
|
139
|
+
end
|
26
140
|
|
27
|
-
|
28
|
-
|
29
|
-
|
141
|
+
private
|
142
|
+
|
143
|
+
attr_reader :addressable_uri, :public_suffix_domain
|
144
|
+
|
145
|
+
def normalize_blogspot(host, domain)
|
146
|
+
if domain.sld.downcase == "blogspot"
|
147
|
+
host.sub(/\Awww\./i, "").sub(/#{domain.tld}\z/i, "com")
|
148
|
+
else
|
149
|
+
host
|
150
|
+
end
|
30
151
|
end
|
31
152
|
|
32
|
-
def
|
33
|
-
|
153
|
+
def strip_trailing_slashes(path)
|
154
|
+
path.sub(ENDS_WITH_SLASH, "")
|
34
155
|
end
|
35
156
|
end
|
36
157
|
end
|
data/lib/twingly/url/hasher.rb
CHANGED
@@ -0,0 +1,30 @@
|
|
1
|
+
module Twingly
|
2
|
+
class URL
|
3
|
+
class NullURL
|
4
|
+
include Comparable
|
5
|
+
|
6
|
+
def method_missing(name, *)
|
7
|
+
error = NoMethodError.new("undefined method `#{name}'")
|
8
|
+
raise error unless Twingly::URL.instance_methods.include?(name)
|
9
|
+
|
10
|
+
""
|
11
|
+
end
|
12
|
+
|
13
|
+
def normalized
|
14
|
+
self
|
15
|
+
end
|
16
|
+
|
17
|
+
def valid?
|
18
|
+
false
|
19
|
+
end
|
20
|
+
|
21
|
+
def <=>(other)
|
22
|
+
self.to_s <=> other.to_s
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
""
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -1,12 +1,14 @@
|
|
1
1
|
module Twingly
|
2
|
-
|
2
|
+
class URL
|
3
3
|
module Utilities
|
4
4
|
module_function
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
def extract_valid_urls(text_or_array)
|
7
|
+
potential_urls = Array(text_or_array).flat_map(&:split)
|
8
|
+
potential_urls.map do |potential_url|
|
9
|
+
url = Twingly::URL.parse(potential_url)
|
10
|
+
url if url.valid?
|
11
|
+
end.compact
|
10
12
|
end
|
11
13
|
end
|
12
14
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: twingly-url
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Twingly AB
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-10-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '1.4'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: minitest-reporters
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '1'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '1'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: rake
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -67,19 +53,19 @@ dependencies:
|
|
67
53
|
- !ruby/object:Gem::Version
|
68
54
|
version: '10'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
56
|
+
name: rspec
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
72
58
|
requirements:
|
73
59
|
- - "~>"
|
74
60
|
- !ruby/object:Gem::Version
|
75
|
-
version: '
|
61
|
+
version: '3'
|
76
62
|
type: :development
|
77
63
|
prerelease: false
|
78
64
|
version_requirements: !ruby/object:Gem::Requirement
|
79
65
|
requirements:
|
80
66
|
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
|
-
version: '
|
68
|
+
version: '3'
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: ruby-prof
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -102,12 +88,12 @@ extensions: []
|
|
102
88
|
extra_rdoc_files: []
|
103
89
|
files:
|
104
90
|
- README.md
|
105
|
-
- lib/twingly-url-normalizer.rb
|
106
91
|
- lib/twingly/url.rb
|
92
|
+
- lib/twingly/url/error.rb
|
107
93
|
- lib/twingly/url/hasher.rb
|
108
|
-
- lib/twingly/url/
|
94
|
+
- lib/twingly/url/null_url.rb
|
109
95
|
- lib/twingly/url/utilities.rb
|
110
|
-
- lib/version.rb
|
96
|
+
- lib/twingly/version.rb
|
111
97
|
homepage: http://github.com/twingly/twingly-url
|
112
98
|
licenses:
|
113
99
|
- MIT
|
@@ -128,7 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
128
114
|
version: '0'
|
129
115
|
requirements: []
|
130
116
|
rubyforge_project:
|
131
|
-
rubygems_version: 2.4.5
|
117
|
+
rubygems_version: 2.4.5.1
|
132
118
|
signing_key:
|
133
119
|
specification_version: 4
|
134
120
|
summary: Ruby library for URL handling
|
@@ -1,36 +0,0 @@
|
|
1
|
-
require 'twingly/url'
|
2
|
-
|
3
|
-
module Twingly
|
4
|
-
module URL
|
5
|
-
module Normalizer
|
6
|
-
module_function
|
7
|
-
|
8
|
-
def normalize(potential_urls)
|
9
|
-
extract_urls(potential_urls).map do |potential_url|
|
10
|
-
normalize_url(potential_url)
|
11
|
-
end.compact
|
12
|
-
end
|
13
|
-
|
14
|
-
def extract_urls(potential_urls)
|
15
|
-
Array(potential_urls).map(&:split).flatten
|
16
|
-
end
|
17
|
-
|
18
|
-
def normalize_url(potential_url)
|
19
|
-
result = Twingly::URL.parse(potential_url)
|
20
|
-
|
21
|
-
return nil unless result.valid?
|
22
|
-
|
23
|
-
unless result.domain.subdomain?
|
24
|
-
result.url.host = "www.#{result.domain}"
|
25
|
-
end
|
26
|
-
|
27
|
-
if result.url.path.empty?
|
28
|
-
result.url.path = "/"
|
29
|
-
end
|
30
|
-
|
31
|
-
result.url.to_s.downcase
|
32
|
-
end
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|