twingly-url 1.3.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e71d9c7443a57b9f00f9fa78279ea8e7ffd3c2c7
4
- data.tar.gz: 0e112dff95a6cd5270edadb8787bfbb51d173fde
3
+ metadata.gz: 2332efd0ec1df89e43ac8c6dbfe50e32111ee4e1
4
+ data.tar.gz: f9242f3818d99ef4b49d997898fd22fba951ea9d
5
5
  SHA512:
6
- metadata.gz: cea28a55da2e632b6dffa00b5a1d8717ade55aaefa9739e2b300340ebc17f05bd93da8d36c7064ab2addc60f088852ba3b802b82b46b84c4a80bfe593202e7b4
7
- data.tar.gz: 694eb5dcfaea6fb8601711986aac870d024944b58344d2c351860100f0f870745123bcec0b4773f9487ea3363fade9e759d723b581265433ab04d2672efa6e82
6
+ metadata.gz: 614aac135b4e9c8fe61dbf9f0e5c13928c5e71f7b552bf980f6ae3331a17eb6cfcbd2d6421d5fdaf49bca9fca723d4a188c1585b8e7b77987a6cd8b27d9abcbc
7
+ data.tar.gz: 9d33a5f6ebb37eb410779cd40350df8bea3f7ffad72f93e8b8015e0dca086c3a86b66110cde34c28ac05a1da08985aeee2b46eba9fd6e0b4dce6f56e5e01e7f2
data/README.md CHANGED
@@ -5,10 +5,7 @@
5
5
  Twingly URL tools.
6
6
 
7
7
  * `twingly/url` - Parse and validate URLs
8
- * `Twingly::URL.parse` - Returns a Struct with `#url` and `#domain` accessors
9
- * `Twingly::URL.validate` - Validates a URL
10
- * `twingly/url/normalizer` - Normalize URLs
11
- * `Twingly::URL::Normalizer.normalize(string)` - Extracts URLs from string (Array)
8
+ * `Twingly::URL.parse` - Returns one or more `Twingly::URL` instance
12
9
  * `twingly/url/hasher` - Generate URL hashes suitable for primary keys
13
10
  * `Twingly::URL::Hasher.taskdb_hash(url)` - MD5 hexdigest
14
11
  * `Twingly::URL::Hasher.blogstream_hash(url)` - MD5 hexdigest
@@ -16,27 +13,12 @@ Twingly URL tools.
16
13
  * `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest
17
14
  * `Twingly::URL::Hasher.pingloggerdb_hash(url)` - SHA256 64-bit unsigned, native endian digest
18
15
  * `twingly/url/utilities` - Utilities to work with URLs
19
- * `Twingly::URL::Utilities.remove_scheme(url)` - Removes scheme from HTTP/HTTPS URLs (`http://twingly.com` -> `//twingly.com`)
16
+ * `Twingly::URL::Utilities.extract_valid_urls` - Returns Array of valid `Twingly::URL`
20
17
 
21
18
  ## Installation
22
19
 
23
20
  gem install twingly-url
24
21
 
25
- ## Normalization example
26
-
27
- ```ruby
28
- require 'twingly/url/normalizer'
29
-
30
- Twingly::URL::Normalizer.normalize('http://duh.se')
31
- # => ["http://www.duh.se/"]
32
-
33
- Twingly::URL::Normalizer.normalize('http://duh.se http://blog.twingly.com/')
34
- # => ["http://www.duh.se/", "http://blog.twingly.com/"]
35
-
36
- Twingly::URL::Normalizer.normalize('no URL')
37
- # => []
38
- ```
39
-
40
22
  ## Tests
41
23
 
42
24
  Run tests with
data/lib/twingly/url.rb CHANGED
@@ -1,36 +1,157 @@
1
- require 'addressable/uri'
2
- require 'public_suffix'
1
+ require "addressable/uri"
2
+ require "public_suffix"
3
3
 
4
- PublicSuffix::List.private_domains = false
4
+ require_relative "url/null_url"
5
+ require_relative "url/error"
5
6
 
6
- SCHEMES = %w(http https)
7
+ PublicSuffix::List.private_domains = false
7
8
 
8
9
  module Twingly
9
- module URL
10
- module_function
10
+ class URL
11
+ include Comparable
12
+
13
+ SCHEMES = %w(http https)
14
+ ENDS_WITH_SLASH = /\/+$/
15
+
16
+ def self.parse(potential_url)
17
+ potential_url = String(potential_url)
18
+ potential_url = potential_url.scrub
19
+ potential_url = potential_url.strip
20
+
21
+ internal_parse(potential_url)
22
+ rescue Twingly::URL::Error, Twingly::URL::Error::ParseError => error
23
+ NullURL.new
24
+ end
25
+
26
+ def self.internal_parse(potential_url)
27
+ if potential_url.is_a?(Addressable::URI)
28
+ addressable_uri = potential_url
29
+ else
30
+ addressable_uri = Addressable::URI.heuristic_parse(potential_url)
31
+ end
32
+
33
+ raise Twingly::Error::ParseError if addressable_uri.nil?
34
+
35
+ public_suffix_domain = PublicSuffix.parse(addressable_uri.display_uri.host)
36
+
37
+ self.new(addressable_uri, public_suffix_domain)
38
+ rescue Addressable::URI::InvalidURIError, PublicSuffix::DomainInvalid => error
39
+ error.extend(Twingly::URL::Error)
40
+ raise
41
+ end
42
+
43
+ def initialize(addressable_uri, public_suffix_domain)
44
+ unless addressable_uri.is_a?(Addressable::URI)
45
+ raise ArgumentError, "First parameter must be an Addressable::URI"
46
+ end
47
+
48
+ unless public_suffix_domain.is_a?(PublicSuffix::Domain)
49
+ raise ArgumentError, "Second parameter must be a PublicSuffix::Domain"
50
+ end
51
+
52
+ @addressable_uri = addressable_uri
53
+ @public_suffix_domain = public_suffix_domain
54
+ end
55
+
56
+ def scheme
57
+ addressable_uri.scheme
58
+ end
59
+
60
+ def trd
61
+ public_suffix_domain.trd
62
+ end
63
+
64
+ def sld
65
+ public_suffix_domain.sld
66
+ end
67
+
68
+ def tld
69
+ public_suffix_domain.tld
70
+ end
71
+
72
+ def domain
73
+ public_suffix_domain.domain
74
+ end
11
75
 
12
- UrlObject = Struct.new(:url, :domain) do
13
- def valid?
14
- url && domain && SCHEMES.include?(url.normalized_scheme)
76
+ def host
77
+ addressable_uri.host
78
+ end
79
+
80
+ def origin
81
+ addressable_uri.origin
82
+ end
83
+
84
+ def path
85
+ addressable_uri.path
86
+ end
87
+
88
+ def without_scheme
89
+ self.to_s.sub(/\A#{scheme}:/, "")
90
+ end
91
+
92
+ def normalized
93
+ normalized_url = addressable_uri.dup
94
+
95
+ normalized_url.scheme = normalized_scheme
96
+ normalized_url.host = normalized_host
97
+ normalized_url.path = normalized_path
98
+
99
+ self.class.internal_parse(normalized_url)
100
+ end
101
+
102
+ def normalized_scheme
103
+ addressable_uri.scheme.downcase
104
+ end
105
+
106
+ def normalized_host
107
+ host = addressable_uri.normalized_host
108
+ domain = public_suffix_domain
109
+
110
+ unless domain.subdomain?
111
+ host = "www.#{host}"
15
112
  end
113
+
114
+ host = normalize_blogspot(host, domain)
115
+
116
+ host
117
+ end
118
+
119
+ def normalized_path
120
+ path = strip_trailing_slashes(addressable_uri.path)
121
+
122
+ (path.empty?) ? "/" : path
123
+ end
124
+
125
+ def valid?
126
+ addressable_uri && public_suffix_domain && SCHEMES.include?(normalized_scheme)
127
+ end
128
+
129
+ def <=>(other)
130
+ self.to_s <=> other.to_s
16
131
  end
17
132
 
18
- def parse(potential_url)
19
- url, domain = extract_url_and_domain(potential_url)
20
- UrlObject.new(url, domain)
133
+ def to_s
134
+ addressable_uri.to_s
21
135
  end
22
136
 
23
- def extract_url_and_domain(potential_url)
24
- url = Addressable::URI.heuristic_parse(potential_url)
25
- domain = PublicSuffix.parse(url.host) if url
137
+ def inspect
138
+ sprintf("#<%s:0x%x %s>", self.class.name, __id__, self.to_s)
139
+ end
26
140
 
27
- [url, domain]
28
- rescue PublicSuffix::DomainInvalid, Addressable::URI::InvalidURIError
29
- []
141
+ private
142
+
143
+ attr_reader :addressable_uri, :public_suffix_domain
144
+
145
+ def normalize_blogspot(host, domain)
146
+ if domain.sld.downcase == "blogspot"
147
+ host.sub(/\Awww\./i, "").sub(/#{domain.tld}\z/i, "com")
148
+ else
149
+ host
150
+ end
30
151
  end
31
152
 
32
- def validate(potential_url)
33
- parse(potential_url).valid?
153
+ def strip_trailing_slashes(path)
154
+ path.sub(ENDS_WITH_SLASH, "")
34
155
  end
35
156
  end
36
157
  end
@@ -0,0 +1,8 @@
1
+ module Twingly
2
+ class URL
3
+ module Error
4
+ class ParseError < StandardError
5
+ end
6
+ end
7
+ end
8
+ end
@@ -1,7 +1,7 @@
1
1
  require 'digest'
2
2
 
3
3
  module Twingly
4
- module URL
4
+ class URL
5
5
  module Hasher
6
6
  module_function
7
7
 
@@ -0,0 +1,30 @@
1
+ module Twingly
2
+ class URL
3
+ class NullURL
4
+ include Comparable
5
+
6
+ def method_missing(name, *)
7
+ error = NoMethodError.new("undefined method `#{name}'")
8
+ raise error unless Twingly::URL.instance_methods.include?(name)
9
+
10
+ ""
11
+ end
12
+
13
+ def normalized
14
+ self
15
+ end
16
+
17
+ def valid?
18
+ false
19
+ end
20
+
21
+ def <=>(other)
22
+ self.to_s <=> other.to_s
23
+ end
24
+
25
+ def to_s
26
+ ""
27
+ end
28
+ end
29
+ end
30
+ end
@@ -1,12 +1,14 @@
1
1
  module Twingly
2
- module URL
2
+ class URL
3
3
  module Utilities
4
4
  module_function
5
5
 
6
- PROTOCOL_EXPRESSION = /^https?:/i
7
-
8
- def remove_scheme(url)
9
- url.sub(PROTOCOL_EXPRESSION, '')
6
+ def extract_valid_urls(text_or_array)
7
+ potential_urls = Array(text_or_array).flat_map(&:split)
8
+ potential_urls.map do |potential_url|
9
+ url = Twingly::URL.parse(potential_url)
10
+ url if url.valid?
11
+ end.compact
10
12
  end
11
13
  end
12
14
  end
@@ -0,0 +1,5 @@
1
+ module Twingly
2
+ class URL
3
+ VERSION = "2.0.0"
4
+ end
5
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twingly-url
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Twingly AB
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-04 00:00:00.000000000 Z
11
+ date: 2015-10-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.4'
41
- - !ruby/object:Gem::Dependency
42
- name: minitest-reporters
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: '1'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: '1'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: rake
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -67,19 +53,19 @@ dependencies:
67
53
  - !ruby/object:Gem::Version
68
54
  version: '10'
69
55
  - !ruby/object:Gem::Dependency
70
- name: shoulda-context
56
+ name: rspec
71
57
  requirement: !ruby/object:Gem::Requirement
72
58
  requirements:
73
59
  - - "~>"
74
60
  - !ruby/object:Gem::Version
75
- version: '1'
61
+ version: '3'
76
62
  type: :development
77
63
  prerelease: false
78
64
  version_requirements: !ruby/object:Gem::Requirement
79
65
  requirements:
80
66
  - - "~>"
81
67
  - !ruby/object:Gem::Version
82
- version: '1'
68
+ version: '3'
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: ruby-prof
85
71
  requirement: !ruby/object:Gem::Requirement
@@ -102,12 +88,12 @@ extensions: []
102
88
  extra_rdoc_files: []
103
89
  files:
104
90
  - README.md
105
- - lib/twingly-url-normalizer.rb
106
91
  - lib/twingly/url.rb
92
+ - lib/twingly/url/error.rb
107
93
  - lib/twingly/url/hasher.rb
108
- - lib/twingly/url/normalizer.rb
94
+ - lib/twingly/url/null_url.rb
109
95
  - lib/twingly/url/utilities.rb
110
- - lib/version.rb
96
+ - lib/twingly/version.rb
111
97
  homepage: http://github.com/twingly/twingly-url
112
98
  licenses:
113
99
  - MIT
@@ -128,7 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
128
114
  version: '0'
129
115
  requirements: []
130
116
  rubyforge_project:
131
- rubygems_version: 2.4.5
117
+ rubygems_version: 2.4.5.1
132
118
  signing_key:
133
119
  specification_version: 4
134
120
  summary: Ruby library for URL handling
@@ -1,2 +0,0 @@
1
- warn "twingly-url-normalizer will be removed, use twingly/url/normalizer"
2
- require 'twingly/url/normalizer'
@@ -1,36 +0,0 @@
1
- require 'twingly/url'
2
-
3
- module Twingly
4
- module URL
5
- module Normalizer
6
- module_function
7
-
8
- def normalize(potential_urls)
9
- extract_urls(potential_urls).map do |potential_url|
10
- normalize_url(potential_url)
11
- end.compact
12
- end
13
-
14
- def extract_urls(potential_urls)
15
- Array(potential_urls).map(&:split).flatten
16
- end
17
-
18
- def normalize_url(potential_url)
19
- result = Twingly::URL.parse(potential_url)
20
-
21
- return nil unless result.valid?
22
-
23
- unless result.domain.subdomain?
24
- result.url.host = "www.#{result.domain}"
25
- end
26
-
27
- if result.url.path.empty?
28
- result.url.path = "/"
29
- end
30
-
31
- result.url.to_s.downcase
32
- end
33
- end
34
- end
35
- end
36
-
data/lib/version.rb DELETED
@@ -1,5 +0,0 @@
1
- module Twingly
2
- module URL
3
- VERSION = '1.3.4'
4
- end
5
- end