twingly-url 1.3.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e71d9c7443a57b9f00f9fa78279ea8e7ffd3c2c7
4
- data.tar.gz: 0e112dff95a6cd5270edadb8787bfbb51d173fde
3
+ metadata.gz: 2332efd0ec1df89e43ac8c6dbfe50e32111ee4e1
4
+ data.tar.gz: f9242f3818d99ef4b49d997898fd22fba951ea9d
5
5
  SHA512:
6
- metadata.gz: cea28a55da2e632b6dffa00b5a1d8717ade55aaefa9739e2b300340ebc17f05bd93da8d36c7064ab2addc60f088852ba3b802b82b46b84c4a80bfe593202e7b4
7
- data.tar.gz: 694eb5dcfaea6fb8601711986aac870d024944b58344d2c351860100f0f870745123bcec0b4773f9487ea3363fade9e759d723b581265433ab04d2672efa6e82
6
+ metadata.gz: 614aac135b4e9c8fe61dbf9f0e5c13928c5e71f7b552bf980f6ae3331a17eb6cfcbd2d6421d5fdaf49bca9fca723d4a188c1585b8e7b77987a6cd8b27d9abcbc
7
+ data.tar.gz: 9d33a5f6ebb37eb410779cd40350df8bea3f7ffad72f93e8b8015e0dca086c3a86b66110cde34c28ac05a1da08985aeee2b46eba9fd6e0b4dce6f56e5e01e7f2
data/README.md CHANGED
@@ -5,10 +5,7 @@
5
5
  Twingly URL tools.
6
6
 
7
7
  * `twingly/url` - Parse and validate URLs
8
- * `Twingly::URL.parse` - Returns a Struct with `#url` and `#domain` accessors
9
- * `Twingly::URL.validate` - Validates a URL
10
- * `twingly/url/normalizer` - Normalize URLs
11
- * `Twingly::URL::Normalizer.normalize(string)` - Extracts URLs from string (Array)
8
+ * `Twingly::URL.parse` - Returns one or more `Twingly::URL` instance
12
9
  * `twingly/url/hasher` - Generate URL hashes suitable for primary keys
13
10
  * `Twingly::URL::Hasher.taskdb_hash(url)` - MD5 hexdigest
14
11
  * `Twingly::URL::Hasher.blogstream_hash(url)` - MD5 hexdigest
@@ -16,27 +13,12 @@ Twingly URL tools.
16
13
  * `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest
17
14
  * `Twingly::URL::Hasher.pingloggerdb_hash(url)` - SHA256 64-bit unsigned, native endian digest
18
15
  * `twingly/url/utilities` - Utilities to work with URLs
19
- * `Twingly::URL::Utilities.remove_scheme(url)` - Removes scheme from HTTP/HTTPS URLs (`http://twingly.com` -> `//twingly.com`)
16
+ * `Twingly::URL::Utilities.extract_valid_urls` - Returns Array of valid `Twingly::URL`
20
17
 
21
18
  ## Installation
22
19
 
23
20
  gem install twingly-url
24
21
 
25
- ## Normalization example
26
-
27
- ```ruby
28
- require 'twingly/url/normalizer'
29
-
30
- Twingly::URL::Normalizer.normalize('http://duh.se')
31
- # => ["http://www.duh.se/"]
32
-
33
- Twingly::URL::Normalizer.normalize('http://duh.se http://blog.twingly.com/')
34
- # => ["http://www.duh.se/", "http://blog.twingly.com/"]
35
-
36
- Twingly::URL::Normalizer.normalize('no URL')
37
- # => []
38
- ```
39
-
40
22
  ## Tests
41
23
 
42
24
  Run tests with
data/lib/twingly/url.rb CHANGED
@@ -1,36 +1,157 @@
1
- require 'addressable/uri'
2
- require 'public_suffix'
1
+ require "addressable/uri"
2
+ require "public_suffix"
3
3
 
4
- PublicSuffix::List.private_domains = false
4
+ require_relative "url/null_url"
5
+ require_relative "url/error"
5
6
 
6
- SCHEMES = %w(http https)
7
+ PublicSuffix::List.private_domains = false
7
8
 
8
9
  module Twingly
9
- module URL
10
- module_function
10
+ class URL
11
+ include Comparable
12
+
13
+ SCHEMES = %w(http https)
14
+ ENDS_WITH_SLASH = /\/+$/
15
+
16
+ def self.parse(potential_url)
17
+ potential_url = String(potential_url)
18
+ potential_url = potential_url.scrub
19
+ potential_url = potential_url.strip
20
+
21
+ internal_parse(potential_url)
22
+ rescue Twingly::URL::Error, Twingly::URL::Error::ParseError => error
23
+ NullURL.new
24
+ end
25
+
26
+ def self.internal_parse(potential_url)
27
+ if potential_url.is_a?(Addressable::URI)
28
+ addressable_uri = potential_url
29
+ else
30
+ addressable_uri = Addressable::URI.heuristic_parse(potential_url)
31
+ end
32
+
33
+ raise Twingly::Error::ParseError if addressable_uri.nil?
34
+
35
+ public_suffix_domain = PublicSuffix.parse(addressable_uri.display_uri.host)
36
+
37
+ self.new(addressable_uri, public_suffix_domain)
38
+ rescue Addressable::URI::InvalidURIError, PublicSuffix::DomainInvalid => error
39
+ error.extend(Twingly::URL::Error)
40
+ raise
41
+ end
42
+
43
+ def initialize(addressable_uri, public_suffix_domain)
44
+ unless addressable_uri.is_a?(Addressable::URI)
45
+ raise ArgumentError, "First parameter must be an Addressable::URI"
46
+ end
47
+
48
+ unless public_suffix_domain.is_a?(PublicSuffix::Domain)
49
+ raise ArgumentError, "Second parameter must be a PublicSuffix::Domain"
50
+ end
51
+
52
+ @addressable_uri = addressable_uri
53
+ @public_suffix_domain = public_suffix_domain
54
+ end
55
+
56
+ def scheme
57
+ addressable_uri.scheme
58
+ end
59
+
60
+ def trd
61
+ public_suffix_domain.trd
62
+ end
63
+
64
+ def sld
65
+ public_suffix_domain.sld
66
+ end
67
+
68
+ def tld
69
+ public_suffix_domain.tld
70
+ end
71
+
72
+ def domain
73
+ public_suffix_domain.domain
74
+ end
11
75
 
12
- UrlObject = Struct.new(:url, :domain) do
13
- def valid?
14
- url && domain && SCHEMES.include?(url.normalized_scheme)
76
+ def host
77
+ addressable_uri.host
78
+ end
79
+
80
+ def origin
81
+ addressable_uri.origin
82
+ end
83
+
84
+ def path
85
+ addressable_uri.path
86
+ end
87
+
88
+ def without_scheme
89
+ self.to_s.sub(/\A#{scheme}:/, "")
90
+ end
91
+
92
+ def normalized
93
+ normalized_url = addressable_uri.dup
94
+
95
+ normalized_url.scheme = normalized_scheme
96
+ normalized_url.host = normalized_host
97
+ normalized_url.path = normalized_path
98
+
99
+ self.class.internal_parse(normalized_url)
100
+ end
101
+
102
+ def normalized_scheme
103
+ addressable_uri.scheme.downcase
104
+ end
105
+
106
+ def normalized_host
107
+ host = addressable_uri.normalized_host
108
+ domain = public_suffix_domain
109
+
110
+ unless domain.subdomain?
111
+ host = "www.#{host}"
15
112
  end
113
+
114
+ host = normalize_blogspot(host, domain)
115
+
116
+ host
117
+ end
118
+
119
+ def normalized_path
120
+ path = strip_trailing_slashes(addressable_uri.path)
121
+
122
+ (path.empty?) ? "/" : path
123
+ end
124
+
125
+ def valid?
126
+ addressable_uri && public_suffix_domain && SCHEMES.include?(normalized_scheme)
127
+ end
128
+
129
+ def <=>(other)
130
+ self.to_s <=> other.to_s
16
131
  end
17
132
 
18
- def parse(potential_url)
19
- url, domain = extract_url_and_domain(potential_url)
20
- UrlObject.new(url, domain)
133
+ def to_s
134
+ addressable_uri.to_s
21
135
  end
22
136
 
23
- def extract_url_and_domain(potential_url)
24
- url = Addressable::URI.heuristic_parse(potential_url)
25
- domain = PublicSuffix.parse(url.host) if url
137
+ def inspect
138
+ sprintf("#<%s:0x%x %s>", self.class.name, __id__, self.to_s)
139
+ end
26
140
 
27
- [url, domain]
28
- rescue PublicSuffix::DomainInvalid, Addressable::URI::InvalidURIError
29
- []
141
+ private
142
+
143
+ attr_reader :addressable_uri, :public_suffix_domain
144
+
145
+ def normalize_blogspot(host, domain)
146
+ if domain.sld.downcase == "blogspot"
147
+ host.sub(/\Awww\./i, "").sub(/#{domain.tld}\z/i, "com")
148
+ else
149
+ host
150
+ end
30
151
  end
31
152
 
32
- def validate(potential_url)
33
- parse(potential_url).valid?
153
+ def strip_trailing_slashes(path)
154
+ path.sub(ENDS_WITH_SLASH, "")
34
155
  end
35
156
  end
36
157
  end
@@ -0,0 +1,8 @@
1
+ module Twingly
2
+ class URL
3
+ module Error
4
+ class ParseError < StandardError
5
+ end
6
+ end
7
+ end
8
+ end
@@ -1,7 +1,7 @@
1
1
  require 'digest'
2
2
 
3
3
  module Twingly
4
- module URL
4
+ class URL
5
5
  module Hasher
6
6
  module_function
7
7
 
@@ -0,0 +1,30 @@
1
+ module Twingly
2
+ class URL
3
+ class NullURL
4
+ include Comparable
5
+
6
+ def method_missing(name, *)
7
+ error = NoMethodError.new("undefined method `#{name}'")
8
+ raise error unless Twingly::URL.instance_methods.include?(name)
9
+
10
+ ""
11
+ end
12
+
13
+ def normalized
14
+ self
15
+ end
16
+
17
+ def valid?
18
+ false
19
+ end
20
+
21
+ def <=>(other)
22
+ self.to_s <=> other.to_s
23
+ end
24
+
25
+ def to_s
26
+ ""
27
+ end
28
+ end
29
+ end
30
+ end
@@ -1,12 +1,14 @@
1
1
  module Twingly
2
- module URL
2
+ class URL
3
3
  module Utilities
4
4
  module_function
5
5
 
6
- PROTOCOL_EXPRESSION = /^https?:/i
7
-
8
- def remove_scheme(url)
9
- url.sub(PROTOCOL_EXPRESSION, '')
6
+ def extract_valid_urls(text_or_array)
7
+ potential_urls = Array(text_or_array).flat_map(&:split)
8
+ potential_urls.map do |potential_url|
9
+ url = Twingly::URL.parse(potential_url)
10
+ url if url.valid?
11
+ end.compact
10
12
  end
11
13
  end
12
14
  end
@@ -0,0 +1,5 @@
1
+ module Twingly
2
+ class URL
3
+ VERSION = "2.0.0"
4
+ end
5
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: twingly-url
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Twingly AB
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-04 00:00:00.000000000 Z
11
+ date: 2015-10-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.4'
41
- - !ruby/object:Gem::Dependency
42
- name: minitest-reporters
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: '1'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: '1'
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: rake
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -67,19 +53,19 @@ dependencies:
67
53
  - !ruby/object:Gem::Version
68
54
  version: '10'
69
55
  - !ruby/object:Gem::Dependency
70
- name: shoulda-context
56
+ name: rspec
71
57
  requirement: !ruby/object:Gem::Requirement
72
58
  requirements:
73
59
  - - "~>"
74
60
  - !ruby/object:Gem::Version
75
- version: '1'
61
+ version: '3'
76
62
  type: :development
77
63
  prerelease: false
78
64
  version_requirements: !ruby/object:Gem::Requirement
79
65
  requirements:
80
66
  - - "~>"
81
67
  - !ruby/object:Gem::Version
82
- version: '1'
68
+ version: '3'
83
69
  - !ruby/object:Gem::Dependency
84
70
  name: ruby-prof
85
71
  requirement: !ruby/object:Gem::Requirement
@@ -102,12 +88,12 @@ extensions: []
102
88
  extra_rdoc_files: []
103
89
  files:
104
90
  - README.md
105
- - lib/twingly-url-normalizer.rb
106
91
  - lib/twingly/url.rb
92
+ - lib/twingly/url/error.rb
107
93
  - lib/twingly/url/hasher.rb
108
- - lib/twingly/url/normalizer.rb
94
+ - lib/twingly/url/null_url.rb
109
95
  - lib/twingly/url/utilities.rb
110
- - lib/version.rb
96
+ - lib/twingly/version.rb
111
97
  homepage: http://github.com/twingly/twingly-url
112
98
  licenses:
113
99
  - MIT
@@ -128,7 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
128
114
  version: '0'
129
115
  requirements: []
130
116
  rubyforge_project:
131
- rubygems_version: 2.4.5
117
+ rubygems_version: 2.4.5.1
132
118
  signing_key:
133
119
  specification_version: 4
134
120
  summary: Ruby library for URL handling
@@ -1,2 +0,0 @@
1
- warn "twingly-url-normalizer will be removed, use twingly/url/normalizer"
2
- require 'twingly/url/normalizer'
@@ -1,36 +0,0 @@
1
- require 'twingly/url'
2
-
3
- module Twingly
4
- module URL
5
- module Normalizer
6
- module_function
7
-
8
- def normalize(potential_urls)
9
- extract_urls(potential_urls).map do |potential_url|
10
- normalize_url(potential_url)
11
- end.compact
12
- end
13
-
14
- def extract_urls(potential_urls)
15
- Array(potential_urls).map(&:split).flatten
16
- end
17
-
18
- def normalize_url(potential_url)
19
- result = Twingly::URL.parse(potential_url)
20
-
21
- return nil unless result.valid?
22
-
23
- unless result.domain.subdomain?
24
- result.url.host = "www.#{result.domain}"
25
- end
26
-
27
- if result.url.path.empty?
28
- result.url.path = "/"
29
- end
30
-
31
- result.url.to_s.downcase
32
- end
33
- end
34
- end
35
- end
36
-
data/lib/version.rb DELETED
@@ -1,5 +0,0 @@
1
- module Twingly
2
- module URL
3
- VERSION = '1.3.4'
4
- end
5
- end