twingly-url 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +47 -0
- data/lib/twingly-url-normalizer.rb +2 -0
- data/lib/twingly/url.rb +36 -0
- data/lib/twingly/url/hasher.rb +30 -0
- data/lib/twingly/url/normalizer.rb +36 -0
- data/lib/version.rb +5 -0
- metadata +135 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6748ed6715a818455973b7330c774c4d4e66776a
|
4
|
+
data.tar.gz: bb17c4b3eb99af036257f3668d7048ccb94f3466
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 55194c78c7a3af312d70e88730586b806868b2b9184853bca25cbdcc99e29622113c2fe333512d9f785ed4adefb8c89fe5c4472f3eb6d1e41d28905fcb020257
|
7
|
+
data.tar.gz: 8b9f59dfb2566c42393edc23535d5487e9bda3e0e1342c717dc4ba4a9b74e17f327b0324ee767608af1e54bdb3cc301416f153b705c8c1a253403759eb629914
|
data/README.md
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# Twingly::URL
|
2
|
+
|
3
|
+
[](https://travis-ci.org/twingly/twingly-url)
|
4
|
+
|
5
|
+
Twingly URL tools.
|
6
|
+
|
7
|
+
* `twingly/url` - Parse and validate URLs
|
8
|
+
* `Twingly::URL.parse` - Returns a Struct with `#url` and `#domain` accessors
|
9
|
+
* `Twingly::URL.validate` - Validates a URL
|
10
|
+
* `twingly/url/normalizer` - Normalize URLs
|
11
|
+
* `Twingly::URL::Normalizer.normalize(string)` - Extracts URLs from string (Array)
|
12
|
+
* `twingly/url/hasher` - Generate URL hashes suitable for primary keys
|
13
|
+
* `Twingly::URL::Hasher.documentdb_hash(url)` - MD5 hexdigest
|
14
|
+
* `Twingly::URL::Hasher.blogstream_hash(url)` - SHA256 unsigned long, native endian digest
|
15
|
+
* `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest
|
16
|
+
* `Twingly::URL::Hasher.pingloggerdb_hash(url)` - SHA256 64-bit unsigned, native endian digest
|
17
|
+
|
18
|
+
## Normalization example
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
require 'twingly/url/normalizer'
|
22
|
+
|
23
|
+
Twingly::URL::Normalizer.normalize('http://duh.se')
|
24
|
+
# => ["http://www.duh.se/"]
|
25
|
+
|
26
|
+
Twingly::URL::Normalizer.normalize('http://duh.se http://blog.twingly.com/')
|
27
|
+
# => ["http://www.duh.se/", "http://blog.twingly.com/"]
|
28
|
+
|
29
|
+
Twingly::URL::Normalizer.normalize('no URL')
|
30
|
+
# => []
|
31
|
+
```
|
32
|
+
|
33
|
+
## Tests
|
34
|
+
|
35
|
+
Run tests with
|
36
|
+
|
37
|
+
bundle exec rake
|
38
|
+
|
39
|
+
### Profiling
|
40
|
+
|
41
|
+
You can get some profiling by running
|
42
|
+
|
43
|
+
bundle exec rake test:profile
|
44
|
+
|
45
|
+
Note that this isn't a benchmark, we're using [ruby-prof] which will slow things down.
|
46
|
+
|
47
|
+
[ruby-prof]: http://ruby-prof.rubyforge.org/
|
data/lib/twingly/url.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'public_suffix'
|
3
|
+
|
4
|
+
PublicSuffix::List.private_domains = false
|
5
|
+
|
6
|
+
SCHEMES = %w(http https)
|
7
|
+
|
8
|
+
module Twingly
|
9
|
+
module URL
|
10
|
+
module_function
|
11
|
+
|
12
|
+
UrlObject = Struct.new(:url, :domain) do
|
13
|
+
def valid?
|
14
|
+
url && domain && SCHEMES.include?(url.normalized_scheme)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse(potential_url)
|
19
|
+
url, domain = extract_url_and_domain(potential_url)
|
20
|
+
UrlObject.new(url, domain)
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_url_and_domain(potential_url)
|
24
|
+
url = Addressable::URI.heuristic_parse(potential_url)
|
25
|
+
domain = PublicSuffix.parse(url.host) if url
|
26
|
+
|
27
|
+
[url, domain]
|
28
|
+
rescue PublicSuffix::DomainInvalid, Addressable::URI::InvalidURIError
|
29
|
+
[]
|
30
|
+
end
|
31
|
+
|
32
|
+
def validate(potential_url)
|
33
|
+
parse(potential_url).valid?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'digest/sha2'
|
3
|
+
|
4
|
+
module Twingly
|
5
|
+
module URL
|
6
|
+
module Hasher
|
7
|
+
module_function
|
8
|
+
|
9
|
+
def taskdb_hash(url)
|
10
|
+
Digest::MD5.hexdigest(url)[0..29].upcase
|
11
|
+
end
|
12
|
+
|
13
|
+
def blogstream_hash(url)
|
14
|
+
Digest::MD5.hexdigest(url)[0..29].upcase
|
15
|
+
end
|
16
|
+
|
17
|
+
def documentdb_hash(url)
|
18
|
+
Digest::SHA256.digest(url).unpack("L!")[0]
|
19
|
+
end
|
20
|
+
|
21
|
+
def autopingdb_hash(url)
|
22
|
+
Digest::SHA256.digest(url).unpack("q")[0]
|
23
|
+
end
|
24
|
+
|
25
|
+
def pingloggerdb_hash(url)
|
26
|
+
Digest::SHA256.digest(url).unpack("Q")[0]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'twingly/url'
|
2
|
+
|
3
|
+
module Twingly
|
4
|
+
module URL
|
5
|
+
module Normalizer
|
6
|
+
module_function
|
7
|
+
|
8
|
+
def normalize(potential_urls)
|
9
|
+
extract_urls(potential_urls).map do |potential_url|
|
10
|
+
normalize_url(potential_url)
|
11
|
+
end.compact
|
12
|
+
end
|
13
|
+
|
14
|
+
def extract_urls(potential_urls)
|
15
|
+
Array(potential_urls).map(&:split).flatten
|
16
|
+
end
|
17
|
+
|
18
|
+
def normalize_url(potential_url)
|
19
|
+
result = Twingly::URL.parse(potential_url)
|
20
|
+
|
21
|
+
return nil unless result.valid?
|
22
|
+
|
23
|
+
unless result.domain.subdomain?
|
24
|
+
result.url.host = "www.#{result.domain}"
|
25
|
+
end
|
26
|
+
|
27
|
+
if result.url.path.empty?
|
28
|
+
result.url.path = "/"
|
29
|
+
end
|
30
|
+
|
31
|
+
result.url.to_s
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
data/lib/version.rb
ADDED
metadata
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twingly-url
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Johan Eckerström
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-11-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: addressable
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: public_suffix
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.4'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.4'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: turn
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: shoulda-context
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-prof
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: Twingly URL tools
|
98
|
+
email:
|
99
|
+
- johan.eckerstrom@twingly.com
|
100
|
+
executables: []
|
101
|
+
extensions: []
|
102
|
+
extra_rdoc_files: []
|
103
|
+
files:
|
104
|
+
- README.md
|
105
|
+
- lib/twingly-url-normalizer.rb
|
106
|
+
- lib/twingly/url.rb
|
107
|
+
- lib/twingly/url/hasher.rb
|
108
|
+
- lib/twingly/url/normalizer.rb
|
109
|
+
- lib/version.rb
|
110
|
+
homepage: http://github.com/twingly/twingly-url
|
111
|
+
licenses:
|
112
|
+
- MIT
|
113
|
+
metadata: {}
|
114
|
+
post_install_message:
|
115
|
+
rdoc_options: []
|
116
|
+
require_paths:
|
117
|
+
- lib
|
118
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 1.9.3
|
123
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
125
|
+
- - ">="
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '0'
|
128
|
+
requirements: []
|
129
|
+
rubyforge_project:
|
130
|
+
rubygems_version: 2.2.2
|
131
|
+
signing_key:
|
132
|
+
specification_version: 4
|
133
|
+
summary: Ruby library for URL handling
|
134
|
+
test_files: []
|
135
|
+
has_rdoc:
|