twingly-url 1.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +47 -0
- data/lib/twingly-url-normalizer.rb +2 -0
- data/lib/twingly/url.rb +36 -0
- data/lib/twingly/url/hasher.rb +30 -0
- data/lib/twingly/url/normalizer.rb +36 -0
- data/lib/version.rb +5 -0
- metadata +135 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 6748ed6715a818455973b7330c774c4d4e66776a
|
4
|
+
data.tar.gz: bb17c4b3eb99af036257f3668d7048ccb94f3466
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 55194c78c7a3af312d70e88730586b806868b2b9184853bca25cbdcc99e29622113c2fe333512d9f785ed4adefb8c89fe5c4472f3eb6d1e41d28905fcb020257
|
7
|
+
data.tar.gz: 8b9f59dfb2566c42393edc23535d5487e9bda3e0e1342c717dc4ba4a9b74e17f327b0324ee767608af1e54bdb3cc301416f153b705c8c1a253403759eb629914
|
data/README.md
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# Twingly::URL
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/twingly/twingly-url.svg?branch=master)](https://travis-ci.org/twingly/twingly-url)
|
4
|
+
|
5
|
+
Twingly URL tools.
|
6
|
+
|
7
|
+
* `twingly/url` - Parse and validate URLs
|
8
|
+
* `Twingly::URL.parse` - Returns a Struct with `#url` and `#domain` accessors
|
9
|
+
* `Twingly::URL.validate` - Validates a URL
|
10
|
+
* `twingly/url/normalizer` - Normalize URLs
|
11
|
+
* `Twingly::URL::Normalizer.normalize(string)` - Extracts URLs from string (Array)
|
12
|
+
* `twingly/url/hasher` - Generate URL hashes suitable for primary keys
|
13
|
+
* `Twingly::URL::Hasher.documentdb_hash(url)` - MD5 hexdigest
|
14
|
+
* `Twingly::URL::Hasher.blogstream_hash(url)` - SHA256 unsigned long, native endian digest
|
15
|
+
* `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest
|
16
|
+
* `Twingly::URL::Hasher.pingloggerdb_hash(url)` - SHA256 64-bit unsigned, native endian digest
|
17
|
+
|
18
|
+
## Normalization example
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
require 'twingly/url/normalizer'
|
22
|
+
|
23
|
+
Twingly::URL::Normalizer.normalize('http://duh.se')
|
24
|
+
# => ["http://www.duh.se/"]
|
25
|
+
|
26
|
+
Twingly::URL::Normalizer.normalize('http://duh.se http://blog.twingly.com/')
|
27
|
+
# => ["http://www.duh.se/", "http://blog.twingly.com/"]
|
28
|
+
|
29
|
+
Twingly::URL::Normalizer.normalize('no URL')
|
30
|
+
# => []
|
31
|
+
```
|
32
|
+
|
33
|
+
## Tests
|
34
|
+
|
35
|
+
Run tests with
|
36
|
+
|
37
|
+
bundle exec rake
|
38
|
+
|
39
|
+
### Profiling
|
40
|
+
|
41
|
+
You can get some profiling by running
|
42
|
+
|
43
|
+
bundle exec rake test:profile
|
44
|
+
|
45
|
+
Note that this isn't a benchmark, we're using [ruby-prof] which will slow things down.
|
46
|
+
|
47
|
+
[ruby-prof]: http://ruby-prof.rubyforge.org/
|
data/lib/twingly/url.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
require 'public_suffix'
|
3
|
+
|
4
|
+
PublicSuffix::List.private_domains = false
|
5
|
+
|
6
|
+
SCHEMES = %w(http https)
|
7
|
+
|
8
|
+
module Twingly
|
9
|
+
module URL
|
10
|
+
module_function
|
11
|
+
|
12
|
+
UrlObject = Struct.new(:url, :domain) do
|
13
|
+
def valid?
|
14
|
+
url && domain && SCHEMES.include?(url.normalized_scheme)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse(potential_url)
|
19
|
+
url, domain = extract_url_and_domain(potential_url)
|
20
|
+
UrlObject.new(url, domain)
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_url_and_domain(potential_url)
|
24
|
+
url = Addressable::URI.heuristic_parse(potential_url)
|
25
|
+
domain = PublicSuffix.parse(url.host) if url
|
26
|
+
|
27
|
+
[url, domain]
|
28
|
+
rescue PublicSuffix::DomainInvalid, Addressable::URI::InvalidURIError
|
29
|
+
[]
|
30
|
+
end
|
31
|
+
|
32
|
+
def validate(potential_url)
|
33
|
+
parse(potential_url).valid?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
require 'digest/sha2'
|
3
|
+
|
4
|
+
module Twingly
|
5
|
+
module URL
|
6
|
+
module Hasher
|
7
|
+
module_function
|
8
|
+
|
9
|
+
def taskdb_hash(url)
|
10
|
+
Digest::MD5.hexdigest(url)[0..29].upcase
|
11
|
+
end
|
12
|
+
|
13
|
+
def blogstream_hash(url)
|
14
|
+
Digest::MD5.hexdigest(url)[0..29].upcase
|
15
|
+
end
|
16
|
+
|
17
|
+
def documentdb_hash(url)
|
18
|
+
Digest::SHA256.digest(url).unpack("L!")[0]
|
19
|
+
end
|
20
|
+
|
21
|
+
def autopingdb_hash(url)
|
22
|
+
Digest::SHA256.digest(url).unpack("q")[0]
|
23
|
+
end
|
24
|
+
|
25
|
+
def pingloggerdb_hash(url)
|
26
|
+
Digest::SHA256.digest(url).unpack("Q")[0]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'twingly/url'
|
2
|
+
|
3
|
+
module Twingly
|
4
|
+
module URL
|
5
|
+
module Normalizer
|
6
|
+
module_function
|
7
|
+
|
8
|
+
def normalize(potential_urls)
|
9
|
+
extract_urls(potential_urls).map do |potential_url|
|
10
|
+
normalize_url(potential_url)
|
11
|
+
end.compact
|
12
|
+
end
|
13
|
+
|
14
|
+
def extract_urls(potential_urls)
|
15
|
+
Array(potential_urls).map(&:split).flatten
|
16
|
+
end
|
17
|
+
|
18
|
+
def normalize_url(potential_url)
|
19
|
+
result = Twingly::URL.parse(potential_url)
|
20
|
+
|
21
|
+
return nil unless result.valid?
|
22
|
+
|
23
|
+
unless result.domain.subdomain?
|
24
|
+
result.url.host = "www.#{result.domain}"
|
25
|
+
end
|
26
|
+
|
27
|
+
if result.url.path.empty?
|
28
|
+
result.url.path = "/"
|
29
|
+
end
|
30
|
+
|
31
|
+
result.url.to_s
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
data/lib/version.rb
ADDED
metadata
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: twingly-url
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Johan Eckerström
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-11-14 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: addressable
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: public_suffix
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.4'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.4'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: turn
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: shoulda-context
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-prof
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: Twingly URL tools
|
98
|
+
email:
|
99
|
+
- johan.eckerstrom@twingly.com
|
100
|
+
executables: []
|
101
|
+
extensions: []
|
102
|
+
extra_rdoc_files: []
|
103
|
+
files:
|
104
|
+
- README.md
|
105
|
+
- lib/twingly-url-normalizer.rb
|
106
|
+
- lib/twingly/url.rb
|
107
|
+
- lib/twingly/url/hasher.rb
|
108
|
+
- lib/twingly/url/normalizer.rb
|
109
|
+
- lib/version.rb
|
110
|
+
homepage: http://github.com/twingly/twingly-url
|
111
|
+
licenses:
|
112
|
+
- MIT
|
113
|
+
metadata: {}
|
114
|
+
post_install_message:
|
115
|
+
rdoc_options: []
|
116
|
+
require_paths:
|
117
|
+
- lib
|
118
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 1.9.3
|
123
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
125
|
+
- - ">="
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '0'
|
128
|
+
requirements: []
|
129
|
+
rubyforge_project:
|
130
|
+
rubygems_version: 2.2.2
|
131
|
+
signing_key:
|
132
|
+
specification_version: 4
|
133
|
+
summary: Ruby library for URL handling
|
134
|
+
test_files: []
|
135
|
+
has_rdoc:
|