twingly-url 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6748ed6715a818455973b7330c774c4d4e66776a
4
+ data.tar.gz: bb17c4b3eb99af036257f3668d7048ccb94f3466
5
+ SHA512:
6
+ metadata.gz: 55194c78c7a3af312d70e88730586b806868b2b9184853bca25cbdcc99e29622113c2fe333512d9f785ed4adefb8c89fe5c4472f3eb6d1e41d28905fcb020257
7
+ data.tar.gz: 8b9f59dfb2566c42393edc23535d5487e9bda3e0e1342c717dc4ba4a9b74e17f327b0324ee767608af1e54bdb3cc301416f153b705c8c1a253403759eb629914
data/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # Twingly::URL
2
+
3
+ [![Build Status](https://travis-ci.org/twingly/twingly-url.svg?branch=master)](https://travis-ci.org/twingly/twingly-url)
4
+
5
+ Twingly URL tools.
6
+
7
+ * `twingly/url` - Parse and validate URLs
8
+ * `Twingly::URL.parse` - Returns a Struct with `#url` and `#domain` accessors
9
+ * `Twingly::URL.validate` - Validates a URL
10
+ * `twingly/url/normalizer` - Normalize URLs
11
+ * `Twingly::URL::Normalizer.normalize(string)` - Extracts URLs from string (Array)
12
+ * `twingly/url/hasher` - Generate URL hashes suitable for primary keys
13
+ * `Twingly::URL::Hasher.documentdb_hash(url)` - MD5 hexdigest
14
+ * `Twingly::URL::Hasher.blogstream_hash(url)` - SHA256 unsigned long, native endian digest
15
+ * `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest
16
+ * `Twingly::URL::Hasher.pingloggerdb_hash(url)` - SHA256 64-bit unsigned, native endian digest
17
+
18
+ ## Normalization example
19
+
20
+ ```ruby
21
+ require 'twingly/url/normalizer'
22
+
23
+ Twingly::URL::Normalizer.normalize('http://duh.se')
24
+ # => ["http://www.duh.se/"]
25
+
26
+ Twingly::URL::Normalizer.normalize('http://duh.se http://blog.twingly.com/')
27
+ # => ["http://www.duh.se/", "http://blog.twingly.com/"]
28
+
29
+ Twingly::URL::Normalizer.normalize('no URL')
30
+ # => []
31
+ ```
32
+
33
+ ## Tests
34
+
35
+ Run tests with
36
+
37
+ bundle exec rake
38
+
39
+ ### Profiling
40
+
41
+ You can get some profiling by running
42
+
43
+ bundle exec rake test:profile
44
+
45
+ Note that this isn't a benchmark, we're using [ruby-prof] which will slow things down.
46
+
47
+ [ruby-prof]: http://ruby-prof.rubyforge.org/
@@ -0,0 +1,2 @@
1
+ warn "twingly-url-normalizer will be removed, use twingly/url/normalizer"
2
+ require 'twingly/url/normalizer'
@@ -0,0 +1,36 @@
1
+ require 'addressable/uri'
2
+ require 'public_suffix'
3
+
4
+ PublicSuffix::List.private_domains = false
5
+
6
+ SCHEMES = %w(http https)
7
+
8
+ module Twingly
9
+ module URL
10
+ module_function
11
+
12
+ UrlObject = Struct.new(:url, :domain) do
13
+ def valid?
14
+ url && domain && SCHEMES.include?(url.normalized_scheme)
15
+ end
16
+ end
17
+
18
+ def parse(potential_url)
19
+ url, domain = extract_url_and_domain(potential_url)
20
+ UrlObject.new(url, domain)
21
+ end
22
+
23
+ def extract_url_and_domain(potential_url)
24
+ url = Addressable::URI.heuristic_parse(potential_url)
25
+ domain = PublicSuffix.parse(url.host) if url
26
+
27
+ [url, domain]
28
+ rescue PublicSuffix::DomainInvalid, Addressable::URI::InvalidURIError
29
+ []
30
+ end
31
+
32
+ def validate(potential_url)
33
+ parse(potential_url).valid?
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,30 @@
1
+ require 'digest/md5'
2
+ require 'digest/sha2'
3
+
4
+ module Twingly
5
+ module URL
6
+ module Hasher
7
+ module_function
8
+
9
+ def taskdb_hash(url)
10
+ Digest::MD5.hexdigest(url)[0..29].upcase
11
+ end
12
+
13
+ def blogstream_hash(url)
14
+ Digest::MD5.hexdigest(url)[0..29].upcase
15
+ end
16
+
17
+ def documentdb_hash(url)
18
+ Digest::SHA256.digest(url).unpack("L!")[0]
19
+ end
20
+
21
+ def autopingdb_hash(url)
22
+ Digest::SHA256.digest(url).unpack("q")[0]
23
+ end
24
+
25
+ def pingloggerdb_hash(url)
26
+ Digest::SHA256.digest(url).unpack("Q")[0]
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,36 @@
1
+ require 'twingly/url'
2
+
3
+ module Twingly
4
+ module URL
5
+ module Normalizer
6
+ module_function
7
+
8
+ def normalize(potential_urls)
9
+ extract_urls(potential_urls).map do |potential_url|
10
+ normalize_url(potential_url)
11
+ end.compact
12
+ end
13
+
14
+ def extract_urls(potential_urls)
15
+ Array(potential_urls).map(&:split).flatten
16
+ end
17
+
18
+ def normalize_url(potential_url)
19
+ result = Twingly::URL.parse(potential_url)
20
+
21
+ return nil unless result.valid?
22
+
23
+ unless result.domain.subdomain?
24
+ result.url.host = "www.#{result.domain}"
25
+ end
26
+
27
+ if result.url.path.empty?
28
+ result.url.path = "/"
29
+ end
30
+
31
+ result.url.to_s
32
+ end
33
+ end
34
+ end
35
+ end
36
+
data/lib/version.rb ADDED
@@ -0,0 +1,5 @@
1
+ module Twingly
2
+ module URL
3
+ VERSION = '1.3.1'
4
+ end
5
+ end
metadata ADDED
@@ -0,0 +1,135 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twingly-url
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.1
5
+ platform: ruby
6
+ authors:
7
+ - Johan Eckerström
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: addressable
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2'
27
+ - !ruby/object:Gem::Dependency
28
+ name: public_suffix
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.4'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.4'
41
+ - !ruby/object:Gem::Dependency
42
+ name: turn
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10'
69
+ - !ruby/object:Gem::Dependency
70
+ name: shoulda-context
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: ruby-prof
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Twingly URL tools
98
+ email:
99
+ - johan.eckerstrom@twingly.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - README.md
105
+ - lib/twingly-url-normalizer.rb
106
+ - lib/twingly/url.rb
107
+ - lib/twingly/url/hasher.rb
108
+ - lib/twingly/url/normalizer.rb
109
+ - lib/version.rb
110
+ homepage: http://github.com/twingly/twingly-url
111
+ licenses:
112
+ - MIT
113
+ metadata: {}
114
+ post_install_message:
115
+ rdoc_options: []
116
+ require_paths:
117
+ - lib
118
+ required_ruby_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 1.9.3
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ requirements: []
129
+ rubyforge_project:
130
+ rubygems_version: 2.2.2
131
+ signing_key:
132
+ specification_version: 4
133
+ summary: Ruby library for URL handling
134
+ test_files: []
135
+ has_rdoc: