twingly-url 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6748ed6715a818455973b7330c774c4d4e66776a
4
+ data.tar.gz: bb17c4b3eb99af036257f3668d7048ccb94f3466
5
+ SHA512:
6
+ metadata.gz: 55194c78c7a3af312d70e88730586b806868b2b9184853bca25cbdcc99e29622113c2fe333512d9f785ed4adefb8c89fe5c4472f3eb6d1e41d28905fcb020257
7
+ data.tar.gz: 8b9f59dfb2566c42393edc23535d5487e9bda3e0e1342c717dc4ba4a9b74e17f327b0324ee767608af1e54bdb3cc301416f153b705c8c1a253403759eb629914
data/README.md ADDED
@@ -0,0 +1,47 @@
1
+ # Twingly::URL
2
+
3
+ [![Build Status](https://travis-ci.org/twingly/twingly-url.svg?branch=master)](https://travis-ci.org/twingly/twingly-url)
4
+
5
+ Twingly URL tools.
6
+
7
+ * `twingly/url` - Parse and validate URLs
8
+ * `Twingly::URL.parse` - Returns a Struct with `#url` and `#domain` accessors
9
+ * `Twingly::URL.validate` - Validates a URL
10
+ * `twingly/url/normalizer` - Normalize URLs
11
+ * `Twingly::URL::Normalizer.normalize(string)` - Extracts URLs from string (Array)
12
+ * `twingly/url/hasher` - Generate URL hashes suitable for primary keys
13
+ * `Twingly::URL::Hasher.documentdb_hash(url)` - MD5 hexdigest
14
+ * `Twingly::URL::Hasher.blogstream_hash(url)` - SHA256 unsigned long, native endian digest
15
+ * `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest
16
+ * `Twingly::URL::Hasher.pingloggerdb_hash(url)` - SHA256 64-bit unsigned, native endian digest
17
+
18
+ ## Normalization example
19
+
20
+ ```ruby
21
+ require 'twingly/url/normalizer'
22
+
23
+ Twingly::URL::Normalizer.normalize('http://duh.se')
24
+ # => ["http://www.duh.se/"]
25
+
26
+ Twingly::URL::Normalizer.normalize('http://duh.se http://blog.twingly.com/')
27
+ # => ["http://www.duh.se/", "http://blog.twingly.com/"]
28
+
29
+ Twingly::URL::Normalizer.normalize('no URL')
30
+ # => []
31
+ ```
32
+
33
+ ## Tests
34
+
35
+ Run tests with
36
+
37
+ bundle exec rake
38
+
39
+ ### Profiling
40
+
41
+ You can get some profiling by running
42
+
43
+ bundle exec rake test:profile
44
+
45
+ Note that this isn't a benchmark, we're using [ruby-prof] which will slow things down.
46
+
47
+ [ruby-prof]: http://ruby-prof.rubyforge.org/
@@ -0,0 +1,2 @@
1
+ warn "twingly-url-normalizer will be removed, use twingly/url/normalizer"
2
+ require 'twingly/url/normalizer'
@@ -0,0 +1,36 @@
1
+ require 'addressable/uri'
2
+ require 'public_suffix'
3
+
4
+ PublicSuffix::List.private_domains = false
5
+
6
+ SCHEMES = %w(http https)
7
+
8
+ module Twingly
9
+ module URL
10
+ module_function
11
+
12
+ UrlObject = Struct.new(:url, :domain) do
13
+ def valid?
14
+ url && domain && SCHEMES.include?(url.normalized_scheme)
15
+ end
16
+ end
17
+
18
+ def parse(potential_url)
19
+ url, domain = extract_url_and_domain(potential_url)
20
+ UrlObject.new(url, domain)
21
+ end
22
+
23
+ def extract_url_and_domain(potential_url)
24
+ url = Addressable::URI.heuristic_parse(potential_url)
25
+ domain = PublicSuffix.parse(url.host) if url
26
+
27
+ [url, domain]
28
+ rescue PublicSuffix::DomainInvalid, Addressable::URI::InvalidURIError
29
+ []
30
+ end
31
+
32
+ def validate(potential_url)
33
+ parse(potential_url).valid?
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,30 @@
1
+ require 'digest/md5'
2
+ require 'digest/sha2'
3
+
4
+ module Twingly
5
+ module URL
6
+ module Hasher
7
+ module_function
8
+
9
+ def taskdb_hash(url)
10
+ Digest::MD5.hexdigest(url)[0..29].upcase
11
+ end
12
+
13
+ def blogstream_hash(url)
14
+ Digest::MD5.hexdigest(url)[0..29].upcase
15
+ end
16
+
17
+ def documentdb_hash(url)
18
+ Digest::SHA256.digest(url).unpack("L!")[0]
19
+ end
20
+
21
+ def autopingdb_hash(url)
22
+ Digest::SHA256.digest(url).unpack("q")[0]
23
+ end
24
+
25
+ def pingloggerdb_hash(url)
26
+ Digest::SHA256.digest(url).unpack("Q")[0]
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,36 @@
1
+ require 'twingly/url'
2
+
3
+ module Twingly
4
+ module URL
5
+ module Normalizer
6
+ module_function
7
+
8
+ def normalize(potential_urls)
9
+ extract_urls(potential_urls).map do |potential_url|
10
+ normalize_url(potential_url)
11
+ end.compact
12
+ end
13
+
14
+ def extract_urls(potential_urls)
15
+ Array(potential_urls).map(&:split).flatten
16
+ end
17
+
18
+ def normalize_url(potential_url)
19
+ result = Twingly::URL.parse(potential_url)
20
+
21
+ return nil unless result.valid?
22
+
23
+ unless result.domain.subdomain?
24
+ result.url.host = "www.#{result.domain}"
25
+ end
26
+
27
+ if result.url.path.empty?
28
+ result.url.path = "/"
29
+ end
30
+
31
+ result.url.to_s
32
+ end
33
+ end
34
+ end
35
+ end
36
+
data/lib/version.rb ADDED
@@ -0,0 +1,5 @@
1
+ module Twingly
2
+ module URL
3
+ VERSION = '1.3.1'
4
+ end
5
+ end
metadata ADDED
@@ -0,0 +1,135 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: twingly-url
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.1
5
+ platform: ruby
6
+ authors:
7
+ - Johan Eckerström
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: addressable
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '2'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '2'
27
+ - !ruby/object:Gem::Dependency
28
+ name: public_suffix
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.4'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.4'
41
+ - !ruby/object:Gem::Dependency
42
+ name: turn
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10'
69
+ - !ruby/object:Gem::Dependency
70
+ name: shoulda-context
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: ruby-prof
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Twingly URL tools
98
+ email:
99
+ - johan.eckerstrom@twingly.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - README.md
105
+ - lib/twingly-url-normalizer.rb
106
+ - lib/twingly/url.rb
107
+ - lib/twingly/url/hasher.rb
108
+ - lib/twingly/url/normalizer.rb
109
+ - lib/version.rb
110
+ homepage: http://github.com/twingly/twingly-url
111
+ licenses:
112
+ - MIT
113
+ metadata: {}
114
+ post_install_message:
115
+ rdoc_options: []
116
+ require_paths:
117
+ - lib
118
+ required_ruby_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 1.9.3
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ requirements: []
129
+ rubyforge_project:
130
+ rubygems_version: 2.2.2
131
+ signing_key:
132
+ specification_version: 4
133
+ summary: Ruby library for URL handling
134
+ test_files: []
135
+ has_rdoc: