url_parser 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: d88f4309a1787a5ed3f004e60a85e5f4a5e26765
4
+ data.tar.gz: 78146dbfb19dbec7f5f9169fef026a7b2448c43d
5
+ SHA512:
6
+ metadata.gz: 99801e22611a0d7c78b576e01aacb097c32852fa99775674524b2bf1f000988e64cb00766eaeec083ce16b0cb7c10c9b29745dff7186fca43135276e467c8a05
7
+ data.tar.gz: 4df106696e71b773da7bc7e7997ac8ade0cb4002327cbb107c1b436c420373b1ce55abcd0eed67d0b328385c7d35d190bc921e5e64d264ef26c4ce7232817389
data/.gitignore ADDED
@@ -0,0 +1,22 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ *.bundle
19
+ *.so
20
+ *.o
21
+ *.a
22
+ mkmf.log
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in url_parser.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Matt Solt
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # UrlParser
2
+
3
+ Combine PostRank-URI, Domainatrix, and other Ruby url parsing libraries into a common interface.
4
+
5
+ See also:
6
+ - https://github.com/pauldix/domainatrix
7
+ - https://github.com/postrank-labs/postrank-uri
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'url_parser'
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install url_parser
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Contributing
28
+
29
+ 1. Fork it ( https://github.com/[my-github-username]/url_parser/fork )
30
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
31
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
32
+ 4. Push to the branch (`git push origin my-new-feature`)
33
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,3 @@
1
+ module UrlParser
2
+ VERSION = "0.1.0"
3
+ end
data/lib/url_parser.rb ADDED
@@ -0,0 +1,150 @@
1
+ require "url_parser/version"
2
+ require "domainatrix"
3
+ require "postrank-uri"
4
+ require "addressable/uri"
5
+
6
+ class Array
7
+
8
+ def self.wrap(object)
9
+ if object.nil?
10
+ []
11
+ elsif object.respond_to?(:to_ary)
12
+ object.to_ary || [object]
13
+ else
14
+ [object]
15
+ end
16
+ end unless respond_to?(:wrap)
17
+
18
+ end
19
+
20
+ module UrlParser
21
+
22
+ module Error; end
23
+
24
+ def self.call(text)
25
+ urls = []
26
+ PostRank::URI.extract(text).each do |url|
27
+ urls << new(url)
28
+ end
29
+ urls
30
+ end
31
+
32
+ def self.new(url, options = {})
33
+ Base.new(url, options)
34
+ end
35
+
36
+ class Base
37
+
38
+ # https://secure.wikimedia.org/wikipedia/en/wiki/URI_scheme
39
+ MAJOR_SCHEMES = [
40
+ 'file', 'ftp', 'gopher', 'h323', 'hdl', 'http', 'https', 'imap', 'magnet',
41
+ 'mailto', 'mms', 'news', 'nntp', 'prospero', 'rsync', 'rtsp', 'rtspu',
42
+ 'sftp', 'shttp', 'sip', 'sips', 'snews', 'svn', 'svn+ssh', 'telnet',
43
+ 'wais',
44
+ # Unofficial schemes
45
+ 'aim', 'callto', 'cvs', 'facetime', 'feed', 'git', 'gtalk', 'irc', 'ircs',
46
+ 'irc6', 'itms', 'mms', 'msnim', 'skype', 'ssh', 'smb', 'svn', 'ymsg', 'mvn'
47
+ ]
48
+
49
+ DEFAULT_SCHEMES = [
50
+ 'http', 'https', 'ftp', 'mailto', 'file', 'ssh', 'feed',
51
+ 'cvs', 'git', 'mvn', 'nntp', 'shttp', 'svn'
52
+ ]
53
+
54
+ attr_reader :url, :original_url
55
+
56
+ def initialize(url, options = {})
57
+ tag_errors do
58
+ @schemes = options.fetch(:schemes) { DEFAULT_SCHEMES }
59
+ @preserve = !!options[:preserve]
60
+ @original_url = url
61
+ @url = @preserve ? url : PostRank::URI.clean(url)
62
+ end
63
+ end
64
+
65
+ def schemes
66
+ Array.wrap(@schemes)
67
+ end
68
+
69
+ def uri
70
+ tag_errors do
71
+ @uri ||= Addressable::URI.parse(url) rescue nil
72
+ end
73
+ end
74
+
75
+ def scheme
76
+ uri.scheme if uri
77
+ end
78
+
79
+ def user
80
+ uri.user if uri
81
+ end
82
+
83
+ def password
84
+ uri.password if uri
85
+ end
86
+
87
+ def host
88
+ uri.host if uri
89
+ end
90
+
91
+ def port
92
+ uri.port if uri
93
+ end
94
+
95
+ def path
96
+ uri.path if uri
97
+ end
98
+
99
+ def query
100
+ uri.query if uri
101
+ end
102
+
103
+ def fragment
104
+ uri.fragment if uri
105
+ end
106
+
107
+ def query_values
108
+ uri ? uri.query_values.to_h : {}
109
+ end
110
+
111
+ def valid?
112
+ return true if domain == 'localhost'
113
+ return false if uri.nil?
114
+ return false unless schemes.include?(scheme)
115
+ return false unless host =~ /\./
116
+ true
117
+ end
118
+
119
+ def parser
120
+ tag_errors do
121
+ @parser ||= Domainatrix.parse(url)
122
+ end
123
+ end
124
+
125
+ def domain
126
+ parser.domain_with_public_suffix
127
+ end
128
+
129
+ def subdomain
130
+ unless parser.subdomain.empty?
131
+ parts = parser.subdomain.tap{ |s| s.slice!(domain) }.split('.')
132
+ parts.shift if parts.first =~ /www?\d*/
133
+ (parts << domain).join('.')
134
+ else
135
+ domain
136
+ end
137
+ end
138
+
139
+ private
140
+
141
+ def tag_errors
142
+ yield
143
+ rescue Exception => error
144
+ error.extend(UrlParser::Error)
145
+ raise
146
+ end
147
+
148
+ end
149
+
150
+ end
@@ -0,0 +1,22 @@
1
+ require "rspec"
2
+
3
+ require "url_parser"
4
+
5
+ # This file was generated by the `rspec --init` command. Conventionally, all
6
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
7
+ # Require this file using `require "spec_helper"` to ensure that it is only
8
+ # loaded once.
9
+ #
10
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
11
+ RSpec.configure do |config|
12
+ config.run_all_when_everything_filtered = true
13
+ config.filter_run :focus
14
+
15
+ # Run specs in random order to surface order dependencies. If you find an
16
+ # order dependency and want to debug it, you can fix the order by providing
17
+ # the seed, which is printed after each run.
18
+ # --seed 1234
19
+ config.order = 'random'
20
+ end
21
+
22
+
@@ -0,0 +1,176 @@
1
+ require 'spec_helper'
2
+
3
+ describe UrlParser do
4
+
5
+ let(:parser) { UrlParser.new(link) }
6
+
7
+ it "must be defined" do
8
+ expect(UrlParser::VERSION).not_to be_nil
9
+ end
10
+
11
+ context "::call" do
12
+
13
+ let(:link) { 'http://example.com/' }
14
+ let(:text) { "there is a #{link} in here" }
15
+ let(:extractor) { UrlParser.call(text) }
16
+
17
+ it "extracts urls from text into an array" do
18
+ expect(extractor.collect(&:url)).to include link
19
+ end
20
+
21
+ it "initializes each url with the parser" do
22
+ expect(extractor.first).to be_a UrlParser::Base
23
+ end
24
+
25
+ end
26
+
27
+ context "::new" do
28
+
29
+ let(:link) { 'http://example.com/' }
30
+
31
+ it "initializes a parser with a url" do
32
+ expect(parser.url).to eq link
33
+ end
34
+
35
+ it "cannot initialize invalid urls" do
36
+ expect{ UrlParser.new('http:||bra.ziz') }.to raise_error
37
+ end
38
+
39
+ it "adds http by default" do
40
+ expect(UrlParser.new('example.com').url).to eq link
41
+ end
42
+
43
+ it "adds http to protocol-less urls" do
44
+ expect(UrlParser.new('//example.com').url).to eq link
45
+ end
46
+
47
+ it "any errors raised inherit from UrlParser::Error" do
48
+ expect{
49
+ UrlParser.new('http:||bra.ziz')
50
+ }.to raise_error UrlParser::Error
51
+ end
52
+
53
+ context "options" do
54
+
55
+ context ":preserve" do
56
+
57
+ let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
58
+
59
+ it "is false by default" do
60
+ expect(parser.url).not_to eq parser.original_url
61
+ end
62
+
63
+ it "does not clean the url when true" do
64
+ parser = UrlParser.new(link, preserve: true)
65
+ expect(parser.url).to eq parser.original_url
66
+ end
67
+
68
+ end
69
+
70
+ end
71
+
72
+ end
73
+
74
+ context "#uri" do
75
+
76
+ it "returns a parsed uri" do
77
+ expect(UrlParser.new('http://example.com').uri).to be_a Addressable::URI
78
+ end
79
+
80
+ end
81
+
82
+ context "#valid?" do
83
+
84
+ it "returns false if the url is invalid" do
85
+ expect(UrlParser.new('bullshit')).not_to be_valid
86
+ end
87
+
88
+ it "returns false if the url scheme is not in the options" do
89
+ expect(UrlParser.new('telnet://some.com')).not_to be_valid
90
+ end
91
+
92
+ it "returns true if the url scheme is in the options" do
93
+ expect(UrlParser.new('telnet://some.com', schemes: ['telnet'])).to be_valid
94
+ end
95
+
96
+ it "returns true if the url is valid" do
97
+ expect(UrlParser.new('http://example.com/')).to be_valid
98
+ end
99
+
100
+ it "returns true for localhost" do
101
+ expect(UrlParser.new('localhost:5000')).to be_valid
102
+ end
103
+
104
+ end
105
+
106
+ context "#original_url" do
107
+
108
+ let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
109
+
110
+ it "preserves the url input" do
111
+ expect(parser.original_url).to eq link
112
+ end
113
+
114
+ end
115
+
116
+ context "#url" do
117
+
118
+ let(:link) { 'link.to?a=b&utm_source=FeedBurner#stuff' }
119
+
120
+ it "returns a url" do
121
+ expect(parser.url).to eq 'http://link.to/?a=b'
122
+ end
123
+
124
+ it "attempts to clean and normalize urls" do
125
+ [
126
+ 'http://igvita.com/',
127
+ 'http://igvita.com///',
128
+ 'http://igvita.com/../?#',
129
+ 'http://igvita.com/a/../?',
130
+ 'http://igvita.com/a/../?utm_source%3Danalytics'
131
+ ].each do |url|
132
+ expect(UrlParser.new(url).url)
133
+ .to eq 'http://igvita.com/'
134
+ end
135
+ end
136
+
137
+ end
138
+
139
+ context "#domain" do
140
+
141
+ let(:link) { 'https://github.com/pauldix/domainatrix' }
142
+
143
+ it "returns the domain name with suffix" do
144
+ expect(parser.domain).to eq 'github.com'
145
+ end
146
+
147
+ end
148
+
149
+ context "#subdomain" do
150
+
151
+ let(:link) { 'http://foo.bar.pauldix.co.uk/asdf.html?q=arg' }
152
+
153
+ it "returns all subdomains with suffix" do
154
+ expect(parser.subdomain).to eq 'foo.bar.pauldix.co.uk'
155
+ end
156
+
157
+ it "returns only the domain if there is no subdomain" do
158
+ url = UrlParser.new('https://github.com/')
159
+ expect(url.subdomain).to eq 'github.com'
160
+ end
161
+
162
+ it "does not include www as part of the subdomain" do
163
+ parser = UrlParser.new("http://www.energy.ca.gov/")
164
+ expect(parser.subdomain).to eq 'energy.ca.gov'
165
+ end
166
+
167
+ it "does not include any variation of www as part of the subdomain" do
168
+ [ 'ww2', 'www2', 'ww23', 'www23' ].each do |www|
169
+ parser = UrlParser.new("http://#{www}.energy.ca.gov/")
170
+ expect(parser.subdomain).to eq 'energy.ca.gov'
171
+ end
172
+ end
173
+
174
+ end
175
+
176
+ end
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'url_parser/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "url_parser"
8
+ spec.version = UrlParser::VERSION
9
+ spec.authors = ["Matt Solt"]
10
+ spec.email = ["mattsolt@gmail.com"]
11
+ spec.summary = %q{Combine PostRank-URI, Domainatrix, and other Ruby url parsing libraries into a common interface.}
12
+ spec.description = %q{Uses PostRank-URI to clean, Addressable to break into components, and Domainatrix to determine domain and subdomain.}
13
+ spec.homepage = "https://github.com/activefx/url_parser"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.6"
22
+ spec.add_development_dependency "rake", "~> 10"
23
+ spec.add_development_dependency "rspec", "~> 3.0"
24
+
25
+ spec.add_dependency "domainatrix", ">= 0.0.11"
26
+ spec.add_dependency "postrank-uri", "~> 1.0"
27
+ spec.add_dependency "addressable", "~> 2.3"
28
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: url_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matt Solt
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-08-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: domainatrix
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: 0.0.11
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: 0.0.11
69
+ - !ruby/object:Gem::Dependency
70
+ name: postrank-uri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: addressable
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '2.3'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '2.3'
97
+ description: Uses PostRank-URI to clean, Addressable to break into components, and
98
+ Domainatrix to determine domain and subdomain.
99
+ email:
100
+ - mattsolt@gmail.com
101
+ executables: []
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - Gemfile
108
+ - LICENSE.txt
109
+ - README.md
110
+ - Rakefile
111
+ - lib/url_parser.rb
112
+ - lib/url_parser/version.rb
113
+ - spec/spec_helper.rb
114
+ - spec/url_parser_spec.rb
115
+ - url_parser.gemspec
116
+ homepage: https://github.com/activefx/url_parser
117
+ licenses:
118
+ - MIT
119
+ metadata: {}
120
+ post_install_message:
121
+ rdoc_options: []
122
+ require_paths:
123
+ - lib
124
+ required_ruby_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ required_rubygems_version: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ requirements: []
135
+ rubyforge_project:
136
+ rubygems_version: 2.2.2
137
+ signing_key:
138
+ specification_version: 4
139
+ summary: Combine PostRank-URI, Domainatrix, and other Ruby url parsing libraries into
140
+ a common interface.
141
+ test_files:
142
+ - spec/spec_helper.rb
143
+ - spec/url_parser_spec.rb