uri_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. data/.gitignore +6 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +6 -0
  4. data/Rakefile +13 -0
  5. data/ext/uri_parser/basictypes.h +89 -0
  6. data/ext/uri_parser/extconf.h +6 -0
  7. data/ext/uri_parser/extconf.rb +50 -0
  8. data/ext/uri_parser/logging.h +5 -0
  9. data/ext/uri_parser/scoped_ptr.h +322 -0
  10. data/ext/uri_parser/string16.cc +95 -0
  11. data/ext/uri_parser/string16.h +194 -0
  12. data/ext/uri_parser/uri_parser.cc +87 -0
  13. data/ext/uri_parser/url_canon.h +872 -0
  14. data/ext/uri_parser/url_canon_etc.cc +392 -0
  15. data/ext/uri_parser/url_canon_fileurl.cc +215 -0
  16. data/ext/uri_parser/url_canon_host.cc +401 -0
  17. data/ext/uri_parser/url_canon_icu.cc +207 -0
  18. data/ext/uri_parser/url_canon_icu.h +63 -0
  19. data/ext/uri_parser/url_canon_internal.cc +427 -0
  20. data/ext/uri_parser/url_canon_internal.h +453 -0
  21. data/ext/uri_parser/url_canon_internal_file.h +157 -0
  22. data/ext/uri_parser/url_canon_ip.cc +737 -0
  23. data/ext/uri_parser/url_canon_ip.h +101 -0
  24. data/ext/uri_parser/url_canon_mailtourl.cc +137 -0
  25. data/ext/uri_parser/url_canon_path.cc +380 -0
  26. data/ext/uri_parser/url_canon_pathurl.cc +128 -0
  27. data/ext/uri_parser/url_canon_query.cc +189 -0
  28. data/ext/uri_parser/url_canon_relative.cc +572 -0
  29. data/ext/uri_parser/url_canon_stdstring.h +134 -0
  30. data/ext/uri_parser/url_canon_stdurl.cc +211 -0
  31. data/ext/uri_parser/url_common.h +48 -0
  32. data/ext/uri_parser/url_file.h +108 -0
  33. data/ext/uri_parser/url_parse.cc +760 -0
  34. data/ext/uri_parser/url_parse.h +336 -0
  35. data/ext/uri_parser/url_parse_file.cc +243 -0
  36. data/ext/uri_parser/url_parse_internal.h +112 -0
  37. data/ext/uri_parser/url_util.cc +553 -0
  38. data/ext/uri_parser/url_util.h +222 -0
  39. data/lib/uri_parser.rb +28 -0
  40. data/lib/uri_parser/version.rb +3 -0
  41. data/spec/spec_helper.rb +16 -0
  42. data/spec/uri_parser_spec.rb +54 -0
  43. data/uri_parser.gemspec +26 -0
  44. metadata +117 -0
@@ -0,0 +1,222 @@
1
+ // Copyright 2007, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #ifndef GOOGLEURL_SRC_URL_UTIL_H__
31
+ #define GOOGLEURL_SRC_URL_UTIL_H__
32
+
33
+ #include <string>
34
+
35
+ #include "string16.h"
36
+ #include "url_common.h"
37
+ #include "url_parse.h"
38
+ #include "url_canon.h"
39
+
40
+ namespace url_util {
41
+
42
+ // Init ------------------------------------------------------------------------
43
+
44
+ // Initialization is NOT required, it will be implicitly initialized when first
45
+ // used. However, this implicit initialization is NOT threadsafe. If you are
46
+ // using this library in a threaded environment and don't have a consistent
47
+ // "first call" (an example might be calling "AddStandardScheme" with your
48
+ // special application-specific schemes) then you will want to call initialize
49
+ // before spawning any threads.
50
+ //
51
+ // It is OK to call this function more than once, subsequent calls will simply
52
+ // "noop", unless Shutdown() was called in the mean time. This will also be a
53
+ // "noop" if other calls to the library have forced an initialization
54
+ // beforehand.
55
+ GURL_API void Initialize();
56
+
57
+ // Cleanup is not required, except some strings may leak. For most user
58
+ // applications, this is fine. If you're using it in a library that may get
59
+ // loaded and unloaded, you'll want to unload to properly clean up your
60
+ // library.
61
+ GURL_API void Shutdown();
62
+
63
+ // Schemes --------------------------------------------------------------------
64
+
65
+ // Adds an application-defined scheme to the internal list of "standard" URL
66
+ // schemes. This function is not threadsafe and can not be called concurrently
67
+ // with any other url_util function. It will assert if the list of standard
68
+ // schemes has been locked (see LockStandardSchemes).
69
+ GURL_API void AddStandardScheme(const char* new_scheme);
70
+
71
+ // Sets a flag to prevent future calls to AddStandardScheme from succeeding.
72
+ //
73
+ // This is designed to help prevent errors for multithreaded applications.
74
+ // Normal usage would be to call AddStandardScheme for your custom schemes at
75
+ // the beginning of program initialization, and then LockStandardSchemes. This
76
+ // prevents future callers from mistakenly calling AddStandardScheme when the
77
+ // program is running with multiple threads, where such usage would be
78
+ // dangerous.
79
+ //
80
+ // We could have had AddStandardScheme use a lock instead, but that would add
81
+ // some platform-specific dependencies we don't otherwise have now, and is
82
+ // overkill considering the normal usage is so simple.
83
+ GURL_API void LockStandardSchemes();
84
+
85
+ // Locates the scheme in the given string and places it into |found_scheme|,
86
+ // which may be NULL to indicate the caller does not care about the range.
87
+ //
88
+ // Returns whether the given |compare| scheme matches the scheme found in the
89
+ // input (if any). The |compare| scheme must be a valid canonical scheme or
90
+ // the result of the comparison is undefined.
91
+ GURL_API bool FindAndCompareScheme(const char* str,
92
+ int str_len,
93
+ const char* compare,
94
+ url_parse::Component* found_scheme);
95
+ GURL_API bool FindAndCompareScheme(const char16* str,
96
+ int str_len,
97
+ const char* compare,
98
+ url_parse::Component* found_scheme);
99
+ inline bool FindAndCompareScheme(const std::string& str,
100
+ const char* compare,
101
+ url_parse::Component* found_scheme) {
102
+ return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
103
+ compare, found_scheme);
104
+ }
105
+ inline bool FindAndCompareScheme(const string16& str,
106
+ const char* compare,
107
+ url_parse::Component* found_scheme) {
108
+ return FindAndCompareScheme(str.data(), static_cast<int>(str.size()),
109
+ compare, found_scheme);
110
+ }
111
+
112
+ // Returns true if the given string represents a standard URL. This means that
113
+ // either the scheme is in the list of known standard schemes.
114
+ GURL_API bool IsStandard(const char* spec,
115
+ const url_parse::Component& scheme);
116
+ GURL_API bool IsStandard(const char16* spec,
117
+ const url_parse::Component& scheme);
118
+
119
+ // TODO(brettw) remove this. This is a temporary compatibility hack to avoid
120
+ // breaking the WebKit build when this version is synced via Chrome.
121
+ inline bool IsStandard(const char* spec, int spec_len,
122
+ const url_parse::Component& scheme) {
123
+ return IsStandard(spec, scheme);
124
+ }
125
+
126
+ // URL library wrappers -------------------------------------------------------
127
+
128
+ // Parses the given spec according to the extracted scheme type. Normal users
129
+ // should use the URL object, although this may be useful if performance is
130
+ // critical and you don't want to do the heap allocation for the std::string.
131
+ //
132
+ // As with the url_canon::Canonicalize* functions, the charset converter can
133
+ // be NULL to use UTF-8 (it will be faster in this case).
134
+ //
135
+ // Returns true if a valid URL was produced, false if not. On failure, the
136
+ // output and parsed structures will still be filled and will be consistent,
137
+ // but they will not represent a loadable URL.
138
+ GURL_API bool Canonicalize(const char* spec,
139
+ int spec_len,
140
+ url_canon::CharsetConverter* charset_converter,
141
+ url_canon::CanonOutput* output,
142
+ url_parse::Parsed* output_parsed);
143
+ GURL_API bool Canonicalize(const char16* spec,
144
+ int spec_len,
145
+ url_canon::CharsetConverter* charset_converter,
146
+ url_canon::CanonOutput* output,
147
+ url_parse::Parsed* output_parsed);
148
+
149
+ // Resolves a potentially relative URL relative to the given parsed base URL.
150
+ // The base MUST be valid. The resulting canonical URL and parsed information
151
+ // will be placed in to the given out variables.
152
+ //
153
+ // The relative need not be relative. If we discover that it's absolute, this
154
+ // will produce a canonical version of that URL. See Canonicalize() for more
155
+ // about the charset_converter.
156
+ //
157
+ // Returns true if the output is valid, false if the input could not produce
158
+ // a valid URL.
159
+ GURL_API bool ResolveRelative(const char* base_spec,
160
+ int base_spec_len,
161
+ const url_parse::Parsed& base_parsed,
162
+ const char* relative,
163
+ int relative_length,
164
+ url_canon::CharsetConverter* charset_converter,
165
+ url_canon::CanonOutput* output,
166
+ url_parse::Parsed* output_parsed);
167
+ GURL_API bool ResolveRelative(const char* base_spec,
168
+ int base_spec_len,
169
+ const url_parse::Parsed& base_parsed,
170
+ const char16* relative,
171
+ int relative_length,
172
+ url_canon::CharsetConverter* charset_converter,
173
+ url_canon::CanonOutput* output,
174
+ url_parse::Parsed* output_parsed);
175
+
176
+ // Replaces components in the given VALID input url. The new canonical URL info
177
+ // is written to output and out_parsed.
178
+ //
179
+ // Returns true if the resulting URL is valid.
180
+ GURL_API bool ReplaceComponents(
181
+ const char* spec,
182
+ int spec_len,
183
+ const url_parse::Parsed& parsed,
184
+ const url_canon::Replacements<char>& replacements,
185
+ url_canon::CharsetConverter* charset_converter,
186
+ url_canon::CanonOutput* output,
187
+ url_parse::Parsed* out_parsed);
188
+ GURL_API bool ReplaceComponents(
189
+ const char* spec,
190
+ int spec_len,
191
+ const url_parse::Parsed& parsed,
192
+ const url_canon::Replacements<char16>& replacements,
193
+ url_canon::CharsetConverter* charset_converter,
194
+ url_canon::CanonOutput* output,
195
+ url_parse::Parsed* out_parsed);
196
+
197
+ // String helper functions ----------------------------------------------------
198
+
199
+ // Compare the lower-case form of the given string against the given ASCII
200
+ // string. This is useful for doing checking if an input string matches some
201
+ // token, and it is optimized to avoid intermediate string copies.
202
+ //
203
+ // The versions of this function that don't take a b_end assume that the b
204
+ // string is NULL terminated.
205
+ GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
206
+ const char* a_end,
207
+ const char* b);
208
+ GURL_API bool LowerCaseEqualsASCII(const char* a_begin,
209
+ const char* a_end,
210
+ const char* b_begin,
211
+ const char* b_end);
212
+ GURL_API bool LowerCaseEqualsASCII(const char16* a_begin,
213
+ const char16* a_end,
214
+ const char* b);
215
+
216
+ // Unescapes the given string using URL escaping rules.
217
+ GURL_API void DecodeURLEscapeSequences(const char* input, int length,
218
+ url_canon::CanonOutputW* output);
219
+
220
+ } // namespace url_util
221
+
222
+ #endif // GOOGLEURL_SRC_URL_UTIL_H__
data/lib/uri_parser.rb ADDED
@@ -0,0 +1,28 @@
1
+ require 'uri_parser/version'
2
+ require 'uri_parser/uri_parser'
3
+ require 'cgi'
4
+
5
+ class URIParser
6
+ alias normalized uri
7
+
8
+ def path_and_query
9
+ @path_and_query ||= if query.empty?
10
+ path
11
+ else
12
+ "#{path}?#{query}"
13
+ end
14
+ end
15
+
16
+ def query_params
17
+ @query_params ||= begin
18
+ k_v_pairs = query.split('&')
19
+
20
+ {}.tap do |hash|
21
+ k_v_pairs.each do |kv|
22
+ key, value = kv.split('=')
23
+ hash[CGI.unescape(key)] = CGI.unescape(value)
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,3 @@
1
+ class URIParser
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,16 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ Bundler.setup
4
+
5
+ require 'rspec'
6
+ Dir['./spec/support/**/*.rb'].each { |f| require f }
7
+
8
+ require 'uri_parser'
9
+
10
+ RSpec.configure do |config|
11
+ config.color_enabled = true
12
+ config.debug = true
13
+
14
+ config.filter_run :focus => true
15
+ config.run_all_when_everything_filtered = true
16
+ end
@@ -0,0 +1,54 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe URIParser do
5
+ def self.describe_parsed(url, &block)
6
+ describe ".new('#{url}')" do
7
+ subject { described_class.new(url) }
8
+ module_eval(&block)
9
+ end
10
+ end
11
+
12
+ describe '#normalized' do
13
+ it 'is an alias for #uri' do
14
+ uri = URIParser.new('http://foo.com')
15
+ uri.normalized.should == 'http://foo.com/'
16
+ end
17
+ end
18
+
19
+ describe_parsed 'http://example.com/foo/bar?a=b&c=d' do
20
+ its(:scheme) { should == 'http' }
21
+ its(:host) { should == 'example.com' }
22
+ its(:path) { should == '/foo/bar' }
23
+ its(:query) { should == 'a=b&c=d' }
24
+ its(:valid?) { should be_true }
25
+ its(:uri) { should == 'http://example.com/foo/bar?a=b&c=d' }
26
+ its(:path_and_query) { should == '/foo/bar?a=b&c=d' }
27
+ its(:query_params) { should == { 'a' => 'b', 'c' => 'd' } }
28
+ end
29
+
30
+ describe_parsed '@#4ioasfajdkshfas' do
31
+ its(:valid?) { should be_false }
32
+ end
33
+
34
+ describe_parsed 'http://руцентр.рф/Iñtërnâtiônàlizætiøn!?i18n=true' do
35
+ its(:valid?) { should be_true }
36
+ its(:scheme) { should == 'http' }
37
+ its(:host) { should == 'xn--e1aqhcjdv.xn--p1ai' }
38
+ its(:path) { should == '/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%C3%A0liz%C3%A6ti%C3%B8n!' }
39
+ its(:query) { should == 'i18n=true' }
40
+ its(:uri) { should == 'http://xn--e1aqhcjdv.xn--p1ai/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%C3%A0liz%C3%A6ti%C3%B8n!?i18n=true' }
41
+ end
42
+
43
+ describe_parsed 'http://subdomain.bar.com' do
44
+ its(:host) { should == 'subdomain.bar.com' }
45
+ its(:path) { should == '/' }
46
+ its(:query) { should == '' }
47
+ its(:path_and_query) { should == '/' }
48
+ end
49
+
50
+ describe_parsed 'http://domain.com?a param=a value' do
51
+ its(:uri) { should == 'http://domain.com/?a%20param=a%20value' }
52
+ its(:query_params) { should == { 'a param' => 'a value' } }
53
+ end
54
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "uri_parser/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "uri_parser"
7
+ s.version = URIParser::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = %w[Myron Marston]
10
+ s.email = %w[myron.marston@gmail.com]
11
+ s.homepage = "https://github.com/seomoz/uri_parser"
12
+ s.summary = "A fast URI parser and normalizer"
13
+ s.description = "Parses and normalizes URIs very quickly, using Google's URI canonicalization library"
14
+
15
+ s.rubyforge_project = "uri_parser"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.extensions = ["ext/uri_parser/extconf.rb"]
21
+ s.require_paths = ["lib"]
22
+ s.required_ruby_version = '>= 1.9.2'
23
+
24
+ s.add_development_dependency 'rspec', '~> 2.5'
25
+ s.add_development_dependency 'rake-compiler', '~> 0.7.6'
26
+ end
metadata ADDED
@@ -0,0 +1,117 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: uri_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Myron
9
+ - Marston
10
+ autorequire:
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 2011-03-24 00:00:00.000000000 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: rspec
18
+ requirement: &2152411480 !ruby/object:Gem::Requirement
19
+ none: false
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: '2.5'
24
+ type: :development
25
+ prerelease: false
26
+ version_requirements: *2152411480
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake-compiler
29
+ requirement: &2152411020 !ruby/object:Gem::Requirement
30
+ none: false
31
+ requirements:
32
+ - - ~>
33
+ - !ruby/object:Gem::Version
34
+ version: 0.7.6
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: *2152411020
38
+ description: Parses and normalizes URIs very quickly, using Google's URI canonicalization
39
+ library
40
+ email:
41
+ - myron.marston@gmail.com
42
+ executables: []
43
+ extensions:
44
+ - ext/uri_parser/extconf.rb
45
+ extra_rdoc_files: []
46
+ files:
47
+ - .gitignore
48
+ - .rvmrc
49
+ - Gemfile
50
+ - Rakefile
51
+ - ext/uri_parser/basictypes.h
52
+ - ext/uri_parser/extconf.h
53
+ - ext/uri_parser/extconf.rb
54
+ - ext/uri_parser/logging.h
55
+ - ext/uri_parser/scoped_ptr.h
56
+ - ext/uri_parser/string16.cc
57
+ - ext/uri_parser/string16.h
58
+ - ext/uri_parser/uri_parser.cc
59
+ - ext/uri_parser/url_canon.h
60
+ - ext/uri_parser/url_canon_etc.cc
61
+ - ext/uri_parser/url_canon_fileurl.cc
62
+ - ext/uri_parser/url_canon_host.cc
63
+ - ext/uri_parser/url_canon_icu.cc
64
+ - ext/uri_parser/url_canon_icu.h
65
+ - ext/uri_parser/url_canon_internal.cc
66
+ - ext/uri_parser/url_canon_internal.h
67
+ - ext/uri_parser/url_canon_internal_file.h
68
+ - ext/uri_parser/url_canon_ip.cc
69
+ - ext/uri_parser/url_canon_ip.h
70
+ - ext/uri_parser/url_canon_mailtourl.cc
71
+ - ext/uri_parser/url_canon_path.cc
72
+ - ext/uri_parser/url_canon_pathurl.cc
73
+ - ext/uri_parser/url_canon_query.cc
74
+ - ext/uri_parser/url_canon_relative.cc
75
+ - ext/uri_parser/url_canon_stdstring.h
76
+ - ext/uri_parser/url_canon_stdurl.cc
77
+ - ext/uri_parser/url_common.h
78
+ - ext/uri_parser/url_file.h
79
+ - ext/uri_parser/url_parse.cc
80
+ - ext/uri_parser/url_parse.h
81
+ - ext/uri_parser/url_parse_file.cc
82
+ - ext/uri_parser/url_parse_internal.h
83
+ - ext/uri_parser/url_util.cc
84
+ - ext/uri_parser/url_util.h
85
+ - lib/uri_parser.rb
86
+ - lib/uri_parser/version.rb
87
+ - spec/spec_helper.rb
88
+ - spec/uri_parser_spec.rb
89
+ - uri_parser.gemspec
90
+ has_rdoc: true
91
+ homepage: https://github.com/seomoz/uri_parser
92
+ licenses: []
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ none: false
99
+ requirements:
100
+ - - ! '>='
101
+ - !ruby/object:Gem::Version
102
+ version: 1.9.2
103
+ required_rubygems_version: !ruby/object:Gem::Requirement
104
+ none: false
105
+ requirements:
106
+ - - ! '>='
107
+ - !ruby/object:Gem::Version
108
+ version: '0'
109
+ requirements: []
110
+ rubyforge_project: uri_parser
111
+ rubygems_version: 1.5.2
112
+ signing_key:
113
+ specification_version: 3
114
+ summary: A fast URI parser and normalizer
115
+ test_files:
116
+ - spec/spec_helper.rb
117
+ - spec/uri_parser_spec.rb