uri_scanner 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0c1cf86b6e9135a1fb326524b5069413b1f3bff9
4
+ data.tar.gz: 687da77c9318bbdb712e1bac6d1f164518526c47
5
+ SHA512:
6
+ metadata.gz: 33be598298215e59cb044256b6ccd25a3d40394a5bf0a23c9ea5d173938a095f6fbd8ae7bdf9bfb63fe3ef34202cfda1ed0b692187ec31ab6ad523ce0d25c547
7
+ data.tar.gz: 344bed1a5df1cc61cace654f3d3d34a875383654f0af5076eaf866db00920cbe4f592b3c286bd71087b1a936350762c1f8d4c9fa62b205ce1631771e58914852
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ *.sw?
2
+ .DS_Store
3
+ doc
4
+ pkg
5
+ bin
6
+ Gemfile.lock
7
+ .bundle
8
+ *.rbc
9
+ *.log
10
+ *.dot
11
+ *.svg
12
+ bundle
13
+ .bundle
14
+ specs.out
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in uri_scanner.gemspec
4
+ gemspec
5
+
6
+ gem 'rspec', '~> 3.0'
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2016 Stas Kobzar
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,73 @@
1
+ # UriScanner
2
+
3
+ Simple library that parses URI or scans input text for URIs.
4
+ RFC3986 compliant. SIP URIs parsing implemented following RFC3261.
5
+
6
+ This library is based on [Ragel State Machine Compiler](http://www.colm.net/open-source/ragel/).
7
+ Ragel is great software created by Dr. Adrian D. Thurston.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'uri_scanner'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install uri_scanner
24
+
25
+ ## Usage
26
+
27
+ Start with:
28
+ ```ruby
29
+ require 'uri_scanner'
30
+ ```
31
+ There are only four core methods:
32
+
33
+ ```scan```: Scans text and return array of found URIs
34
+ ```ruby
35
+ URIScanner.scan(text)
36
+ ```
37
+
38
+ ```parse_uri```: Parses uri and return object that allows access to URI segments.
39
+ Raises ```URIParserError```
40
+ ```ruby
41
+ uri = URIScanner.parse_uri(uri_string)
42
+ uri.scheme
43
+ uri.host
44
+ uri.port
45
+ uri.userinfo
46
+ uri.username
47
+ uri.password
48
+ uri.path
49
+ uri.query
50
+ uri.fragment
51
+ uri.param
52
+ uri.header
53
+ ```
54
+
55
+ ```scan_and_parse```: Same as ```scan```, but retruns array of parsed URI objects (see parse_uri)
56
+ ```ruby
57
+ URIScanner.scan_and_parse(text)
58
+ ```
59
+
60
+ ```is_ip_valid?```: Additional methos that validates IPv4/IPv6 (RFC3986 ABNF)
61
+ ```ruby
62
+ URIScanner.is_ip_valid?(ip_string)
63
+ ```
64
+
65
+ Check folder "example".
66
+
67
+ ## Contributing
68
+
69
+ 1. Fork it ( https://github.com/[my-github-username]/uri_scanner/fork )
70
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
71
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
72
+ 4. Push to the branch (`git push origin my-new-feature`)
73
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,35 @@
1
+ require 'rake'
2
+ require 'mkmf'
3
+ require 'rspec/core/rake_task'
4
+ require "bundler/gem_tasks"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default do
9
+ Rake::Task['ragel'].invoke
10
+ Rake::Task['spec'].invoke
11
+ end
12
+
13
+ desc "Build ragel machines into ruby classes."
14
+ task :ragel do
15
+ ragel = find_executable('ragel')
16
+ raise ArgumentError, "Ragel executable not found" unless ragel
17
+ Dir["lib/uri_scanner/*.rl"].each do |file|
18
+ sh "#{ragel} -R #{file}"
19
+ end
20
+ end
21
+
22
+ desc "Create transition graphs."
23
+ task :graph do
24
+ format = "svg"
25
+ ragel = find_executable('ragel')
26
+ raise ArgumentError, "Ragel executable not found" unless ragel
27
+ dot = find_executable('dot')
28
+ raise ArgumentError, "Graphviz executable not found" unless dot
29
+ Dir["lib/uri_scanner/*.rl"].each do |file|
30
+ sh "#{ragel} -R #{file}"
31
+ sh "#{ragel} -Vp #{file} -o #{file}.dot"
32
+ sh "#{dot} #{file}.dot -T#{format} -o #{file}.#{format}"
33
+ sh "rm #{file}.dot"
34
+ end
35
+ end
data/example/parse.rb ADDED
@@ -0,0 +1,32 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Scan text and get all uris
4
+ # How to use
5
+ # >$ gem install uri_scanner
6
+ # >$ ruby example/parse.rb URL
7
+ #
8
+ # Try:
9
+ # ruby example/parse.rb "foo://user:pass@example.com:8042/over/there?name=ferret#nose"
10
+ # ruby example/parse.rb "sips:alice:secretW0rd@gateway.com:5061;transport=udp;user=phone;method=REGISTER?subject=sales%20meeting&priority=urgent&to=sales%40city.com"
11
+ #
12
+
13
+ require 'uri_scanner'
14
+
15
+ begin
16
+ raise "Usage: #{__FILE__} URI" unless ARGV.first
17
+ uri = URIScanner.parse_uri ARGV.first
18
+ puts "URI #{ARGV.first} segments:"
19
+ puts "scheme: #{uri.scheme}"
20
+ puts "host: #{uri.host}"
21
+ puts "port: #{uri.port}"
22
+ puts "userinfo: #{uri.userinfo}"
23
+ puts "username: #{uri.username}"
24
+ puts "password: #{uri.password}"
25
+ puts "path: #{uri.path}"
26
+ puts "query: #{uri.query}"
27
+ puts "fragment: #{uri.fragment}"
28
+ puts "param: #{uri.param}"
29
+ puts "header: #{uri.header}"
30
+ rescue Exception => e
31
+ puts e.message
32
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ # Scan text and get all uris
4
+ # How to use
5
+ # >$ gem install uri_scanner
6
+ # >$ curl https://rubygems.org/ | ruby example/scanner.rb
7
+
8
+ require 'uri_scanner'
9
+
10
+ URIScanner.scan( $stdin.read ).each do |uri|
11
+ puts uri
12
+ end
13
+ # To parse to uri objects:
14
+ # URIScanner.scan_and_parse( $stdin.read ).each ...
@@ -0,0 +1,60 @@
1
+ # Ragel machine : IP address
2
+ # IPv4 and IPv6 addresses
3
+ %%{
4
+ machine ip_addr;
5
+
6
+ # Ragel machine: IPv4 addresses
7
+ # ===
8
+ # Implements RFC 3986 [Section 3.2.2]
9
+ # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
10
+ #
11
+ # dec-octet = DIGIT ; 0-9
12
+ # / %x31-39 DIGIT ; 10-99
13
+ # / "1" 2DIGIT ; 100-199
14
+ # / "2" %x30-34 DIGIT ; 200-249
15
+ # / "25" %x30-35 ; 250-255
16
+ #
17
+ # Additionally also treating leading "0".
18
+ # For example 001.010.100.1 == 1.10.100.1
19
+ OCTET4 = ("0" | "00")? digit |
20
+ "0"? [1-9] digit |
21
+ "1" digit{2} |
22
+ "2" [0-4] digit |
23
+ "25" [0-5] ;
24
+
25
+ IPv4_ADDR = (OCTET4 "."){3} OCTET4;
26
+
27
+ # IPv6 addresses
28
+ # Implements RFC 3986 [Section 3.2.2]
29
+ # IPv6address = 6( h16 ":" ) ls32
30
+ # / "::" 5( h16 ":" ) ls32
31
+ # / [ h16 ] "::" 4( h16 ":" ) ls32
32
+ # / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
33
+ # / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
34
+ # / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
35
+ # / [ *4( h16 ":" ) h16 ] "::" ls32
36
+ # / [ *5( h16 ":" ) h16 ] "::" h16
37
+ # / [ *6( h16 ":" ) h16 ] "::"
38
+ #
39
+ # ls32 = ( h16 ":" h16 ) / IPv4address
40
+ # ; least-significant 32 bits of address
41
+ #
42
+ # h16 = 1*4HEXDIG
43
+ # ; 16 bits of address represented in hexadecimal
44
+ H16 = xdigit{1,4} ;
45
+ LS32 = (H16 ":" H16) | IPv4_ADDR ;
46
+ IPv6_ADDR = (H16 ":"){6} LS32 |
47
+ "::" H16 |
48
+ "::" (H16 ":"){5} LS32 |
49
+ H16 "::" (H16 ":"){4} LS32 |
50
+ (H16 ":"){,1} H16 "::" (H16 ":"){3} LS32 |
51
+ (H16 ":"){,2} H16 "::" (H16 ":"){2} LS32 |
52
+ (H16 ":"){,3} H16 "::" H16 ":" LS32 |
53
+ (H16 ":"){,4} H16 "::" LS32 |
54
+ (H16 ":"){,5} H16 "::" H16 |
55
+ (H16 ":"){,6} H16 "::" ;
56
+
57
+ # IP address
58
+ IP_ADDR = IPv4_ADDR | IPv6_ADDR;
59
+
60
+ }%%
@@ -0,0 +1,49 @@
1
+ %%{
2
+ machine actions;
3
+
4
+ action mark_start{
5
+ mark = p
6
+ }
7
+
8
+ action fetch_scheme{
9
+ @scheme = data[0..p-1]
10
+ }
11
+
12
+ action fetch_host{
13
+ @host = data[mark..p-1]
14
+ }
15
+
16
+ action fetch_userinfo{
17
+ @userinfo = data[mark..p-1]
18
+ @username, @password = @userinfo.split(":")
19
+ }
20
+
21
+ action fetch_port{
22
+ @port = data[mark..p-1].to_i
23
+ }
24
+
25
+ action fetch_path{
26
+ @path = data[mark..p-1]
27
+ }
28
+
29
+ action fetch_query{
30
+ @query = data[mark..p-1]
31
+ }
32
+
33
+ action fetch_fragment{
34
+ @fragment = data[mark..p-1]
35
+ }
36
+
37
+ # Actions for SIP URI
38
+ action fetch_uri_params{
39
+ data[mark..p-1].split(";").each do |param|
40
+ next if param.empty?
41
+ k,v = param.split("=")
42
+ @param[k.to_sym] = v
43
+ end
44
+ }
45
+ action fetch_sipuri_header{
46
+ k,v = data[mark..p-1].split("=")
47
+ @header[k.to_sym] = URI.unescape(v)
48
+ }
49
+ }%%
@@ -0,0 +1,52 @@
1
+ # Ragel machine : SIP URI
2
+ # RFC 3261
3
+ # SIP URI scheme seems to be somehow not compatible with RFC986.
4
+ # http://www.ietf.org/mail-archive/web/sip/current/msg26338.html
5
+ # http://www.ietf.org/mail-archive/web/sip/current/msg26385.html
6
+ # Here comes RFC3261 ABN form.
7
+
8
+ %%{
9
+ machine sip_uri;
10
+
11
+ ESCAPED = PCT_ENC;
12
+ SIP_UNRESERVED = alnum | [\-_\.!~\*'()];
13
+ USER_UNRESERVED = [&=+$,;?/];
14
+ PARAM_UNRESERVED= [[\]/:&+$];
15
+ TOKEN = (alnum | [\-\.!%*_+`'~]){1,};
16
+ PARAMCHAR = PARAM_UNRESERVED | SIP_UNRESERVED | ESCAPED;
17
+ PNAME = PARAMCHAR{1,};
18
+ PVALUE = PARAMCHAR{1,};
19
+ HNV_UNRESERVED = [[\]/?:+$];
20
+ HNAME = (HNV_UNRESERVED | SIP_UNRESERVED | ESCAPED){1,};
21
+ HVALUE = (HNV_UNRESERVED | SIP_UNRESERVED | ESCAPED)*;
22
+ HEADER = HNAME >mark_start "=" HVALUE %fetch_sipuri_header;
23
+
24
+ TEL_SUBSCRIBER = zlen; # will implement in future
25
+
26
+ USER = ( SIP_UNRESERVED | ESCAPED | USER_UNRESERVED ){1,};
27
+ PASSWORD = ( SIP_UNRESERVED | ESCAPED | [&=+$,] )*;
28
+ SIP_UINFO = ( USER | TEL_SUBSCRIBER ) >mark_start ( ":" PASSWORD )? %fetch_userinfo "@";
29
+ DOMAINLABEL = alnum | (alnum (alnum | "-")* alnum);
30
+ TOPLABEL = alpha | (alpha (alnum | "-")* alnum);
31
+ HOSTNAME = (DOMAINLABEL ".")* TOPLABEL "."?;
32
+ SIPHOST = (HOSTNAME | IPv4_ADDR | IPv6_ADDR) >mark_start %fetch_host;
33
+
34
+ HOSTPORT = SIPHOST (":" digit{1,} >mark_start %fetch_port)? ;
35
+
36
+ OTHER_PARAM = PNAME ("=" PVALUE)?;
37
+ LR_PARAM = "lr";
38
+ MADDR_PARAM = "maddr=" SIPHOST;
39
+ TTL_PARAM = "ttl=" digit{1,3};
40
+ METHOD_PARAM = "method=" TOKEN{1,};
41
+ USER_PARAM = "user=" ( "phone" | "ip" | TOKEN);
42
+ TRANSPORT_PARAM = "transport=" ( "udp" | "tcp" | "sctp" | "tls" | TOKEN );
43
+ URI_PARAM = TRANSPORT_PARAM | USER_PARAM | METHOD_PARAM |
44
+ TTL_PARAM | MADDR_PARAM | LR_PARAM | OTHER_PARAM;
45
+
46
+ URI_PARAMS = ( ";" URI_PARAM )* >mark_start %fetch_uri_params;
47
+
48
+ HEADERS = "?" HEADER ("&" HEADER)*;
49
+
50
+ SIP_URI = ("sip"i [sS]?) %fetch_scheme ":"
51
+ SIP_UINFO? HOSTPORT URI_PARAMS HEADERS?;
52
+ }%%
@@ -0,0 +1,92 @@
1
+ # Ragel machine : URI
2
+ # RFC 3986
3
+ # foo://example.com:8042/over/there?name=ferret#nose
4
+ # \_/ \______________/\_________/ \_________/ \__/
5
+ # | | | | |
6
+ # scheme authority path query fragment
7
+ # | _____________________|__
8
+ # / \ / \
9
+ # urn:example:animal:ferret:nose
10
+
11
+ %%{
12
+ machine uri;
13
+
14
+ # Percent-Encoding. Example: "%20" (space)
15
+ PCT_ENC = "%" xdigit xdigit;
16
+
17
+ # These characters are called "reserved" because
18
+ # they may (or may not) be defined as delimiters by
19
+ # the generic syntax.
20
+ GEN_DELIMS = [:/?#[\]@];
21
+ SUB_DELIMS = [!$&'()*+,;=];
22
+ RESERVED = GEN_DELIMS | SUB_DELIMS;
23
+
24
+ # Characters that are allowed in a URI but do not
25
+ # have a reserved purpose.
26
+ UNRESERVED = alnum | [\-\._~];
27
+
28
+ # URI Scheme
29
+ # scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
30
+ SCHEME = alpha (alnum | [+\-\.])* %fetch_scheme;
31
+
32
+ # Hierarchical element for a naming authority
33
+ # authority = [ userinfo "@" ] host [ ":" port ]
34
+ USERINFO = (UNRESERVED | PCT_ENC | SUB_DELIMS | ":")* ;
35
+ IPv_FUTURE = "v" xdigit{1,} "." (UNRESERVED | SUB_DELIMS | ":"){1,};
36
+ IP_LITERAL = "[" (IPv6_ADDR | IPv_FUTURE) "]";
37
+ REG_NAME = (UNRESERVED | PCT_ENC | SUB_DELIMS)*;
38
+ HOST = (IP_LITERAL | IPv4_ADDR | REG_NAME) >mark_start %fetch_host;
39
+ PORT = digit*;
40
+ AUTHORITY = (USERINFO >mark_start %fetch_userinfo "@")? HOST (":" PORT >mark_start %fetch_port)?;
41
+
42
+ # Path RFC 3986 Section 3.3
43
+ PCHAR = UNRESERVED | PCT_ENC | SUB_DELIMS | ":" | "@";
44
+ SEG_NZ_NC = (UNRESERVED | PCT_ENC | SUB_DELIMS | "@"){1,};
45
+ SEG_NZ = PCHAR{1,};
46
+ SEGMENT = PCHAR*;
47
+ PATH_EMPTY = '\0';
48
+ PATH_ROOTLESS = (SEG_NZ ("/" SEGMENT)*) >mark_start %fetch_path;
49
+ PATH_NOSCHEME = (SEG_NZ_NC ("/" SEGMENT)*) >mark_start %fetch_path;
50
+ PATH_ABSOLUTE = ("/" (SEG_NZ ("/" SEGMENT)*)?) >mark_start %fetch_path;
51
+ PATH_ABEMPTY = ("/" SEGMENT)* >mark_start %fetch_path;
52
+
53
+ PATH = PATH_ABEMPTY | # begins with "/" or is empty
54
+ PATH_ABSOLUTE | # begins with "/" but not "//"
55
+ PATH_NOSCHEME | # begins with a non-colon segment
56
+ PATH_ROOTLESS | # begins with a segment
57
+ PATH_EMPTY ; # zero characters
58
+
59
+ # The query component contains non-hierarchical
60
+ # Section 3.4
61
+ QUERY = (PCHAR | "/" | "?")* >mark_start %fetch_query;
62
+
63
+ # Fragment
64
+ # Section 3.5
65
+ FRAGMENT = (PCHAR | "/" | "?")* >mark_start %fetch_fragment;
66
+
67
+ # Relative Reference
68
+ # Section 4.2
69
+ REL_PART = ("//" AUTHORITY PATH_ABEMPTY) |
70
+ PATH_ABSOLUTE |
71
+ PATH_NOSCHEME |
72
+ PATH_EMPTY ;
73
+
74
+ REL_REF = REL_PART ("?" QUERY)? ("#" FRAGMENT)?;
75
+
76
+ # Absolute URI
77
+ # Section 4.3
78
+ HIER_PART = ("//" AUTHORITY PATH_ABEMPTY) |
79
+ PATH_ABSOLUTE |
80
+ PATH_ROOTLESS |
81
+ PATH_EMPTY;
82
+
83
+ URI_ABS = SCHEME ":" HIER_PART ("?" QUERY)?;
84
+
85
+ # URI Reference
86
+ # Section 4.1; Section 3
87
+ URI = SCHEME ":" HIER_PART ("?" QUERY)? ("#" FRAGMENT)?;
88
+
89
+ # generic
90
+ URI_REF = URI | REL_REF;
91
+
92
+ }%%