domain_prefix 0.1.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +20 -0
- data/Rakefile +7 -0
- data/VERSION +1 -1
- data/data/effective_tld_names.dat +2027 -275
- data/domain_prefix.gemspec +6 -6
- data/lib/domain_prefix.rb +97 -62
- data/test/sample/test.txt +49 -47
- data/test/{test_tldifier.rb → test_domain_prefix.rb} +8 -6
- metadata +6 -6
- data/README.rdoc +0 -18
data/domain_prefix.gemspec
CHANGED
@@ -5,21 +5,21 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "domain_prefix"
|
8
|
-
s.version = "0.1
|
8
|
+
s.version = "0.2.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["tadman"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-12-12"
|
13
13
|
s.description = "A library to extract information about top-level domain and registered name from generic and international domain names"
|
14
14
|
s.email = "github@tadman.ca"
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE",
|
17
|
-
"README.
|
17
|
+
"README.md"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
"LICENSE",
|
22
|
-
"README.
|
22
|
+
"README.md",
|
23
23
|
"Rakefile",
|
24
24
|
"VERSION",
|
25
25
|
"data/effective_tld_names.dat",
|
@@ -28,11 +28,11 @@ Gem::Specification.new do |s|
|
|
28
28
|
"test/helper.rb",
|
29
29
|
"test/sample/README",
|
30
30
|
"test/sample/test.txt",
|
31
|
-
"test/
|
31
|
+
"test/test_domain_prefix.rb"
|
32
32
|
]
|
33
33
|
s.homepage = "http://github.com/twg/domain_prefix"
|
34
34
|
s.require_paths = ["lib"]
|
35
|
-
s.rubygems_version = "1.8.
|
35
|
+
s.rubygems_version = "1.8.24"
|
36
36
|
s.summary = "Domain Prefix Extraction Library"
|
37
37
|
|
38
38
|
if s.respond_to? :specification_version then
|
data/lib/domain_prefix.rb
CHANGED
@@ -1,17 +1,62 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
module DomainPrefix
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
SEPARATOR = '.'.freeze
|
5
|
+
|
6
|
+
class Tree < Hash
|
7
|
+
def insert(path)
|
8
|
+
leaf = path.split(SEPARATOR).reverse.inject(self) do |tree, component|
|
9
|
+
# Seeds an element into the tree structure by referencing it
|
10
|
+
tree[component.sub(/^!/, '')] ||= Tree.new
|
11
|
+
end
|
12
|
+
|
13
|
+
if (path.match(/^[\!]/))
|
14
|
+
leaf[:required] = 0
|
15
|
+
else
|
16
|
+
leaf[:required] = 1
|
8
17
|
end
|
18
|
+
|
19
|
+
self
|
9
20
|
end
|
10
21
|
|
11
|
-
def
|
12
|
-
|
13
|
-
|
22
|
+
def follow(path)
|
23
|
+
path = path.to_s.split(SEPARATOR) unless (path.is_a?(Array))
|
24
|
+
path = path.reverse
|
25
|
+
|
26
|
+
index = traverse(path)
|
27
|
+
|
28
|
+
index and index <= path.length and path[0, index].reverse
|
29
|
+
end
|
30
|
+
|
31
|
+
protected
|
32
|
+
def traverse(path, index = 0)
|
33
|
+
component = path[index]
|
34
|
+
|
35
|
+
unless (component)
|
36
|
+
return self[:required] == 0 ? index : nil
|
37
|
+
end
|
38
|
+
|
39
|
+
named_branch = self[component]
|
40
|
+
|
41
|
+
if (named_branch)
|
42
|
+
result = named_branch.traverse(path, index + 1)
|
43
|
+
|
44
|
+
return result if (result)
|
45
|
+
end
|
46
|
+
|
47
|
+
wildcard_branch = self["*"]
|
48
|
+
|
49
|
+
if (wildcard_branch)
|
50
|
+
result = wildcard_branch.traverse(path, index + 1)
|
51
|
+
|
52
|
+
return result if (result)
|
53
|
+
end
|
54
|
+
|
55
|
+
if (!named_branch and !wildcard_branch and self[:required])
|
56
|
+
return index + self[:required]
|
14
57
|
end
|
58
|
+
|
59
|
+
return
|
15
60
|
end
|
16
61
|
end
|
17
62
|
|
@@ -25,40 +70,9 @@ module DomainPrefix
|
|
25
70
|
[ -d.length, d ]
|
26
71
|
end.freeze
|
27
72
|
|
28
|
-
TLD_TREE = TLD_NAMES.inject(
|
29
|
-
|
30
|
-
case (component)
|
31
|
-
when '*'
|
32
|
-
_h
|
33
|
-
when /!(.*)/
|
34
|
-
_h[$1]
|
35
|
-
else
|
36
|
-
_h[component]
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
h
|
73
|
+
TLD_TREE = TLD_NAMES.inject(Tree.new) do |t, name|
|
74
|
+
t.insert(name)
|
41
75
|
end.freeze
|
42
|
-
|
43
|
-
PREFIX_SPEC = Regexp.new(
|
44
|
-
'^(' + TLD_NAMES.collect do |d|
|
45
|
-
Regexp.escape(d).sub(/^\\\*\\\./, '')
|
46
|
-
end.join('|') + ')$'
|
47
|
-
).freeze
|
48
|
-
|
49
|
-
ALLOWED_DOMAIN_PREFIXES = Hash[
|
50
|
-
TLD_NAMES.select do |d|
|
51
|
-
d.match(/^\!/)
|
52
|
-
end.collect do |d|
|
53
|
-
[ d.sub(/^\!/, ''), true ]
|
54
|
-
end
|
55
|
-
].freeze
|
56
|
-
|
57
|
-
DOMAIN_PREFIX_SPEC = Regexp.new(
|
58
|
-
'^(?:[^\.]+\.)*?(([^\.]+)\.(' + TLD_NAMES.collect do |d|
|
59
|
-
Regexp.escape(d).sub(/^\\\*\\\./, '[^\.]+\.')
|
60
|
-
end.join('|') + '))$'
|
61
|
-
).freeze
|
62
76
|
|
63
77
|
NONPUBLIC_TLD = {
|
64
78
|
'local' => true
|
@@ -69,43 +83,64 @@ module DomainPrefix
|
|
69
83
|
domain and domain.downcase
|
70
84
|
end
|
71
85
|
|
72
|
-
def
|
73
|
-
|
86
|
+
def public_tld?(tld)
|
87
|
+
!NONPUBLIC_TLD.key?(tld)
|
88
|
+
end
|
89
|
+
|
90
|
+
def registered_domain(domain, rules = :strict)
|
91
|
+
return unless (domain)
|
74
92
|
|
75
|
-
|
76
|
-
|
77
|
-
domain = m[1]
|
78
|
-
suffix = m[3]
|
93
|
+
components = rfc3492_canonical_domain(domain).split(SEPARATOR)
|
79
94
|
|
80
|
-
return if (
|
81
|
-
|
95
|
+
return if (components.empty? or components.find(&:empty?))
|
96
|
+
|
97
|
+
if (rules == :strict)
|
98
|
+
return unless (self.public_tld?(components.last))
|
99
|
+
end
|
100
|
+
|
101
|
+
suffix = TLD_TREE.follow(components)
|
102
|
+
|
103
|
+
unless (suffix)
|
104
|
+
if (rules == :relaxed and components.length >= 2 and !TLD_TREE[components[-1]])
|
105
|
+
return components.last(2).join(SEPARATOR)
|
106
|
+
else
|
107
|
+
return
|
108
|
+
end
|
109
|
+
end
|
82
110
|
|
83
|
-
|
111
|
+
suffix.join(SEPARATOR)
|
84
112
|
end
|
85
113
|
|
86
114
|
def public_suffix(domain)
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
115
|
+
return unless (domain)
|
116
|
+
|
117
|
+
components = rfc3492_canonical_domain(domain).split(SEPARATOR)
|
118
|
+
|
119
|
+
return if (components.empty? or components.find(&:empty?))
|
120
|
+
|
121
|
+
return unless (public_tld?(components.last))
|
122
|
+
|
123
|
+
suffix = TLD_TREE.follow(components)
|
95
124
|
|
96
|
-
suffix
|
125
|
+
return unless (suffix)
|
126
|
+
|
127
|
+
suffix.shift
|
128
|
+
|
129
|
+
suffix.join(SEPARATOR)
|
97
130
|
end
|
98
131
|
|
99
132
|
def tld(domain)
|
100
133
|
suffix = public_suffix(rfc3492_canonical_domain(domain))
|
101
134
|
|
102
|
-
suffix and suffix.split(
|
135
|
+
suffix and suffix.split(SEPARATOR).last
|
103
136
|
end
|
104
137
|
|
105
138
|
def name(domain)
|
106
|
-
|
107
|
-
|
108
|
-
|
139
|
+
if (domain = registered_domain(domain))
|
140
|
+
domain.split(SEPARATOR).first
|
141
|
+
else
|
142
|
+
nil
|
143
|
+
end
|
109
144
|
end
|
110
145
|
|
111
146
|
extend self
|
data/test/sample/test.txt
CHANGED
@@ -1,76 +1,78 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
// Any copyright is dedicated to the Public Domain.
|
2
|
+
// http://creativecommons.org/publicdomain/zero/1.0/
|
3
3
|
|
4
|
-
|
5
|
-
checkPublicSuffix(
|
6
|
-
|
7
|
-
checkPublicSuffix('COM',
|
4
|
+
// null input.
|
5
|
+
checkPublicSuffix(null, null);
|
6
|
+
// Mixed case.
|
7
|
+
checkPublicSuffix('COM', null);
|
8
8
|
checkPublicSuffix('example.COM', 'example.com');
|
9
9
|
checkPublicSuffix('WwW.example.COM', 'example.com');
|
10
|
-
|
11
|
-
checkPublicSuffix('.com',
|
12
|
-
checkPublicSuffix('.example',
|
13
|
-
checkPublicSuffix('.example.com',
|
14
|
-
checkPublicSuffix('.example.example',
|
15
|
-
|
16
|
-
checkPublicSuffix('example',
|
17
|
-
checkPublicSuffix('example.example',
|
18
|
-
checkPublicSuffix('b.example.example',
|
19
|
-
checkPublicSuffix('a.b.example.example',
|
20
|
-
|
21
|
-
checkPublicSuffix('local',
|
22
|
-
checkPublicSuffix('example.local',
|
23
|
-
checkPublicSuffix('b.example.local',
|
24
|
-
checkPublicSuffix('a.b.example.local',
|
25
|
-
|
26
|
-
checkPublicSuffix('biz',
|
10
|
+
// Leading dot.
|
11
|
+
checkPublicSuffix('.com', null);
|
12
|
+
checkPublicSuffix('.example', null);
|
13
|
+
checkPublicSuffix('.example.com', null);
|
14
|
+
checkPublicSuffix('.example.example', null);
|
15
|
+
// Unlisted TLD.
|
16
|
+
checkPublicSuffix('example', null);
|
17
|
+
checkPublicSuffix('example.example', 'example.example');
|
18
|
+
checkPublicSuffix('b.example.example', 'example.example');
|
19
|
+
checkPublicSuffix('a.b.example.example', 'example.example');
|
20
|
+
// Listed, but non-Internet, TLD.
|
21
|
+
//checkPublicSuffix('local', null);
|
22
|
+
//checkPublicSuffix('example.local', null);
|
23
|
+
//checkPublicSuffix('b.example.local', null);
|
24
|
+
//checkPublicSuffix('a.b.example.local', null);
|
25
|
+
// TLD with only 1 rule.
|
26
|
+
checkPublicSuffix('biz', null);
|
27
27
|
checkPublicSuffix('domain.biz', 'domain.biz');
|
28
28
|
checkPublicSuffix('b.domain.biz', 'domain.biz');
|
29
29
|
checkPublicSuffix('a.b.domain.biz', 'domain.biz');
|
30
|
-
|
31
|
-
checkPublicSuffix('com',
|
30
|
+
// TLD with some 2-level rules.
|
31
|
+
checkPublicSuffix('com', null);
|
32
32
|
checkPublicSuffix('example.com', 'example.com');
|
33
33
|
checkPublicSuffix('b.example.com', 'example.com');
|
34
34
|
checkPublicSuffix('a.b.example.com', 'example.com');
|
35
|
-
checkPublicSuffix('uk.com',
|
35
|
+
checkPublicSuffix('uk.com', null);
|
36
36
|
checkPublicSuffix('example.uk.com', 'example.uk.com');
|
37
37
|
checkPublicSuffix('b.example.uk.com', 'example.uk.com');
|
38
38
|
checkPublicSuffix('a.b.example.uk.com', 'example.uk.com');
|
39
39
|
checkPublicSuffix('test.ac', 'test.ac');
|
40
|
-
|
41
|
-
checkPublicSuffix('cy',
|
42
|
-
checkPublicSuffix('c.cy',
|
40
|
+
// TLD with only 1 (wildcard) rule.
|
41
|
+
checkPublicSuffix('cy', null);
|
42
|
+
checkPublicSuffix('c.cy', null);
|
43
43
|
checkPublicSuffix('b.c.cy', 'b.c.cy');
|
44
44
|
checkPublicSuffix('a.b.c.cy', 'b.c.cy');
|
45
|
-
|
46
|
-
checkPublicSuffix('jp',
|
45
|
+
// More complex TLD.
|
46
|
+
checkPublicSuffix('jp', null);
|
47
47
|
checkPublicSuffix('test.jp', 'test.jp');
|
48
48
|
checkPublicSuffix('www.test.jp', 'test.jp');
|
49
|
-
checkPublicSuffix('ac.jp',
|
49
|
+
checkPublicSuffix('ac.jp', null);
|
50
50
|
checkPublicSuffix('test.ac.jp', 'test.ac.jp');
|
51
51
|
checkPublicSuffix('www.test.ac.jp', 'test.ac.jp');
|
52
|
-
checkPublicSuffix('kyoto.jp',
|
53
|
-
checkPublicSuffix('
|
54
|
-
checkPublicSuffix('
|
55
|
-
checkPublicSuffix('
|
56
|
-
checkPublicSuffix('
|
57
|
-
checkPublicSuffix('
|
58
|
-
checkPublicSuffix('
|
59
|
-
checkPublicSuffix('
|
60
|
-
|
61
|
-
checkPublicSuffix('
|
62
|
-
|
52
|
+
checkPublicSuffix('kyoto.jp', null);
|
53
|
+
checkPublicSuffix('test.kyoto.jp', 'test.kyoto.jp');
|
54
|
+
checkPublicSuffix('ide.kyoto.jp', null);
|
55
|
+
checkPublicSuffix('b.ide.kyoto.jp', 'b.ide.kyoto.jp');
|
56
|
+
checkPublicSuffix('a.b.ide.kyoto.jp', 'b.ide.kyoto.jp');
|
57
|
+
checkPublicSuffix('c.kobe.jp', null);
|
58
|
+
checkPublicSuffix('b.c.kobe.jp', 'b.c.kobe.jp');
|
59
|
+
checkPublicSuffix('a.b.c.kobe.jp', 'b.c.kobe.jp');
|
60
|
+
checkPublicSuffix('city.kobe.jp', 'city.kobe.jp');
|
61
|
+
checkPublicSuffix('www.city.kobe.jp', 'city.kobe.jp');
|
62
|
+
// TLD with a wildcard rule and exceptions.
|
63
|
+
checkPublicSuffix('om', null);
|
64
|
+
checkPublicSuffix('test.om', null);
|
63
65
|
checkPublicSuffix('b.test.om', 'b.test.om');
|
64
66
|
checkPublicSuffix('a.b.test.om', 'b.test.om');
|
65
67
|
checkPublicSuffix('songfest.om', 'songfest.om');
|
66
68
|
checkPublicSuffix('www.songfest.om', 'songfest.om');
|
67
|
-
|
68
|
-
checkPublicSuffix('us',
|
69
|
+
// US K12.
|
70
|
+
checkPublicSuffix('us', null);
|
69
71
|
checkPublicSuffix('test.us', 'test.us');
|
70
72
|
checkPublicSuffix('www.test.us', 'test.us');
|
71
|
-
checkPublicSuffix('ak.us',
|
73
|
+
checkPublicSuffix('ak.us', null);
|
72
74
|
checkPublicSuffix('test.ak.us', 'test.ak.us');
|
73
75
|
checkPublicSuffix('www.test.ak.us', 'test.ak.us');
|
74
|
-
checkPublicSuffix('k12.ak.us',
|
76
|
+
checkPublicSuffix('k12.ak.us', null);
|
75
77
|
checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us');
|
76
78
|
checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us');
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative 'helper'
|
2
2
|
|
3
3
|
class TestDomainPrefix < Test::Unit::TestCase
|
4
4
|
def test_initialization
|
@@ -14,11 +14,11 @@ class TestDomainPrefix < Test::Unit::TestCase
|
|
14
14
|
'example.on.ca' => %w[ example.on.ca on.ca ],
|
15
15
|
'example.gc.ca' => %w[ example.gc.ca gc.ca ],
|
16
16
|
'example.co.uk' => %w[ example.co.uk co.uk ],
|
17
|
-
'example.au' => [ nil, nil ],
|
18
17
|
'example.com.au' => %w[ example.com.au com.au ],
|
18
|
+
'example.au' => [ nil, nil ],
|
19
19
|
'example.bar.jp' => %w[ bar.jp jp ],
|
20
|
-
'example.bar.hokkaido.jp' =>%w[
|
21
|
-
'example.metro.tokyo.jp' => %w[
|
20
|
+
'example.bar.hokkaido.jp' =>%w[ bar.hokkaido.jp hokkaido.jp ],
|
21
|
+
'example.metro.tokyo.jp' => %w[ metro.tokyo.jp tokyo.jp ]
|
22
22
|
) do |domain|
|
23
23
|
[
|
24
24
|
DomainPrefix.registered_domain(domain),
|
@@ -29,11 +29,13 @@ class TestDomainPrefix < Test::Unit::TestCase
|
|
29
29
|
|
30
30
|
def test_public_suffix_samples
|
31
31
|
sample_data('test.txt').split(/\n/).collect do |line|
|
32
|
+
line.sub!(/\/\/.*/, '')
|
33
|
+
|
32
34
|
case (line)
|
33
35
|
when /checkPublicSuffix\((\S+),\s*(\S+)\)/
|
34
36
|
[ $1, $2 ].collect do |part|
|
35
37
|
case (part)
|
36
|
-
when 'NULL'
|
38
|
+
when 'NULL', 'null'
|
37
39
|
nil
|
38
40
|
else
|
39
41
|
part.gsub(/'/, '')
|
@@ -43,7 +45,7 @@ class TestDomainPrefix < Test::Unit::TestCase
|
|
43
45
|
nil
|
44
46
|
end
|
45
47
|
end.each do |domain, expected|
|
46
|
-
assert_equal expected, DomainPrefix.registered_domain(domain), "#{domain.inspect} -> #{expected.inspect}"
|
48
|
+
assert_equal expected, DomainPrefix.registered_domain(domain, :relaxed), "#{domain.inspect} -> #{expected.inspect}"
|
47
49
|
end
|
48
50
|
end
|
49
51
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: domain_prefix
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-12 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: A library to extract information about top-level domain and registered
|
15
15
|
name from generic and international domain names
|
@@ -18,11 +18,11 @@ executables: []
|
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files:
|
20
20
|
- LICENSE
|
21
|
-
- README.
|
21
|
+
- README.md
|
22
22
|
files:
|
23
23
|
- .document
|
24
24
|
- LICENSE
|
25
|
-
- README.
|
25
|
+
- README.md
|
26
26
|
- Rakefile
|
27
27
|
- VERSION
|
28
28
|
- data/effective_tld_names.dat
|
@@ -31,7 +31,7 @@ files:
|
|
31
31
|
- test/helper.rb
|
32
32
|
- test/sample/README
|
33
33
|
- test/sample/test.txt
|
34
|
-
- test/
|
34
|
+
- test/test_domain_prefix.rb
|
35
35
|
homepage: http://github.com/twg/domain_prefix
|
36
36
|
licenses: []
|
37
37
|
post_install_message:
|
@@ -52,7 +52,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
52
52
|
version: '0'
|
53
53
|
requirements: []
|
54
54
|
rubyforge_project:
|
55
|
-
rubygems_version: 1.8.
|
55
|
+
rubygems_version: 1.8.24
|
56
56
|
signing_key:
|
57
57
|
specification_version: 3
|
58
58
|
summary: Domain Prefix Extraction Library
|
data/README.rdoc
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
= tldifier
|
2
|
-
|
3
|
-
Description goes here.
|
4
|
-
|
5
|
-
== Note on Patches/Pull Requests
|
6
|
-
|
7
|
-
* Fork the project.
|
8
|
-
* Make your feature addition or bug fix.
|
9
|
-
* Add tests for it. This is important so I don't break it in a
|
10
|
-
future version unintentionally.
|
11
|
-
* Commit, do not mess with rakefile, version, or history.
|
12
|
-
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
13
|
-
* Send me a pull request. Bonus points for topic branches.
|
14
|
-
|
15
|
-
== Copyright
|
16
|
-
|
17
|
-
Copyright (c) 2009-2012 Scott Tadman, The Working Group Inc.
|
18
|
-
See LICENSE for details.
|