domain_prefix 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +20 -0
- data/Rakefile +7 -0
- data/VERSION +1 -1
- data/data/effective_tld_names.dat +2027 -275
- data/domain_prefix.gemspec +6 -6
- data/lib/domain_prefix.rb +97 -62
- data/test/sample/test.txt +49 -47
- data/test/{test_tldifier.rb → test_domain_prefix.rb} +8 -6
- metadata +6 -6
- data/README.rdoc +0 -18
data/domain_prefix.gemspec
CHANGED
@@ -5,21 +5,21 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "domain_prefix"
|
8
|
-
s.version = "0.1
|
8
|
+
s.version = "0.2.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["tadman"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-12-12"
|
13
13
|
s.description = "A library to extract information about top-level domain and registered name from generic and international domain names"
|
14
14
|
s.email = "github@tadman.ca"
|
15
15
|
s.extra_rdoc_files = [
|
16
16
|
"LICENSE",
|
17
|
-
"README.
|
17
|
+
"README.md"
|
18
18
|
]
|
19
19
|
s.files = [
|
20
20
|
".document",
|
21
21
|
"LICENSE",
|
22
|
-
"README.
|
22
|
+
"README.md",
|
23
23
|
"Rakefile",
|
24
24
|
"VERSION",
|
25
25
|
"data/effective_tld_names.dat",
|
@@ -28,11 +28,11 @@ Gem::Specification.new do |s|
|
|
28
28
|
"test/helper.rb",
|
29
29
|
"test/sample/README",
|
30
30
|
"test/sample/test.txt",
|
31
|
-
"test/
|
31
|
+
"test/test_domain_prefix.rb"
|
32
32
|
]
|
33
33
|
s.homepage = "http://github.com/twg/domain_prefix"
|
34
34
|
s.require_paths = ["lib"]
|
35
|
-
s.rubygems_version = "1.8.
|
35
|
+
s.rubygems_version = "1.8.24"
|
36
36
|
s.summary = "Domain Prefix Extraction Library"
|
37
37
|
|
38
38
|
if s.respond_to? :specification_version then
|
data/lib/domain_prefix.rb
CHANGED
@@ -1,17 +1,62 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
module DomainPrefix
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
4
|
+
SEPARATOR = '.'.freeze
|
5
|
+
|
6
|
+
class Tree < Hash
|
7
|
+
def insert(path)
|
8
|
+
leaf = path.split(SEPARATOR).reverse.inject(self) do |tree, component|
|
9
|
+
# Seeds an element into the tree structure by referencing it
|
10
|
+
tree[component.sub(/^!/, '')] ||= Tree.new
|
11
|
+
end
|
12
|
+
|
13
|
+
if (path.match(/^[\!]/))
|
14
|
+
leaf[:required] = 0
|
15
|
+
else
|
16
|
+
leaf[:required] = 1
|
8
17
|
end
|
18
|
+
|
19
|
+
self
|
9
20
|
end
|
10
21
|
|
11
|
-
def
|
12
|
-
|
13
|
-
|
22
|
+
def follow(path)
|
23
|
+
path = path.to_s.split(SEPARATOR) unless (path.is_a?(Array))
|
24
|
+
path = path.reverse
|
25
|
+
|
26
|
+
index = traverse(path)
|
27
|
+
|
28
|
+
index and index <= path.length and path[0, index].reverse
|
29
|
+
end
|
30
|
+
|
31
|
+
protected
|
32
|
+
def traverse(path, index = 0)
|
33
|
+
component = path[index]
|
34
|
+
|
35
|
+
unless (component)
|
36
|
+
return self[:required] == 0 ? index : nil
|
37
|
+
end
|
38
|
+
|
39
|
+
named_branch = self[component]
|
40
|
+
|
41
|
+
if (named_branch)
|
42
|
+
result = named_branch.traverse(path, index + 1)
|
43
|
+
|
44
|
+
return result if (result)
|
45
|
+
end
|
46
|
+
|
47
|
+
wildcard_branch = self["*"]
|
48
|
+
|
49
|
+
if (wildcard_branch)
|
50
|
+
result = wildcard_branch.traverse(path, index + 1)
|
51
|
+
|
52
|
+
return result if (result)
|
53
|
+
end
|
54
|
+
|
55
|
+
if (!named_branch and !wildcard_branch and self[:required])
|
56
|
+
return index + self[:required]
|
14
57
|
end
|
58
|
+
|
59
|
+
return
|
15
60
|
end
|
16
61
|
end
|
17
62
|
|
@@ -25,40 +70,9 @@ module DomainPrefix
|
|
25
70
|
[ -d.length, d ]
|
26
71
|
end.freeze
|
27
72
|
|
28
|
-
TLD_TREE = TLD_NAMES.inject(
|
29
|
-
|
30
|
-
case (component)
|
31
|
-
when '*'
|
32
|
-
_h
|
33
|
-
when /!(.*)/
|
34
|
-
_h[$1]
|
35
|
-
else
|
36
|
-
_h[component]
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
h
|
73
|
+
TLD_TREE = TLD_NAMES.inject(Tree.new) do |t, name|
|
74
|
+
t.insert(name)
|
41
75
|
end.freeze
|
42
|
-
|
43
|
-
PREFIX_SPEC = Regexp.new(
|
44
|
-
'^(' + TLD_NAMES.collect do |d|
|
45
|
-
Regexp.escape(d).sub(/^\\\*\\\./, '')
|
46
|
-
end.join('|') + ')$'
|
47
|
-
).freeze
|
48
|
-
|
49
|
-
ALLOWED_DOMAIN_PREFIXES = Hash[
|
50
|
-
TLD_NAMES.select do |d|
|
51
|
-
d.match(/^\!/)
|
52
|
-
end.collect do |d|
|
53
|
-
[ d.sub(/^\!/, ''), true ]
|
54
|
-
end
|
55
|
-
].freeze
|
56
|
-
|
57
|
-
DOMAIN_PREFIX_SPEC = Regexp.new(
|
58
|
-
'^(?:[^\.]+\.)*?(([^\.]+)\.(' + TLD_NAMES.collect do |d|
|
59
|
-
Regexp.escape(d).sub(/^\\\*\\\./, '[^\.]+\.')
|
60
|
-
end.join('|') + '))$'
|
61
|
-
).freeze
|
62
76
|
|
63
77
|
NONPUBLIC_TLD = {
|
64
78
|
'local' => true
|
@@ -69,43 +83,64 @@ module DomainPrefix
|
|
69
83
|
domain and domain.downcase
|
70
84
|
end
|
71
85
|
|
72
|
-
def
|
73
|
-
|
86
|
+
def public_tld?(tld)
|
87
|
+
!NONPUBLIC_TLD.key?(tld)
|
88
|
+
end
|
89
|
+
|
90
|
+
def registered_domain(domain, rules = :strict)
|
91
|
+
return unless (domain)
|
74
92
|
|
75
|
-
|
76
|
-
|
77
|
-
domain = m[1]
|
78
|
-
suffix = m[3]
|
93
|
+
components = rfc3492_canonical_domain(domain).split(SEPARATOR)
|
79
94
|
|
80
|
-
return if (
|
81
|
-
|
95
|
+
return if (components.empty? or components.find(&:empty?))
|
96
|
+
|
97
|
+
if (rules == :strict)
|
98
|
+
return unless (self.public_tld?(components.last))
|
99
|
+
end
|
100
|
+
|
101
|
+
suffix = TLD_TREE.follow(components)
|
102
|
+
|
103
|
+
unless (suffix)
|
104
|
+
if (rules == :relaxed and components.length >= 2 and !TLD_TREE[components[-1]])
|
105
|
+
return components.last(2).join(SEPARATOR)
|
106
|
+
else
|
107
|
+
return
|
108
|
+
end
|
109
|
+
end
|
82
110
|
|
83
|
-
|
111
|
+
suffix.join(SEPARATOR)
|
84
112
|
end
|
85
113
|
|
86
114
|
def public_suffix(domain)
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
115
|
+
return unless (domain)
|
116
|
+
|
117
|
+
components = rfc3492_canonical_domain(domain).split(SEPARATOR)
|
118
|
+
|
119
|
+
return if (components.empty? or components.find(&:empty?))
|
120
|
+
|
121
|
+
return unless (public_tld?(components.last))
|
122
|
+
|
123
|
+
suffix = TLD_TREE.follow(components)
|
95
124
|
|
96
|
-
suffix
|
125
|
+
return unless (suffix)
|
126
|
+
|
127
|
+
suffix.shift
|
128
|
+
|
129
|
+
suffix.join(SEPARATOR)
|
97
130
|
end
|
98
131
|
|
99
132
|
def tld(domain)
|
100
133
|
suffix = public_suffix(rfc3492_canonical_domain(domain))
|
101
134
|
|
102
|
-
suffix and suffix.split(
|
135
|
+
suffix and suffix.split(SEPARATOR).last
|
103
136
|
end
|
104
137
|
|
105
138
|
def name(domain)
|
106
|
-
|
107
|
-
|
108
|
-
|
139
|
+
if (domain = registered_domain(domain))
|
140
|
+
domain.split(SEPARATOR).first
|
141
|
+
else
|
142
|
+
nil
|
143
|
+
end
|
109
144
|
end
|
110
145
|
|
111
146
|
extend self
|
data/test/sample/test.txt
CHANGED
@@ -1,76 +1,78 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
// Any copyright is dedicated to the Public Domain.
|
2
|
+
// http://creativecommons.org/publicdomain/zero/1.0/
|
3
3
|
|
4
|
-
|
5
|
-
checkPublicSuffix(
|
6
|
-
|
7
|
-
checkPublicSuffix('COM',
|
4
|
+
// null input.
|
5
|
+
checkPublicSuffix(null, null);
|
6
|
+
// Mixed case.
|
7
|
+
checkPublicSuffix('COM', null);
|
8
8
|
checkPublicSuffix('example.COM', 'example.com');
|
9
9
|
checkPublicSuffix('WwW.example.COM', 'example.com');
|
10
|
-
|
11
|
-
checkPublicSuffix('.com',
|
12
|
-
checkPublicSuffix('.example',
|
13
|
-
checkPublicSuffix('.example.com',
|
14
|
-
checkPublicSuffix('.example.example',
|
15
|
-
|
16
|
-
checkPublicSuffix('example',
|
17
|
-
checkPublicSuffix('example.example',
|
18
|
-
checkPublicSuffix('b.example.example',
|
19
|
-
checkPublicSuffix('a.b.example.example',
|
20
|
-
|
21
|
-
checkPublicSuffix('local',
|
22
|
-
checkPublicSuffix('example.local',
|
23
|
-
checkPublicSuffix('b.example.local',
|
24
|
-
checkPublicSuffix('a.b.example.local',
|
25
|
-
|
26
|
-
checkPublicSuffix('biz',
|
10
|
+
// Leading dot.
|
11
|
+
checkPublicSuffix('.com', null);
|
12
|
+
checkPublicSuffix('.example', null);
|
13
|
+
checkPublicSuffix('.example.com', null);
|
14
|
+
checkPublicSuffix('.example.example', null);
|
15
|
+
// Unlisted TLD.
|
16
|
+
checkPublicSuffix('example', null);
|
17
|
+
checkPublicSuffix('example.example', 'example.example');
|
18
|
+
checkPublicSuffix('b.example.example', 'example.example');
|
19
|
+
checkPublicSuffix('a.b.example.example', 'example.example');
|
20
|
+
// Listed, but non-Internet, TLD.
|
21
|
+
//checkPublicSuffix('local', null);
|
22
|
+
//checkPublicSuffix('example.local', null);
|
23
|
+
//checkPublicSuffix('b.example.local', null);
|
24
|
+
//checkPublicSuffix('a.b.example.local', null);
|
25
|
+
// TLD with only 1 rule.
|
26
|
+
checkPublicSuffix('biz', null);
|
27
27
|
checkPublicSuffix('domain.biz', 'domain.biz');
|
28
28
|
checkPublicSuffix('b.domain.biz', 'domain.biz');
|
29
29
|
checkPublicSuffix('a.b.domain.biz', 'domain.biz');
|
30
|
-
|
31
|
-
checkPublicSuffix('com',
|
30
|
+
// TLD with some 2-level rules.
|
31
|
+
checkPublicSuffix('com', null);
|
32
32
|
checkPublicSuffix('example.com', 'example.com');
|
33
33
|
checkPublicSuffix('b.example.com', 'example.com');
|
34
34
|
checkPublicSuffix('a.b.example.com', 'example.com');
|
35
|
-
checkPublicSuffix('uk.com',
|
35
|
+
checkPublicSuffix('uk.com', null);
|
36
36
|
checkPublicSuffix('example.uk.com', 'example.uk.com');
|
37
37
|
checkPublicSuffix('b.example.uk.com', 'example.uk.com');
|
38
38
|
checkPublicSuffix('a.b.example.uk.com', 'example.uk.com');
|
39
39
|
checkPublicSuffix('test.ac', 'test.ac');
|
40
|
-
|
41
|
-
checkPublicSuffix('cy',
|
42
|
-
checkPublicSuffix('c.cy',
|
40
|
+
// TLD with only 1 (wildcard) rule.
|
41
|
+
checkPublicSuffix('cy', null);
|
42
|
+
checkPublicSuffix('c.cy', null);
|
43
43
|
checkPublicSuffix('b.c.cy', 'b.c.cy');
|
44
44
|
checkPublicSuffix('a.b.c.cy', 'b.c.cy');
|
45
|
-
|
46
|
-
checkPublicSuffix('jp',
|
45
|
+
// More complex TLD.
|
46
|
+
checkPublicSuffix('jp', null);
|
47
47
|
checkPublicSuffix('test.jp', 'test.jp');
|
48
48
|
checkPublicSuffix('www.test.jp', 'test.jp');
|
49
|
-
checkPublicSuffix('ac.jp',
|
49
|
+
checkPublicSuffix('ac.jp', null);
|
50
50
|
checkPublicSuffix('test.ac.jp', 'test.ac.jp');
|
51
51
|
checkPublicSuffix('www.test.ac.jp', 'test.ac.jp');
|
52
|
-
checkPublicSuffix('kyoto.jp',
|
53
|
-
checkPublicSuffix('
|
54
|
-
checkPublicSuffix('
|
55
|
-
checkPublicSuffix('
|
56
|
-
checkPublicSuffix('
|
57
|
-
checkPublicSuffix('
|
58
|
-
checkPublicSuffix('
|
59
|
-
checkPublicSuffix('
|
60
|
-
|
61
|
-
checkPublicSuffix('
|
62
|
-
|
52
|
+
checkPublicSuffix('kyoto.jp', null);
|
53
|
+
checkPublicSuffix('test.kyoto.jp', 'test.kyoto.jp');
|
54
|
+
checkPublicSuffix('ide.kyoto.jp', null);
|
55
|
+
checkPublicSuffix('b.ide.kyoto.jp', 'b.ide.kyoto.jp');
|
56
|
+
checkPublicSuffix('a.b.ide.kyoto.jp', 'b.ide.kyoto.jp');
|
57
|
+
checkPublicSuffix('c.kobe.jp', null);
|
58
|
+
checkPublicSuffix('b.c.kobe.jp', 'b.c.kobe.jp');
|
59
|
+
checkPublicSuffix('a.b.c.kobe.jp', 'b.c.kobe.jp');
|
60
|
+
checkPublicSuffix('city.kobe.jp', 'city.kobe.jp');
|
61
|
+
checkPublicSuffix('www.city.kobe.jp', 'city.kobe.jp');
|
62
|
+
// TLD with a wildcard rule and exceptions.
|
63
|
+
checkPublicSuffix('om', null);
|
64
|
+
checkPublicSuffix('test.om', null);
|
63
65
|
checkPublicSuffix('b.test.om', 'b.test.om');
|
64
66
|
checkPublicSuffix('a.b.test.om', 'b.test.om');
|
65
67
|
checkPublicSuffix('songfest.om', 'songfest.om');
|
66
68
|
checkPublicSuffix('www.songfest.om', 'songfest.om');
|
67
|
-
|
68
|
-
checkPublicSuffix('us',
|
69
|
+
// US K12.
|
70
|
+
checkPublicSuffix('us', null);
|
69
71
|
checkPublicSuffix('test.us', 'test.us');
|
70
72
|
checkPublicSuffix('www.test.us', 'test.us');
|
71
|
-
checkPublicSuffix('ak.us',
|
73
|
+
checkPublicSuffix('ak.us', null);
|
72
74
|
checkPublicSuffix('test.ak.us', 'test.ak.us');
|
73
75
|
checkPublicSuffix('www.test.ak.us', 'test.ak.us');
|
74
|
-
checkPublicSuffix('k12.ak.us',
|
76
|
+
checkPublicSuffix('k12.ak.us', null);
|
75
77
|
checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us');
|
76
78
|
checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us');
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
require_relative 'helper'
|
2
2
|
|
3
3
|
class TestDomainPrefix < Test::Unit::TestCase
|
4
4
|
def test_initialization
|
@@ -14,11 +14,11 @@ class TestDomainPrefix < Test::Unit::TestCase
|
|
14
14
|
'example.on.ca' => %w[ example.on.ca on.ca ],
|
15
15
|
'example.gc.ca' => %w[ example.gc.ca gc.ca ],
|
16
16
|
'example.co.uk' => %w[ example.co.uk co.uk ],
|
17
|
-
'example.au' => [ nil, nil ],
|
18
17
|
'example.com.au' => %w[ example.com.au com.au ],
|
18
|
+
'example.au' => [ nil, nil ],
|
19
19
|
'example.bar.jp' => %w[ bar.jp jp ],
|
20
|
-
'example.bar.hokkaido.jp' =>%w[
|
21
|
-
'example.metro.tokyo.jp' => %w[
|
20
|
+
'example.bar.hokkaido.jp' =>%w[ bar.hokkaido.jp hokkaido.jp ],
|
21
|
+
'example.metro.tokyo.jp' => %w[ metro.tokyo.jp tokyo.jp ]
|
22
22
|
) do |domain|
|
23
23
|
[
|
24
24
|
DomainPrefix.registered_domain(domain),
|
@@ -29,11 +29,13 @@ class TestDomainPrefix < Test::Unit::TestCase
|
|
29
29
|
|
30
30
|
def test_public_suffix_samples
|
31
31
|
sample_data('test.txt').split(/\n/).collect do |line|
|
32
|
+
line.sub!(/\/\/.*/, '')
|
33
|
+
|
32
34
|
case (line)
|
33
35
|
when /checkPublicSuffix\((\S+),\s*(\S+)\)/
|
34
36
|
[ $1, $2 ].collect do |part|
|
35
37
|
case (part)
|
36
|
-
when 'NULL'
|
38
|
+
when 'NULL', 'null'
|
37
39
|
nil
|
38
40
|
else
|
39
41
|
part.gsub(/'/, '')
|
@@ -43,7 +45,7 @@ class TestDomainPrefix < Test::Unit::TestCase
|
|
43
45
|
nil
|
44
46
|
end
|
45
47
|
end.each do |domain, expected|
|
46
|
-
assert_equal expected, DomainPrefix.registered_domain(domain), "#{domain.inspect} -> #{expected.inspect}"
|
48
|
+
assert_equal expected, DomainPrefix.registered_domain(domain, :relaxed), "#{domain.inspect} -> #{expected.inspect}"
|
47
49
|
end
|
48
50
|
end
|
49
51
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: domain_prefix
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-12 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: A library to extract information about top-level domain and registered
|
15
15
|
name from generic and international domain names
|
@@ -18,11 +18,11 @@ executables: []
|
|
18
18
|
extensions: []
|
19
19
|
extra_rdoc_files:
|
20
20
|
- LICENSE
|
21
|
-
- README.
|
21
|
+
- README.md
|
22
22
|
files:
|
23
23
|
- .document
|
24
24
|
- LICENSE
|
25
|
-
- README.
|
25
|
+
- README.md
|
26
26
|
- Rakefile
|
27
27
|
- VERSION
|
28
28
|
- data/effective_tld_names.dat
|
@@ -31,7 +31,7 @@ files:
|
|
31
31
|
- test/helper.rb
|
32
32
|
- test/sample/README
|
33
33
|
- test/sample/test.txt
|
34
|
-
- test/
|
34
|
+
- test/test_domain_prefix.rb
|
35
35
|
homepage: http://github.com/twg/domain_prefix
|
36
36
|
licenses: []
|
37
37
|
post_install_message:
|
@@ -52,7 +52,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
52
52
|
version: '0'
|
53
53
|
requirements: []
|
54
54
|
rubyforge_project:
|
55
|
-
rubygems_version: 1.8.
|
55
|
+
rubygems_version: 1.8.24
|
56
56
|
signing_key:
|
57
57
|
specification_version: 3
|
58
58
|
summary: Domain Prefix Extraction Library
|
data/README.rdoc
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
= tldifier
|
2
|
-
|
3
|
-
Description goes here.
|
4
|
-
|
5
|
-
== Note on Patches/Pull Requests
|
6
|
-
|
7
|
-
* Fork the project.
|
8
|
-
* Make your feature addition or bug fix.
|
9
|
-
* Add tests for it. This is important so I don't break it in a
|
10
|
-
future version unintentionally.
|
11
|
-
* Commit, do not mess with rakefile, version, or history.
|
12
|
-
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
13
|
-
* Send me a pull request. Bonus points for topic branches.
|
14
|
-
|
15
|
-
== Copyright
|
16
|
-
|
17
|
-
Copyright (c) 2009-2012 Scott Tadman, The Working Group Inc.
|
18
|
-
See LICENSE for details.
|