domain_prefix 0.1.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,21 +5,21 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "domain_prefix"
8
- s.version = "0.1.0"
8
+ s.version = "0.2.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["tadman"]
12
- s.date = "2012-02-03"
12
+ s.date = "2012-12-12"
13
13
  s.description = "A library to extract information about top-level domain and registered name from generic and international domain names"
14
14
  s.email = "github@tadman.ca"
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE",
17
- "README.rdoc"
17
+ "README.md"
18
18
  ]
19
19
  s.files = [
20
20
  ".document",
21
21
  "LICENSE",
22
- "README.rdoc",
22
+ "README.md",
23
23
  "Rakefile",
24
24
  "VERSION",
25
25
  "data/effective_tld_names.dat",
@@ -28,11 +28,11 @@ Gem::Specification.new do |s|
28
28
  "test/helper.rb",
29
29
  "test/sample/README",
30
30
  "test/sample/test.txt",
31
- "test/test_tldifier.rb"
31
+ "test/test_domain_prefix.rb"
32
32
  ]
33
33
  s.homepage = "http://github.com/twg/domain_prefix"
34
34
  s.require_paths = ["lib"]
35
- s.rubygems_version = "1.8.11"
35
+ s.rubygems_version = "1.8.24"
36
36
  s.summary = "Domain Prefix Extraction Library"
37
37
 
38
38
  if s.respond_to? :specification_version then
data/lib/domain_prefix.rb CHANGED
@@ -1,17 +1,62 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  module DomainPrefix
4
- class TreeHash < Hash
5
- def initialize
6
- super do |h, k|
7
- h[k] = TreeHash.new
4
+ SEPARATOR = '.'.freeze
5
+
6
+ class Tree < Hash
7
+ def insert(path)
8
+ leaf = path.split(SEPARATOR).reverse.inject(self) do |tree, component|
9
+ # Seeds an element into the tree structure by referencing it
10
+ tree[component.sub(/^!/, '')] ||= Tree.new
11
+ end
12
+
13
+ if (path.match(/^[\!]/))
14
+ leaf[:required] = 0
15
+ else
16
+ leaf[:required] = 1
8
17
  end
18
+
19
+ self
9
20
  end
10
21
 
11
- def find_domain(domain)
12
- domain.split('.').inject(self) do |h, component|
13
- h and h.key?(component) ? h[component] : nil
22
+ def follow(path)
23
+ path = path.to_s.split(SEPARATOR) unless (path.is_a?(Array))
24
+ path = path.reverse
25
+
26
+ index = traverse(path)
27
+
28
+ index and index <= path.length and path[0, index].reverse
29
+ end
30
+
31
+ protected
32
+ def traverse(path, index = 0)
33
+ component = path[index]
34
+
35
+ unless (component)
36
+ return self[:required] == 0 ? index : nil
37
+ end
38
+
39
+ named_branch = self[component]
40
+
41
+ if (named_branch)
42
+ result = named_branch.traverse(path, index + 1)
43
+
44
+ return result if (result)
45
+ end
46
+
47
+ wildcard_branch = self["*"]
48
+
49
+ if (wildcard_branch)
50
+ result = wildcard_branch.traverse(path, index + 1)
51
+
52
+ return result if (result)
53
+ end
54
+
55
+ if (!named_branch and !wildcard_branch and self[:required])
56
+ return index + self[:required]
14
57
  end
58
+
59
+ return
15
60
  end
16
61
  end
17
62
 
@@ -25,40 +70,9 @@ module DomainPrefix
25
70
  [ -d.length, d ]
26
71
  end.freeze
27
72
 
28
- TLD_TREE = TLD_NAMES.inject(TreeHash.new) do |h, name|
29
- name.split('.').reverse.inject(h) do |_h, component|
30
- case (component)
31
- when '*'
32
- _h
33
- when /!(.*)/
34
- _h[$1]
35
- else
36
- _h[component]
37
- end
38
- end
39
-
40
- h
73
+ TLD_TREE = TLD_NAMES.inject(Tree.new) do |t, name|
74
+ t.insert(name)
41
75
  end.freeze
42
-
43
- PREFIX_SPEC = Regexp.new(
44
- '^(' + TLD_NAMES.collect do |d|
45
- Regexp.escape(d).sub(/^\\\*\\\./, '')
46
- end.join('|') + ')$'
47
- ).freeze
48
-
49
- ALLOWED_DOMAIN_PREFIXES = Hash[
50
- TLD_NAMES.select do |d|
51
- d.match(/^\!/)
52
- end.collect do |d|
53
- [ d.sub(/^\!/, ''), true ]
54
- end
55
- ].freeze
56
-
57
- DOMAIN_PREFIX_SPEC = Regexp.new(
58
- '^(?:[^\.]+\.)*?(([^\.]+)\.(' + TLD_NAMES.collect do |d|
59
- Regexp.escape(d).sub(/^\\\*\\\./, '[^\.]+\.')
60
- end.join('|') + '))$'
61
- ).freeze
62
76
 
63
77
  NONPUBLIC_TLD = {
64
78
  'local' => true
@@ -69,43 +83,64 @@ module DomainPrefix
69
83
  domain and domain.downcase
70
84
  end
71
85
 
72
- def registered_domain(domain)
73
- m = DOMAIN_PREFIX_SPEC.match(rfc3492_canonical_domain(domain))
86
+ def public_tld?(tld)
87
+ !NONPUBLIC_TLD.key?(tld)
88
+ end
89
+
90
+ def registered_domain(domain, rules = :strict)
91
+ return unless (domain)
74
92
 
75
- return unless (m)
76
-
77
- domain = m[1]
78
- suffix = m[3]
93
+ components = rfc3492_canonical_domain(domain).split(SEPARATOR)
79
94
 
80
- return if (NONPUBLIC_TLD[suffix])
81
- return if (PREFIX_SPEC.match(domain) and !ALLOWED_DOMAIN_PREFIXES[domain])
95
+ return if (components.empty? or components.find(&:empty?))
96
+
97
+ if (rules == :strict)
98
+ return unless (self.public_tld?(components.last))
99
+ end
100
+
101
+ suffix = TLD_TREE.follow(components)
102
+
103
+ unless (suffix)
104
+ if (rules == :relaxed and components.length >= 2 and !TLD_TREE[components[-1]])
105
+ return components.last(2).join(SEPARATOR)
106
+ else
107
+ return
108
+ end
109
+ end
82
110
 
83
- domain
111
+ suffix.join(SEPARATOR)
84
112
  end
85
113
 
86
114
  def public_suffix(domain)
87
- m = DOMAIN_PREFIX_SPEC.match(rfc3492_canonical_domain(domain))
88
-
89
- return unless (m)
90
-
91
- domain = m[1]
92
- suffix = m[3]
93
-
94
- return if (PREFIX_SPEC.match(domain) and !ALLOWED_DOMAIN_PREFIXES[domain])
115
+ return unless (domain)
116
+
117
+ components = rfc3492_canonical_domain(domain).split(SEPARATOR)
118
+
119
+ return if (components.empty? or components.find(&:empty?))
120
+
121
+ return unless (public_tld?(components.last))
122
+
123
+ suffix = TLD_TREE.follow(components)
95
124
 
96
- suffix
125
+ return unless (suffix)
126
+
127
+ suffix.shift
128
+
129
+ suffix.join(SEPARATOR)
97
130
  end
98
131
 
99
132
  def tld(domain)
100
133
  suffix = public_suffix(rfc3492_canonical_domain(domain))
101
134
 
102
- suffix and suffix.split(/\./).last
135
+ suffix and suffix.split(SEPARATOR).last
103
136
  end
104
137
 
105
138
  def name(domain)
106
- m = DOMAIN_PREFIX_SPEC.match(rfc3492_canonical_domain(domain))
107
-
108
- m and m[2]
139
+ if (domain = registered_domain(domain))
140
+ domain.split(SEPARATOR).first
141
+ else
142
+ nil
143
+ end
109
144
  end
110
145
 
111
146
  extend self
data/test/sample/test.txt CHANGED
@@ -1,76 +1,78 @@
1
- # Any copyright is dedicated to the Public Domain.
2
- # http://creativecommons.org/publicdomain/zero/1.0/
1
+ // Any copyright is dedicated to the Public Domain.
2
+ // http://creativecommons.org/publicdomain/zero/1.0/
3
3
 
4
- # NULL input.
5
- checkPublicSuffix(NULL, NULL);
6
- # Mixed case.
7
- checkPublicSuffix('COM', NULL);
4
+ // null input.
5
+ checkPublicSuffix(null, null);
6
+ // Mixed case.
7
+ checkPublicSuffix('COM', null);
8
8
  checkPublicSuffix('example.COM', 'example.com');
9
9
  checkPublicSuffix('WwW.example.COM', 'example.com');
10
- # Leading dot.
11
- checkPublicSuffix('.com', NULL);
12
- checkPublicSuffix('.example', NULL);
13
- checkPublicSuffix('.example.com', NULL);
14
- checkPublicSuffix('.example.example', NULL);
15
- # Unlisted TLD.
16
- checkPublicSuffix('example', NULL);
17
- checkPublicSuffix('example.example', NULL);
18
- checkPublicSuffix('b.example.example', NULL);
19
- checkPublicSuffix('a.b.example.example', NULL);
20
- # Listed, but non-Internet, TLD.
21
- checkPublicSuffix('local', NULL);
22
- checkPublicSuffix('example.local', NULL);
23
- checkPublicSuffix('b.example.local', NULL);
24
- checkPublicSuffix('a.b.example.local', NULL);
25
- # TLD with only 1 rule.
26
- checkPublicSuffix('biz', NULL);
10
+ // Leading dot.
11
+ checkPublicSuffix('.com', null);
12
+ checkPublicSuffix('.example', null);
13
+ checkPublicSuffix('.example.com', null);
14
+ checkPublicSuffix('.example.example', null);
15
+ // Unlisted TLD.
16
+ checkPublicSuffix('example', null);
17
+ checkPublicSuffix('example.example', 'example.example');
18
+ checkPublicSuffix('b.example.example', 'example.example');
19
+ checkPublicSuffix('a.b.example.example', 'example.example');
20
+ // Listed, but non-Internet, TLD.
21
+ //checkPublicSuffix('local', null);
22
+ //checkPublicSuffix('example.local', null);
23
+ //checkPublicSuffix('b.example.local', null);
24
+ //checkPublicSuffix('a.b.example.local', null);
25
+ // TLD with only 1 rule.
26
+ checkPublicSuffix('biz', null);
27
27
  checkPublicSuffix('domain.biz', 'domain.biz');
28
28
  checkPublicSuffix('b.domain.biz', 'domain.biz');
29
29
  checkPublicSuffix('a.b.domain.biz', 'domain.biz');
30
- # TLD with some 2-level rules.
31
- checkPublicSuffix('com', NULL);
30
+ // TLD with some 2-level rules.
31
+ checkPublicSuffix('com', null);
32
32
  checkPublicSuffix('example.com', 'example.com');
33
33
  checkPublicSuffix('b.example.com', 'example.com');
34
34
  checkPublicSuffix('a.b.example.com', 'example.com');
35
- checkPublicSuffix('uk.com', NULL);
35
+ checkPublicSuffix('uk.com', null);
36
36
  checkPublicSuffix('example.uk.com', 'example.uk.com');
37
37
  checkPublicSuffix('b.example.uk.com', 'example.uk.com');
38
38
  checkPublicSuffix('a.b.example.uk.com', 'example.uk.com');
39
39
  checkPublicSuffix('test.ac', 'test.ac');
40
- # TLD with only 1 (wildcard) rule.
41
- checkPublicSuffix('cy', NULL);
42
- checkPublicSuffix('c.cy', NULL);
40
+ // TLD with only 1 (wildcard) rule.
41
+ checkPublicSuffix('cy', null);
42
+ checkPublicSuffix('c.cy', null);
43
43
  checkPublicSuffix('b.c.cy', 'b.c.cy');
44
44
  checkPublicSuffix('a.b.c.cy', 'b.c.cy');
45
- # More complex TLD.
46
- checkPublicSuffix('jp', NULL);
45
+ // More complex TLD.
46
+ checkPublicSuffix('jp', null);
47
47
  checkPublicSuffix('test.jp', 'test.jp');
48
48
  checkPublicSuffix('www.test.jp', 'test.jp');
49
- checkPublicSuffix('ac.jp', NULL);
49
+ checkPublicSuffix('ac.jp', null);
50
50
  checkPublicSuffix('test.ac.jp', 'test.ac.jp');
51
51
  checkPublicSuffix('www.test.ac.jp', 'test.ac.jp');
52
- checkPublicSuffix('kyoto.jp', NULL);
53
- checkPublicSuffix('c.kyoto.jp', NULL);
54
- checkPublicSuffix('b.c.kyoto.jp', 'b.c.kyoto.jp');
55
- checkPublicSuffix('a.b.c.kyoto.jp', 'b.c.kyoto.jp');
56
- checkPublicSuffix('pref.kyoto.jp', 'pref.kyoto.jp'); # Exception rule.
57
- checkPublicSuffix('www.pref.kyoto.jp', 'pref.kyoto.jp'); # Exception rule.
58
- checkPublicSuffix('city.kyoto.jp', 'city.kyoto.jp'); # Exception rule.
59
- checkPublicSuffix('www.city.kyoto.jp', 'city.kyoto.jp'); # Exception rule.
60
- # TLD with a wildcard rule and exceptions.
61
- checkPublicSuffix('om', NULL);
62
- checkPublicSuffix('test.om', NULL);
52
+ checkPublicSuffix('kyoto.jp', null);
53
+ checkPublicSuffix('test.kyoto.jp', 'test.kyoto.jp');
54
+ checkPublicSuffix('ide.kyoto.jp', null);
55
+ checkPublicSuffix('b.ide.kyoto.jp', 'b.ide.kyoto.jp');
56
+ checkPublicSuffix('a.b.ide.kyoto.jp', 'b.ide.kyoto.jp');
57
+ checkPublicSuffix('c.kobe.jp', null);
58
+ checkPublicSuffix('b.c.kobe.jp', 'b.c.kobe.jp');
59
+ checkPublicSuffix('a.b.c.kobe.jp', 'b.c.kobe.jp');
60
+ checkPublicSuffix('city.kobe.jp', 'city.kobe.jp');
61
+ checkPublicSuffix('www.city.kobe.jp', 'city.kobe.jp');
62
+ // TLD with a wildcard rule and exceptions.
63
+ checkPublicSuffix('om', null);
64
+ checkPublicSuffix('test.om', null);
63
65
  checkPublicSuffix('b.test.om', 'b.test.om');
64
66
  checkPublicSuffix('a.b.test.om', 'b.test.om');
65
67
  checkPublicSuffix('songfest.om', 'songfest.om');
66
68
  checkPublicSuffix('www.songfest.om', 'songfest.om');
67
- # US K12.
68
- checkPublicSuffix('us', NULL);
69
+ // US K12.
70
+ checkPublicSuffix('us', null);
69
71
  checkPublicSuffix('test.us', 'test.us');
70
72
  checkPublicSuffix('www.test.us', 'test.us');
71
- checkPublicSuffix('ak.us', NULL);
73
+ checkPublicSuffix('ak.us', null);
72
74
  checkPublicSuffix('test.ak.us', 'test.ak.us');
73
75
  checkPublicSuffix('www.test.ak.us', 'test.ak.us');
74
- checkPublicSuffix('k12.ak.us', NULL);
76
+ checkPublicSuffix('k12.ak.us', null);
75
77
  checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us');
76
78
  checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us');
@@ -1,4 +1,4 @@
1
- require 'helper'
1
+ require_relative 'helper'
2
2
 
3
3
  class TestDomainPrefix < Test::Unit::TestCase
4
4
  def test_initialization
@@ -14,11 +14,11 @@ class TestDomainPrefix < Test::Unit::TestCase
14
14
  'example.on.ca' => %w[ example.on.ca on.ca ],
15
15
  'example.gc.ca' => %w[ example.gc.ca gc.ca ],
16
16
  'example.co.uk' => %w[ example.co.uk co.uk ],
17
- 'example.au' => [ nil, nil ],
18
17
  'example.com.au' => %w[ example.com.au com.au ],
18
+ 'example.au' => [ nil, nil ],
19
19
  'example.bar.jp' => %w[ bar.jp jp ],
20
- 'example.bar.hokkaido.jp' =>%w[ example.bar.hokkaido.jp bar.hokkaido.jp ],
21
- 'example.metro.tokyo.jp' => %w[ example.metro.tokyo.jp metro.tokyo.jp ]
20
+ 'example.bar.hokkaido.jp' =>%w[ bar.hokkaido.jp hokkaido.jp ],
21
+ 'example.metro.tokyo.jp' => %w[ metro.tokyo.jp tokyo.jp ]
22
22
  ) do |domain|
23
23
  [
24
24
  DomainPrefix.registered_domain(domain),
@@ -29,11 +29,13 @@ class TestDomainPrefix < Test::Unit::TestCase
29
29
 
30
30
  def test_public_suffix_samples
31
31
  sample_data('test.txt').split(/\n/).collect do |line|
32
+ line.sub!(/\/\/.*/, '')
33
+
32
34
  case (line)
33
35
  when /checkPublicSuffix\((\S+),\s*(\S+)\)/
34
36
  [ $1, $2 ].collect do |part|
35
37
  case (part)
36
- when 'NULL'
38
+ when 'NULL', 'null'
37
39
  nil
38
40
  else
39
41
  part.gsub(/'/, '')
@@ -43,7 +45,7 @@ class TestDomainPrefix < Test::Unit::TestCase
43
45
  nil
44
46
  end
45
47
  end.each do |domain, expected|
46
- assert_equal expected, DomainPrefix.registered_domain(domain), "#{domain.inspect} -> #{expected.inspect}"
48
+ assert_equal expected, DomainPrefix.registered_domain(domain, :relaxed), "#{domain.inspect} -> #{expected.inspect}"
47
49
  end
48
50
  end
49
51
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: domain_prefix
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-03 00:00:00.000000000 Z
12
+ date: 2012-12-12 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: A library to extract information about top-level domain and registered
15
15
  name from generic and international domain names
@@ -18,11 +18,11 @@ executables: []
18
18
  extensions: []
19
19
  extra_rdoc_files:
20
20
  - LICENSE
21
- - README.rdoc
21
+ - README.md
22
22
  files:
23
23
  - .document
24
24
  - LICENSE
25
- - README.rdoc
25
+ - README.md
26
26
  - Rakefile
27
27
  - VERSION
28
28
  - data/effective_tld_names.dat
@@ -31,7 +31,7 @@ files:
31
31
  - test/helper.rb
32
32
  - test/sample/README
33
33
  - test/sample/test.txt
34
- - test/test_tldifier.rb
34
+ - test/test_domain_prefix.rb
35
35
  homepage: http://github.com/twg/domain_prefix
36
36
  licenses: []
37
37
  post_install_message:
@@ -52,7 +52,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
52
52
  version: '0'
53
53
  requirements: []
54
54
  rubyforge_project:
55
- rubygems_version: 1.8.11
55
+ rubygems_version: 1.8.24
56
56
  signing_key:
57
57
  specification_version: 3
58
58
  summary: Domain Prefix Extraction Library
data/README.rdoc DELETED
@@ -1,18 +0,0 @@
1
- = tldifier
2
-
3
- Description goes here.
4
-
5
- == Note on Patches/Pull Requests
6
-
7
- * Fork the project.
8
- * Make your feature addition or bug fix.
9
- * Add tests for it. This is important so I don't break it in a
10
- future version unintentionally.
11
- * Commit, do not mess with rakefile, version, or history.
12
- (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
- * Send me a pull request. Bonus points for topic branches.
14
-
15
- == Copyright
16
-
17
- Copyright (c) 2009-2012 Scott Tadman, The Working Group Inc.
18
- See LICENSE for details.