domain_prefix 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,21 +5,21 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "domain_prefix"
8
- s.version = "0.1.0"
8
+ s.version = "0.2.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["tadman"]
12
- s.date = "2012-02-03"
12
+ s.date = "2012-12-12"
13
13
  s.description = "A library to extract information about top-level domain and registered name from generic and international domain names"
14
14
  s.email = "github@tadman.ca"
15
15
  s.extra_rdoc_files = [
16
16
  "LICENSE",
17
- "README.rdoc"
17
+ "README.md"
18
18
  ]
19
19
  s.files = [
20
20
  ".document",
21
21
  "LICENSE",
22
- "README.rdoc",
22
+ "README.md",
23
23
  "Rakefile",
24
24
  "VERSION",
25
25
  "data/effective_tld_names.dat",
@@ -28,11 +28,11 @@ Gem::Specification.new do |s|
28
28
  "test/helper.rb",
29
29
  "test/sample/README",
30
30
  "test/sample/test.txt",
31
- "test/test_tldifier.rb"
31
+ "test/test_domain_prefix.rb"
32
32
  ]
33
33
  s.homepage = "http://github.com/twg/domain_prefix"
34
34
  s.require_paths = ["lib"]
35
- s.rubygems_version = "1.8.11"
35
+ s.rubygems_version = "1.8.24"
36
36
  s.summary = "Domain Prefix Extraction Library"
37
37
 
38
38
  if s.respond_to? :specification_version then
data/lib/domain_prefix.rb CHANGED
@@ -1,17 +1,62 @@
1
1
  # encoding: UTF-8
2
2
 
3
3
  module DomainPrefix
4
- class TreeHash < Hash
5
- def initialize
6
- super do |h, k|
7
- h[k] = TreeHash.new
4
+ SEPARATOR = '.'.freeze
5
+
6
+ class Tree < Hash
7
+ def insert(path)
8
+ leaf = path.split(SEPARATOR).reverse.inject(self) do |tree, component|
9
+ # Seeds an element into the tree structure by referencing it
10
+ tree[component.sub(/^!/, '')] ||= Tree.new
11
+ end
12
+
13
+ if (path.match(/^[\!]/))
14
+ leaf[:required] = 0
15
+ else
16
+ leaf[:required] = 1
8
17
  end
18
+
19
+ self
9
20
  end
10
21
 
11
- def find_domain(domain)
12
- domain.split('.').inject(self) do |h, component|
13
- h and h.key?(component) ? h[component] : nil
22
+ def follow(path)
23
+ path = path.to_s.split(SEPARATOR) unless (path.is_a?(Array))
24
+ path = path.reverse
25
+
26
+ index = traverse(path)
27
+
28
+ index and index <= path.length and path[0, index].reverse
29
+ end
30
+
31
+ protected
32
+ def traverse(path, index = 0)
33
+ component = path[index]
34
+
35
+ unless (component)
36
+ return self[:required] == 0 ? index : nil
37
+ end
38
+
39
+ named_branch = self[component]
40
+
41
+ if (named_branch)
42
+ result = named_branch.traverse(path, index + 1)
43
+
44
+ return result if (result)
45
+ end
46
+
47
+ wildcard_branch = self["*"]
48
+
49
+ if (wildcard_branch)
50
+ result = wildcard_branch.traverse(path, index + 1)
51
+
52
+ return result if (result)
53
+ end
54
+
55
+ if (!named_branch and !wildcard_branch and self[:required])
56
+ return index + self[:required]
14
57
  end
58
+
59
+ return
15
60
  end
16
61
  end
17
62
 
@@ -25,40 +70,9 @@ module DomainPrefix
25
70
  [ -d.length, d ]
26
71
  end.freeze
27
72
 
28
- TLD_TREE = TLD_NAMES.inject(TreeHash.new) do |h, name|
29
- name.split('.').reverse.inject(h) do |_h, component|
30
- case (component)
31
- when '*'
32
- _h
33
- when /!(.*)/
34
- _h[$1]
35
- else
36
- _h[component]
37
- end
38
- end
39
-
40
- h
73
+ TLD_TREE = TLD_NAMES.inject(Tree.new) do |t, name|
74
+ t.insert(name)
41
75
  end.freeze
42
-
43
- PREFIX_SPEC = Regexp.new(
44
- '^(' + TLD_NAMES.collect do |d|
45
- Regexp.escape(d).sub(/^\\\*\\\./, '')
46
- end.join('|') + ')$'
47
- ).freeze
48
-
49
- ALLOWED_DOMAIN_PREFIXES = Hash[
50
- TLD_NAMES.select do |d|
51
- d.match(/^\!/)
52
- end.collect do |d|
53
- [ d.sub(/^\!/, ''), true ]
54
- end
55
- ].freeze
56
-
57
- DOMAIN_PREFIX_SPEC = Regexp.new(
58
- '^(?:[^\.]+\.)*?(([^\.]+)\.(' + TLD_NAMES.collect do |d|
59
- Regexp.escape(d).sub(/^\\\*\\\./, '[^\.]+\.')
60
- end.join('|') + '))$'
61
- ).freeze
62
76
 
63
77
  NONPUBLIC_TLD = {
64
78
  'local' => true
@@ -69,43 +83,64 @@ module DomainPrefix
69
83
  domain and domain.downcase
70
84
  end
71
85
 
72
- def registered_domain(domain)
73
- m = DOMAIN_PREFIX_SPEC.match(rfc3492_canonical_domain(domain))
86
+ def public_tld?(tld)
87
+ !NONPUBLIC_TLD.key?(tld)
88
+ end
89
+
90
+ def registered_domain(domain, rules = :strict)
91
+ return unless (domain)
74
92
 
75
- return unless (m)
76
-
77
- domain = m[1]
78
- suffix = m[3]
93
+ components = rfc3492_canonical_domain(domain).split(SEPARATOR)
79
94
 
80
- return if (NONPUBLIC_TLD[suffix])
81
- return if (PREFIX_SPEC.match(domain) and !ALLOWED_DOMAIN_PREFIXES[domain])
95
+ return if (components.empty? or components.find(&:empty?))
96
+
97
+ if (rules == :strict)
98
+ return unless (self.public_tld?(components.last))
99
+ end
100
+
101
+ suffix = TLD_TREE.follow(components)
102
+
103
+ unless (suffix)
104
+ if (rules == :relaxed and components.length >= 2 and !TLD_TREE[components[-1]])
105
+ return components.last(2).join(SEPARATOR)
106
+ else
107
+ return
108
+ end
109
+ end
82
110
 
83
- domain
111
+ suffix.join(SEPARATOR)
84
112
  end
85
113
 
86
114
  def public_suffix(domain)
87
- m = DOMAIN_PREFIX_SPEC.match(rfc3492_canonical_domain(domain))
88
-
89
- return unless (m)
90
-
91
- domain = m[1]
92
- suffix = m[3]
93
-
94
- return if (PREFIX_SPEC.match(domain) and !ALLOWED_DOMAIN_PREFIXES[domain])
115
+ return unless (domain)
116
+
117
+ components = rfc3492_canonical_domain(domain).split(SEPARATOR)
118
+
119
+ return if (components.empty? or components.find(&:empty?))
120
+
121
+ return unless (public_tld?(components.last))
122
+
123
+ suffix = TLD_TREE.follow(components)
95
124
 
96
- suffix
125
+ return unless (suffix)
126
+
127
+ suffix.shift
128
+
129
+ suffix.join(SEPARATOR)
97
130
  end
98
131
 
99
132
  def tld(domain)
100
133
  suffix = public_suffix(rfc3492_canonical_domain(domain))
101
134
 
102
- suffix and suffix.split(/\./).last
135
+ suffix and suffix.split(SEPARATOR).last
103
136
  end
104
137
 
105
138
  def name(domain)
106
- m = DOMAIN_PREFIX_SPEC.match(rfc3492_canonical_domain(domain))
107
-
108
- m and m[2]
139
+ if (domain = registered_domain(domain))
140
+ domain.split(SEPARATOR).first
141
+ else
142
+ nil
143
+ end
109
144
  end
110
145
 
111
146
  extend self
data/test/sample/test.txt CHANGED
@@ -1,76 +1,78 @@
1
- # Any copyright is dedicated to the Public Domain.
2
- # http://creativecommons.org/publicdomain/zero/1.0/
1
+ // Any copyright is dedicated to the Public Domain.
2
+ // http://creativecommons.org/publicdomain/zero/1.0/
3
3
 
4
- # NULL input.
5
- checkPublicSuffix(NULL, NULL);
6
- # Mixed case.
7
- checkPublicSuffix('COM', NULL);
4
+ // null input.
5
+ checkPublicSuffix(null, null);
6
+ // Mixed case.
7
+ checkPublicSuffix('COM', null);
8
8
  checkPublicSuffix('example.COM', 'example.com');
9
9
  checkPublicSuffix('WwW.example.COM', 'example.com');
10
- # Leading dot.
11
- checkPublicSuffix('.com', NULL);
12
- checkPublicSuffix('.example', NULL);
13
- checkPublicSuffix('.example.com', NULL);
14
- checkPublicSuffix('.example.example', NULL);
15
- # Unlisted TLD.
16
- checkPublicSuffix('example', NULL);
17
- checkPublicSuffix('example.example', NULL);
18
- checkPublicSuffix('b.example.example', NULL);
19
- checkPublicSuffix('a.b.example.example', NULL);
20
- # Listed, but non-Internet, TLD.
21
- checkPublicSuffix('local', NULL);
22
- checkPublicSuffix('example.local', NULL);
23
- checkPublicSuffix('b.example.local', NULL);
24
- checkPublicSuffix('a.b.example.local', NULL);
25
- # TLD with only 1 rule.
26
- checkPublicSuffix('biz', NULL);
10
+ // Leading dot.
11
+ checkPublicSuffix('.com', null);
12
+ checkPublicSuffix('.example', null);
13
+ checkPublicSuffix('.example.com', null);
14
+ checkPublicSuffix('.example.example', null);
15
+ // Unlisted TLD.
16
+ checkPublicSuffix('example', null);
17
+ checkPublicSuffix('example.example', 'example.example');
18
+ checkPublicSuffix('b.example.example', 'example.example');
19
+ checkPublicSuffix('a.b.example.example', 'example.example');
20
+ // Listed, but non-Internet, TLD.
21
+ //checkPublicSuffix('local', null);
22
+ //checkPublicSuffix('example.local', null);
23
+ //checkPublicSuffix('b.example.local', null);
24
+ //checkPublicSuffix('a.b.example.local', null);
25
+ // TLD with only 1 rule.
26
+ checkPublicSuffix('biz', null);
27
27
  checkPublicSuffix('domain.biz', 'domain.biz');
28
28
  checkPublicSuffix('b.domain.biz', 'domain.biz');
29
29
  checkPublicSuffix('a.b.domain.biz', 'domain.biz');
30
- # TLD with some 2-level rules.
31
- checkPublicSuffix('com', NULL);
30
+ // TLD with some 2-level rules.
31
+ checkPublicSuffix('com', null);
32
32
  checkPublicSuffix('example.com', 'example.com');
33
33
  checkPublicSuffix('b.example.com', 'example.com');
34
34
  checkPublicSuffix('a.b.example.com', 'example.com');
35
- checkPublicSuffix('uk.com', NULL);
35
+ checkPublicSuffix('uk.com', null);
36
36
  checkPublicSuffix('example.uk.com', 'example.uk.com');
37
37
  checkPublicSuffix('b.example.uk.com', 'example.uk.com');
38
38
  checkPublicSuffix('a.b.example.uk.com', 'example.uk.com');
39
39
  checkPublicSuffix('test.ac', 'test.ac');
40
- # TLD with only 1 (wildcard) rule.
41
- checkPublicSuffix('cy', NULL);
42
- checkPublicSuffix('c.cy', NULL);
40
+ // TLD with only 1 (wildcard) rule.
41
+ checkPublicSuffix('cy', null);
42
+ checkPublicSuffix('c.cy', null);
43
43
  checkPublicSuffix('b.c.cy', 'b.c.cy');
44
44
  checkPublicSuffix('a.b.c.cy', 'b.c.cy');
45
- # More complex TLD.
46
- checkPublicSuffix('jp', NULL);
45
+ // More complex TLD.
46
+ checkPublicSuffix('jp', null);
47
47
  checkPublicSuffix('test.jp', 'test.jp');
48
48
  checkPublicSuffix('www.test.jp', 'test.jp');
49
- checkPublicSuffix('ac.jp', NULL);
49
+ checkPublicSuffix('ac.jp', null);
50
50
  checkPublicSuffix('test.ac.jp', 'test.ac.jp');
51
51
  checkPublicSuffix('www.test.ac.jp', 'test.ac.jp');
52
- checkPublicSuffix('kyoto.jp', NULL);
53
- checkPublicSuffix('c.kyoto.jp', NULL);
54
- checkPublicSuffix('b.c.kyoto.jp', 'b.c.kyoto.jp');
55
- checkPublicSuffix('a.b.c.kyoto.jp', 'b.c.kyoto.jp');
56
- checkPublicSuffix('pref.kyoto.jp', 'pref.kyoto.jp'); # Exception rule.
57
- checkPublicSuffix('www.pref.kyoto.jp', 'pref.kyoto.jp'); # Exception rule.
58
- checkPublicSuffix('city.kyoto.jp', 'city.kyoto.jp'); # Exception rule.
59
- checkPublicSuffix('www.city.kyoto.jp', 'city.kyoto.jp'); # Exception rule.
60
- # TLD with a wildcard rule and exceptions.
61
- checkPublicSuffix('om', NULL);
62
- checkPublicSuffix('test.om', NULL);
52
+ checkPublicSuffix('kyoto.jp', null);
53
+ checkPublicSuffix('test.kyoto.jp', 'test.kyoto.jp');
54
+ checkPublicSuffix('ide.kyoto.jp', null);
55
+ checkPublicSuffix('b.ide.kyoto.jp', 'b.ide.kyoto.jp');
56
+ checkPublicSuffix('a.b.ide.kyoto.jp', 'b.ide.kyoto.jp');
57
+ checkPublicSuffix('c.kobe.jp', null);
58
+ checkPublicSuffix('b.c.kobe.jp', 'b.c.kobe.jp');
59
+ checkPublicSuffix('a.b.c.kobe.jp', 'b.c.kobe.jp');
60
+ checkPublicSuffix('city.kobe.jp', 'city.kobe.jp');
61
+ checkPublicSuffix('www.city.kobe.jp', 'city.kobe.jp');
62
+ // TLD with a wildcard rule and exceptions.
63
+ checkPublicSuffix('om', null);
64
+ checkPublicSuffix('test.om', null);
63
65
  checkPublicSuffix('b.test.om', 'b.test.om');
64
66
  checkPublicSuffix('a.b.test.om', 'b.test.om');
65
67
  checkPublicSuffix('songfest.om', 'songfest.om');
66
68
  checkPublicSuffix('www.songfest.om', 'songfest.om');
67
- # US K12.
68
- checkPublicSuffix('us', NULL);
69
+ // US K12.
70
+ checkPublicSuffix('us', null);
69
71
  checkPublicSuffix('test.us', 'test.us');
70
72
  checkPublicSuffix('www.test.us', 'test.us');
71
- checkPublicSuffix('ak.us', NULL);
73
+ checkPublicSuffix('ak.us', null);
72
74
  checkPublicSuffix('test.ak.us', 'test.ak.us');
73
75
  checkPublicSuffix('www.test.ak.us', 'test.ak.us');
74
- checkPublicSuffix('k12.ak.us', NULL);
76
+ checkPublicSuffix('k12.ak.us', null);
75
77
  checkPublicSuffix('test.k12.ak.us', 'test.k12.ak.us');
76
78
  checkPublicSuffix('www.test.k12.ak.us', 'test.k12.ak.us');
@@ -1,4 +1,4 @@
1
- require 'helper'
1
+ require_relative 'helper'
2
2
 
3
3
  class TestDomainPrefix < Test::Unit::TestCase
4
4
  def test_initialization
@@ -14,11 +14,11 @@ class TestDomainPrefix < Test::Unit::TestCase
14
14
  'example.on.ca' => %w[ example.on.ca on.ca ],
15
15
  'example.gc.ca' => %w[ example.gc.ca gc.ca ],
16
16
  'example.co.uk' => %w[ example.co.uk co.uk ],
17
- 'example.au' => [ nil, nil ],
18
17
  'example.com.au' => %w[ example.com.au com.au ],
18
+ 'example.au' => [ nil, nil ],
19
19
  'example.bar.jp' => %w[ bar.jp jp ],
20
- 'example.bar.hokkaido.jp' =>%w[ example.bar.hokkaido.jp bar.hokkaido.jp ],
21
- 'example.metro.tokyo.jp' => %w[ example.metro.tokyo.jp metro.tokyo.jp ]
20
+ 'example.bar.hokkaido.jp' =>%w[ bar.hokkaido.jp hokkaido.jp ],
21
+ 'example.metro.tokyo.jp' => %w[ metro.tokyo.jp tokyo.jp ]
22
22
  ) do |domain|
23
23
  [
24
24
  DomainPrefix.registered_domain(domain),
@@ -29,11 +29,13 @@ class TestDomainPrefix < Test::Unit::TestCase
29
29
 
30
30
  def test_public_suffix_samples
31
31
  sample_data('test.txt').split(/\n/).collect do |line|
32
+ line.sub!(/\/\/.*/, '')
33
+
32
34
  case (line)
33
35
  when /checkPublicSuffix\((\S+),\s*(\S+)\)/
34
36
  [ $1, $2 ].collect do |part|
35
37
  case (part)
36
- when 'NULL'
38
+ when 'NULL', 'null'
37
39
  nil
38
40
  else
39
41
  part.gsub(/'/, '')
@@ -43,7 +45,7 @@ class TestDomainPrefix < Test::Unit::TestCase
43
45
  nil
44
46
  end
45
47
  end.each do |domain, expected|
46
- assert_equal expected, DomainPrefix.registered_domain(domain), "#{domain.inspect} -> #{expected.inspect}"
48
+ assert_equal expected, DomainPrefix.registered_domain(domain, :relaxed), "#{domain.inspect} -> #{expected.inspect}"
47
49
  end
48
50
  end
49
51
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: domain_prefix
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-03 00:00:00.000000000 Z
12
+ date: 2012-12-12 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: A library to extract information about top-level domain and registered
15
15
  name from generic and international domain names
@@ -18,11 +18,11 @@ executables: []
18
18
  extensions: []
19
19
  extra_rdoc_files:
20
20
  - LICENSE
21
- - README.rdoc
21
+ - README.md
22
22
  files:
23
23
  - .document
24
24
  - LICENSE
25
- - README.rdoc
25
+ - README.md
26
26
  - Rakefile
27
27
  - VERSION
28
28
  - data/effective_tld_names.dat
@@ -31,7 +31,7 @@ files:
31
31
  - test/helper.rb
32
32
  - test/sample/README
33
33
  - test/sample/test.txt
34
- - test/test_tldifier.rb
34
+ - test/test_domain_prefix.rb
35
35
  homepage: http://github.com/twg/domain_prefix
36
36
  licenses: []
37
37
  post_install_message:
@@ -52,7 +52,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
52
52
  version: '0'
53
53
  requirements: []
54
54
  rubyforge_project:
55
- rubygems_version: 1.8.11
55
+ rubygems_version: 1.8.24
56
56
  signing_key:
57
57
  specification_version: 3
58
58
  summary: Domain Prefix Extraction Library
data/README.rdoc DELETED
@@ -1,18 +0,0 @@
1
- = tldifier
2
-
3
- Description goes here.
4
-
5
- == Note on Patches/Pull Requests
6
-
7
- * Fork the project.
8
- * Make your feature addition or bug fix.
9
- * Add tests for it. This is important so I don't break it in a
10
- future version unintentionally.
11
- * Commit, do not mess with rakefile, version, or history.
12
- (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
- * Send me a pull request. Bonus points for topic branches.
14
-
15
- == Copyright
16
-
17
- Copyright (c) 2009-2012 Scott Tadman, The Working Group Inc.
18
- See LICENSE for details.