swot-ruby 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,83 @@
1
+ ac.bd
2
+ ac.be
3
+ ac.gg
4
+ ac.gn
5
+ ac.il
6
+ ac.in
7
+ ac.jp
8
+ ac.kr
9
+ ac.ma
10
+ ac.me
11
+ ac.mw
12
+ ac.ni
13
+ ac.om
14
+ ac.pg
15
+ ac.pr
16
+ ac.ru
17
+ ac.rw
18
+ ac.sz
19
+ ac.yu
20
+ ac.za
21
+ ac.zm
22
+ ed.ao
23
+ ed.cr
24
+ ed.jp
25
+ edu
26
+ edu.al
27
+ edu.ar
28
+ edu.az
29
+ edu.bb
30
+ edu.bd
31
+ edu.bh
32
+ edu.bs
33
+ edu.bz
34
+ edu.ck
35
+ edu.cn
36
+ edu.dz
37
+ edu.ee
38
+ edu.er
39
+ edu.gh
40
+ edu.hn
41
+ edu.in
42
+ edu.jm
43
+ edu.kn
44
+ edu.kz
45
+ edu.lr
46
+ edu.ly
47
+ edu.me
48
+ edu.mg
49
+ edu.ml
50
+ edu.mv
51
+ edu.mw
52
+ edu.ni
53
+ edu.pa
54
+ edu.pr
55
+ edu.pt
56
+ edu.pw
57
+ edu.qa
58
+ edu.sc
59
+ edu.sd
60
+ edu.sh
61
+ edu.sl
62
+ edu.sy
63
+ edu.ws
64
+ edu.ye
65
+ edu.zm
66
+ es.kr
67
+ g12.br
68
+ hs.kr
69
+ ms.kr
70
+ sc.kr
71
+ sch.ae
72
+ sch.gg
73
+ sch.je
74
+ sch.jo
75
+ sch.lk
76
+ sch.ly
77
+ sch.my
78
+ sch.om
79
+ sch.ps
80
+ sch.sa
81
+ school.za
82
+ vic.edu.au
83
+ urfu.me
@@ -0,0 +1,249 @@
1
+ require 'set'
2
+
3
+ class Swot
4
+ # These top-level domains are guaranteed to be academic institutions.
5
+ ACADEMIC_TLDS = %w(
6
+ ac.ae
7
+ ac.at
8
+ ac.bd
9
+ ac.be
10
+ ac.cn
11
+ ac.cr
12
+ ac.cy
13
+ ac.fj
14
+ ac.gg
15
+ ac.gn
16
+ ac.id
17
+ ac.il
18
+ ac.in
19
+ ac.ir
20
+ ac.jp
21
+ ac.ke
22
+ ac.kr
23
+ ac.ma
24
+ ac.me
25
+ ac.mu
26
+ ac.mw
27
+ ac.mz
28
+ ac.ni
29
+ ac.nz
30
+ ac.om
31
+ ac.pa
32
+ ac.pg
33
+ ac.pr
34
+ ac.rs
35
+ ac.ru
36
+ ac.rw
37
+ ac.sz
38
+ ac.th
39
+ ac.tz
40
+ ac.ug
41
+ ac.uk
42
+ ac.yu
43
+ ac.za
44
+ ac.zm
45
+ ac.zw
46
+ cc.al.us
47
+ cc.ar.us
48
+ cc.az.us
49
+ cc.ca.us
50
+ cc.co.us
51
+ cc.fl.us
52
+ cc.ga.us
53
+ cc.hi.us
54
+ cc.ia.us
55
+ cc.id.us
56
+ cc.il.us
57
+ cc.in.us
58
+ cc.ks.us
59
+ cc.ky.us
60
+ cc.la.us
61
+ cc.md.us
62
+ cc.me.us
63
+ cc.mi.us
64
+ cc.mn.us
65
+ cc.mo.us
66
+ cc.ms.us
67
+ cc.mt.us
68
+ cc.nc.us
69
+ cc.nd.us
70
+ cc.ne.us
71
+ cc.nj.us
72
+ cc.nm.us
73
+ cc.nv.us
74
+ cc.ny.us
75
+ cc.oh.us
76
+ cc.ok.us
77
+ cc.or.us
78
+ cc.pa.us
79
+ cc.ri.us
80
+ cc.sc.us
81
+ cc.sd.us
82
+ cc.tx.us
83
+ cc.va.us
84
+ cc.vi.us
85
+ cc.wa.us
86
+ cc.wi.us
87
+ cc.wv.us
88
+ cc.wy.us
89
+ ed.ao
90
+ ed.cr
91
+ ed.jp
92
+ edu
93
+ edu.af
94
+ edu.al
95
+ edu.ar
96
+ edu.au
97
+ edu.az
98
+ edu.ba
99
+ edu.bb
100
+ edu.bd
101
+ edu.bh
102
+ edu.bi
103
+ edu.bn
104
+ edu.bo
105
+ edu.br
106
+ edu.bs
107
+ edu.bt
108
+ edu.bz
109
+ edu.ck
110
+ edu.cn
111
+ edu.co
112
+ edu.cu
113
+ edu.do
114
+ edu.dz
115
+ edu.ec
116
+ edu.ee
117
+ edu.eg
118
+ edu.er
119
+ edu.es
120
+ edu.et
121
+ edu.ge
122
+ edu.gh
123
+ edu.gr
124
+ edu.gt
125
+ edu.hk
126
+ edu.hn
127
+ edu.ht
128
+ edu.in
129
+ edu.iq
130
+ edu.jm
131
+ edu.jo
132
+ edu.kg
133
+ edu.kh
134
+ edu.kn
135
+ edu.kw
136
+ edu.ky
137
+ edu.kz
138
+ edu.la
139
+ edu.lb
140
+ edu.lr
141
+ edu.lv
142
+ edu.ly
143
+ edu.me
144
+ edu.mg
145
+ edu.mk
146
+ edu.ml
147
+ edu.mm
148
+ edu.mn
149
+ edu.mo
150
+ edu.mt
151
+ edu.mv
152
+ edu.mw
153
+ edu.mx
154
+ edu.my
155
+ edu.ni
156
+ edu.np
157
+ edu.om
158
+ edu.pa
159
+ edu.pe
160
+ edu.ph
161
+ edu.pk
162
+ edu.pl
163
+ edu.pr
164
+ edu.ps
165
+ edu.pt
166
+ edu.pw
167
+ edu.py
168
+ edu.qa
169
+ edu.rs
170
+ edu.ru
171
+ edu.sa
172
+ edu.sc
173
+ edu.sd
174
+ edu.sg
175
+ edu.sh
176
+ edu.sl
177
+ edu.sv
178
+ edu.sy
179
+ edu.tr
180
+ edu.tt
181
+ edu.tw
182
+ edu.ua
183
+ edu.uy
184
+ edu.ve
185
+ edu.vn
186
+ edu.ws
187
+ edu.ye
188
+ edu.zm
189
+ es.kr
190
+ g12.br
191
+ hs.kr
192
+ ms.kr
193
+ sc.kr
194
+ sc.ug
195
+ sch.ae
196
+ sch.gg
197
+ sch.id
198
+ sch.ir
199
+ sch.je
200
+ sch.jo
201
+ sch.lk
202
+ sch.ly
203
+ sch.my
204
+ sch.om
205
+ sch.ps
206
+ sch.sa
207
+ sch.uk
208
+ school.nz
209
+ school.za
210
+ tec.ar.us
211
+ tec.az.us
212
+ tec.co.us
213
+ tec.fl.us
214
+ tec.ga.us
215
+ tec.ia.us
216
+ tec.id.us
217
+ tec.il.us
218
+ tec.in.us
219
+ tec.ks.us
220
+ tec.ky.us
221
+ tec.la.us
222
+ tec.ma.us
223
+ tec.md.us
224
+ tec.me.us
225
+ tec.mi.us
226
+ tec.mn.us
227
+ tec.mo.us
228
+ tec.ms.us
229
+ tec.mt.us
230
+ tec.nc.us
231
+ tec.nd.us
232
+ tec.nh.us
233
+ tec.nm.us
234
+ tec.nv.us
235
+ tec.ny.us
236
+ tec.oh.us
237
+ tec.ok.us
238
+ tec.pa.us
239
+ tec.sc.us
240
+ tec.sd.us
241
+ tec.tx.us
242
+ tec.ut.us
243
+ tec.vi.us
244
+ tec.wa.us
245
+ tec.wi.us
246
+ tec.wv.us
247
+ vic.edu.au
248
+ ).to_set.freeze
249
+ end
@@ -0,0 +1,20 @@
1
+ # Module for methods that act on the entire Swot dataset.
2
+
3
+ module SwotCollectionMethods
4
+
5
+ # Returns an array of domain strings.
6
+ def all_domains
7
+ each_domain.map(&:to_s)
8
+ end
9
+
10
+ # Yields a Swot instance for every domain under lib/domains. Does not
11
+ # include blacklisted or ACADEMIC_TLDS domains.
12
+ #
13
+ # returns a Enumerator object with Swot instances if no block is given
14
+ def each_domain
15
+ return to_enum(:each_domain) unless block_given?
16
+ Pathname.glob(Pathname.new(Swot.domains_path).join('**/*.txt')) do |path|
17
+ yield(Swot.from_path(path))
18
+ end
19
+ end
20
+ end
data/lib/swot.rb ADDED
@@ -0,0 +1,90 @@
1
+ require "public_suffix"
2
+ require "naughty_or_nice"
3
+ require_relative "swot/academic_tlds"
4
+ require_relative "swot/collection_methods"
5
+
6
+ class Swot
7
+
8
+ VERSION = "0.4.2"
9
+
10
+ # These are domains that snuck into the edu registry,
11
+ # but don't pass the education sniff test
12
+ # Note: validated domain must not end with the blacklisted string
13
+ BLACKLIST = File.readlines(File.join(__dir__, '../data/lib/domains/stoplist.txt')).map(&:chomp).freeze
14
+
15
+ include NaughtyOrNice
16
+ extend SwotCollectionMethods
17
+ class << self
18
+ alias_method :is_academic?, :valid?
19
+ alias_method :academic?, :valid?
20
+
21
+ def get_institution_name(text)
22
+ Swot.new(text).institution_name
23
+ end
24
+ alias_method :school_name, :get_institution_name
25
+
26
+ def domains_path
27
+ @domains_path ||= File.expand_path "../data/lib/domains", File.dirname(__FILE__)
28
+ end
29
+
30
+ # Returns a new Swot instance for the domain file at the given path.
31
+ # Note that the path must be absolute.
32
+ #
33
+ # Returns a Swot instance or false is no domain is found at the given path.
34
+ def from_path(path_string_or_path)
35
+ path = Pathname.new(path_string_or_path)
36
+ return false unless path.exist?
37
+ path_dir, file = path.relative_path_from(Pathname.new(domains_path)).split
38
+ backwards_path = path_dir.to_s.split('/').push(file.basename('.txt').to_s)
39
+ domain = backwards_path.reverse.join('.')
40
+ Swot.new(domain)
41
+ end
42
+ end
43
+
44
+ # Figure out if an email or domain belongs to academic institution.
45
+ #
46
+ # Returns true if the domain name belongs to an academic institution;
47
+ # false otherwise.
48
+ def valid?
49
+ if domain.nil?
50
+ false
51
+ elsif BLACKLIST.any? { |d| to_s =~ /(\A|\.)#{Regexp.escape(d)}\z/ }
52
+ false
53
+ elsif ACADEMIC_TLDS.include?(domain.tld)
54
+ true
55
+ elsif academic_domain?
56
+ true
57
+ else
58
+ false
59
+ end
60
+ end
61
+
62
+ # Figure out the institution name based on the email address/domain.
63
+ #
64
+ # Returns a string with the institution name; nil if nothing is found.
65
+ def institution_name
66
+ @institution_name ||= File.read(file_path, :mode => "rb", :external_encoding => "UTF-8").strip
67
+ rescue
68
+ nil
69
+ end
70
+ alias_method :school_name, :institution_name
71
+ alias_method :name, :institution_name
72
+
73
+ # Figure out if a domain name is a know academic institution.
74
+ #
75
+ # Returns true if the domain name belongs to a known academic institution;
76
+ # false otherwise.
77
+ def academic_domain?
78
+ @academic_domain ||= File.exist?(file_path) || File.exist?(file_extended_path)
79
+ end
80
+
81
+ private
82
+
83
+ def file_path
84
+ @file_path ||= File.join(Swot::domains_path, domain.domain.to_s.split(".").reverse) + ".txt"
85
+ end
86
+
87
+ def file_extended_path
88
+ @file_extended_path ||= File.join([Swot::domains_path, domain.to_s.split(".").reverse].flatten) + ".txt"
89
+ end
90
+ end
data/swot.gemspec ADDED
@@ -0,0 +1,33 @@
1
+ # -*- encoding: utf-8 -*-
2
+ Gem::Specification.new do |s|
3
+ s.name = "swot-ruby"
4
+ s.version = "1.0.0"
5
+
6
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
7
+ s.authors = ["Lee Reilly"]
8
+ s.date = "2015-07-10"
9
+ s.description = "Identify email addresses or domains names that belong to colleges or universities. Help automate the process of approving or rejecting academic discounts."
10
+ s.email = "lee@leereilly.net"
11
+ s.extra_rdoc_files = [
12
+ "LICENSE.txt",
13
+ "README.md"
14
+ ]
15
+
16
+ s.files = `git ls-files -z`.split("\x0") + Dir['data/lib/domains/*']
17
+
18
+ Dir.glob('lib/domains/**/*.txt').each do |path|
19
+ s.files << path if File.file?(path)
20
+ end
21
+
22
+ s.homepage = "https://github.com/kobaltz/swot"
23
+ s.licenses = ["MIT"]
24
+ s.require_paths = ["lib"]
25
+ s.required_ruby_version = Gem::Requirement.new(">= 2.0")
26
+ s.rubygems_version = "2.0.14"
27
+ s.summary = "Identify email addresses or domains names that belong to colleges or universities."
28
+ s.test_files = ["test/helper.rb", "test/test_collection_methods.rb", "test/test_swot.rb"]
29
+
30
+ s.add_dependency('public_suffix', ">= 0")
31
+ s.add_dependency('naughty_or_nice', "~> 2.0")
32
+ end
33
+
data/test/helper.rb ADDED
@@ -0,0 +1,32 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'minitest/autorun'
11
+
12
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
14
+ require 'swot'
15
+
16
+ # Extracted from Rails ActiveSupport::Testing::Assertions
17
+ #
18
+ # Assert that an expression is not truthy. Passes if <tt>object</tt> is
19
+ # +nil+ or +false+. "Truthy" means "considered true in a conditional"
20
+ # like <tt>if foo</tt>.
21
+ #
22
+ # assert_not nil # => true
23
+ # assert_not false # => true
24
+ # assert_not 'foo' # => Expected "foo" to be nil or false
25
+ #
26
+ # An error message can be specified.
27
+ #
28
+ # assert_not foo, 'foo should be false'
29
+ def assert_not(object, message = nil)
30
+ message ||= "Expected #{mu_pp(object)} to be nil or false"
31
+ assert !object, message
32
+ end
@@ -0,0 +1,44 @@
1
+ require 'helper'
2
+ require 'fileutils'
3
+ require 'tmpdir'
4
+
5
+ describe 'SwotCollectionMethods' do
6
+ before do
7
+ @tmp_dir = Dir.mktmpdir "swot-test"
8
+ Swot.instance_variable_set(:@domains_path, @tmp_dir)
9
+ write_domain_file "students.texas.edu"
10
+ write_domain_file "mit.edu"
11
+ end
12
+
13
+ after do
14
+ FileUtils.remove_entry @tmp_dir
15
+ end
16
+
17
+ def write_domain_file(domain, school_name = 'The University')
18
+ parts = domain.split('.').reverse
19
+ parts.last.concat('.txt')
20
+ path = Pathname.new(@tmp_dir).join(*parts)
21
+ path.dirname.mkpath
22
+ path.open('w') { |f| f.puts school_name }
23
+ end
24
+
25
+ describe 'all_domains' do
26
+ it 'gets all domains from files on disk' do
27
+ all_domains = Swot.all_domains
28
+ assert_equal all_domains.size, 2
29
+ assert_includes all_domains, "students.texas.edu"
30
+ assert_includes all_domains, "mit.edu"
31
+ end
32
+ end
33
+
34
+ describe 'each_domain' do
35
+ it 'yields a swot instance for every domain file' do
36
+ domains = []
37
+ Swot.each_domain { |d| domains << d }
38
+ assert_equal domains.size, 2
39
+ assert_equal true, domains.all?{ |d| d.is_a? Swot }
40
+ assert_includes domains.map(&:to_s), "students.texas.edu"
41
+ assert_includes domains.map(&:to_s), "mit.edu"
42
+ end
43
+ end
44
+ end
data/test/test_swot.rb ADDED
@@ -0,0 +1,122 @@
1
+ # encoding: UTF-8
2
+ require 'helper'
3
+
4
+ describe Swot do
5
+ it "recognizes academic email addresses and domains" do
6
+ assert Swot::is_academic?('lreilly@stanford.edu')
7
+ assert Swot::is_academic?('LREILLY@STANFORD.EDU')
8
+ assert Swot::is_academic?('Lreilly@Stanford.Edu')
9
+ assert Swot::is_academic?('lreilly@slac.stanford.edu')
10
+ assert Swot::is_academic?('lreilly@strath.ac.uk')
11
+ assert Swot::is_academic?('lreilly@soft-eng.strath.ac.uk')
12
+ assert Swot::is_academic?('lee@ugr.es')
13
+ assert Swot::is_academic?('lee@uottawa.ca')
14
+ assert Swot::is_academic?('lee@mother.edu.ru')
15
+ assert Swot::is_academic?('lee@ucy.ac.cy')
16
+ assert Swot::is_academic?('dave.kimura@osu.edu')
17
+ assert Swot::is_academic?('kimura.13@osu.edu')
18
+ assert Swot::is_academic?('dave@daffodil.ac')
19
+ assert Swot::is_academic?('dave@cti.za.ac')
20
+ assert Swot::is_academic?('dave@lsst.ac')
21
+ assert Swot::is_academic?('dave@dcc.netpoint.com.bd')
22
+ assert Swot::is_academic?('dave@student.gatewayhs.com')
23
+
24
+
25
+ assert_not Swot::is_academic?('support@driftingruby.com')
26
+ assert_not Swot::is_academic?('lee@leerilly.net')
27
+ assert_not Swot::is_academic?('lee@gmail.com')
28
+ assert_not Swot::is_academic?('lee@stanford.edu.com')
29
+ assert_not Swot::is_academic?('lee@strath.ac.uk.com')
30
+
31
+ assert Swot::is_academic?('stanford.edu')
32
+ assert Swot::is_academic?('slac.stanford.edu')
33
+ assert Swot::is_academic?('www.stanford.edu')
34
+ assert Swot::is_academic?('http://www.stanford.edu')
35
+ assert Swot::is_academic?('http://www.stanford.edu:9393')
36
+ assert Swot::is_academic?('strath.ac.uk')
37
+ assert Swot::is_academic?('soft-eng.strath.ac.uk')
38
+ assert Swot::is_academic?('ugr.es')
39
+ assert Swot::is_academic?('uottawa.ca')
40
+ assert Swot::is_academic?('mother.edu.ru')
41
+ assert Swot::is_academic?('ucy.ac.cy')
42
+
43
+ assert_not Swot::is_academic?('leerilly.net')
44
+ assert_not Swot::is_academic?('gmail.com')
45
+ assert_not Swot::is_academic?('stanford.edu.com')
46
+ assert_not Swot::is_academic?('strath.ac.uk.com')
47
+
48
+ assert_not Swot::is_academic?(nil)
49
+ assert_not Swot::is_academic?('')
50
+ assert_not Swot::is_academic?('the')
51
+
52
+ assert Swot::is_academic?(' stanford.edu')
53
+ assert Swot::is_academic?('lee@strath.ac.uk ')
54
+ assert_not Swot::is_academic?(' gmail.com ')
55
+
56
+ assert Swot::is_academic?('lee@stud.uni-corvinus.hu')
57
+
58
+ # overkill
59
+ assert Swot::is_academic?('lee@harvard.edu')
60
+ assert Swot::is_academic?('lee@mail.harvard.edu')
61
+ end
62
+
63
+ it "returns name of valid institution" do
64
+ assert_match "University of Strathclyde", Swot::get_institution_name('lreilly@cs.strath.ac.uk')
65
+ assert_match "BRG Fadingerstraße Linz, Austria", Swot::get_institution_name('lreilly@fadi.at')
66
+ end
67
+
68
+ it "returns nil when institution invalid" do
69
+ assert_not Swot::get_institution_name('foo@shop.com')
70
+ end
71
+
72
+ it "test aliased methods" do
73
+ assert Swot::academic?('stanford.edu')
74
+ assert_match "University of Strathclyde", Swot::school_name('lreilly@cs.strath.ac.uk')
75
+ end
76
+
77
+ it "fail blacklisted domains" do
78
+ ["si.edu", " si.edu ", "imposter@si.edu", "foo.si.edu", "america.edu"].each do |domain|
79
+ assert_not Swot::is_academic?(domain), "#{domain} should be denied"
80
+ end
81
+ ["student.tsu.edu.ph"].each do |domain|
82
+ assert_not Swot::is_academic?(domain), "#{domain} should be denied"
83
+ end
84
+ end
85
+
86
+ it "not err on tld-only domains" do
87
+ Swot::is_academic? ".com"
88
+ assert_not Swot::is_academic?(".com")
89
+ end
90
+
91
+ it "does not err on invalid domains" do
92
+ assert_not Swot::is_academic?("foo@bar.invalid")
93
+ end
94
+
95
+ # it "contains only text files" do
96
+ # Dir.glob("lib/domains/**/*") do |file|
97
+ # if not File.directory?(file)
98
+ # assert file.end_with?(".txt"), "#{file} should have a .txt extension"
99
+ # end
100
+ # end
101
+ # end
102
+
103
+ # it "contains no file with an invalid encoding" do
104
+ # Dir.glob("lib/domains/**/*") do |file|
105
+ # if not File.directory?(file)
106
+ # File.open(file, "r") do |fh|
107
+ # assert fh.read.valid_encoding?, "Invalid encoding for #{file}"
108
+ # end
109
+ # end
110
+ # end
111
+ # end
112
+
113
+ # it "contains only file with a single line" do
114
+ # Dir.glob("lib/domains/**/*") do |file|
115
+ # if not File.directory?(file)
116
+ # File.open(file, "r") do |fh|
117
+ # assert fh.read.lines.count == 1, "#{file} should only contain one line"
118
+ # end
119
+ # end
120
+ # end
121
+ # end
122
+ end