swot-ruby 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ ac.bd
2
+ ac.be
3
+ ac.gg
4
+ ac.gn
5
+ ac.il
6
+ ac.in
7
+ ac.jp
8
+ ac.kr
9
+ ac.ma
10
+ ac.me
11
+ ac.mw
12
+ ac.ni
13
+ ac.om
14
+ ac.pg
15
+ ac.pr
16
+ ac.ru
17
+ ac.rw
18
+ ac.sz
19
+ ac.yu
20
+ ac.za
21
+ ac.zm
22
+ ed.ao
23
+ ed.cr
24
+ ed.jp
25
+ edu
26
+ edu.al
27
+ edu.ar
28
+ edu.az
29
+ edu.bb
30
+ edu.bd
31
+ edu.bh
32
+ edu.bs
33
+ edu.bz
34
+ edu.ck
35
+ edu.cn
36
+ edu.dz
37
+ edu.ee
38
+ edu.er
39
+ edu.gh
40
+ edu.hn
41
+ edu.in
42
+ edu.jm
43
+ edu.kn
44
+ edu.kz
45
+ edu.lr
46
+ edu.ly
47
+ edu.me
48
+ edu.mg
49
+ edu.ml
50
+ edu.mv
51
+ edu.mw
52
+ edu.ni
53
+ edu.pa
54
+ edu.pr
55
+ edu.pt
56
+ edu.pw
57
+ edu.qa
58
+ edu.sc
59
+ edu.sd
60
+ edu.sh
61
+ edu.sl
62
+ edu.sy
63
+ edu.ws
64
+ edu.ye
65
+ edu.zm
66
+ es.kr
67
+ g12.br
68
+ hs.kr
69
+ ms.kr
70
+ sc.kr
71
+ sch.ae
72
+ sch.gg
73
+ sch.je
74
+ sch.jo
75
+ sch.lk
76
+ sch.ly
77
+ sch.my
78
+ sch.om
79
+ sch.ps
80
+ sch.sa
81
+ school.za
82
+ vic.edu.au
83
+ urfu.me
@@ -0,0 +1,249 @@
1
+ require 'set'
2
+
3
+ class Swot
4
+ # These top-level domains are guaranteed to be academic institutions.
5
+ ACADEMIC_TLDS = %w(
6
+ ac.ae
7
+ ac.at
8
+ ac.bd
9
+ ac.be
10
+ ac.cn
11
+ ac.cr
12
+ ac.cy
13
+ ac.fj
14
+ ac.gg
15
+ ac.gn
16
+ ac.id
17
+ ac.il
18
+ ac.in
19
+ ac.ir
20
+ ac.jp
21
+ ac.ke
22
+ ac.kr
23
+ ac.ma
24
+ ac.me
25
+ ac.mu
26
+ ac.mw
27
+ ac.mz
28
+ ac.ni
29
+ ac.nz
30
+ ac.om
31
+ ac.pa
32
+ ac.pg
33
+ ac.pr
34
+ ac.rs
35
+ ac.ru
36
+ ac.rw
37
+ ac.sz
38
+ ac.th
39
+ ac.tz
40
+ ac.ug
41
+ ac.uk
42
+ ac.yu
43
+ ac.za
44
+ ac.zm
45
+ ac.zw
46
+ cc.al.us
47
+ cc.ar.us
48
+ cc.az.us
49
+ cc.ca.us
50
+ cc.co.us
51
+ cc.fl.us
52
+ cc.ga.us
53
+ cc.hi.us
54
+ cc.ia.us
55
+ cc.id.us
56
+ cc.il.us
57
+ cc.in.us
58
+ cc.ks.us
59
+ cc.ky.us
60
+ cc.la.us
61
+ cc.md.us
62
+ cc.me.us
63
+ cc.mi.us
64
+ cc.mn.us
65
+ cc.mo.us
66
+ cc.ms.us
67
+ cc.mt.us
68
+ cc.nc.us
69
+ cc.nd.us
70
+ cc.ne.us
71
+ cc.nj.us
72
+ cc.nm.us
73
+ cc.nv.us
74
+ cc.ny.us
75
+ cc.oh.us
76
+ cc.ok.us
77
+ cc.or.us
78
+ cc.pa.us
79
+ cc.ri.us
80
+ cc.sc.us
81
+ cc.sd.us
82
+ cc.tx.us
83
+ cc.va.us
84
+ cc.vi.us
85
+ cc.wa.us
86
+ cc.wi.us
87
+ cc.wv.us
88
+ cc.wy.us
89
+ ed.ao
90
+ ed.cr
91
+ ed.jp
92
+ edu
93
+ edu.af
94
+ edu.al
95
+ edu.ar
96
+ edu.au
97
+ edu.az
98
+ edu.ba
99
+ edu.bb
100
+ edu.bd
101
+ edu.bh
102
+ edu.bi
103
+ edu.bn
104
+ edu.bo
105
+ edu.br
106
+ edu.bs
107
+ edu.bt
108
+ edu.bz
109
+ edu.ck
110
+ edu.cn
111
+ edu.co
112
+ edu.cu
113
+ edu.do
114
+ edu.dz
115
+ edu.ec
116
+ edu.ee
117
+ edu.eg
118
+ edu.er
119
+ edu.es
120
+ edu.et
121
+ edu.ge
122
+ edu.gh
123
+ edu.gr
124
+ edu.gt
125
+ edu.hk
126
+ edu.hn
127
+ edu.ht
128
+ edu.in
129
+ edu.iq
130
+ edu.jm
131
+ edu.jo
132
+ edu.kg
133
+ edu.kh
134
+ edu.kn
135
+ edu.kw
136
+ edu.ky
137
+ edu.kz
138
+ edu.la
139
+ edu.lb
140
+ edu.lr
141
+ edu.lv
142
+ edu.ly
143
+ edu.me
144
+ edu.mg
145
+ edu.mk
146
+ edu.ml
147
+ edu.mm
148
+ edu.mn
149
+ edu.mo
150
+ edu.mt
151
+ edu.mv
152
+ edu.mw
153
+ edu.mx
154
+ edu.my
155
+ edu.ni
156
+ edu.np
157
+ edu.om
158
+ edu.pa
159
+ edu.pe
160
+ edu.ph
161
+ edu.pk
162
+ edu.pl
163
+ edu.pr
164
+ edu.ps
165
+ edu.pt
166
+ edu.pw
167
+ edu.py
168
+ edu.qa
169
+ edu.rs
170
+ edu.ru
171
+ edu.sa
172
+ edu.sc
173
+ edu.sd
174
+ edu.sg
175
+ edu.sh
176
+ edu.sl
177
+ edu.sv
178
+ edu.sy
179
+ edu.tr
180
+ edu.tt
181
+ edu.tw
182
+ edu.ua
183
+ edu.uy
184
+ edu.ve
185
+ edu.vn
186
+ edu.ws
187
+ edu.ye
188
+ edu.zm
189
+ es.kr
190
+ g12.br
191
+ hs.kr
192
+ ms.kr
193
+ sc.kr
194
+ sc.ug
195
+ sch.ae
196
+ sch.gg
197
+ sch.id
198
+ sch.ir
199
+ sch.je
200
+ sch.jo
201
+ sch.lk
202
+ sch.ly
203
+ sch.my
204
+ sch.om
205
+ sch.ps
206
+ sch.sa
207
+ sch.uk
208
+ school.nz
209
+ school.za
210
+ tec.ar.us
211
+ tec.az.us
212
+ tec.co.us
213
+ tec.fl.us
214
+ tec.ga.us
215
+ tec.ia.us
216
+ tec.id.us
217
+ tec.il.us
218
+ tec.in.us
219
+ tec.ks.us
220
+ tec.ky.us
221
+ tec.la.us
222
+ tec.ma.us
223
+ tec.md.us
224
+ tec.me.us
225
+ tec.mi.us
226
+ tec.mn.us
227
+ tec.mo.us
228
+ tec.ms.us
229
+ tec.mt.us
230
+ tec.nc.us
231
+ tec.nd.us
232
+ tec.nh.us
233
+ tec.nm.us
234
+ tec.nv.us
235
+ tec.ny.us
236
+ tec.oh.us
237
+ tec.ok.us
238
+ tec.pa.us
239
+ tec.sc.us
240
+ tec.sd.us
241
+ tec.tx.us
242
+ tec.ut.us
243
+ tec.vi.us
244
+ tec.wa.us
245
+ tec.wi.us
246
+ tec.wv.us
247
+ vic.edu.au
248
+ ).to_set.freeze
249
+ end
@@ -0,0 +1,20 @@
1
+ # Module for methods that act on the entire Swot dataset.
2
+
3
+ module SwotCollectionMethods
4
+
5
+ # Returns an array of domain strings.
6
+ def all_domains
7
+ each_domain.map(&:to_s)
8
+ end
9
+
10
+ # Yields a Swot instance for every domain under lib/domains. Does not
11
+ # include blacklisted or ACADEMIC_TLDS domains.
12
+ #
13
+ # returns a Enumerator object with Swot instances if no block is given
14
+ def each_domain
15
+ return to_enum(:each_domain) unless block_given?
16
+ Pathname.glob(Pathname.new(Swot.domains_path).join('**/*.txt')) do |path|
17
+ yield(Swot.from_path(path))
18
+ end
19
+ end
20
+ end
data/lib/swot.rb ADDED
@@ -0,0 +1,90 @@
1
+ require "public_suffix"
2
+ require "naughty_or_nice"
3
+ require_relative "swot/academic_tlds"
4
+ require_relative "swot/collection_methods"
5
+
6
+ class Swot
7
+
8
+ VERSION = "0.4.2"
9
+
10
+ # These are domains that snuck into the edu registry,
11
+ # but don't pass the education sniff test
12
+ # Note: validated domain must not end with the blacklisted string
13
+ BLACKLIST = File.readlines(File.join(__dir__, '../data/lib/domains/stoplist.txt')).map(&:chomp).freeze
14
+
15
+ include NaughtyOrNice
16
+ extend SwotCollectionMethods
17
+ class << self
18
+ alias_method :is_academic?, :valid?
19
+ alias_method :academic?, :valid?
20
+
21
+ def get_institution_name(text)
22
+ Swot.new(text).institution_name
23
+ end
24
+ alias_method :school_name, :get_institution_name
25
+
26
+ def domains_path
27
+ @domains_path ||= File.expand_path "../data/lib/domains", File.dirname(__FILE__)
28
+ end
29
+
30
+ # Returns a new Swot instance for the domain file at the given path.
31
+ # Note that the path must be absolute.
32
+ #
33
+ # Returns a Swot instance or false is no domain is found at the given path.
34
+ def from_path(path_string_or_path)
35
+ path = Pathname.new(path_string_or_path)
36
+ return false unless path.exist?
37
+ path_dir, file = path.relative_path_from(Pathname.new(domains_path)).split
38
+ backwards_path = path_dir.to_s.split('/').push(file.basename('.txt').to_s)
39
+ domain = backwards_path.reverse.join('.')
40
+ Swot.new(domain)
41
+ end
42
+ end
43
+
44
+ # Figure out if an email or domain belongs to academic institution.
45
+ #
46
+ # Returns true if the domain name belongs to an academic institution;
47
+ # false otherwise.
48
+ def valid?
49
+ if domain.nil?
50
+ false
51
+ elsif BLACKLIST.any? { |d| to_s =~ /(\A|\.)#{Regexp.escape(d)}\z/ }
52
+ false
53
+ elsif ACADEMIC_TLDS.include?(domain.tld)
54
+ true
55
+ elsif academic_domain?
56
+ true
57
+ else
58
+ false
59
+ end
60
+ end
61
+
62
+ # Figure out the institution name based on the email address/domain.
63
+ #
64
+ # Returns a string with the institution name; nil if nothing is found.
65
+ def institution_name
66
+ @institution_name ||= File.read(file_path, :mode => "rb", :external_encoding => "UTF-8").strip
67
+ rescue
68
+ nil
69
+ end
70
+ alias_method :school_name, :institution_name
71
+ alias_method :name, :institution_name
72
+
73
+ # Figure out if a domain name is a know academic institution.
74
+ #
75
+ # Returns true if the domain name belongs to a known academic institution;
76
+ # false otherwise.
77
+ def academic_domain?
78
+ @academic_domain ||= File.exist?(file_path) || File.exist?(file_extended_path)
79
+ end
80
+
81
+ private
82
+
83
+ def file_path
84
+ @file_path ||= File.join(Swot::domains_path, domain.domain.to_s.split(".").reverse) + ".txt"
85
+ end
86
+
87
+ def file_extended_path
88
+ @file_extended_path ||= File.join([Swot::domains_path, domain.to_s.split(".").reverse].flatten) + ".txt"
89
+ end
90
+ end
data/swot.gemspec ADDED
@@ -0,0 +1,33 @@
1
+ # -*- encoding: utf-8 -*-
2
+ Gem::Specification.new do |s|
3
+ s.name = "swot-ruby"
4
+ s.version = "1.0.0"
5
+
6
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
7
+ s.authors = ["Lee Reilly"]
8
+ s.date = "2015-07-10"
9
+ s.description = "Identify email addresses or domains names that belong to colleges or universities. Help automate the process of approving or rejecting academic discounts."
10
+ s.email = "lee@leereilly.net"
11
+ s.extra_rdoc_files = [
12
+ "LICENSE.txt",
13
+ "README.md"
14
+ ]
15
+
16
+ s.files = `git ls-files -z`.split("\x0") + Dir['data/lib/domains/*']
17
+
18
+ Dir.glob('lib/domains/**/*.txt').each do |path|
19
+ s.files << path if File.file?(path)
20
+ end
21
+
22
+ s.homepage = "https://github.com/kobaltz/swot"
23
+ s.licenses = ["MIT"]
24
+ s.require_paths = ["lib"]
25
+ s.required_ruby_version = Gem::Requirement.new(">= 2.0")
26
+ s.rubygems_version = "2.0.14"
27
+ s.summary = "Identify email addresses or domains names that belong to colleges or universities."
28
+ s.test_files = ["test/helper.rb", "test/test_collection_methods.rb", "test/test_swot.rb"]
29
+
30
+ s.add_dependency('public_suffix', ">= 0")
31
+ s.add_dependency('naughty_or_nice', "~> 2.0")
32
+ end
33
+
data/test/helper.rb ADDED
@@ -0,0 +1,32 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'minitest/autorun'
11
+
12
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
14
+ require 'swot'
15
+
16
+ # Extracted from Rails ActiveSupport::Testing::Assertions
17
+ #
18
+ # Assert that an expression is not truthy. Passes if <tt>object</tt> is
19
+ # +nil+ or +false+. "Truthy" means "considered true in a conditional"
20
+ # like <tt>if foo</tt>.
21
+ #
22
+ # assert_not nil # => true
23
+ # assert_not false # => true
24
+ # assert_not 'foo' # => Expected "foo" to be nil or false
25
+ #
26
+ # An error message can be specified.
27
+ #
28
+ # assert_not foo, 'foo should be false'
29
+ def assert_not(object, message = nil)
30
+ message ||= "Expected #{mu_pp(object)} to be nil or false"
31
+ assert !object, message
32
+ end
@@ -0,0 +1,44 @@
1
+ require 'helper'
2
+ require 'fileutils'
3
+ require 'tmpdir'
4
+
5
+ describe 'SwotCollectionMethods' do
6
+ before do
7
+ @tmp_dir = Dir.mktmpdir "swot-test"
8
+ Swot.instance_variable_set(:@domains_path, @tmp_dir)
9
+ write_domain_file "students.texas.edu"
10
+ write_domain_file "mit.edu"
11
+ end
12
+
13
+ after do
14
+ FileUtils.remove_entry @tmp_dir
15
+ end
16
+
17
+ def write_domain_file(domain, school_name = 'The University')
18
+ parts = domain.split('.').reverse
19
+ parts.last.concat('.txt')
20
+ path = Pathname.new(@tmp_dir).join(*parts)
21
+ path.dirname.mkpath
22
+ path.open('w') { |f| f.puts school_name }
23
+ end
24
+
25
+ describe 'all_domains' do
26
+ it 'gets all domains from files on disk' do
27
+ all_domains = Swot.all_domains
28
+ assert_equal all_domains.size, 2
29
+ assert_includes all_domains, "students.texas.edu"
30
+ assert_includes all_domains, "mit.edu"
31
+ end
32
+ end
33
+
34
+ describe 'each_domain' do
35
+ it 'yields a swot instance for every domain file' do
36
+ domains = []
37
+ Swot.each_domain { |d| domains << d }
38
+ assert_equal domains.size, 2
39
+ assert_equal true, domains.all?{ |d| d.is_a? Swot }
40
+ assert_includes domains.map(&:to_s), "students.texas.edu"
41
+ assert_includes domains.map(&:to_s), "mit.edu"
42
+ end
43
+ end
44
+ end
data/test/test_swot.rb ADDED
@@ -0,0 +1,122 @@
1
+ # encoding: UTF-8
2
+ require 'helper'
3
+
4
+ describe Swot do
5
+ it "recognizes academic email addresses and domains" do
6
+ assert Swot::is_academic?('lreilly@stanford.edu')
7
+ assert Swot::is_academic?('LREILLY@STANFORD.EDU')
8
+ assert Swot::is_academic?('Lreilly@Stanford.Edu')
9
+ assert Swot::is_academic?('lreilly@slac.stanford.edu')
10
+ assert Swot::is_academic?('lreilly@strath.ac.uk')
11
+ assert Swot::is_academic?('lreilly@soft-eng.strath.ac.uk')
12
+ assert Swot::is_academic?('lee@ugr.es')
13
+ assert Swot::is_academic?('lee@uottawa.ca')
14
+ assert Swot::is_academic?('lee@mother.edu.ru')
15
+ assert Swot::is_academic?('lee@ucy.ac.cy')
16
+ assert Swot::is_academic?('dave.kimura@osu.edu')
17
+ assert Swot::is_academic?('kimura.13@osu.edu')
18
+ assert Swot::is_academic?('dave@daffodil.ac')
19
+ assert Swot::is_academic?('dave@cti.za.ac')
20
+ assert Swot::is_academic?('dave@lsst.ac')
21
+ assert Swot::is_academic?('dave@dcc.netpoint.com.bd')
22
+ assert Swot::is_academic?('dave@student.gatewayhs.com')
23
+
24
+
25
+ assert_not Swot::is_academic?('support@driftingruby.com')
26
+ assert_not Swot::is_academic?('lee@leerilly.net')
27
+ assert_not Swot::is_academic?('lee@gmail.com')
28
+ assert_not Swot::is_academic?('lee@stanford.edu.com')
29
+ assert_not Swot::is_academic?('lee@strath.ac.uk.com')
30
+
31
+ assert Swot::is_academic?('stanford.edu')
32
+ assert Swot::is_academic?('slac.stanford.edu')
33
+ assert Swot::is_academic?('www.stanford.edu')
34
+ assert Swot::is_academic?('http://www.stanford.edu')
35
+ assert Swot::is_academic?('http://www.stanford.edu:9393')
36
+ assert Swot::is_academic?('strath.ac.uk')
37
+ assert Swot::is_academic?('soft-eng.strath.ac.uk')
38
+ assert Swot::is_academic?('ugr.es')
39
+ assert Swot::is_academic?('uottawa.ca')
40
+ assert Swot::is_academic?('mother.edu.ru')
41
+ assert Swot::is_academic?('ucy.ac.cy')
42
+
43
+ assert_not Swot::is_academic?('leerilly.net')
44
+ assert_not Swot::is_academic?('gmail.com')
45
+ assert_not Swot::is_academic?('stanford.edu.com')
46
+ assert_not Swot::is_academic?('strath.ac.uk.com')
47
+
48
+ assert_not Swot::is_academic?(nil)
49
+ assert_not Swot::is_academic?('')
50
+ assert_not Swot::is_academic?('the')
51
+
52
+ assert Swot::is_academic?(' stanford.edu')
53
+ assert Swot::is_academic?('lee@strath.ac.uk ')
54
+ assert_not Swot::is_academic?(' gmail.com ')
55
+
56
+ assert Swot::is_academic?('lee@stud.uni-corvinus.hu')
57
+
58
+ # overkill
59
+ assert Swot::is_academic?('lee@harvard.edu')
60
+ assert Swot::is_academic?('lee@mail.harvard.edu')
61
+ end
62
+
63
+ it "returns name of valid institution" do
64
+ assert_match "University of Strathclyde", Swot::get_institution_name('lreilly@cs.strath.ac.uk')
65
+ assert_match "BRG Fadingerstraße Linz, Austria", Swot::get_institution_name('lreilly@fadi.at')
66
+ end
67
+
68
+ it "returns nil when institution invalid" do
69
+ assert_not Swot::get_institution_name('foo@shop.com')
70
+ end
71
+
72
+ it "test aliased methods" do
73
+ assert Swot::academic?('stanford.edu')
74
+ assert_match "University of Strathclyde", Swot::school_name('lreilly@cs.strath.ac.uk')
75
+ end
76
+
77
+ it "fail blacklisted domains" do
78
+ ["si.edu", " si.edu ", "imposter@si.edu", "foo.si.edu", "america.edu"].each do |domain|
79
+ assert_not Swot::is_academic?(domain), "#{domain} should be denied"
80
+ end
81
+ ["student.tsu.edu.ph"].each do |domain|
82
+ assert_not Swot::is_academic?(domain), "#{domain} should be denied"
83
+ end
84
+ end
85
+
86
+ it "not err on tld-only domains" do
87
+ Swot::is_academic? ".com"
88
+ assert_not Swot::is_academic?(".com")
89
+ end
90
+
91
+ it "does not err on invalid domains" do
92
+ assert_not Swot::is_academic?("foo@bar.invalid")
93
+ end
94
+
95
+ # it "contains only text files" do
96
+ # Dir.glob("lib/domains/**/*") do |file|
97
+ # if not File.directory?(file)
98
+ # assert file.end_with?(".txt"), "#{file} should have a .txt extension"
99
+ # end
100
+ # end
101
+ # end
102
+
103
+ # it "contains no file with an invalid encoding" do
104
+ # Dir.glob("lib/domains/**/*") do |file|
105
+ # if not File.directory?(file)
106
+ # File.open(file, "r") do |fh|
107
+ # assert fh.read.valid_encoding?, "Invalid encoding for #{file}"
108
+ # end
109
+ # end
110
+ # end
111
+ # end
112
+
113
+ # it "contains only file with a single line" do
114
+ # Dir.glob("lib/domains/**/*") do |file|
115
+ # if not File.directory?(file)
116
+ # File.open(file, "r") do |fh|
117
+ # assert fh.read.lines.count == 1, "#{file} should only contain one line"
118
+ # end
119
+ # end
120
+ # end
121
+ # end
122
+ end