te_rex 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0a89389faff81b10c963d6469dfb52e514387813
4
+ data.tar.gz: 443a437ac01c887945eb14af4cc36aee5257977e
5
+ SHA512:
6
+ metadata.gz: a03cd6e851150d5d9fdfced7942b1b7d9e0d20f47c08d5ac81edbd10337568c148b072a41f40b5615c2610d52535a1c6045874cac4c3baee90c18abe58143647
7
+ data.tar.gz: 2efe3da36bb2981b519ef4d5310915f7cb0906f1481a27dca3ab041a2847de0354feca7ec60f9404a5f6837ae0561b4f41b7e3454d892971ba39a64cc0030b86
@@ -0,0 +1,29 @@
1
+ module TeRex
2
+ module Format
3
+ class BasicFile
4
+
5
+ attr_accessor :sentences
6
+
7
+ def initialize(file_path)
8
+ @path = file_path
9
+ end
10
+
11
+ # Each line of file with Array object,
12
+ # strip it, split by whitespace, map it,
13
+ # split words by '/' to separate POS tags,
14
+ # join by whitespace
15
+ def scanner
16
+ @sentences ||= File.open(@path) do |file|
17
+ file.each_line.each_with_object([]) do |line, acc|
18
+ stripped_line = line.strip
19
+
20
+ unless stripped_line.nil? || stripped_line.empty?
21
+ acc << line.split(' ').map {|word| word}.join(' ')
22
+ end
23
+ end
24
+ end
25
+
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,31 @@
1
+ module TeRex
2
+ module Format
3
+ class BrownFile
4
+
5
+ attr_accessor :sentences
6
+
7
+ def initialize(file_path)
8
+ @path = file_path
9
+ end
10
+
11
+ # Each line of file with Array object,
12
+ # strip it, split by whitespace, map it,
13
+ # split words by '/' to separate POS tags,
14
+ # join by whitespace
15
+ def scanner
16
+ @sentences ||= File.open(@path) do |file|
17
+ file.each_line.each_with_object([]) do |line, acc|
18
+ stripped_line = line.strip
19
+
20
+ unless stripped_line.nil? || stripped_line.empty?
21
+ acc << line.split(' ').map do |word|
22
+ word.split('/').first
23
+ end.join(' ')
24
+ end
25
+ end
26
+ end
27
+
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,7 @@
1
+ module TeRex
2
+ module Format
3
+ def self.category_term(t)
4
+ t.capitalize.intern
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,36 @@
1
+ # This module generates an alphabet of alpha-numeric characters for use in generating fake data.
2
+ module TeRex
3
+ module AlphaNum
4
+ @symbols = ['!','@','#','$','%','^','&','*','(',')','-','+','=','<','>','?','~']
5
+ def self.gen
6
+ ('a'..'z').map{|aleph| aleph}
7
+ .concat(('A'..'Z').map{|aleph2| aleph2})
8
+ .concat(@symbols)
9
+ .concat((1..20).map{|num| num})
10
+ end
11
+
12
+ #Some randomized date format generator.
13
+ def self.date
14
+ day = (1..31).map{|num| num}.sample
15
+ month = (1..12).map{|num| num}.sample
16
+ year2 = (10..99).map{|num| num}.sample
17
+ year4 = (1900..2099).map{|n| n}.sample
18
+ ["#{day}-#{month}-#{year2}",
19
+ "#{day}/#{month}/#{year2}",
20
+ "#{day}/#{month}/#{year4}",
21
+ "#{day}-#{month}-#{year4}",
22
+ "#{year4}-#{month}-#{day}",
23
+ "#{year4}/#{month}/#{day}",
24
+ "#{year2}/#{month}/#{day}",
25
+ "#{year2}-#{month}-#{day}",
26
+ "#{month}-#{day}-#{year2}",
27
+ "#{month}/#{day}/#{year2}",
28
+ "#{month}/#{day}/#{year4}",
29
+ "#{month}-#{day}-#{year4}",
30
+ "#{year4}-#{day}-#{month}",
31
+ "#{year4}/#{day}/#{month}",
32
+ "#{year2}/#{day}/#{month}",
33
+ "#{year2}-#{day}-#{month}"].sample
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,73 @@
1
+ #
2
+ # Refactor of Lucas Carlson's classifier https://github.com/cardmagic/classifier (Copyright (c) 2005 lucas@rufy.com)
3
+ #
4
+ module TeRex
5
+ module Classifier
6
+ class Bayes
7
+
8
+ attr_accessor :category_counts, :total_words
9
+
10
+ def initialize(*categories)
11
+ @clasif = Hash.new
12
+ categories.each {|cat| @clasif[TeRex::Format.category_term(cat)] = Hash.new}
13
+ @total_words = 0
14
+ @category_counts = Hash.new(0)
15
+ end
16
+
17
+ def train(ctgry, text)
18
+ category = TeRex::Format.category_term(ctgry)
19
+ @category_counts[category] += 1
20
+
21
+ BayesData.index_frequency(text).each do |word, count|
22
+ @clasif[category][word] ||= 0
23
+ @clasif[category][word] += count
24
+
25
+ @total_words += count
26
+ end
27
+ end
28
+
29
+ def classifications(text)
30
+ score = Hash.new
31
+ training_count = @category_counts.values.inject {|x,y| x+y}.to_f
32
+
33
+ @clasif.each do |category, category_words|
34
+ score[category.to_s] = 0
35
+ total = category_words.values.inject(0) {|sum, element| sum+element}
36
+ BayesData.index_frequency(text).each do |word, count|
37
+ s = category_words.has_key?(word) ? category_words[word] : 0.1
38
+ score[category.to_s] += Math.log(s/total.to_f)
39
+ end
40
+
41
+ k = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
42
+ score[category.to_s] += Math.log(k/training_count)
43
+ end
44
+
45
+ score
46
+ end
47
+
48
+ def classify(text)
49
+ (classifications(text).sort_by{|a| -a[1]})[0][0]
50
+ end
51
+
52
+ def categories
53
+ @classif.keys.collect {|c| c.to_s}
54
+ end
55
+
56
+ def training_description
57
+ max_threshold = (@total_words/self.category_counts.keys.count).to_f
58
+ tmp = []
59
+ @clasif.each_pair do |term,val|
60
+ cc = self.category_counts[term]
61
+ train_ratio = (@total_words/cc).to_f
62
+ tmp << [(train_ratio > max_threshold), term, "description" => {"training_ratio" => "#{train_ratio}", "threshold" => "#{max_threshold}", "category_counts" => "#{cc}", "total_words" => "#{@total_words}"}]
63
+ end
64
+ tmp
65
+ end
66
+
67
+ def under_trained?
68
+ training_description.select {|ut| ut.first == true}
69
+ end
70
+
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,100 @@
1
+ require 'fast_stemmer'
2
+
3
+ module TeRex
4
+ module Classifier
5
+ class BayesData
6
+ class << self
7
+
8
+ # Remove all kinds of explicit punctuation.
9
+ def remove_punct(s)
10
+ s.gsub(/(\,)|(\?)|(\.)|(\!)|(\;)|(\:)|(\")|(\@)|(\#)|(\$)|(\^)|(\&)|(\*)|(\()|(\))|(\_)|(\=)|(\+)|(\[)|(\])|(\\)|(\|)|(\<)|(\>)|(\/)|(\`)|(\{)|(\})/, '')
11
+ end
12
+
13
+ # Remove cardinal terms (1st, 23rd, 42nd)
14
+ def remove_cardinal(s)
15
+ s.gsub(/\d+\w{2}/, '')
16
+ end
17
+
18
+ # Replace date times with TERM (09MAR04, 02-23-14, 2014/03/05)
19
+ def date_time(s)
20
+ s.gsub(/(^\d+)|(\s\d+(AM|PM))|(\d{2}\w{3}\d{2})|(\d{2}\:\d{2})|(\d{2,4}\-\d{2,4}-\d{2,4})|(\d{1,3}\/\d{2,4}\/\d{2,4})|(\d+\:\d+)/, 'datetime')
21
+ end
22
+
23
+ # Replace money types with TERM ($60, 120.00, $423.89)
24
+ def money_term(s)
25
+ s.gsub(/(\$\d+\.\d+)|(\$\d+)|(\d+\.\d+)/, 'moneyterm')
26
+ end
27
+
28
+ # Return a Hashed Index of words => instance_count.
29
+ # Each word in the string is interned and shows count in the document.
30
+ def index_frequency(text)
31
+ cfi = clean_stemmed_filtered_index(text)
32
+ cni = clean_filtered_index(text)
33
+ cfi.merge(cni)
34
+ end
35
+
36
+ # Return text with datetime and moneyterms replaced, remove cardinal terms (1st, 23rd, 42nd), remove punctuation.
37
+ # At one point we were replacing any non-word chars exlcuding spaces (/[^\w\s]/) like so `gsub(/[^\w\s]/, "")` but I took it out as it removed some punctuation needed to distinguish some classes.
38
+ def clean(text)
39
+ dt = date_time(text)
40
+ mt = money_term(dt)
41
+ rp = remove_punct(mt)
42
+ remove_cardinal(rp)
43
+ end
44
+
45
+ # Return a filtered word freq index with stemmed morphemes and without extra punctuation or short words
46
+ def clean_stemmed_filtered_index(text)
47
+ stemmed_filtered_index clean(text).split
48
+ end
49
+
50
+ # Return a filtered word freq index without extra punctuation or short words
51
+ def clean_filtered_index(text)
52
+ filtered_index clean(text).split
53
+ end
54
+
55
+ # Return a word freq index without downcasing, stemming, or filtering with stop list
56
+ def clean_naive_index(text)
57
+ naive_index clean(text).split
58
+ end
59
+
60
+ private
61
+ # Downcase, filter against stop list, ignore sequences less that 2 chars, and stem words
62
+ def stemmed_filtered_index(word_array)
63
+ idx = Hash.new(0)
64
+ word_array.each do |word|
65
+ word.downcase!
66
+ if !TeRex::StopWord::LIST.include?(word) #&& word.length > 2
67
+ idx[word.stem.intern] += 1
68
+ end
69
+ end
70
+
71
+ idx
72
+ end
73
+
74
+ # Downcase, filter against stop list, and ignore sequences less that 2 chars.
75
+ def filtered_index(word_array)
76
+ idx = Hash.new(0)
77
+ word_array.each do |word|
78
+ word.downcase!
79
+ if !TeRex::StopWord::LIST.include?(word) #&& word.length > 2
80
+ idx[word.intern] += 1
81
+ end
82
+ end
83
+
84
+ idx
85
+ end
86
+
87
+ # Count everything in the word array.
88
+ def naive_index(word_array)
89
+ idx = Hash.new(0)
90
+ word_array.each do |word|
91
+ idx[word.intern] += 1
92
+ end
93
+
94
+ idx
95
+ end
96
+
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,42 @@
1
+ module TeRex
2
+ module Corpus
3
+ class Body
4
+
5
+ attr_accessor :files, :sample_size, :training, :testing
6
+
7
+ def initialize(glob, klass)
8
+ @glob = glob
9
+ @klass = klass
10
+ end
11
+
12
+ def build
13
+ get_files
14
+ @training_sentences = partition_train
15
+ @testing_sentences = partition_test
16
+ end
17
+
18
+ def get_files
19
+ @files ||= Dir[@glob].map do |file|
20
+ @klass.new(file)
21
+ end
22
+ @sample_size = (@files.count * 0.75).round
23
+ @files
24
+ end
25
+
26
+ def partition_train
27
+ @files[0..@sample_size].map do |file|
28
+ file.scanner
29
+ end.flatten
30
+ end
31
+
32
+ def partition_test
33
+ @files[(@sample_size - 1)..-1].map do |file|
34
+ file.scanner
35
+ end.flatten
36
+ end
37
+
38
+ end
39
+ end
40
+ end
41
+
42
+
@@ -0,0 +1,68 @@
1
+ module TeRex
2
+ class StopWord
3
+ LIST = [
4
+ "a",
5
+ "am",
6
+ "an",
7
+ "and",
8
+ "are",
9
+ "as",
10
+ "at",
11
+ "be",
12
+ "by",
13
+ "do",
14
+ "for",
15
+ "in",
16
+ "into",
17
+ "it",
18
+ "it's",
19
+ "its",
20
+ "of",
21
+ "so",
22
+ "than",
23
+ "that",
24
+ "that's",
25
+ "the",
26
+ "what",
27
+ "what's",
28
+ "where",
29
+ "which",
30
+ "january",
31
+ "february",
32
+ "march",
33
+ "april",
34
+ "may",
35
+ "june",
36
+ "july",
37
+ "august",
38
+ "september",
39
+ "october",
40
+ "november",
41
+ "december",
42
+ "jan",
43
+ "feb",
44
+ "mar",
45
+ "apr",
46
+ "aug",
47
+ "sept",
48
+ "nov",
49
+ "dec",
50
+ "monday",
51
+ "mon",
52
+ "tuesday",
53
+ "tue",
54
+ "wednesday",
55
+ "wed",
56
+ "thursday",
57
+ "thur",
58
+ "friday",
59
+ "fri",
60
+ "saturday",
61
+ "sat",
62
+ "sunday",
63
+ "sun",
64
+ "pm",
65
+ "am"
66
+ ]
67
+ end
68
+ end
@@ -0,0 +1,3 @@
1
+ module TeRex
2
+ VERSION = "0.0.4"
3
+ end
data/lib/te_rex.rb ADDED
@@ -0,0 +1,11 @@
1
+ require_relative "format/format"
2
+ require_relative "format/corpus/brown_file"
3
+ require_relative "format/corpus/basic_file"
4
+ require_relative "te_rex/stop_word"
5
+ require_relative "te_rex/alpha_num"
6
+ require_relative "te_rex/bayes_data"
7
+ require_relative "te_rex/bayes"
8
+ require_relative "te_rex/corpus"
9
+
10
+ module TeRex
11
+ end
@@ -0,0 +1,18 @@
1
+ require_relative "../lib/te_rex"
2
+ class AlphaNumTest < MicroTest::Test
3
+
4
+ test "generates array of lowercase roman characters" do
5
+ res = TeRex::AlphaNum.gen
6
+ ('a'..'z').each {|char| assert res.include? char}
7
+ end
8
+
9
+ test "generates array of uppercase roman characters" do
10
+ res = TeRex::AlphaNum.gen
11
+ ('A'..'Z').each {|char| assert res.include? char}
12
+ end
13
+
14
+ test "generates array of integers 1-20" do
15
+ res = TeRex::AlphaNum.gen
16
+ (1..20).each {|int| assert res.include? int}
17
+ end
18
+ end
@@ -0,0 +1,70 @@
1
+ require_relative "../lib/te_rex"
2
+ class BayesDataTest < MicroTest::Test
3
+
4
+ test "punctuation is removed (except %)" do
5
+ s1 = "This * punctuation se%ntence ).!"
6
+ s2 = "Much $ in @ this } [ sentence too?"
7
+ s3 = "And I$ have c#des in |his one with 100% refund too@>."
8
+
9
+ s11 = TeRex::Classifier::BayesData.remove_punct(s1)
10
+ s22 = TeRex::Classifier::BayesData.remove_punct(s2)
11
+ s33 = TeRex::Classifier::BayesData.remove_punct(s3)
12
+
13
+ assert s11 == "This punctuation se%ntence "
14
+ assert s22 == "Much in this sentence too"
15
+ assert s33 == "And I have cdes in his one with 100% refund too"
16
+ end
17
+
18
+ test "datetime is removed and replaced" do
19
+ s1 = "This $140 will be paid on 09/14/2014"
20
+ s2 = "I get $20.00 on 2014-05-21 and on 09MAR04"
21
+ s3 = "I'll pay you $60.21 on 06-20-2014"
22
+
23
+ s11 = TeRex::Classifier::BayesData.date_time(s1)
24
+ s22 = TeRex::Classifier::BayesData.date_time(s2)
25
+ s33 = TeRex::Classifier::BayesData.date_time(s3)
26
+
27
+ assert s11 == "This $140 will be paid on datetime"
28
+ assert s22 == "I get $20.00 on datetime and on datetime"
29
+ assert s33 == "I'll pay you $60.21 on datetime"
30
+ end
31
+
32
+ test "moneyterm is removed and replaced" do
33
+ s1 = "$140 will be paid on 09/14/2014 with $60"
34
+ s2 = "I get $20.00 on 2014-05-21 and on 09MAR04"
35
+ s3 = "You'll make $1234.73 on 06-20-2014"
36
+
37
+ s11 = TeRex::Classifier::BayesData.money_term(s1)
38
+ s22 = TeRex::Classifier::BayesData.money_term(s2)
39
+ s33 = TeRex::Classifier::BayesData.money_term(s3)
40
+
41
+ assert s11 == "moneyterm will be paid on 09/14/2014 with moneyterm"
42
+ assert s22 == "I get moneyterm on 2014-05-21 and on 09MAR04"
43
+ assert s33 == "You'll make moneyterm on 06-20-2014"
44
+ end
45
+
46
+ test "cleaned text does what we want" do
47
+ s1 = "$140 will be paid on 09/14/2014 with $60"
48
+ s2 = "I get $20.00 on 2014-05-21 and on 09MAR04 with %49 and &*%^)"
49
+ s3 = "And I$ have c#des in |his one wi%th 100% refund too@>."
50
+
51
+ s11 = TeRex::Classifier::BayesData.clean(s1)
52
+ s22 = TeRex::Classifier::BayesData.clean(s2)
53
+ s33 = TeRex::Classifier::BayesData.clean(s3)
54
+
55
+ assert s11 == "moneyterm will be paid on datetime with moneyterm"
56
+ assert s22 == "I get moneyterm on datetime and on datetime with %49 and %"
57
+ assert s33 == "And I have cdes in his one wi%th % refund too"
58
+ end
59
+
60
+ test "index frequency has correct counts" do
61
+ s = "Here is a sentence $141.34 that that $60 that 123.56 I need & & ^ % $c#@ to check the index is correct and okay."
62
+ result = TeRex::Classifier::BayesData.index_frequency(s)
63
+
64
+ assert result[:moneyterm] == 3
65
+ assert result[:sentenc] == 1
66
+ assert result[:sentence] == 1
67
+
68
+ end
69
+
70
+ end
@@ -0,0 +1,130 @@
1
+ require_relative "../lib/te_rex"
2
+ class SparseBayesTest < MicroTest::Test
3
+ @@refund = [
4
+ "Free cancellation before 1201 AM on 9/17/14! If you cancel or change your reservation after 1201 AM on 9/17/14 the hotel will charge you for the total cost of your reservation.",
5
+ "ALL RESERVATIONS MUST BE CANCELLED 24 HOURS PRIOR TO HOST TIME UNLESS DEPOSIT REQUIRED IF THIS RESERVATION HAS BEEN MADE ELECTRONICALLY PLEASE CANCEL IT ELECTRONICALLY TO AVOID CONFUSION AND A NO SHOW BILL. POLICY SUBJECT TO CHANGE. .",
6
+ "Free cancellation before 800 PM on 9/20/14! If you cancel or change your reservation after 800 PM on 9/20/14 the hotel will charge you $158. If you cancel or change your reservation after 800 PM on 9/21/14 the hotel will charge you for the total cost of your reservation."
7
+ ]
8
+
9
+ @@partrefund = [
10
+ "If you cancel or change your reservation before 1201 AM on 10/21/14 the hotel will charge you $57. If you cancel or change your reservation after 1201 AM on 10/21/14 the hotel will charge you $335. If you cancel or change your reservation after 1201 AM on 10/24/14 the hotel will charge you for the total cost of your reservation.",
11
+ "If you cancel or change your reservation before 1201 AM on 9/10/14 the hotel will charge you $225. If you cancel or change your reservation after 1201 AM on 9/10/14 the hotel will charge you for the total cost of your reservation.",
12
+ "Cancellations or changes made before 4:00 PM Eastern Time on Sep 11, 2014 are subject to a 1 Night Room & Tax penalty. Cancellations or changes made after 4:00 PM Eastern Time on Sep 11, 2014 are subject to a 1 Night Room & Tax penalty. The property makes no refunds for no shows or early checkouts."
13
+ ]
14
+
15
+ @@norefund = [
16
+ "This reservation is non-refundable. Cancellations or changes made at any time are subject to a 100% charge.",
17
+ "This rate is non-refundable and cannot be changed or cancelled - if you do choose to change or cancel this booking you will not be refunded any of the payment.",
18
+ "For the room type and rate that you've selected you are not allowed to change or cancel your reservation. If you cancel your room you will still be charged for the full reservation amount."
19
+ ]
20
+
21
+ @@unknown = [
22
+ "The cancellation policy will be determined when the rate is validated."
23
+ ]
24
+
25
+ @@cls = TeRex::Classifier::Bayes.new("Refund", "Partrefund", "Nonrefund", "Unknown")
26
+ @@refund.each {|txt| @@cls.train("Refund", txt) }
27
+ @@partrefund.each {|txt| @@cls.train("Partrefund", txt) }
28
+ @@norefund.each {|txt| @@cls.train("Nonrefund", txt) }
29
+ @@unknown.each {|txt| @@cls.train("Unknown", txt) }
30
+
31
+ test "Sparse Data Set Test: Random exact match sould classify correctly" do
32
+
33
+ s_refund = @@refund.sample
34
+ s_partial = @@partrefund.sample
35
+ s_non = @@norefund.sample
36
+ s_unk = @@unknown.sample
37
+
38
+ s_refund1 = @@cls.classify(s_refund)
39
+ s_partial1 = @@cls.classify(s_partial)
40
+ s_non1 = @@cls.classify(s_non)
41
+ s_unk1= @@cls.classify(s_unk)
42
+
43
+ assert s_refund1 == "Refund"
44
+ assert s_partial1 == "Partrefund"
45
+ assert s_non1 == "Nonrefund"
46
+ assert s_unk1 == "Unknown"
47
+
48
+ assert s_refund1 != "Partrefund"
49
+ assert s_partial1 != "Refund"
50
+ assert s_non1 != "Unknown"
51
+ assert s_unk1 != "Nonrefund"
52
+ end
53
+
54
+
55
+ test "Sparse Data Set Test: Non-canonical examples should return unknown" do
56
+
57
+ s1 = "You will get a full refund and free cancellation"
58
+ s2 = "You will get a partial refund and be charged"
59
+ s3 = "You will get non refund"
60
+ s4 = "You will get a nonsense am I writing here."
61
+
62
+ s11 = @@cls.classify(s1)
63
+ s22 = @@cls.classify(s2)
64
+ s33 = @@cls.classify(s3)
65
+ s44 = @@cls.classify(s4)
66
+
67
+ assert s11 == "Unknown"
68
+ assert s22 == "Unknown"
69
+ assert s33 == "Unknown"
70
+ assert s44 == "Unknown"
71
+ end
72
+
73
+ test "Sparse Data Set Test: Micro examples should return correct classification" do
74
+
75
+ s1 = "Free cancellation before"
76
+ s2 = "If you cancel or change your reservation before"
77
+ s3 = "non-refund"
78
+ s4 = "policy rate validated."
79
+
80
+ s11 = @@cls.classify(s1)
81
+ s22 = @@cls.classify(s2)
82
+ s33 = @@cls.classify(s3)
83
+ s44 = @@cls.classify(s4)
84
+
85
+ assert s11 == "Refund"
86
+ assert s22 == "Partrefund"
87
+ assert s33 == "Nonrefund"
88
+ assert s44 == "Unknown"
89
+
90
+ assert s11 != "Partrefund"
91
+ assert s22 != "Refund"
92
+ assert s33 != "Unknown"
93
+ assert s44 != "Nonrefund"
94
+ end
95
+
96
+
97
+ test "Sparse Data Set Test: Micro examples should NOT match fake classes" do
98
+
99
+ s1 = "free cancellation"
100
+ s2 = "partial refund"
101
+ s3 = "no refund"
102
+ s4 = "policy rate validated."
103
+
104
+ s11 = @@cls.classify(s1)
105
+ s22 = @@cls.classify(s2)
106
+ s33 = @@cls.classify(s3)
107
+ s44 = @@cls.classify(s4)
108
+
109
+ assert s11 != "Computers"
110
+ assert s22 != "Science"
111
+ assert s33 != "Entertainment"
112
+ assert s44 != "Sports"
113
+ end
114
+
115
+ test "Sparse Data Set Test: Category counts are equivalent with number of training data per class" do
116
+
117
+ assert @@cls.category_counts[:Refund] == @@refund.count
118
+ assert @@cls.category_counts[:Partrefund] == @@partrefund.count
119
+ assert @@cls.category_counts[:Nonrefund] == @@norefund.count
120
+ assert @@cls.category_counts[:Unknown] == @@unknown.count
121
+
122
+ end
123
+
124
+ test "Sparse Data Set Test: All SPARSE Training classes should be undertrained... " do
125
+ res = @@cls.under_trained?
126
+ assert res.count == 4
127
+ end
128
+
129
+ end
130
+
@@ -0,0 +1,35 @@
1
+ module TeRex
2
+ module Train
3
+ NONREFUND = [
4
+ "This reservation is non-refundable. Cancellations or changes made at any time are subject to a 100% charge.",
5
+ "This reservation is non-refundable. Cancellations made at any time are subject to a 100% charge.",
6
+ "This reservation is non-refundable.",
7
+ "This reservation is non-refundable. Cancellations or changes made at any time are subject to a 100% charge.",
8
+ "This rate is non-refundable and cannot be changed or cancelled - if you do choose to change or cancel this booking you will not be refunded any of the payment.",
9
+ "This reservation is non-refundable. Cancellations made at any time are subject to a 100% charge.",
10
+ "This reservation is non-refundable.",
11
+ "This reservation is non-refundable. Cancellations or changes made at any time are subject to a 100% charge.",
12
+ "This reservation is non-refundable. Cancellations made at any time are subject to a 100% charge.",
13
+ "This reservation is non-refundable.",
14
+ "This rate is non-refundable and cannot be changed or cancelled - if you choose to change or cancel this booking you will not be refunded any of the payment.",
15
+ "This rate is non-refundable and cannot be changed or cancelled - if you do choose to cancel this booking you will not be refunded any of the payment.",
16
+ "This rate is non-refundable and cannot be changed or cancelled - if you do choose to change or cancel this booking you will not be refunded any payment.",
17
+ "This rate is non-refundable and cannot be changed or cancelled - if you do choose to change or cancel this booking you will not be refunded any of the payment.",
18
+ "This rate is non-refundable and cannot be changed or cancelled - if you do choose to change or cancel this booking you will not be refunded any of the payment.",
19
+ "This rate is non-refundable and cannot be changed or cancelled - if you do choose to change or cancel this booking you will not be refunded any of the payment.",
20
+ "For the room type and rate that you've selected you are not allowed to change or cancel your reservation. If you cancel your room you will still be charged for the full reservation amount.",
21
+ "For the room type and rate that you've selected you are not allowed to change or cancel your reservation. If you cancel your room you will still be charged for the full reservation amount.",
22
+ "For the room type and rate that you've selected you are not allowed to change or cancel your reservation. If you cancel your room you will still be charged for the full reservation amount.",
23
+ "This rate is non-refundable and cannot be changed or cancelled - if you do choose to change or cancel this booking you will not be refunded any of the payment.",
24
+ "For the room type and rate that you've selected you are not allowed to change or cancel your reservation. If you cancel your room you will still be charged for the full reservation amount.",
25
+ "For the room type and rate that you've selected you are not allowed to change or cancel your reservation. If you cancel your room you will still be charged for the full reservation amount.",
26
+ "For the room type and rate that you've selected you are not allowed to change or cancel your reservation. If you cancel your room you will still be charged for the full reservation amount.",
27
+ "This reservation is non-refundable. Cancellations or changes made at any time are subject to a 100% charge. Refunds are not available for early check-out.",
28
+ "Non-refundable. Cancellations or changes made at any time are subject to a 100% charge. We are sorry but refunds are not available for early check-out.",
29
+ "This reservation is non-refundable. Cancellations or changes made at any time are subject to a 100% charge. We are sorry but refunds are not available for early check-out.",
30
+ "This reservation is non-refundable. Cancellations or changes made at any time are subject to a 100% charge. We are sorry but refunds are not available for early check-out.",
31
+ "This reservation is non-refundable. Cancellations or changes made at any time are subject to a 100% charge. We are sorry but refunds are not available for early check-out.",
32
+ "This reservation is non-refundable. Cancellations or changes made at any time are subject to a 100% charge. We are sorry but refunds are not available for early check-out."
33
+ ]
34
+ end
35
+ end
@@ -0,0 +1,30 @@
1
+ module TeRex
2
+ module Train
3
+ PARTREFUND = [
4
+ "If you cancel or change your reservation before 1201 AM on 10/21/14 the hotel will charge you $57. If you cancel or change your reservation after 1201 AM on 10/21/14 the hotel will charge you $335. If you cancel or change your reservation after 1201 AM on 10/24/14 the hotel will charge you for the total cost of your reservation.",
5
+ "If you cancel or change your reservation before 4:00 PM on 9/22/14, the hotel will charge you $388. If you cancel or change your reservation after 4:00 PM on 9/22/14, the hotel will charge you for the total cost of your reservation. ",
6
+ "If you cancel or change your reservation before 1201 AM on 9/10/14 the hotel will charge you $225. If you cancel or change your reservation after 1201 AM on 9/10/14 the hotel will charge you for the total cost of your reservation.",
7
+ "Cancellations or changes made before 4:00 PM Eastern Time on Sep 11, 2014 are subject to a 1 Night Room & Tax penalty. Cancellations or changes made after 4:00 PM Eastern Time on Sep 11, 2014 are subject to a 1 Night Room & Tax penalty. The property makes no refunds for no shows or early checkouts.",
8
+ "If you cancel or change your reservation before 3:00 PM on 07/14/14, the hotel will charge you $189. If you cancel or change your reservation after 3:00 PM on 07/14/14, the hotel will charge you $225. If you cancel or change your reservation after 3:00 PM on 07/17/14 the hotel will charge you for the total cost of your reservation. ",
9
+ "If you cancel or change your reservation before 3:00 PM on 7\\/31\\/14, the hotel will charge you $305. If you cancel or change your reservation after 3:00 PM on 7\\/31\\/14, the hotel will charge you for the total cost of your reservation. ",
10
+ "If you cancel or change your reservation before 4:00 PM on 9/22/14, the hotel will charge you $245. If you cancel or change your reservation after 4:00 PM on 9/22/14, the hotel will charge you for the total cost of your reservation. ",
11
+ "If you cancel or change your reservation before 4:00 PM on 9/24/14, the hotel will charge you $233. If you cancel or change your reservation after 4:00 PM on 9/24/14, the hotel will charge you for the total cost of your reservation. ",
12
+ "If you cancel or change your reservation before 12:00 PM on 1/05/15, the hotel will charge you $86. If you cancel or change your reservation after 12:00 PM on 1/05/15, the hotel will charge you $625. If you cancel or change your reservation after 12:00 PM on 1/07/15 the hotel will charge you for the total cost of your reservation. ",
13
+ "If you cancel or change your reservation before 4:00 PM on 9/21/14, the hotel will charge you $215. If you cancel or change your reservation after 4:00 PM on 9/21/14, the hotel will charge you for the total cost of your reservation. ",
14
+ "If you cancel or change your reservation before 6:00 PM on 04/17/14, the hotel will charge you $187. If you cancel or change your reservation after 6:00 PM on 04/17/14, the hotel will charge you for the total cost of your reservation. ",
15
+ "If you cancel or change your reservation before 16:00 PM on 3/31/14, the hotel will charge you $89. If you cancel or change your reservation after 16:00 PM on 3/31/14,, the hotel will charge you for the total cost of your reservation. ",
16
+ "If you cancel or change your reservation before 6:00 AM on 12/07/14, the hotel will charge you $5675. If you cancel or change your reservation after 6:00 AM on 12/07/14, the hotel will charge you for the total cost of your reservation. ",
17
+ "If you cancel or change your reservation before 3:00 PM on 07/14/14, the hotel will charge you $189. If you cancel or change your reservation after 3:00 PM on 07/14/14, the hotel will charge you $225. If you cancel or change your reservation after 3:00 PM on 07/17/14 the hotel will charge you for the total cost of your reservation. ",
18
+ "If you cancel or change your reservation before 7:00 AM on 06/09/14, the hotel will charge you $509. If you cancel or change your reservation after 7:00 AM on 06/09/14, the hotel will charge you for the total cost of your reservation. ",
19
+ "If you cancel or change your reservation before 9:00 PM on 5/23/14, the hotel will charge you $1018. If you cancel or change your reservation after 9:00 PM on 5/23/14, the hotel will charge you for the total cost of your reservation. ",
20
+ "If you cancel or change your reservation before 10:00 PM on 5/31/14, the hotel will charge you $215. If you cancel or change your reservation after 10:00 PM on 5/31/14, the hotel will charge you $425. If you cancel or change your reservation after 10:00 PM on 6/03/14 the hotel will charge you for the total cost of your reservation. ",
21
+ "If you cancel or change your reservation before 8:00 AM on 12/17/14, the hotel will charge you $15. If you cancel or change your reservation after 8:00 AM on 12/17/14, the hotel will charge you for the total cost of your reservation. ",
22
+ "If you cancel or change your reservation before 12:00 PM on 1/05/15, the hotel will charge you $86. If you cancel or change your reservation after 12:00 PM on 1/05/15, the hotel will charge you $625. If you cancel or change your reservation after 12:00 PM on 1/07/15 the hotel will charge you for the total cost of your reservation. ",
23
+ "If you cancel or change your reservation before 2:00 PM on 11/14/14, the hotel will charge you $57. If you cancel or change your reservation after 2:00 PM on 11/14/14, the hotel will charge you for the total cost of your reservation. ",
24
+ "If you cancel or change your reservation before 2:00 PM on 11/14/14, the hotel will charge you $57. If you cancel or change your reservation after 2:00 PM on 11/14/14, the hotel will charge you for the total cost of your reservation. ",
25
+ "If you cancel or change your reservation before 12:00 PM on 1/05/15, the hotel will charge you $86. If you cancel or change your reservation after 12:00 PM on 1/05/15, the hotel will charge you $625. If you cancel or change your reservation after 12:00 PM on 1/07/15 the hotel will charge you for the total cost of your reservation. ",
26
+ "If you cancel or change your reservation before 4:00 PM on 2/21/15, the hotel will charge you $115. If you cancel or change your reservation after 4:00 PM on 2/21/15, the hotel will charge you for the total cost of your reservation. ",
27
+ "If you cancel or change your reservation before 4:00 PM on 2/14/15, the hotel will charge you $318. If you cancel or change your reservation after 4:00 PM on 2/14/15, the hotel will charge you for the total cost of your reservation. ",
28
+ ]
29
+ end
30
+ end
@@ -0,0 +1,32 @@
1
+ module TeRex
2
+ module Train
3
+ REFUND = [
4
+ "Free cancellation before 1201 AM on 9/17/14! If you cancel or change your reservation after 1201 AM on 9/17/14 the hotel will charge you for the total cost of your reservation. ",
5
+ "Free cancellation before 12:01 AM on 10/29/14! If you cancel or change your reservation after 12:01 AM on 10/29/14, the hotel will charge you $194. If you cancel or change your reservation after 12:01 AM on 10/30/14, the hotel will charge you for the total cost of your reservation.",
6
+ "If you cancel or change your reservation after 12:01 AM on 9/12/14, the hotel will charge you for the total cost of your reservation. ",
7
+ "Cancellations or changes made within 1 day prior to 4:00 PM local hotel time on the day of arrival are subject to a $142.93 charge. Cancellations or changes made after 4:00 PM local hotel time on the day of arrival are subject to a 100% charge. We are sorry but refunds are not available for early check-out.",
8
+ "We understand that sometimes your travel plans change. We do not charge a change or cancel fee. However, this property (Courtyard by Marriott Traverse City) imposes the following penalty to its customers that we are required to pass on: Cancellations or changes made after 5:00 PM ((GMT-05:00) Eastern Time (US &amp; Canada)) on Jul 9, 2014 are subject to a 1 Night Room &amp; Tax penalty. The property makes no refunds for no shows or early checkouts.",
9
+ "Free cancellation before 800 PM on 9/20/14! If you cancel or change your reservation after 800 PM on 9/20/14 the hotel will charge you $158. If you cancel or change your reservation after 800 PM on 9/21/14 the hotel will charge you for the total cost of your reservation. ",
10
+ "Any cancellation received within 2 days prior to arrival date will incur the first night charge. Failure to arrive at your hotel will be treated as a No-Show and will incur the first night charge (Hotel policy).",
11
+ "For the room type you've selected you can cancel your reservation for a full refund up until noon on Friday September 12th (local hotel time). If you decide to cancel your reservation anytime between noon on Friday September 12th and noon on Saturday September 13th (local hotel time) the hotel requires payment for the first night's stay. You will be charged for the first night's stay including taxes and fees. Any remaining amount will be refunded to you. Refunds or cancellations are not available after noon local hotel time on your day of arrival (Saturday September 13th). ",
12
+ "THIS PROPERTY REQUIRES A NOTIFICATION OF CANCELLATION BY 4PM HOTEL TIME 1 DAY PRIOR TO ARRIVAL TO AVOID A PENALTY.",
13
+ "You can cancel free of charge up until the cancellation window. Cancellations or changes made after 4:00 PM Eastern Time on Sep 12, 2014 are subject to a 1 Night Room & Tax penalty. The property makes no refunds for no shows or early checkouts.",
14
+ "For the room type you've selected you can cancel your reservation for a full refund up until noon on Monday September 15th (local hotel time). If you decide to cancel your reservation anytime between noon on Monday September 15th and noon on Wednesday September 17th (local hotel time) the hotel requires payment for the first night's stay. You will be charged for the first night's stay including taxes and fees. Any remaining amount will be refunded to you. Refunds or cancellations are not available after noon local hotel time on your day of arrival (Wednesday September 17th).",
15
+ "-08FEB04 - END - CANCEL BY 0 DAYS PRIOR TO ARRIVAL, LOCAL HOTEL TIME TO AVOID A CANCELLATION PENALTY CANCEL POLICIES AND EARLY DEPARTURE FEES VARY BY HOTEL. SINCE A HOTEL CAN SET A CANCELLATION POLICY OF UP TO 30 DAYS IN ADVANCE PLEASE REVIEW POLICY PRIOR TO BOOKING TO AVOID POSSIBLE CHARGE.",
16
+ "IF CANCELLATION IS NECESSARY, TO AVOID BILLING YOU MUST RECEIVE A CANCEL NUMBER BY 6 PM ON THE DATE OF ARRIVAL. CANCEL TIMES AND DATES MAY VARY. CONTACT THE RESERVED MOTEL 6 FOR SPECIFIC POLICIES THAT MAY AFFECT YOUR RESERVATION. . -EARLY CHECKOUT POLICY -CONTACT LOCATION FOR MORE INFORMATION. .",
17
+ "-1800 HOTEL TIME, DAY OF ARRIVAL TO AVOID BILLING OF 1 NIGHT ROOM AND TAX OR FORFEITURE OF DEPOSIT",
18
+ "RESERVATIONS MUST BE CANCELLED 1DAY/24HOURS PRIOR TO ARRIVAL TO AVOID A PENALTY OF ONE NIGHT ROOM AND TAX. INDIVIDUAL PLANS MAY HAVE VARING CANCEL POLICIES. EARLY DEPARTURE FEE 1NIGHTS STAY. .",
19
+ "06NOV03 - END - CANCEL BY 1 DAYS PRIOR TO 1600 HOURS ON DAY OF ARRIVAL, LOCAL HOTEL TIME TO AVOID A CANCELLATION PENALTY CANCELLATION POLICY TEXT - DUE TO SEASONAL VARIATIONS, MOST ACCURATE CANCEL POLICY RETURNED UPON BOOKINGS. INTERNATIONAL MAXIMUM 3 ROOMS PER RESERVATION. MAXIMUM 3 PERSONS PER ROOM ONLY IF A ROLLAWAY IS AVAILABLE. DOMESTIC MAXIMUM 5 ROOMS PER RESERVATIONS. MAXIMUM 5 PERSONS PER ROOM ONLY IF ROLLAWAY IS AVAILABLE.",
20
+ "RESERVATIONS MUST BE CANCELED *24 HOURS* PRIOR TO ARRIVAL DATE TO AVOID A PENALTY OF ONE NIGHT ROOM AND TAX CHARGE INDIVIDUAL PLANS MAY VARY SEE PLAN DISPLAY FOR MORE INFORMATION **100USD EARLY DEPARTURE FEE** .",
21
+ "ALL RESERVATIONS MUST BE CANCELLED 24 HOURS PRIOR TO HOST TIME UNLESS DEPOSIT REQUIRED IF THIS RESERVATION HAS BEEN MADE ELECTRONICALLY PLEASE CANCEL IT ELECTRONICALLY TO AVOID CONFUSION AND A NO SHOW BILL. POLICY SUBJECT TO CHANGE. .",
22
+ "ALL RESERVATIONS MUST BE CANCELLED 48 HOURS PRIOR TO ARRIVAL HOST TIME IF THIS RESERVATION HAS BEEN MADE ELECTRONICALLY PLEASE CANCEL IT ELECTRONICALLY TO AVOID CONFUSION AND A NO SHOW BILL POLICY SUBJECT TO CHANGE .",
23
+ "IF CANCELLATION IS NECESSARY, TO AVOID BILLING YOU MUST RECEIVE A CANCEL NUMBER BY 6 PM ON THE DATE OF ARRIVAL. CANCEL TIMES AND DATES MAY VARY. CONTACT THE RESERVED MOTEL 6 FOR SPECIFIC POLICIES THAT MAY AFFECT YOUR RESERVATION. . -EARLY CHECKOUT POLICY -CONTACT LOCATION FOR MORE INFORMATION. .o",
24
+ "RESERVATIONS MUST BE CANCELED *72 HOURS* PRIOR TO ARRIVAL LOCAL HOTEL TIME TO AVOID A PENALTY OF ONE NIGHT ROOM AND TAX CHARGE INDIVIDAUL PLANS MAY VARY SEE PLAN DISPLAY FOR MORE INFORMATION **75USD EARLY DEPARTURE FEE**",
25
+ "- 12 NOON HOTEL TIME DAY OF ARRIVAL TO AVOID BILLING OF 1NT ROOM AND TAX OR FORFEITURE OF DEPOSIT",
26
+ "RESERVATIONS MUST BE CANCELLED BY 1600/4PM DAY OF ARRIVAL LOCAL HOTEL TIME TO AVOID A PENALTY OF ONE NIGHT ROOM AND TAX CHARGE. NOTE- INDIVIDUAL PLANS MAY VARY. PLEASE READ INDIVIDUAL DISPLAY FOR MORE INFORMATION. **50.00USD EARLY DEPARTURE FEE** .",
27
+ "- RESERVATIONS BOOKED 8 DAYS OR MORE PRIOR TO ARRIVAL MUST BE CANCELLED 7 DAYS PRIOR TO ARRIVAL TO RECEIVE REFUND - RESERVATIONS BOOKED WITHIN 7 DAYS OF ARRIVAL FORFEIT DEPOSIT OF 3 NIGHTS ROOM AND TAX",
28
+ "ALL RESERVATIONS MUST BE CANCELLED 24 HOURS PRIOR TO HOST TIME UNLESS DEPOSIT REQUIRED IF THIS RESERVATION HAS BEEN MADE ELECTRONICALLY PLEASE CANCEL IT ELECTRONICALLY TO AVOID CONFUSION AND A NO SHOW BILL POLICY SUBJECT TO CHANGE .",
29
+ "PLEASE REFER TO RATE DETAILS AT THE TIME OF BOOKING FOR CANCELLATION POLICY. PLEASE NOTE THAT POLICY IS SUBJECT TO CHANGES WITHOUT NOTICE."
30
+ ]
31
+ end
32
+ end
@@ -0,0 +1,9 @@
1
+ module TeRex
2
+ module Train
3
+ UNKNOWN = [
4
+ "The cancellation policy will be determined when the rate is validated.",
5
+ "-CANCEL POLICY MAY VARY BY DAY OF WEEK AND SEASON. THE MOST ACCURATE CANCEL POLICY IS ADVISED DURING BOOKING PROCESS. IN CASE OF A NO-SHOW THE CREDIT CARD WILL BE CHARGED ONE NIGHT STAY. OUR SYSTEM ACKNOWLEDGES ALL PROPERLY CANCELED RESERVATIONS BY RETURNING A CANCELLATION NUMBER. DO NOT ASSUME YOUR RESERVATION IS CANCELED IF YOU HAVE NOT RECEIVED A CANCELLATION NUMBER IN YOUR PNR OR BOOKING FILE. IF YOU DO NOT RECEIVE A CANCELLATION NUMBER, PLEASE CALL THE CHOICE GDS DEPARTMENT AT 1-866-953-4570",
6
+ "-14JAN02 - END - CANCEL POLICIES VARY BY HOTEL. SINCE A HOTEL CAN SET A CANCELLATION POLICY OF UP TO 30 DAYS IN ADVANCE, PLEASE REVIEW POLICY PRIOR TO BOOKING TO AVOID POSSIBLE CHARGE."
7
+ ]
8
+ end
9
+ end
@@ -0,0 +1,140 @@
1
+ require_relative "../lib/te_rex"
2
+ class TrainedBayesTest < MicroTest::Test
3
+
4
+ #Dir["#{File.dirname(__FILE__)}/test_modules/**/*.rb"].each { |f| load(f) if !!(f =~ /^[^\.].+\.rb/)}
5
+
6
+ @@refund = TeRex::Train::REFUND
7
+ @@partrefund = TeRex::Train::PARTREFUND
8
+ @@norefund = TeRex::Train::NONREFUND
9
+ @@unknown = TeRex::Train::UNKNOWN
10
+
11
+ @@cls = TeRex::Classifier::Bayes.new("Refund", "Partrefund", "Nonrefund", "Unknown")
12
+ @@refund.each {|txt| @@cls.train("Refund", txt) }
13
+ @@partrefund.each {|txt| @@cls.train("Partrefund", txt) }
14
+ @@norefund.each {|txt| @@cls.train("Nonrefund", txt) }
15
+ @@unknown.each {|txt| @@cls.train("Unknown", txt) }
16
+
17
+ test "Training Data Set Test: Random exact match sould classify correctly (but we are lenient on partrefund/refund)" do
18
+
19
+ s_refund = @@refund.sample
20
+ s_partial = @@partrefund.sample
21
+ s_non = @@norefund.sample
22
+ s_unk = @@unknown.sample
23
+
24
+ s_refund1 = @@cls.classify(s_refund)
25
+ s_partial1 = @@cls.classify(s_partial)
26
+ s_non1 = @@cls.classify(s_non)
27
+ s_unk1= @@cls.classify(s_unk)
28
+
29
+ # We are lenient on Partrefund || Refund but we still want to see when it fails
30
+ assert s_refund1 == "Refund" || "Partrefund"
31
+ # We are lenient on Refund || Partrefund because of the non-distinctness of the two.
32
+ assert s_partial1 == "Partrefund" || "Refund"
33
+ assert s_non1 == "Nonrefund"
34
+ assert s_unk1 == "Unknown"
35
+
36
+ # We are lenient on Partrefund || Refund but we still want to see when it fails
37
+ assert s_refund1 != "Partrefund"
38
+ # We are lenient on Refund || Partrefund but we still want to see when it fails
39
+ assert s_partial1 != "Refund"
40
+ assert s_non1 != "Unknown"
41
+ assert s_unk1 != "Nonrefund"
42
+ end
43
+
44
+
45
+ test "Training Data Set Test: Non-canonical examples should classify correctly" do
46
+
47
+ refund_s1 = "You will get a full refund and free cancellation"
48
+ partrefund_s1 = "You will get a refund if you cancel or change your reservation before 0201 AM on 01/31/14"
49
+ norefund_s1 = "You will get a non-refund"
50
+ unk_s1 = "You will get a nonsense am I writing here."
51
+
52
+ refund_s11 = @@cls.classify(refund_s1)
53
+ partrefund_s11 = @@cls.classify(partrefund_s1)
54
+ norefund_s11 = @@cls.classify(norefund_s1)
55
+ unk_s11 = @@cls.classify(unk_s1)
56
+
57
+ assert refund_s11 == "Refund"
58
+ assert partrefund_s11 == "Partrefund"
59
+ assert norefund_s11 == "Nonrefund"
60
+ assert unk_s11 == "Unknown"
61
+ end
62
+
63
+ test "Training Data Set Test: Micro examples should return correct classification" do
64
+
65
+ s1 = "free cancellation"
66
+ s2 = "If you cancel or change your reservation before"
67
+ s3 = "non-refund"
68
+ s4 = "policy rate validated."
69
+
70
+ s11 = @@cls.classify(s1)
71
+ s22 = @@cls.classify(s2)
72
+ s33 = @@cls.classify(s3)
73
+ s44 = @@cls.classify(s4)
74
+
75
+ assert s11 == "Refund"
76
+ assert s22 == "Partrefund"
77
+ assert s33 == "Nonrefund"
78
+ assert s44 == "Unknown"
79
+
80
+ assert s11 != "Partrefund"
81
+ assert s22 != "Refund"
82
+ assert s33 != "Unknown"
83
+ assert s44 != "Nonrefund"
84
+ end
85
+
86
+ test "Training Data Set Test: Micro examples should NOT match fake classes" do
87
+
88
+ s1 = "free cancellation"
89
+ s2 = "partial refund"
90
+ s3 = "no refund"
91
+ s4 = "policy rate validated."
92
+
93
+ s11 = @@cls.classify(s1)
94
+ s22 = @@cls.classify(s2)
95
+ s33 = @@cls.classify(s3)
96
+ s44 = @@cls.classify(s4)
97
+
98
+ assert s11 != "Computers"
99
+ assert s22 != "Science"
100
+ assert s33 != "Entertainment"
101
+ assert s44 != "Sports"
102
+ end
103
+
104
+ test "Training Data Set Test: Ambiguous examples should return 'Unknown'" do
105
+
106
+ s1 = "gobbly goop droop blithely toadwakle Grimpleshtein uf Varendorrf vun muscilaty"
107
+ s2 = "The United States announced on Tuesday it will send 3,000 troops to help tackle the Ebola outbreak as part of a ramped-up plan, including a major deployment in Liberia."
108
+ s3 = "United Parcel Service Inc is almost doubling the number of seasonal employees it hires for this year's holiday shopping season as it aims to avoid a repeat of last year's network breakdown."
109
+ s4 = "Alberto Contador wrapped up his third Vuelta a España triumph when he comfortably held on to his overall lead in the 21st and final stage time trial in a rain-soaked Santiago de Compostela on Sunday."
110
+
111
+ s11 = @@cls.classify(s1)
112
+ s22 = @@cls.classify(s2)
113
+ s33 = @@cls.classify(s3)
114
+ s44 = @@cls.classify(s4)
115
+
116
+ assert s11 == "Unknown"
117
+ assert s22 == "Unknown"
118
+ assert s33 == "Unknown"
119
+ assert s44 == "Unknown"
120
+ end
121
+
122
+ test "Training Data Set Test: Category counts are equivalent with number of training data per class" do
123
+
124
+ assert @@cls.category_counts[:Refund] == @@refund.count
125
+ assert @@cls.category_counts[:Partrefund] == @@partrefund.count
126
+ assert @@cls.category_counts[:Nonrefund] == @@norefund.count
127
+ assert @@cls.category_counts[:Unknown] == @@unknown.count
128
+
129
+ end
130
+
131
+ test "Sparse Data Set Test: Training categories should NOT be undertrained... except 'Unknown'" do
132
+ info = @@cls.training_description
133
+ puts "\nUndertraining data for SPARSE DATA SET: #{info}"
134
+ res = @@cls.under_trained?
135
+ assert res[0].include? :Unknown
136
+ end
137
+
138
+ end
139
+
140
+
metadata ADDED
@@ -0,0 +1,231 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: te_rex
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - Joshua Bowles
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-09-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: fast-stemmer
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.0.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.0'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.0.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: bundler
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '1.5'
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 1.5.3
43
+ type: :development
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: '1.5'
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 1.5.3
53
+ - !ruby/object:Gem::Dependency
54
+ name: rake
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: '10.3'
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 10.3.2
63
+ type: :development
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: '10.3'
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 10.3.2
73
+ - !ruby/object:Gem::Dependency
74
+ name: micro_test
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - "~>"
78
+ - !ruby/object:Gem::Version
79
+ version: '0.4'
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 0.4.4
83
+ type: :development
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.4'
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: 0.4.4
93
+ - !ruby/object:Gem::Dependency
94
+ name: pry
95
+ requirement: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - "~>"
98
+ - !ruby/object:Gem::Version
99
+ version: '0.10'
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: 0.10.1
103
+ type: :development
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '0.10'
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: 0.10.1
113
+ - !ruby/object:Gem::Dependency
114
+ name: pry-debugger
115
+ requirement: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: '0.2'
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 0.2.3
123
+ type: :development
124
+ prerelease: false
125
+ version_requirements: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - "~>"
128
+ - !ruby/object:Gem::Version
129
+ version: '0.2'
130
+ - - ">="
131
+ - !ruby/object:Gem::Version
132
+ version: 0.2.3
133
+ - !ruby/object:Gem::Dependency
134
+ name: pry-rescue
135
+ requirement: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: '1.4'
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: 1.4.1
143
+ type: :development
144
+ prerelease: false
145
+ version_requirements: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - "~>"
148
+ - !ruby/object:Gem::Version
149
+ version: '1.4'
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 1.4.1
153
+ - !ruby/object:Gem::Dependency
154
+ name: pry-stack_explorer
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: '0.4'
160
+ - - ">="
161
+ - !ruby/object:Gem::Version
162
+ version: 0.4.9.1
163
+ type: :development
164
+ prerelease: false
165
+ version_requirements: !ruby/object:Gem::Requirement
166
+ requirements:
167
+ - - "~>"
168
+ - !ruby/object:Gem::Version
169
+ version: '0.4'
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: 0.4.9.1
173
+ description: Simple text processing for small data sets.
174
+ email:
175
+ - jbowayles@gmail.com
176
+ executables: []
177
+ extensions: []
178
+ extra_rdoc_files: []
179
+ files:
180
+ - lib/format/corpus/basic_file.rb
181
+ - lib/format/corpus/brown_file.rb
182
+ - lib/format/format.rb
183
+ - lib/te_rex.rb
184
+ - lib/te_rex/alpha_num.rb
185
+ - lib/te_rex/bayes.rb
186
+ - lib/te_rex/bayes_data.rb
187
+ - lib/te_rex/corpus.rb
188
+ - lib/te_rex/stop_word.rb
189
+ - lib/te_rex/version.rb
190
+ - test/alpha_num_test.rb
191
+ - test/bayes_data_test.rb
192
+ - test/sparse_bayes_test.rb
193
+ - test/test_modules/nonrefund.rb
194
+ - test/test_modules/partrefund.rb
195
+ - test/test_modules/refund.rb
196
+ - test/test_modules/unknown.rb
197
+ - test/trained_bayes_test.rb
198
+ homepage: ''
199
+ licenses:
200
+ - MIT
201
+ metadata: {}
202
+ post_install_message:
203
+ rdoc_options: []
204
+ require_paths:
205
+ - lib
206
+ required_ruby_version: !ruby/object:Gem::Requirement
207
+ requirements:
208
+ - - ">="
209
+ - !ruby/object:Gem::Version
210
+ version: '0'
211
+ required_rubygems_version: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - ">="
214
+ - !ruby/object:Gem::Version
215
+ version: '0'
216
+ requirements: []
217
+ rubyforge_project:
218
+ rubygems_version: 2.4.1
219
+ signing_key:
220
+ specification_version: 4
221
+ summary: Basic NLP stuff for small data sets. Naive Bayes classification and corpora
222
+ loading.
223
+ test_files:
224
+ - test/alpha_num_test.rb
225
+ - test/bayes_data_test.rb
226
+ - test/sparse_bayes_test.rb
227
+ - test/test_modules/nonrefund.rb
228
+ - test/test_modules/partrefund.rb
229
+ - test/test_modules/refund.rb
230
+ - test/test_modules/unknown.rb
231
+ - test/trained_bayes_test.rb