te_rex 0.0.11 → 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/format/basic_file.rb +3 -2
- data/lib/format/brown_file.rb +3 -2
- data/lib/format/error_file.rb +3 -2
- data/lib/format/format.rb +1 -1
- data/lib/te_rex/bayes.rb +1 -2
- data/lib/te_rex/corpus.rb +11 -11
- data/lib/te_rex/version.rb +1 -1
- data/test/corpus_test.rb +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eba4b0e904897695a6ffe59f676cd41129d9f9d3
|
4
|
+
data.tar.gz: b9714ec30ca138fed335894c95503c113c8e605d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f0e6f419f099c3f485c1b829d1e5421935092af35fc2184fdd9a70de0f7127cbf95bcfc100e6e8f0d0efc7eeda15f4b8cc5f696211c852a92933220ae2894ff
|
7
|
+
data.tar.gz: 3f017ed552f0fdcb0ba89a0ffcc188f76c13780571b67ca4edd254980605ff5166af536610da2d3b2570b8bb5c39a3b1e0dc57d1873c1de89b370447b2426859
|
data/lib/format/basic_file.rb
CHANGED
@@ -2,10 +2,11 @@ module TeRex
|
|
2
2
|
module Format
|
3
3
|
class BasicFile
|
4
4
|
|
5
|
-
attr_reader :sentences, :path
|
5
|
+
attr_reader :sentences, :path, :category
|
6
6
|
|
7
|
-
def initialize(file_path)
|
7
|
+
def initialize(file_path, klass)
|
8
8
|
@path = file_path
|
9
|
+
@category = klass
|
9
10
|
end
|
10
11
|
|
11
12
|
# Each line of file with Array object,
|
data/lib/format/brown_file.rb
CHANGED
@@ -2,10 +2,11 @@ module TeRex
|
|
2
2
|
module Format
|
3
3
|
class BrownFile
|
4
4
|
|
5
|
-
|
5
|
+
attr_reader :sentences, :path, :category
|
6
6
|
|
7
|
-
def initialize(file_path)
|
7
|
+
def initialize(file_path, klass)
|
8
8
|
@path = file_path
|
9
|
+
@category = klass
|
9
10
|
end
|
10
11
|
|
11
12
|
# Each line of file with Array object,
|
data/lib/format/error_file.rb
CHANGED
@@ -3,12 +3,13 @@ module TeRex
|
|
3
3
|
require 'csv'
|
4
4
|
class ErrorFile
|
5
5
|
|
6
|
-
attr_reader :sentences, :path
|
6
|
+
attr_reader :sentences, :path, :category
|
7
7
|
|
8
8
|
@@csv_conf = {:headers => true}
|
9
9
|
|
10
|
-
def initialize(file_path)
|
10
|
+
def initialize(file_path, klass)
|
11
11
|
@path = file_path
|
12
|
+
@category = klass
|
12
13
|
end
|
13
14
|
|
14
15
|
# Each row of csv as Array object, strip it and return
|
data/lib/format/format.rb
CHANGED
data/lib/te_rex/bayes.rb
CHANGED
@@ -5,8 +5,7 @@ module TeRex
|
|
5
5
|
module Classifier
|
6
6
|
class Bayes
|
7
7
|
|
8
|
-
|
9
|
-
attr_reader :messages
|
8
|
+
attr_reader :category_counts, :total_words, :messages
|
10
9
|
|
11
10
|
# categories = [{:tag => "Thing1", :msg => "Thing1 message"}, {:tag => "Thing2", :msg => "Thing2 message"}]
|
12
11
|
# initialize({:tag => "Refund", :msg => "You'll get a refund"}, {:tag => "Nonrefund", :msg => "You won't get a refund"})
|
data/lib/te_rex/corpus.rb
CHANGED
@@ -30,18 +30,24 @@ module TeRex
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def sentence_partition
|
33
|
-
#super_set = build_superset
|
34
33
|
corpus_set = partition_files_for_sentences
|
35
|
-
#@sample_size = (superset.count.to_f * 0.75).round
|
36
|
-
@sample_size = 0.0
|
37
34
|
@training = partition_training_by_sentence(corpus_set)
|
38
35
|
@testing = partition_test_by_sentence(corpus_set)
|
39
|
-
count_all
|
36
|
+
c = count_all
|
37
|
+
@sample_size = (c.to_f * 0.75)
|
38
|
+
c
|
39
|
+
end
|
40
|
+
|
41
|
+
def build_superset
|
42
|
+
@set.reduce([]) do |memo,formatter|
|
43
|
+
memo << formatter.sentences
|
44
|
+
end.flatten
|
40
45
|
end
|
41
46
|
|
47
|
+
private
|
42
48
|
def define_set
|
43
49
|
@set ||= Dir[@glob].map do |file|
|
44
|
-
@format_klass.new(file)
|
50
|
+
@format_klass.new(file, @category_klass)
|
45
51
|
end
|
46
52
|
@set
|
47
53
|
end
|
@@ -72,12 +78,6 @@ module TeRex
|
|
72
78
|
c_set.sample(c_set.count * 0.25)
|
73
79
|
end
|
74
80
|
|
75
|
-
def build_superset
|
76
|
-
@set.reduce([]) do |memo,formatter|
|
77
|
-
memo << formatter.sentences
|
78
|
-
end.flatten
|
79
|
-
end
|
80
|
-
|
81
81
|
def count_all
|
82
82
|
counter = 0
|
83
83
|
@set.map{|f| counter += f.sentences.count}
|
data/lib/te_rex/version.rb
CHANGED
data/test/corpus_test.rb
CHANGED
@@ -48,6 +48,11 @@ class CorpusTest < MicroTest::Test
|
|
48
48
|
assert (22...27).map{|i| i}.include?((ratio * 100).to_i)
|
49
49
|
end
|
50
50
|
|
51
|
+
test "sample size equals size of training set" do
|
52
|
+
sample = @@sent_corpus.total_sentences.to_f * 0.75
|
53
|
+
assert @@sent_corpus.sample_size == sample
|
54
|
+
end
|
55
|
+
|
51
56
|
test "sentence counts are correct" do
|
52
57
|
assert @@sent_corpus.set.count == 3
|
53
58
|
assert @@sent_corpus.training.count == 9
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: te_rex
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joshua Bowles
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fast-stemmer
|