te_rex 0.0.11 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/format/basic_file.rb +3 -2
- data/lib/format/brown_file.rb +3 -2
- data/lib/format/error_file.rb +3 -2
- data/lib/format/format.rb +1 -1
- data/lib/te_rex/bayes.rb +1 -2
- data/lib/te_rex/corpus.rb +11 -11
- data/lib/te_rex/version.rb +1 -1
- data/test/corpus_test.rb +5 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eba4b0e904897695a6ffe59f676cd41129d9f9d3
|
4
|
+
data.tar.gz: b9714ec30ca138fed335894c95503c113c8e605d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f0e6f419f099c3f485c1b829d1e5421935092af35fc2184fdd9a70de0f7127cbf95bcfc100e6e8f0d0efc7eeda15f4b8cc5f696211c852a92933220ae2894ff
|
7
|
+
data.tar.gz: 3f017ed552f0fdcb0ba89a0ffcc188f76c13780571b67ca4edd254980605ff5166af536610da2d3b2570b8bb5c39a3b1e0dc57d1873c1de89b370447b2426859
|
data/lib/format/basic_file.rb
CHANGED
@@ -2,10 +2,11 @@ module TeRex
|
|
2
2
|
module Format
|
3
3
|
class BasicFile
|
4
4
|
|
5
|
-
attr_reader :sentences, :path
|
5
|
+
attr_reader :sentences, :path, :category
|
6
6
|
|
7
|
-
def initialize(file_path)
|
7
|
+
def initialize(file_path, klass)
|
8
8
|
@path = file_path
|
9
|
+
@category = klass
|
9
10
|
end
|
10
11
|
|
11
12
|
# Each line of file with Array object,
|
data/lib/format/brown_file.rb
CHANGED
@@ -2,10 +2,11 @@ module TeRex
|
|
2
2
|
module Format
|
3
3
|
class BrownFile
|
4
4
|
|
5
|
-
|
5
|
+
attr_reader :sentences, :path, :category
|
6
6
|
|
7
|
-
def initialize(file_path)
|
7
|
+
def initialize(file_path, klass)
|
8
8
|
@path = file_path
|
9
|
+
@category = klass
|
9
10
|
end
|
10
11
|
|
11
12
|
# Each line of file with Array object,
|
data/lib/format/error_file.rb
CHANGED
@@ -3,12 +3,13 @@ module TeRex
|
|
3
3
|
require 'csv'
|
4
4
|
class ErrorFile
|
5
5
|
|
6
|
-
attr_reader :sentences, :path
|
6
|
+
attr_reader :sentences, :path, :category
|
7
7
|
|
8
8
|
@@csv_conf = {:headers => true}
|
9
9
|
|
10
|
-
def initialize(file_path)
|
10
|
+
def initialize(file_path, klass)
|
11
11
|
@path = file_path
|
12
|
+
@category = klass
|
12
13
|
end
|
13
14
|
|
14
15
|
# Each row of csv as Array object, strip it and return
|
data/lib/format/format.rb
CHANGED
data/lib/te_rex/bayes.rb
CHANGED
@@ -5,8 +5,7 @@ module TeRex
|
|
5
5
|
module Classifier
|
6
6
|
class Bayes
|
7
7
|
|
8
|
-
|
9
|
-
attr_reader :messages
|
8
|
+
attr_reader :category_counts, :total_words, :messages
|
10
9
|
|
11
10
|
# categories = [{:tag => "Thing1", :msg => "Thing1 message"}, {:tag => "Thing2", :msg => "Thing2 message"}]
|
12
11
|
# initialize({:tag => "Refund", :msg => "You'll get a refund"}, {:tag => "Nonrefund", :msg => "You won't get a refund"})
|
data/lib/te_rex/corpus.rb
CHANGED
@@ -30,18 +30,24 @@ module TeRex
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def sentence_partition
|
33
|
-
#super_set = build_superset
|
34
33
|
corpus_set = partition_files_for_sentences
|
35
|
-
#@sample_size = (superset.count.to_f * 0.75).round
|
36
|
-
@sample_size = 0.0
|
37
34
|
@training = partition_training_by_sentence(corpus_set)
|
38
35
|
@testing = partition_test_by_sentence(corpus_set)
|
39
|
-
count_all
|
36
|
+
c = count_all
|
37
|
+
@sample_size = (c.to_f * 0.75)
|
38
|
+
c
|
39
|
+
end
|
40
|
+
|
41
|
+
def build_superset
|
42
|
+
@set.reduce([]) do |memo,formatter|
|
43
|
+
memo << formatter.sentences
|
44
|
+
end.flatten
|
40
45
|
end
|
41
46
|
|
47
|
+
private
|
42
48
|
def define_set
|
43
49
|
@set ||= Dir[@glob].map do |file|
|
44
|
-
@format_klass.new(file)
|
50
|
+
@format_klass.new(file, @category_klass)
|
45
51
|
end
|
46
52
|
@set
|
47
53
|
end
|
@@ -72,12 +78,6 @@ module TeRex
|
|
72
78
|
c_set.sample(c_set.count * 0.25)
|
73
79
|
end
|
74
80
|
|
75
|
-
def build_superset
|
76
|
-
@set.reduce([]) do |memo,formatter|
|
77
|
-
memo << formatter.sentences
|
78
|
-
end.flatten
|
79
|
-
end
|
80
|
-
|
81
81
|
def count_all
|
82
82
|
counter = 0
|
83
83
|
@set.map{|f| counter += f.sentences.count}
|
data/lib/te_rex/version.rb
CHANGED
data/test/corpus_test.rb
CHANGED
@@ -48,6 +48,11 @@ class CorpusTest < MicroTest::Test
|
|
48
48
|
assert (22...27).map{|i| i}.include?((ratio * 100).to_i)
|
49
49
|
end
|
50
50
|
|
51
|
+
test "sample size equals size of training set" do
|
52
|
+
sample = @@sent_corpus.total_sentences.to_f * 0.75
|
53
|
+
assert @@sent_corpus.sample_size == sample
|
54
|
+
end
|
55
|
+
|
51
56
|
test "sentence counts are correct" do
|
52
57
|
assert @@sent_corpus.set.count == 3
|
53
58
|
assert @@sent_corpus.training.count == 9
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: te_rex
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joshua Bowles
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: fast-stemmer
|