NaiveText 0.3.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +25 -20
- data/lib/NaiveText.rb +2 -5
- data/lib/NaiveText/CategoriesFactory.rb +23 -10
- data/lib/NaiveText/Category.rb +1 -1
- data/lib/NaiveText/Example.rb +19 -0
- data/lib/NaiveText/ExamplesFactory.rb +14 -0
- data/lib/NaiveText/ExamplesGroup.rb +18 -10
- data/lib/NaiveText/PropabilityCalculator.rb +19 -15
- data/lib/NaiveText/PropabilityCollection.rb +16 -5
- data/lib/NaiveText/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 24121c5efbc4119de814e59e3080b3770c822929
|
4
|
+
data.tar.gz: 2638006388fe41e21918f6859545efdb5e9f4526
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 18fecd3d98353ed3d8010af35795a7bda536bf48cd072ad9674b807f78d8988e8f2b7babc3f56bd7f2d6f4caee614998450ade5dada44cfef7eb79b746f84c7a
|
7
|
+
data.tar.gz: c94b0036e3ce3f145ac08beca161ec0d39227ab229938bb5466acbc8f8d0364808f0115dcb6aba162ead52d815da577565681bea71339ef735aa6b6775463f53
|
data/README.md
CHANGED
@@ -5,8 +5,8 @@ A naive Bayes Textclassifier written in Ruby
|
|
5
5
|
1. What does it do?
|
6
6
|
----
|
7
7
|
|
8
|
-
It sorts
|
9
|
-
The algorithm bases its decisions on
|
8
|
+
It sorts texts into predefined categories (i.e. interesting/boring).
|
9
|
+
The algorithm bases its decisions on classified trainingdata (text files, ActiveRecord models,...).
|
10
10
|
|
11
11
|
## Installation
|
12
12
|
|
@@ -26,34 +26,39 @@ Or install it yourself as:
|
|
26
26
|
|
27
27
|
## Usage
|
28
28
|
|
29
|
-
|
29
|
+
The algorithm needs some examples for training. An example is a object with an id that responds to the text message with a string (i.e. ActiveRecord models with an text attribute will do).
|
30
|
+
You can also use local files as examples (via ExamplesFactory.from_files('path/to/dir')).
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
### Example
|
35
|
+
|
36
|
+
Lets pretend you write some kind of forum. A user can write posts and can vote them up or down.
|
37
|
+
|
38
|
+
|
39
|
+
We will build a system which predicts if a new post is interesting to the user or if this post will bore him a sleep.
|
40
|
+
|
41
|
+
In your system (an rails app of course) you haven a *Post* model with a text attribute containing the posts content. There are also two scopes on Post: *up_voted* and *down_voted*, which return all up/down voted posts.
|
42
|
+
|
30
43
|
|
31
|
-
Next up, the code:
|
32
44
|
|
33
45
|
```ruby
|
34
46
|
require 'NaiveText'
|
35
|
-
```
|
36
|
-
Now build the systems with your categories and training texts:
|
37
47
|
|
38
|
-
|
39
|
-
|
40
|
-
{name: 'boring', path: 'spec/training/negative'}]
|
41
|
-
classifier = NaiveText.build(categories_config)
|
42
|
-
```
|
43
|
-
Now you can start classifying texts:
|
48
|
+
interesting_examples = Post.up_voted.to_a
|
49
|
+
boring_examples = Post.down_voted.to_a
|
44
50
|
|
45
|
-
|
46
|
-
|
47
|
-
classifier.classify('Seems to be boring')
|
48
|
-
```
|
49
|
-
Classify will return a category-object on which you can call name to get the name of the category as a string.
|
51
|
+
categories = [{name: 'interesting', examples: interesting_examples},
|
52
|
+
{name: 'boring', examples: boring_examples}];
|
50
53
|
|
51
|
-
|
52
|
-
|
54
|
+
classifier = NaiveText.build(categories: categories)
|
55
|
+
|
56
|
+
category = classifier.classify(new_interesting_post.text)
|
53
57
|
category.name
|
54
58
|
=> 'interesting'
|
55
59
|
```
|
56
|
-
|
60
|
+
Checkout the full example and some more in the
|
61
|
+
[NaiveText-example repo](https://github.com/RicciFlowing/NaiveText-examples).
|
57
62
|
Have fun using it!
|
58
63
|
|
59
64
|
|
data/lib/NaiveText.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
require "NaiveText/version"
|
2
|
+
require "NaiveText/Example"
|
3
|
+
require "NaiveText/ExamplesFactory"
|
2
4
|
require "NaiveText/ExamplesGroup"
|
3
5
|
require "NaiveText/PropabilityCollection"
|
4
6
|
require "NaiveText/PropabilityCalculator"
|
@@ -12,12 +14,7 @@ require "NaiveText/CategoriesFactory"
|
|
12
14
|
module NaiveText
|
13
15
|
|
14
16
|
def self.build(config)
|
15
|
-
begin
|
16
17
|
@categories = CategoriesFactory.build(config)
|
17
18
|
@test_classifier = TextClassifier.new(categories: @categories)
|
18
|
-
rescue
|
19
|
-
puts "Their seems to be an error in your config.
|
20
|
-
The expectedt format is [{name: name_of_category, path: path_to_trainings_data}]"
|
21
|
-
end
|
22
19
|
end
|
23
20
|
end
|
@@ -1,17 +1,30 @@
|
|
1
1
|
class CategoriesFactory
|
2
2
|
def self.build(config)
|
3
3
|
categories = []
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
4
|
+
if config.is_a?(Array)
|
5
|
+
puts "The format [{name: name_of_category, path: path_to_trainings_data}] is deprecated and will be removed in future versions. Use the following arguments instead: categories: [name: 'the name', examples:'An example']"
|
6
|
+
config.each do |category_config|
|
7
|
+
begin
|
8
|
+
examples = ExamplesFactory.from_files(category_config[:path])
|
9
|
+
group = ExamplesGroup.new(examples: examples)
|
10
|
+
categories << Category.new(name: category_config[:name], examples: group)
|
11
|
+
rescue
|
12
|
+
puts "You haven't provided trainingsdata for the category" + category_config[:name]
|
13
|
+
puts "This category was not created."
|
14
|
+
end
|
12
15
|
end
|
16
|
+
Categories.new(categories: categories)
|
17
|
+
else
|
18
|
+
config[:categories].each do |category_config|
|
19
|
+
begin
|
20
|
+
group = ExamplesGroup.new(examples: category_config[:examples])
|
21
|
+
categories << Category.new(name: category_config[:name], examples: group)
|
22
|
+
rescue
|
23
|
+
puts "You haven't provided trainingsdata for the category" + category_config[:name]
|
24
|
+
puts "This category was not created."
|
25
|
+
end
|
26
|
+
end
|
27
|
+
Categories.new(categories: categories)
|
13
28
|
end
|
14
|
-
|
15
|
-
Categories.new(categories: categories)
|
16
29
|
end
|
17
30
|
end
|
data/lib/NaiveText/Category.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
class Example
|
2
|
+
attr_reader :text
|
3
|
+
|
4
|
+
def initialize(args)
|
5
|
+
load_text(args)
|
6
|
+
end
|
7
|
+
|
8
|
+
private
|
9
|
+
def load_text(args)
|
10
|
+
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class FileExample < Example
|
15
|
+
private
|
16
|
+
def load_text(args)
|
17
|
+
@text = File.read(args[:path])
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class ExamplesFactory
|
2
|
+
def self.from_files(dir_path)
|
3
|
+
begin
|
4
|
+
examples = []
|
5
|
+
Dir.foreach(dir_path) do |file_path|
|
6
|
+
next if file_path == '.' or file_path == '..'
|
7
|
+
examples.push FileExample.new(path: dir_path+'/'+file_path)
|
8
|
+
end
|
9
|
+
rescue
|
10
|
+
puts "Failed laoding" + dir_path
|
11
|
+
end
|
12
|
+
examples
|
13
|
+
end
|
14
|
+
end
|
@@ -1,14 +1,16 @@
|
|
1
1
|
class ExamplesGroup
|
2
|
-
def initialize(
|
3
|
-
@
|
4
|
-
|
2
|
+
def initialize(args)
|
3
|
+
@examples = args[:examples] || []
|
4
|
+
load_text
|
5
|
+
split_text_into_words
|
6
|
+
format_words
|
5
7
|
if @words.length == 0
|
6
8
|
raise 'Empty_Trainingsdata'
|
7
9
|
end
|
8
10
|
end
|
9
11
|
|
10
12
|
def count(word)
|
11
|
-
@words.count(word)
|
13
|
+
@words.count(word.downcase)
|
12
14
|
end
|
13
15
|
|
14
16
|
def word_count
|
@@ -17,12 +19,18 @@ class ExamplesGroup
|
|
17
19
|
|
18
20
|
private
|
19
21
|
|
20
|
-
def load_text
|
21
|
-
text =
|
22
|
-
|
23
|
-
|
24
|
-
text += File.read(path +'/'+ example_file)
|
22
|
+
def load_text
|
23
|
+
@text = ''
|
24
|
+
@examples.each do |example|
|
25
|
+
@text += ' ' + example.text
|
25
26
|
end
|
26
|
-
|
27
|
+
end
|
28
|
+
|
29
|
+
def split_text_into_words
|
30
|
+
@words = @text.split(/\W+/)
|
31
|
+
end
|
32
|
+
|
33
|
+
def format_words
|
34
|
+
@words.map! {|word| word.downcase}
|
27
35
|
end
|
28
36
|
end
|
@@ -6,34 +6,40 @@ class PropabilityCalculator
|
|
6
6
|
|
7
7
|
def get_propabilities_for(text)
|
8
8
|
calculateProbabilities(text)
|
9
|
-
normalize unless @propabilities.sum
|
9
|
+
normalize unless @propabilities.sum <= 0
|
10
10
|
@propabilities
|
11
11
|
end
|
12
12
|
|
13
13
|
|
14
14
|
private
|
15
|
-
def
|
16
|
-
minimum
|
15
|
+
def protect_factor(factor)
|
16
|
+
[factor, minimum].max
|
17
17
|
end
|
18
18
|
|
19
|
-
def
|
20
|
-
|
21
|
-
factor = minimum
|
22
|
-
end
|
23
|
-
factor
|
19
|
+
def minimum
|
20
|
+
1.to_f/(10*@categories.total_word_count)
|
24
21
|
end
|
25
22
|
|
26
23
|
def calculateProbabilities(text)
|
24
|
+
set_apriori_propabilities
|
27
25
|
list_of_words = text.split(/\W+/)
|
28
|
-
@categories.each do |category|
|
29
|
-
@propabilities.set(category: category, value: p_apriori(category))
|
30
|
-
end
|
31
|
-
|
32
26
|
list_of_words.each do |word|
|
33
27
|
@categories.each do |category|
|
34
|
-
@propabilities.multiply(category: category, factor:
|
28
|
+
@propabilities.multiply(category: category, factor: protect_factor(category.p(word)) )
|
35
29
|
end
|
36
30
|
end
|
31
|
+
remove_minimum(text)
|
32
|
+
end
|
33
|
+
|
34
|
+
def set_apriori_propabilities
|
35
|
+
@categories.each do |category|
|
36
|
+
@propabilities.set(category: category, value: p_apriori(category))
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def remove_minimum(text)
|
41
|
+
times = text.split(/\W+/).length
|
42
|
+
@propabilities.greater_then(minimum**times)
|
37
43
|
end
|
38
44
|
|
39
45
|
def normalize
|
@@ -44,6 +50,4 @@ class PropabilityCalculator
|
|
44
50
|
def p_apriori(category)
|
45
51
|
@categories.p_apriori(category)
|
46
52
|
end
|
47
|
-
|
48
|
-
|
49
53
|
end
|
@@ -28,14 +28,28 @@ class PropabilityCollection
|
|
28
28
|
end
|
29
29
|
|
30
30
|
def category_with_max
|
31
|
-
|
32
|
-
|
31
|
+
if @propabilities.max > 0
|
32
|
+
id = @propabilities.find_index(@propabilities.max)
|
33
|
+
@categories.find {|category| category.id == id}
|
34
|
+
else
|
35
|
+
NullCategory.new
|
36
|
+
end
|
33
37
|
end
|
34
38
|
|
35
39
|
def max
|
36
40
|
@propabilities.max
|
37
41
|
end
|
38
42
|
|
43
|
+
def greater_then(value)
|
44
|
+
@propabilities.map! do |p|
|
45
|
+
if p > value
|
46
|
+
p
|
47
|
+
else
|
48
|
+
0
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
39
53
|
def sum
|
40
54
|
@propabilities.reduce(:+)
|
41
55
|
end
|
@@ -61,7 +75,4 @@ class PropabilityCollection
|
|
61
75
|
@propabilities << 0
|
62
76
|
end
|
63
77
|
end
|
64
|
-
|
65
|
-
|
66
|
-
|
67
78
|
end
|
data/lib/NaiveText/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: NaiveText
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- RicciFlowing
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -102,6 +102,8 @@ files:
|
|
102
102
|
- lib/NaiveText/Categories.rb
|
103
103
|
- lib/NaiveText/CategoriesFactory.rb
|
104
104
|
- lib/NaiveText/Category.rb
|
105
|
+
- lib/NaiveText/Example.rb
|
106
|
+
- lib/NaiveText/ExamplesFactory.rb
|
105
107
|
- lib/NaiveText/ExamplesGroup.rb
|
106
108
|
- lib/NaiveText/PropabilityCalculator.rb
|
107
109
|
- lib/NaiveText/PropabilityCollection.rb
|