NaiveText 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +25 -20
- data/lib/NaiveText.rb +2 -5
- data/lib/NaiveText/CategoriesFactory.rb +23 -10
- data/lib/NaiveText/Category.rb +1 -1
- data/lib/NaiveText/Example.rb +19 -0
- data/lib/NaiveText/ExamplesFactory.rb +14 -0
- data/lib/NaiveText/ExamplesGroup.rb +18 -10
- data/lib/NaiveText/PropabilityCalculator.rb +19 -15
- data/lib/NaiveText/PropabilityCollection.rb +16 -5
- data/lib/NaiveText/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 24121c5efbc4119de814e59e3080b3770c822929
|
4
|
+
data.tar.gz: 2638006388fe41e21918f6859545efdb5e9f4526
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 18fecd3d98353ed3d8010af35795a7bda536bf48cd072ad9674b807f78d8988e8f2b7babc3f56bd7f2d6f4caee614998450ade5dada44cfef7eb79b746f84c7a
|
7
|
+
data.tar.gz: c94b0036e3ce3f145ac08beca161ec0d39227ab229938bb5466acbc8f8d0364808f0115dcb6aba162ead52d815da577565681bea71339ef735aa6b6775463f53
|
data/README.md
CHANGED
@@ -5,8 +5,8 @@ A naive Bayes Textclassifier written in Ruby
|
|
5
5
|
1. What does it do?
|
6
6
|
----
|
7
7
|
|
8
|
-
It sorts
|
9
|
-
The algorithm bases its decisions on
|
8
|
+
It sorts texts into predefined categories (i.e. interesting/boring).
|
9
|
+
The algorithm bases its decisions on classified trainingdata (text files, ActiveRecord models,...).
|
10
10
|
|
11
11
|
## Installation
|
12
12
|
|
@@ -26,34 +26,39 @@ Or install it yourself as:
|
|
26
26
|
|
27
27
|
## Usage
|
28
28
|
|
29
|
-
|
29
|
+
The algorithm needs some examples for training. An example is a object with an id that responds to the text message with a string (i.e. ActiveRecord models with an text attribute will do).
|
30
|
+
You can also use local files as examples (via ExamplesFactory.from_files('path/to/dir')).
|
31
|
+
|
32
|
+
|
33
|
+
|
34
|
+
### Example
|
35
|
+
|
36
|
+
Lets pretend you write some kind of forum. A user can write posts and can vote them up or down.
|
37
|
+
|
38
|
+
|
39
|
+
We will build a system which predicts if a new post is interesting to the user or if this post will bore him a sleep.
|
40
|
+
|
41
|
+
In your system (an rails app of course) you haven a *Post* model with a text attribute containing the posts content. There are also two scopes on Post: *up_voted* and *down_voted*, which return all up/down voted posts.
|
42
|
+
|
30
43
|
|
31
|
-
Next up, the code:
|
32
44
|
|
33
45
|
```ruby
|
34
46
|
require 'NaiveText'
|
35
|
-
```
|
36
|
-
Now build the systems with your categories and training texts:
|
37
47
|
|
38
|
-
|
39
|
-
|
40
|
-
{name: 'boring', path: 'spec/training/negative'}]
|
41
|
-
classifier = NaiveText.build(categories_config)
|
42
|
-
```
|
43
|
-
Now you can start classifying texts:
|
48
|
+
interesting_examples = Post.up_voted.to_a
|
49
|
+
boring_examples = Post.down_voted.to_a
|
44
50
|
|
45
|
-
|
46
|
-
|
47
|
-
classifier.classify('Seems to be boring')
|
48
|
-
```
|
49
|
-
Classify will return a category-object on which you can call name to get the name of the category as a string.
|
51
|
+
categories = [{name: 'interesting', examples: interesting_examples},
|
52
|
+
{name: 'boring', examples: boring_examples}];
|
50
53
|
|
51
|
-
|
52
|
-
|
54
|
+
classifier = NaiveText.build(categories: categories)
|
55
|
+
|
56
|
+
category = classifier.classify(new_interesting_post.text)
|
53
57
|
category.name
|
54
58
|
=> 'interesting'
|
55
59
|
```
|
56
|
-
|
60
|
+
Checkout the full example and some more in the
|
61
|
+
[NaiveText-example repo](https://github.com/RicciFlowing/NaiveText-examples).
|
57
62
|
Have fun using it!
|
58
63
|
|
59
64
|
|
data/lib/NaiveText.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
require "NaiveText/version"
|
2
|
+
require "NaiveText/Example"
|
3
|
+
require "NaiveText/ExamplesFactory"
|
2
4
|
require "NaiveText/ExamplesGroup"
|
3
5
|
require "NaiveText/PropabilityCollection"
|
4
6
|
require "NaiveText/PropabilityCalculator"
|
@@ -12,12 +14,7 @@ require "NaiveText/CategoriesFactory"
|
|
12
14
|
module NaiveText
|
13
15
|
|
14
16
|
def self.build(config)
|
15
|
-
begin
|
16
17
|
@categories = CategoriesFactory.build(config)
|
17
18
|
@test_classifier = TextClassifier.new(categories: @categories)
|
18
|
-
rescue
|
19
|
-
puts "Their seems to be an error in your config.
|
20
|
-
The expectedt format is [{name: name_of_category, path: path_to_trainings_data}]"
|
21
|
-
end
|
22
19
|
end
|
23
20
|
end
|
@@ -1,17 +1,30 @@
|
|
1
1
|
class CategoriesFactory
|
2
2
|
def self.build(config)
|
3
3
|
categories = []
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
4
|
+
if config.is_a?(Array)
|
5
|
+
puts "The format [{name: name_of_category, path: path_to_trainings_data}] is deprecated and will be removed in future versions. Use the following arguments instead: categories: [name: 'the name', examples:'An example']"
|
6
|
+
config.each do |category_config|
|
7
|
+
begin
|
8
|
+
examples = ExamplesFactory.from_files(category_config[:path])
|
9
|
+
group = ExamplesGroup.new(examples: examples)
|
10
|
+
categories << Category.new(name: category_config[:name], examples: group)
|
11
|
+
rescue
|
12
|
+
puts "You haven't provided trainingsdata for the category" + category_config[:name]
|
13
|
+
puts "This category was not created."
|
14
|
+
end
|
12
15
|
end
|
16
|
+
Categories.new(categories: categories)
|
17
|
+
else
|
18
|
+
config[:categories].each do |category_config|
|
19
|
+
begin
|
20
|
+
group = ExamplesGroup.new(examples: category_config[:examples])
|
21
|
+
categories << Category.new(name: category_config[:name], examples: group)
|
22
|
+
rescue
|
23
|
+
puts "You haven't provided trainingsdata for the category" + category_config[:name]
|
24
|
+
puts "This category was not created."
|
25
|
+
end
|
26
|
+
end
|
27
|
+
Categories.new(categories: categories)
|
13
28
|
end
|
14
|
-
|
15
|
-
Categories.new(categories: categories)
|
16
29
|
end
|
17
30
|
end
|
data/lib/NaiveText/Category.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
class Example
|
2
|
+
attr_reader :text
|
3
|
+
|
4
|
+
def initialize(args)
|
5
|
+
load_text(args)
|
6
|
+
end
|
7
|
+
|
8
|
+
private
|
9
|
+
def load_text(args)
|
10
|
+
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class FileExample < Example
|
15
|
+
private
|
16
|
+
def load_text(args)
|
17
|
+
@text = File.read(args[:path])
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class ExamplesFactory
|
2
|
+
def self.from_files(dir_path)
|
3
|
+
begin
|
4
|
+
examples = []
|
5
|
+
Dir.foreach(dir_path) do |file_path|
|
6
|
+
next if file_path == '.' or file_path == '..'
|
7
|
+
examples.push FileExample.new(path: dir_path+'/'+file_path)
|
8
|
+
end
|
9
|
+
rescue
|
10
|
+
puts "Failed laoding" + dir_path
|
11
|
+
end
|
12
|
+
examples
|
13
|
+
end
|
14
|
+
end
|
@@ -1,14 +1,16 @@
|
|
1
1
|
class ExamplesGroup
|
2
|
-
def initialize(
|
3
|
-
@
|
4
|
-
|
2
|
+
def initialize(args)
|
3
|
+
@examples = args[:examples] || []
|
4
|
+
load_text
|
5
|
+
split_text_into_words
|
6
|
+
format_words
|
5
7
|
if @words.length == 0
|
6
8
|
raise 'Empty_Trainingsdata'
|
7
9
|
end
|
8
10
|
end
|
9
11
|
|
10
12
|
def count(word)
|
11
|
-
@words.count(word)
|
13
|
+
@words.count(word.downcase)
|
12
14
|
end
|
13
15
|
|
14
16
|
def word_count
|
@@ -17,12 +19,18 @@ class ExamplesGroup
|
|
17
19
|
|
18
20
|
private
|
19
21
|
|
20
|
-
def load_text
|
21
|
-
text =
|
22
|
-
|
23
|
-
|
24
|
-
text += File.read(path +'/'+ example_file)
|
22
|
+
def load_text
|
23
|
+
@text = ''
|
24
|
+
@examples.each do |example|
|
25
|
+
@text += ' ' + example.text
|
25
26
|
end
|
26
|
-
|
27
|
+
end
|
28
|
+
|
29
|
+
def split_text_into_words
|
30
|
+
@words = @text.split(/\W+/)
|
31
|
+
end
|
32
|
+
|
33
|
+
def format_words
|
34
|
+
@words.map! {|word| word.downcase}
|
27
35
|
end
|
28
36
|
end
|
@@ -6,34 +6,40 @@ class PropabilityCalculator
|
|
6
6
|
|
7
7
|
def get_propabilities_for(text)
|
8
8
|
calculateProbabilities(text)
|
9
|
-
normalize unless @propabilities.sum
|
9
|
+
normalize unless @propabilities.sum <= 0
|
10
10
|
@propabilities
|
11
11
|
end
|
12
12
|
|
13
13
|
|
14
14
|
private
|
15
|
-
def
|
16
|
-
minimum
|
15
|
+
def protect_factor(factor)
|
16
|
+
[factor, minimum].max
|
17
17
|
end
|
18
18
|
|
19
|
-
def
|
20
|
-
|
21
|
-
factor = minimum
|
22
|
-
end
|
23
|
-
factor
|
19
|
+
def minimum
|
20
|
+
1.to_f/(10*@categories.total_word_count)
|
24
21
|
end
|
25
22
|
|
26
23
|
def calculateProbabilities(text)
|
24
|
+
set_apriori_propabilities
|
27
25
|
list_of_words = text.split(/\W+/)
|
28
|
-
@categories.each do |category|
|
29
|
-
@propabilities.set(category: category, value: p_apriori(category))
|
30
|
-
end
|
31
|
-
|
32
26
|
list_of_words.each do |word|
|
33
27
|
@categories.each do |category|
|
34
|
-
@propabilities.multiply(category: category, factor:
|
28
|
+
@propabilities.multiply(category: category, factor: protect_factor(category.p(word)) )
|
35
29
|
end
|
36
30
|
end
|
31
|
+
remove_minimum(text)
|
32
|
+
end
|
33
|
+
|
34
|
+
def set_apriori_propabilities
|
35
|
+
@categories.each do |category|
|
36
|
+
@propabilities.set(category: category, value: p_apriori(category))
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def remove_minimum(text)
|
41
|
+
times = text.split(/\W+/).length
|
42
|
+
@propabilities.greater_then(minimum**times)
|
37
43
|
end
|
38
44
|
|
39
45
|
def normalize
|
@@ -44,6 +50,4 @@ class PropabilityCalculator
|
|
44
50
|
def p_apriori(category)
|
45
51
|
@categories.p_apriori(category)
|
46
52
|
end
|
47
|
-
|
48
|
-
|
49
53
|
end
|
@@ -28,14 +28,28 @@ class PropabilityCollection
|
|
28
28
|
end
|
29
29
|
|
30
30
|
def category_with_max
|
31
|
-
|
32
|
-
|
31
|
+
if @propabilities.max > 0
|
32
|
+
id = @propabilities.find_index(@propabilities.max)
|
33
|
+
@categories.find {|category| category.id == id}
|
34
|
+
else
|
35
|
+
NullCategory.new
|
36
|
+
end
|
33
37
|
end
|
34
38
|
|
35
39
|
def max
|
36
40
|
@propabilities.max
|
37
41
|
end
|
38
42
|
|
43
|
+
def greater_then(value)
|
44
|
+
@propabilities.map! do |p|
|
45
|
+
if p > value
|
46
|
+
p
|
47
|
+
else
|
48
|
+
0
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
39
53
|
def sum
|
40
54
|
@propabilities.reduce(:+)
|
41
55
|
end
|
@@ -61,7 +75,4 @@ class PropabilityCollection
|
|
61
75
|
@propabilities << 0
|
62
76
|
end
|
63
77
|
end
|
64
|
-
|
65
|
-
|
66
|
-
|
67
78
|
end
|
data/lib/NaiveText/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: NaiveText
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- RicciFlowing
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -102,6 +102,8 @@ files:
|
|
102
102
|
- lib/NaiveText/Categories.rb
|
103
103
|
- lib/NaiveText/CategoriesFactory.rb
|
104
104
|
- lib/NaiveText/Category.rb
|
105
|
+
- lib/NaiveText/Example.rb
|
106
|
+
- lib/NaiveText/ExamplesFactory.rb
|
105
107
|
- lib/NaiveText/ExamplesGroup.rb
|
106
108
|
- lib/NaiveText/PropabilityCalculator.rb
|
107
109
|
- lib/NaiveText/PropabilityCollection.rb
|