NaiveText 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +1 -0
- data/README.md +2 -5
- data/lib/NaiveText.rb +2 -2
- data/lib/NaiveText/{PropabilityCalculator.rb → ProbabilityCalculator.rb} +12 -12
- data/lib/NaiveText/{PropabilityCollection.rb → ProbabilityCollection.rb} +14 -14
- data/lib/NaiveText/TextClassifier.rb +9 -4
- data/lib/NaiveText/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 99c7ed0d2ea0ab00ce13e284e537474e1bd48b5a
|
4
|
+
data.tar.gz: 1feb4b6118ccfde5c54daca91c1386fbdcfe8e1b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 810c0f40cdd3852010a8bdbc831c6b8591ea47ddb0ea7154e64899c682cbbf875a7b17b4d0676a809a080f14f9d69810cbe4798c2540b57c2ae0851224458365
|
7
|
+
data.tar.gz: a28bd4c31239537888d85cca8020ceca2506886a0e4d6bf6f57e78ed383e89bf5adebfebf56bdeda39649c4b5dfd01e8f43d6833101659c6cbc06921a0cd6480
|
data/CHANGELOG.md
CHANGED
@@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file.
|
|
3
3
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
4
4
|
|
5
5
|
## [Unreleased]
|
6
|
+
-Fixed a typo in the interface of TextClassifier propabilities --> probabilities. Deprecated the old version.
|
6
7
|
|
7
8
|
## [0.4.1] - 2015-10-29
|
8
9
|
### Added
|
data/README.md
CHANGED
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
NaiveText is a text classifier gem written in ruby and made to be easily integratable in your Rails app.
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
## What is it good for?
|
6
|
+
|
7
7
|
Text classifier are used in many areas of IT. The filter spam, predict what a user wants to buy, detect which language a text is written in, ...
|
8
8
|
|
9
9
|
The kind of classifier included in NaiveText, uses existing text examples (junk-makrde e-mails, allready bought products, texts in different languages, ...) to calculate in which category (spam/e-mail, interesting_product/not_interesting_product, ...) a unknown text belongs.
|
@@ -35,13 +35,10 @@ You can also use local files as examples (via ExamplesFactory.from_files('path/t
|
|
35
35
|
|
36
36
|
Lets pretend you write some kind of forum. A user can write posts and can vote them up or down.
|
37
37
|
|
38
|
-
|
39
38
|
We will build a system which predicts if a new post is interesting to the user or if this post will bore him a sleep.
|
40
39
|
|
41
40
|
In your system (an rails app of course) you haven a *Post* model with a text attribute containing the posts content. There are also two scopes on Post: *up_voted* and *down_voted*, which return all up/down voted posts.
|
42
41
|
|
43
|
-
|
44
|
-
|
45
42
|
```ruby
|
46
43
|
require 'NaiveText'
|
47
44
|
|
data/lib/NaiveText.rb
CHANGED
@@ -2,8 +2,8 @@ require "NaiveText/version"
|
|
2
2
|
require "NaiveText/Example"
|
3
3
|
require "NaiveText/ExamplesFactory"
|
4
4
|
require "NaiveText/ExamplesGroup"
|
5
|
-
require "NaiveText/
|
6
|
-
require "NaiveText/
|
5
|
+
require "NaiveText/ProbabilityCollection"
|
6
|
+
require "NaiveText/ProbabilityCalculator"
|
7
7
|
require "NaiveText/TextClassifier"
|
8
8
|
require "NaiveText/Category"
|
9
9
|
require "NaiveText/Categories"
|
@@ -1,13 +1,13 @@
|
|
1
|
-
class
|
1
|
+
class ProbabilityCalculator
|
2
2
|
def initialize(args)
|
3
3
|
@categories = args[:categories] || []
|
4
|
-
@
|
4
|
+
@probabilities = ProbabilityCollection.new(categories: @categories)
|
5
5
|
end
|
6
6
|
|
7
|
-
def
|
7
|
+
def get_probabilities_for(text)
|
8
8
|
calculateProbabilities(text)
|
9
|
-
normalize unless @
|
10
|
-
@
|
9
|
+
normalize unless @probabilities.sum <= 0
|
10
|
+
@probabilities
|
11
11
|
end
|
12
12
|
|
13
13
|
|
@@ -21,30 +21,30 @@ class PropabilityCalculator
|
|
21
21
|
end
|
22
22
|
|
23
23
|
def calculateProbabilities(text)
|
24
|
-
|
24
|
+
set_apriori_probabilities
|
25
25
|
list_of_words = text.split(/\W+/)
|
26
26
|
list_of_words.each do |word|
|
27
27
|
@categories.each do |category|
|
28
|
-
@
|
28
|
+
@probabilities.multiply(category: category, factor: protect_factor(category.p(word)) )
|
29
29
|
end
|
30
30
|
end
|
31
31
|
remove_minimum(text)
|
32
32
|
end
|
33
33
|
|
34
|
-
def
|
34
|
+
def set_apriori_probabilities
|
35
35
|
@categories.each do |category|
|
36
|
-
@
|
36
|
+
@probabilities.set(category: category, value: p_apriori(category))
|
37
37
|
end
|
38
38
|
end
|
39
39
|
|
40
40
|
def remove_minimum(text)
|
41
41
|
times = text.split(/\W+/).length
|
42
|
-
@
|
42
|
+
@probabilities.greater_then(minimum**times)
|
43
43
|
end
|
44
44
|
|
45
45
|
def normalize
|
46
|
-
normalization_factor = 1.to_f / @
|
47
|
-
@
|
46
|
+
normalization_factor = 1.to_f / @probabilities.sum
|
47
|
+
@probabilities.multiply(factor: normalization_factor)
|
48
48
|
end
|
49
49
|
|
50
50
|
def p_apriori(category)
|
@@ -1,35 +1,35 @@
|
|
1
|
-
class
|
1
|
+
class ProbabilityCollection
|
2
2
|
def initialize(args)
|
3
3
|
@categories = args[:categories] || []
|
4
4
|
initialize_ids
|
5
|
-
@
|
6
|
-
|
5
|
+
@probabilities = []
|
6
|
+
initalize_probabilities(@ids)
|
7
7
|
end
|
8
8
|
|
9
9
|
def find(category)
|
10
|
-
return @
|
10
|
+
return @probabilities[category.id]
|
11
11
|
end
|
12
12
|
|
13
13
|
|
14
14
|
def set(args)
|
15
15
|
category = args[:category]
|
16
16
|
value = args[:value]
|
17
|
-
@
|
17
|
+
@probabilities[category.id] = value
|
18
18
|
end
|
19
19
|
|
20
20
|
def multiply(args)
|
21
21
|
category = args[:category]
|
22
22
|
factor = args[:factor]
|
23
23
|
if category
|
24
|
-
@
|
24
|
+
@probabilities[category.id] *= factor
|
25
25
|
else
|
26
|
-
@
|
26
|
+
@probabilities.map! {|el| el*factor}
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
30
|
def category_with_max
|
31
|
-
if @
|
32
|
-
id = @
|
31
|
+
if @probabilities.max > 0
|
32
|
+
id = @probabilities.find_index(@probabilities.max)
|
33
33
|
@categories.find {|category| category.id == id}
|
34
34
|
else
|
35
35
|
NullCategory.new
|
@@ -37,11 +37,11 @@ class PropabilityCollection
|
|
37
37
|
end
|
38
38
|
|
39
39
|
def max
|
40
|
-
@
|
40
|
+
@probabilities.max
|
41
41
|
end
|
42
42
|
|
43
43
|
def greater_then(value)
|
44
|
-
@
|
44
|
+
@probabilities.map! do |p|
|
45
45
|
if p > value
|
46
46
|
p
|
47
47
|
else
|
@@ -51,7 +51,7 @@ class PropabilityCollection
|
|
51
51
|
end
|
52
52
|
|
53
53
|
def sum
|
54
|
-
@
|
54
|
+
@probabilities.reduce(:+)
|
55
55
|
end
|
56
56
|
|
57
57
|
def to_s
|
@@ -70,9 +70,9 @@ class PropabilityCollection
|
|
70
70
|
@ids = @categories.map { |category| category.id }
|
71
71
|
end
|
72
72
|
|
73
|
-
def
|
73
|
+
def initalize_probabilities(ids)
|
74
74
|
ids.max.times do
|
75
|
-
@
|
75
|
+
@probabilities << 0
|
76
76
|
end
|
77
77
|
end
|
78
78
|
end
|
@@ -2,20 +2,25 @@ class TextClassifier
|
|
2
2
|
attr_reader :categories
|
3
3
|
def initialize( args )
|
4
4
|
@categories = args[:categories]
|
5
|
-
@calculator = args[:calculator] ||
|
5
|
+
@calculator = args[:calculator] || ProbabilityCalculator.new(categories: @categories)
|
6
6
|
end
|
7
7
|
|
8
8
|
def classify(text)
|
9
9
|
get_category_for(text)
|
10
10
|
end
|
11
11
|
|
12
|
+
def probabilities(text)
|
13
|
+
@calculator.get_probabilities_for(text)
|
14
|
+
end
|
15
|
+
|
12
16
|
def propabilities(text)
|
13
|
-
|
17
|
+
puts "This notation is deprecated in will be removed in later versions. Please use probabilities (4th character b instead of p)"
|
18
|
+
probabilities(text)
|
14
19
|
end
|
15
20
|
|
16
21
|
private
|
17
22
|
def get_category_for(text)
|
18
|
-
|
19
|
-
|
23
|
+
probabilities = @calculator.get_probabilities_for(text)
|
24
|
+
probabilities.category_with_max
|
20
25
|
end
|
21
26
|
end
|
data/lib/NaiveText/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: NaiveText
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- RicciFlowing
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -107,8 +107,8 @@ files:
|
|
107
107
|
- lib/NaiveText/Example.rb
|
108
108
|
- lib/NaiveText/ExamplesFactory.rb
|
109
109
|
- lib/NaiveText/ExamplesGroup.rb
|
110
|
-
- lib/NaiveText/
|
111
|
-
- lib/NaiveText/
|
110
|
+
- lib/NaiveText/ProbabilityCalculator.rb
|
111
|
+
- lib/NaiveText/ProbabilityCollection.rb
|
112
112
|
- lib/NaiveText/TextClassifier.rb
|
113
113
|
- lib/NaiveText/version.rb
|
114
114
|
homepage: https://github.com/RicciFlowing/NaiveText
|