NaiveText 0.6.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Gemfile +3 -4
- data/Guardfile +3 -4
- data/NaiveText.gemspec +12 -12
- data/Rakefile +1 -2
- data/bin/console +3 -3
- data/lib/NaiveText.rb +12 -15
- data/lib/NaiveText/Categories.rb +5 -6
- data/lib/NaiveText/CategoriesFactory.rb +10 -29
- data/lib/NaiveText/Category.rb +2 -3
- data/lib/NaiveText/Example.rb +3 -2
- data/lib/NaiveText/ExamplesFactory.rb +8 -8
- data/lib/NaiveText/ExamplesGroup.rb +15 -17
- data/lib/NaiveText/ProbabilityCalculator.rb +22 -22
- data/lib/NaiveText/ProbabilityCollection.rb +16 -16
- data/lib/NaiveText/TextClassifier.rb +2 -6
- data/lib/NaiveText/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50e030f17d9a465122b843bd773747c02eee7488
|
4
|
+
data.tar.gz: a6b6ac823fb3ac1e190a2fad37871e258f76bf5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 795b9f38baa41fb7899070394832d1d520f63711eae619e0995550984293bb631bb83059826d92444841a17df7297f566371c21d611a5433fff4ee5b3802e224
|
7
|
+
data.tar.gz: 7cd4d3aa96d4b237e98b062250e3cd61536eb705977c088bc4fb004275d4de6f58d3e9b598c7f4b4877fc29e726f3730d7c1614ba4934670faa786812529b17e
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,15 @@
|
|
2
2
|
All notable changes to this project will be documented in this file.
|
3
3
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
4
4
|
|
5
|
+
## [1.0.0]- 2016-1-5
|
6
|
+
### Changed
|
7
|
+
- Split up the integration specs. Removed some duplication in the specs.
|
8
|
+
- Refactored the specs to be more concise.
|
9
|
+
- Cleaned the source code to be more readable.
|
10
|
+
### Deleted
|
11
|
+
- Removed old and deprecated array option for CategoriesFactory
|
12
|
+
- Removed old misspelled call for propabilities on TextClassifier
|
13
|
+
|
5
14
|
## [0.6.0]- 2015-11-30
|
6
15
|
### Added
|
7
16
|
- Added optional language_model, that make it possible to compare words based on the word stem. (Like 'testing', 'tests', 'tested' all matched with the stem 'test')
|
data/Gemfile
CHANGED
@@ -3,7 +3,6 @@ source 'https://rubygems.org'
|
|
3
3
|
# Specify your gem's dependencies in NaiveText.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
|
7
|
-
spec.add_development_dependency
|
8
|
-
spec.add_development_dependency
|
9
|
-
spec.add_development_dependency "guard-rubocop"
|
6
|
+
spec.add_development_dependency 'guard'
|
7
|
+
spec.add_development_dependency 'guard-rspec'
|
8
|
+
spec.add_development_dependency 'guard-rubocop'
|
data/Guardfile
CHANGED
@@ -24,8 +24,8 @@
|
|
24
24
|
# * zeus: 'zeus rspec' (requires the server to be started separately)
|
25
25
|
# * 'just' rspec: 'rspec'
|
26
26
|
|
27
|
-
guard :rspec, cmd:
|
28
|
-
require
|
27
|
+
guard :rspec, cmd: 'bundle exec rspec' do
|
28
|
+
require 'guard/rspec/dsl'
|
29
29
|
dsl = Guard::RSpec::Dsl.new(self)
|
30
30
|
|
31
31
|
# Feel free to open issues for suggestions and improvements
|
@@ -39,10 +39,9 @@ guard :rspec, cmd: "bundle exec rspec" do
|
|
39
39
|
# Ruby files
|
40
40
|
ruby = dsl.ruby
|
41
41
|
dsl.watch_spec_files_for(ruby.lib_files)
|
42
|
-
|
43
42
|
end
|
44
43
|
|
45
44
|
guard :rubocop, keep_failed: false do
|
46
|
-
watch(
|
45
|
+
watch(/(.+\.rb)$/)
|
47
46
|
watch(%r{(?:.+/)?\.rubocop\.yml$}) { |m| File.dirname(m[0]) }
|
48
47
|
end
|
data/NaiveText.gemspec
CHANGED
@@ -4,27 +4,27 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'NaiveText/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
7
|
+
spec.name = 'NaiveText'
|
8
8
|
spec.version = NaiveText::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
9
|
+
spec.authors = ['RicciFlowing']
|
10
|
+
spec.email = ['benjamin@mathe-sellin.de']
|
11
11
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
15
|
-
spec.licenses
|
12
|
+
spec.summary = 'A text classifier written in ruby'
|
13
|
+
spec.description = 'NaiveText is a text classifier gem written in ruby and made to be easily integratable in your Rails app.'
|
14
|
+
spec.homepage = 'https://github.com/RicciFlowing/NaiveText'
|
15
|
+
spec.licenses = ['MIT']
|
16
16
|
|
17
17
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
spec.bindir =
|
18
|
+
spec.bindir = 'exe'
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
21
21
|
|
22
22
|
spec.required_ruby_version = '>= 2.0.0'
|
23
23
|
|
24
24
|
if spec.respond_to?(:metadata)
|
25
|
-
spec.metadata['allowed_push_host'] =
|
25
|
+
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
26
26
|
end
|
27
27
|
|
28
|
-
spec.add_development_dependency
|
29
|
-
spec.add_development_dependency
|
28
|
+
spec.add_development_dependency 'bundler', '~> 1.8'
|
29
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
30
30
|
end
|
data/Rakefile
CHANGED
@@ -1,2 +1 @@
|
|
1
|
-
require
|
2
|
-
|
1
|
+
require 'bundler/gem_tasks'
|
data/bin/console
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'NaiveText'
|
5
5
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +10,5 @@ require "NaiveText"
|
|
10
10
|
# require "pry"
|
11
11
|
# Pry.start
|
12
12
|
|
13
|
-
require
|
13
|
+
require 'irb'
|
14
14
|
IRB.start
|
data/lib/NaiveText.rb
CHANGED
@@ -1,20 +1,17 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
-
require
|
10
|
-
require
|
11
|
-
|
12
|
-
|
1
|
+
require 'NaiveText/version'
|
2
|
+
require 'NaiveText/Example'
|
3
|
+
require 'NaiveText/ExamplesFactory'
|
4
|
+
require 'NaiveText/ExamplesGroup'
|
5
|
+
require 'NaiveText/ProbabilityCollection'
|
6
|
+
require 'NaiveText/ProbabilityCalculator'
|
7
|
+
require 'NaiveText/TextClassifier'
|
8
|
+
require 'NaiveText/Category'
|
9
|
+
require 'NaiveText/Categories'
|
10
|
+
require 'NaiveText/CategoriesFactory'
|
13
11
|
|
14
12
|
module NaiveText
|
15
|
-
|
16
13
|
def self.build(config)
|
17
|
-
|
18
|
-
|
14
|
+
@categories = CategoriesFactory.build(config)
|
15
|
+
@test_classifier = TextClassifier.new(categories: @categories)
|
19
16
|
end
|
20
17
|
end
|
data/lib/NaiveText/Categories.rb
CHANGED
@@ -17,14 +17,13 @@ class Categories
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def total_word_count
|
20
|
-
@categories.inject(0) { |count, category
|
20
|
+
@categories.inject(0) { |count, category| count + category.word_count }
|
21
21
|
end
|
22
22
|
|
23
23
|
private
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
25
|
+
def calculate_apriori_propability_for(category)
|
26
|
+
sum_of_words = @categories.inject(0) { |sum, category| sum + category.word_count }
|
27
|
+
category.word_count.to_f / sum_of_words
|
28
|
+
end
|
30
29
|
end
|
@@ -2,36 +2,17 @@ class CategoriesFactory
|
|
2
2
|
def self.build(config)
|
3
3
|
categories = []
|
4
4
|
default = nil
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
puts "This category was not created."
|
15
|
-
end
|
5
|
+
config[:categories].each do |category_config|
|
6
|
+
begin
|
7
|
+
group = ExamplesGroup.new(examples: category_config[:examples], language_model: config[:language_model])
|
8
|
+
category = Category.new(name: category_config[:name], examples: group, weight: category_config[:weight])
|
9
|
+
categories << category
|
10
|
+
default = category if category_config[:name] == config[:default]
|
11
|
+
rescue
|
12
|
+
puts "You haven't provided trainingsdata for the category" + category_config[:name]
|
13
|
+
puts 'This category was not created.'
|
16
14
|
end
|
17
|
-
Categories.new(categories: categories)
|
18
|
-
|
19
|
-
|
20
|
-
else
|
21
|
-
config[:categories].each do |category_config|
|
22
|
-
begin
|
23
|
-
group = ExamplesGroup.new(examples: category_config[:examples], language_model: config[:language_model] )
|
24
|
-
category = Category.new(name: category_config[:name], examples: group, weight: category_config[:weight])
|
25
|
-
categories << category
|
26
|
-
if category_config[:name] == config[:default]
|
27
|
-
default = category
|
28
|
-
end
|
29
|
-
rescue
|
30
|
-
puts "You haven't provided trainingsdata for the category" + category_config[:name]
|
31
|
-
puts "This category was not created."
|
32
|
-
end
|
33
|
-
end
|
34
|
-
Categories.new(categories: categories, default: default )
|
35
15
|
end
|
16
|
+
Categories.new(categories: categories, default: default)
|
36
17
|
end
|
37
18
|
end
|
data/lib/NaiveText/Category.rb
CHANGED
@@ -3,7 +3,6 @@ class Category
|
|
3
3
|
|
4
4
|
attr_reader :name, :id, :weight
|
5
5
|
|
6
|
-
|
7
6
|
def initialize(args)
|
8
7
|
@name = args[:name]
|
9
8
|
@examples = args[:examples]
|
@@ -14,7 +13,7 @@ class Category
|
|
14
13
|
end
|
15
14
|
|
16
15
|
def p(word)
|
17
|
-
if
|
16
|
+
if @examples.word_count > 0
|
18
17
|
@examples.count(word).to_f / @examples.word_count
|
19
18
|
else
|
20
19
|
0
|
@@ -35,7 +34,7 @@ class NullCategory
|
|
35
34
|
attr_reader :id
|
36
35
|
|
37
36
|
def initialize
|
38
|
-
@name =
|
37
|
+
@name = 'No category'
|
39
38
|
@id = 0
|
40
39
|
end
|
41
40
|
end
|
data/lib/NaiveText/Example.rb
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
class ExamplesFactory
|
2
2
|
def self.from_files(dir_path)
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
end
|
9
|
-
rescue
|
10
|
-
puts "Failed loading" + dir_path
|
3
|
+
begin
|
4
|
+
examples = []
|
5
|
+
Dir.foreach(dir_path) do |file_path|
|
6
|
+
next if file_path == '.' || file_path == '..'
|
7
|
+
examples.push FileExample.new(path: dir_path + '/' + file_path)
|
11
8
|
end
|
9
|
+
rescue
|
10
|
+
puts 'Failed loading' + dir_path
|
11
|
+
end
|
12
12
|
examples
|
13
13
|
end
|
14
14
|
end
|
@@ -1,13 +1,11 @@
|
|
1
1
|
class ExamplesGroup
|
2
2
|
def initialize(args)
|
3
3
|
@examples = args[:examples].to_a || []
|
4
|
-
@language_model = args[:language_model] ||
|
4
|
+
@language_model = args[:language_model] || ->(str) { str }
|
5
5
|
load_text
|
6
6
|
split_text_into_words
|
7
7
|
format_words
|
8
|
-
if @words.length == 0
|
9
|
-
raise 'Empty_Trainingsdata'
|
10
|
-
end
|
8
|
+
fail 'Empty_Trainingsdata' if @words.length == 0
|
11
9
|
end
|
12
10
|
|
13
11
|
def count(word)
|
@@ -20,20 +18,20 @@ class ExamplesGroup
|
|
20
18
|
|
21
19
|
private
|
22
20
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
end
|
21
|
+
def load_text
|
22
|
+
@text = ''
|
23
|
+
@examples.each do |example|
|
24
|
+
@text += ' ' + example.text
|
28
25
|
end
|
26
|
+
end
|
29
27
|
|
30
|
-
|
31
|
-
|
32
|
-
|
28
|
+
def split_text_into_words
|
29
|
+
@words = @text.split(/\W+/)
|
30
|
+
end
|
33
31
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
32
|
+
def format_words
|
33
|
+
@words.map!(&:downcase)
|
34
|
+
@words.map! { |word| @language_model.call(word) }
|
35
|
+
@words
|
36
|
+
end
|
39
37
|
end
|
@@ -9,35 +9,35 @@ class ProbabilityCalculator
|
|
9
9
|
@probabilities.normalize
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
12
|
private
|
14
|
-
def protect_factor(factor)
|
15
|
-
[factor, minimum].max
|
16
|
-
end
|
17
13
|
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
def protect_factor(factor)
|
15
|
+
[factor, minimum].max
|
16
|
+
end
|
21
17
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
list_of_words.each do |word|
|
26
|
-
@categories.each do |category|
|
27
|
-
@probabilities.multiply(category: category, factor: protect_factor(category.p(word)) )
|
28
|
-
end
|
29
|
-
end
|
30
|
-
remove_minimum(text)
|
31
|
-
end
|
18
|
+
def minimum
|
19
|
+
1.to_f / (10 * @categories.total_word_count)
|
20
|
+
end
|
32
21
|
|
33
|
-
|
22
|
+
def calculateProbabilities(text)
|
23
|
+
set_apriori_probabilities
|
24
|
+
list_of_words = text.split(/\W+/)
|
25
|
+
list_of_words.each do |word|
|
34
26
|
@categories.each do |category|
|
35
|
-
@probabilities.
|
27
|
+
@probabilities.multiply(category: category, factor: protect_factor(category.p(word)))
|
36
28
|
end
|
37
29
|
end
|
30
|
+
remove_minimum(text)
|
31
|
+
end
|
38
32
|
|
39
|
-
|
40
|
-
|
41
|
-
@probabilities.
|
33
|
+
def set_apriori_probabilities
|
34
|
+
@categories.each do |category|
|
35
|
+
@probabilities.set(category: category, value: @categories.p_apriori(category))
|
42
36
|
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def remove_minimum(text)
|
40
|
+
times = text.split(/\W+/).length
|
41
|
+
@probabilities.greater_then(minimum**times)
|
42
|
+
end
|
43
43
|
end
|
@@ -1,18 +1,17 @@
|
|
1
1
|
class ProbabilityCollection
|
2
2
|
def initialize(args)
|
3
|
-
@categories
|
3
|
+
@categories = args[:categories] || []
|
4
4
|
initialize_ids
|
5
5
|
@probabilities = []
|
6
6
|
initalize_probabilities(@ids)
|
7
7
|
end
|
8
8
|
|
9
9
|
def find(category)
|
10
|
-
|
10
|
+
@probabilities[category.id]
|
11
11
|
end
|
12
12
|
|
13
|
-
|
14
13
|
def set(args)
|
15
|
-
category
|
14
|
+
category = args[:category]
|
16
15
|
value = args[:value]
|
17
16
|
@probabilities[category.id] = value
|
18
17
|
end
|
@@ -23,14 +22,14 @@ class ProbabilityCollection
|
|
23
22
|
if category
|
24
23
|
@probabilities[category.id] *= factor
|
25
24
|
else
|
26
|
-
@probabilities.map! {|el| el*factor}
|
25
|
+
@probabilities.map! { |el| el * factor }
|
27
26
|
end
|
28
27
|
end
|
29
28
|
|
30
29
|
def normalize
|
31
|
-
if
|
32
|
-
normalization_factor = 1.to_f /
|
33
|
-
|
30
|
+
if sum > 0
|
31
|
+
normalization_factor = 1.to_f / sum
|
32
|
+
multiply(factor: normalization_factor)
|
34
33
|
end
|
35
34
|
self
|
36
35
|
end
|
@@ -38,7 +37,7 @@ class ProbabilityCollection
|
|
38
37
|
def category_with_max
|
39
38
|
if @probabilities.max > 0
|
40
39
|
id = @probabilities.find_index(@probabilities.max)
|
41
|
-
@categories.find {|category| category.id == id}
|
40
|
+
@categories.find { |category| category.id == id }
|
42
41
|
else
|
43
42
|
@categories.default
|
44
43
|
end
|
@@ -50,11 +49,11 @@ class ProbabilityCollection
|
|
50
49
|
|
51
50
|
def greater_then(value)
|
52
51
|
@probabilities.map! do |p|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
52
|
+
if p > value
|
53
|
+
p
|
54
|
+
else
|
55
|
+
0
|
56
|
+
end
|
58
57
|
end
|
59
58
|
end
|
60
59
|
|
@@ -67,15 +66,16 @@ class ProbabilityCollection
|
|
67
66
|
@categories.each do |category|
|
68
67
|
result << category.to_s
|
69
68
|
result << ':'
|
70
|
-
result <<
|
69
|
+
result << find(category).to_s
|
71
70
|
result << "\n"
|
72
71
|
end
|
73
72
|
result
|
74
73
|
end
|
75
74
|
|
76
75
|
private
|
76
|
+
|
77
77
|
def initialize_ids
|
78
|
-
@ids =
|
78
|
+
@ids = @categories.map(&:id)
|
79
79
|
end
|
80
80
|
|
81
81
|
def initalize_probabilities(ids)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
class TextClassifier
|
2
2
|
attr_reader :categories
|
3
|
-
def initialize(
|
3
|
+
def initialize(args)
|
4
4
|
@categories = args[:categories]
|
5
5
|
@calculator = args[:calculator] || ProbabilityCalculator.new(categories: @categories)
|
6
6
|
end
|
@@ -13,12 +13,8 @@ class TextClassifier
|
|
13
13
|
@calculator.get_probabilities_for(text)
|
14
14
|
end
|
15
15
|
|
16
|
-
|
17
|
-
puts "This notation is deprecated in will be removed in later versions. Please use probabilities (4th character b instead of p)"
|
18
|
-
probabilities(text)
|
19
|
-
end
|
16
|
+
private
|
20
17
|
|
21
|
-
private
|
22
18
|
def get_category_for(text)
|
23
19
|
probabilities = @calculator.get_probabilities_for(text)
|
24
20
|
@categories.each do |category|
|
data/lib/NaiveText/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: NaiveText
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- RicciFlowing
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|