NaiveText 0.6.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Gemfile +3 -4
- data/Guardfile +3 -4
- data/NaiveText.gemspec +12 -12
- data/Rakefile +1 -2
- data/bin/console +3 -3
- data/lib/NaiveText.rb +12 -15
- data/lib/NaiveText/Categories.rb +5 -6
- data/lib/NaiveText/CategoriesFactory.rb +10 -29
- data/lib/NaiveText/Category.rb +2 -3
- data/lib/NaiveText/Example.rb +3 -2
- data/lib/NaiveText/ExamplesFactory.rb +8 -8
- data/lib/NaiveText/ExamplesGroup.rb +15 -17
- data/lib/NaiveText/ProbabilityCalculator.rb +22 -22
- data/lib/NaiveText/ProbabilityCollection.rb +16 -16
- data/lib/NaiveText/TextClassifier.rb +2 -6
- data/lib/NaiveText/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50e030f17d9a465122b843bd773747c02eee7488
|
4
|
+
data.tar.gz: a6b6ac823fb3ac1e190a2fad37871e258f76bf5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 795b9f38baa41fb7899070394832d1d520f63711eae619e0995550984293bb631bb83059826d92444841a17df7297f566371c21d611a5433fff4ee5b3802e224
|
7
|
+
data.tar.gz: 7cd4d3aa96d4b237e98b062250e3cd61536eb705977c088bc4fb004275d4de6f58d3e9b598c7f4b4877fc29e726f3730d7c1614ba4934670faa786812529b17e
|
data/CHANGELOG.md
CHANGED
@@ -2,6 +2,15 @@
|
|
2
2
|
All notable changes to this project will be documented in this file.
|
3
3
|
This project adheres to [Semantic Versioning](http://semver.org/).
|
4
4
|
|
5
|
+
## [1.0.0]- 2016-1-5
|
6
|
+
### Changed
|
7
|
+
- Split up the integration specs. Removed some duplication in the specs.
|
8
|
+
- Refactored the specs to be more concise.
|
9
|
+
- Cleaned the source code to be more readable.
|
10
|
+
### Deleted
|
11
|
+
- Removed old and deprecated array option for CategoriesFactory
|
12
|
+
- Removed old misspelled call for propabilities on TextClassifier
|
13
|
+
|
5
14
|
## [0.6.0]- 2015-11-30
|
6
15
|
### Added
|
7
16
|
- Added optional language_model, that make it possible to compare words based on the word stem. (Like 'testing', 'tests', 'tested' all matched with the stem 'test')
|
data/Gemfile
CHANGED
@@ -3,7 +3,6 @@ source 'https://rubygems.org'
|
|
3
3
|
# Specify your gem's dependencies in NaiveText.gemspec
|
4
4
|
gemspec
|
5
5
|
|
6
|
-
|
7
|
-
spec.add_development_dependency
|
8
|
-
spec.add_development_dependency
|
9
|
-
spec.add_development_dependency "guard-rubocop"
|
6
|
+
spec.add_development_dependency 'guard'
|
7
|
+
spec.add_development_dependency 'guard-rspec'
|
8
|
+
spec.add_development_dependency 'guard-rubocop'
|
data/Guardfile
CHANGED
@@ -24,8 +24,8 @@
|
|
24
24
|
# * zeus: 'zeus rspec' (requires the server to be started separately)
|
25
25
|
# * 'just' rspec: 'rspec'
|
26
26
|
|
27
|
-
guard :rspec, cmd:
|
28
|
-
require
|
27
|
+
guard :rspec, cmd: 'bundle exec rspec' do
|
28
|
+
require 'guard/rspec/dsl'
|
29
29
|
dsl = Guard::RSpec::Dsl.new(self)
|
30
30
|
|
31
31
|
# Feel free to open issues for suggestions and improvements
|
@@ -39,10 +39,9 @@ guard :rspec, cmd: "bundle exec rspec" do
|
|
39
39
|
# Ruby files
|
40
40
|
ruby = dsl.ruby
|
41
41
|
dsl.watch_spec_files_for(ruby.lib_files)
|
42
|
-
|
43
42
|
end
|
44
43
|
|
45
44
|
guard :rubocop, keep_failed: false do
|
46
|
-
watch(
|
45
|
+
watch(/(.+\.rb)$/)
|
47
46
|
watch(%r{(?:.+/)?\.rubocop\.yml$}) { |m| File.dirname(m[0]) }
|
48
47
|
end
|
data/NaiveText.gemspec
CHANGED
@@ -4,27 +4,27 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'NaiveText/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name =
|
7
|
+
spec.name = 'NaiveText'
|
8
8
|
spec.version = NaiveText::VERSION
|
9
|
-
spec.authors = [
|
10
|
-
spec.email = [
|
9
|
+
spec.authors = ['RicciFlowing']
|
10
|
+
spec.email = ['benjamin@mathe-sellin.de']
|
11
11
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
14
|
-
spec.homepage =
|
15
|
-
spec.licenses
|
12
|
+
spec.summary = 'A text classifier written in ruby'
|
13
|
+
spec.description = 'NaiveText is a text classifier gem written in ruby and made to be easily integratable in your Rails app.'
|
14
|
+
spec.homepage = 'https://github.com/RicciFlowing/NaiveText'
|
15
|
+
spec.licenses = ['MIT']
|
16
16
|
|
17
17
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
spec.bindir =
|
18
|
+
spec.bindir = 'exe'
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
21
21
|
|
22
22
|
spec.required_ruby_version = '>= 2.0.0'
|
23
23
|
|
24
24
|
if spec.respond_to?(:metadata)
|
25
|
-
spec.metadata['allowed_push_host'] =
|
25
|
+
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
26
26
|
end
|
27
27
|
|
28
|
-
spec.add_development_dependency
|
29
|
-
spec.add_development_dependency
|
28
|
+
spec.add_development_dependency 'bundler', '~> 1.8'
|
29
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
30
30
|
end
|
data/Rakefile
CHANGED
@@ -1,2 +1 @@
|
|
1
|
-
require
|
2
|
-
|
1
|
+
require 'bundler/gem_tasks'
|
data/bin/console
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'NaiveText'
|
5
5
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +10,5 @@ require "NaiveText"
|
|
10
10
|
# require "pry"
|
11
11
|
# Pry.start
|
12
12
|
|
13
|
-
require
|
13
|
+
require 'irb'
|
14
14
|
IRB.start
|
data/lib/NaiveText.rb
CHANGED
@@ -1,20 +1,17 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
-
require
|
10
|
-
require
|
11
|
-
|
12
|
-
|
1
|
+
require 'NaiveText/version'
|
2
|
+
require 'NaiveText/Example'
|
3
|
+
require 'NaiveText/ExamplesFactory'
|
4
|
+
require 'NaiveText/ExamplesGroup'
|
5
|
+
require 'NaiveText/ProbabilityCollection'
|
6
|
+
require 'NaiveText/ProbabilityCalculator'
|
7
|
+
require 'NaiveText/TextClassifier'
|
8
|
+
require 'NaiveText/Category'
|
9
|
+
require 'NaiveText/Categories'
|
10
|
+
require 'NaiveText/CategoriesFactory'
|
13
11
|
|
14
12
|
module NaiveText
|
15
|
-
|
16
13
|
def self.build(config)
|
17
|
-
|
18
|
-
|
14
|
+
@categories = CategoriesFactory.build(config)
|
15
|
+
@test_classifier = TextClassifier.new(categories: @categories)
|
19
16
|
end
|
20
17
|
end
|
data/lib/NaiveText/Categories.rb
CHANGED
@@ -17,14 +17,13 @@ class Categories
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def total_word_count
|
20
|
-
@categories.inject(0) { |count, category
|
20
|
+
@categories.inject(0) { |count, category| count + category.word_count }
|
21
21
|
end
|
22
22
|
|
23
23
|
private
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
25
|
+
def calculate_apriori_propability_for(category)
|
26
|
+
sum_of_words = @categories.inject(0) { |sum, category| sum + category.word_count }
|
27
|
+
category.word_count.to_f / sum_of_words
|
28
|
+
end
|
30
29
|
end
|
@@ -2,36 +2,17 @@ class CategoriesFactory
|
|
2
2
|
def self.build(config)
|
3
3
|
categories = []
|
4
4
|
default = nil
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
puts "This category was not created."
|
15
|
-
end
|
5
|
+
config[:categories].each do |category_config|
|
6
|
+
begin
|
7
|
+
group = ExamplesGroup.new(examples: category_config[:examples], language_model: config[:language_model])
|
8
|
+
category = Category.new(name: category_config[:name], examples: group, weight: category_config[:weight])
|
9
|
+
categories << category
|
10
|
+
default = category if category_config[:name] == config[:default]
|
11
|
+
rescue
|
12
|
+
puts "You haven't provided trainingsdata for the category" + category_config[:name]
|
13
|
+
puts 'This category was not created.'
|
16
14
|
end
|
17
|
-
Categories.new(categories: categories)
|
18
|
-
|
19
|
-
|
20
|
-
else
|
21
|
-
config[:categories].each do |category_config|
|
22
|
-
begin
|
23
|
-
group = ExamplesGroup.new(examples: category_config[:examples], language_model: config[:language_model] )
|
24
|
-
category = Category.new(name: category_config[:name], examples: group, weight: category_config[:weight])
|
25
|
-
categories << category
|
26
|
-
if category_config[:name] == config[:default]
|
27
|
-
default = category
|
28
|
-
end
|
29
|
-
rescue
|
30
|
-
puts "You haven't provided trainingsdata for the category" + category_config[:name]
|
31
|
-
puts "This category was not created."
|
32
|
-
end
|
33
|
-
end
|
34
|
-
Categories.new(categories: categories, default: default )
|
35
15
|
end
|
16
|
+
Categories.new(categories: categories, default: default)
|
36
17
|
end
|
37
18
|
end
|
data/lib/NaiveText/Category.rb
CHANGED
@@ -3,7 +3,6 @@ class Category
|
|
3
3
|
|
4
4
|
attr_reader :name, :id, :weight
|
5
5
|
|
6
|
-
|
7
6
|
def initialize(args)
|
8
7
|
@name = args[:name]
|
9
8
|
@examples = args[:examples]
|
@@ -14,7 +13,7 @@ class Category
|
|
14
13
|
end
|
15
14
|
|
16
15
|
def p(word)
|
17
|
-
if
|
16
|
+
if @examples.word_count > 0
|
18
17
|
@examples.count(word).to_f / @examples.word_count
|
19
18
|
else
|
20
19
|
0
|
@@ -35,7 +34,7 @@ class NullCategory
|
|
35
34
|
attr_reader :id
|
36
35
|
|
37
36
|
def initialize
|
38
|
-
@name =
|
37
|
+
@name = 'No category'
|
39
38
|
@id = 0
|
40
39
|
end
|
41
40
|
end
|
data/lib/NaiveText/Example.rb
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
class ExamplesFactory
|
2
2
|
def self.from_files(dir_path)
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
end
|
9
|
-
rescue
|
10
|
-
puts "Failed loading" + dir_path
|
3
|
+
begin
|
4
|
+
examples = []
|
5
|
+
Dir.foreach(dir_path) do |file_path|
|
6
|
+
next if file_path == '.' || file_path == '..'
|
7
|
+
examples.push FileExample.new(path: dir_path + '/' + file_path)
|
11
8
|
end
|
9
|
+
rescue
|
10
|
+
puts 'Failed loading' + dir_path
|
11
|
+
end
|
12
12
|
examples
|
13
13
|
end
|
14
14
|
end
|
@@ -1,13 +1,11 @@
|
|
1
1
|
class ExamplesGroup
|
2
2
|
def initialize(args)
|
3
3
|
@examples = args[:examples].to_a || []
|
4
|
-
@language_model = args[:language_model] ||
|
4
|
+
@language_model = args[:language_model] || ->(str) { str }
|
5
5
|
load_text
|
6
6
|
split_text_into_words
|
7
7
|
format_words
|
8
|
-
if @words.length == 0
|
9
|
-
raise 'Empty_Trainingsdata'
|
10
|
-
end
|
8
|
+
fail 'Empty_Trainingsdata' if @words.length == 0
|
11
9
|
end
|
12
10
|
|
13
11
|
def count(word)
|
@@ -20,20 +18,20 @@ class ExamplesGroup
|
|
20
18
|
|
21
19
|
private
|
22
20
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
end
|
21
|
+
def load_text
|
22
|
+
@text = ''
|
23
|
+
@examples.each do |example|
|
24
|
+
@text += ' ' + example.text
|
28
25
|
end
|
26
|
+
end
|
29
27
|
|
30
|
-
|
31
|
-
|
32
|
-
|
28
|
+
def split_text_into_words
|
29
|
+
@words = @text.split(/\W+/)
|
30
|
+
end
|
33
31
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
32
|
+
def format_words
|
33
|
+
@words.map!(&:downcase)
|
34
|
+
@words.map! { |word| @language_model.call(word) }
|
35
|
+
@words
|
36
|
+
end
|
39
37
|
end
|
@@ -9,35 +9,35 @@ class ProbabilityCalculator
|
|
9
9
|
@probabilities.normalize
|
10
10
|
end
|
11
11
|
|
12
|
-
|
13
12
|
private
|
14
|
-
def protect_factor(factor)
|
15
|
-
[factor, minimum].max
|
16
|
-
end
|
17
13
|
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
def protect_factor(factor)
|
15
|
+
[factor, minimum].max
|
16
|
+
end
|
21
17
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
list_of_words.each do |word|
|
26
|
-
@categories.each do |category|
|
27
|
-
@probabilities.multiply(category: category, factor: protect_factor(category.p(word)) )
|
28
|
-
end
|
29
|
-
end
|
30
|
-
remove_minimum(text)
|
31
|
-
end
|
18
|
+
def minimum
|
19
|
+
1.to_f / (10 * @categories.total_word_count)
|
20
|
+
end
|
32
21
|
|
33
|
-
|
22
|
+
def calculateProbabilities(text)
|
23
|
+
set_apriori_probabilities
|
24
|
+
list_of_words = text.split(/\W+/)
|
25
|
+
list_of_words.each do |word|
|
34
26
|
@categories.each do |category|
|
35
|
-
@probabilities.
|
27
|
+
@probabilities.multiply(category: category, factor: protect_factor(category.p(word)))
|
36
28
|
end
|
37
29
|
end
|
30
|
+
remove_minimum(text)
|
31
|
+
end
|
38
32
|
|
39
|
-
|
40
|
-
|
41
|
-
@probabilities.
|
33
|
+
def set_apriori_probabilities
|
34
|
+
@categories.each do |category|
|
35
|
+
@probabilities.set(category: category, value: @categories.p_apriori(category))
|
42
36
|
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def remove_minimum(text)
|
40
|
+
times = text.split(/\W+/).length
|
41
|
+
@probabilities.greater_then(minimum**times)
|
42
|
+
end
|
43
43
|
end
|
@@ -1,18 +1,17 @@
|
|
1
1
|
class ProbabilityCollection
|
2
2
|
def initialize(args)
|
3
|
-
@categories
|
3
|
+
@categories = args[:categories] || []
|
4
4
|
initialize_ids
|
5
5
|
@probabilities = []
|
6
6
|
initalize_probabilities(@ids)
|
7
7
|
end
|
8
8
|
|
9
9
|
def find(category)
|
10
|
-
|
10
|
+
@probabilities[category.id]
|
11
11
|
end
|
12
12
|
|
13
|
-
|
14
13
|
def set(args)
|
15
|
-
category
|
14
|
+
category = args[:category]
|
16
15
|
value = args[:value]
|
17
16
|
@probabilities[category.id] = value
|
18
17
|
end
|
@@ -23,14 +22,14 @@ class ProbabilityCollection
|
|
23
22
|
if category
|
24
23
|
@probabilities[category.id] *= factor
|
25
24
|
else
|
26
|
-
@probabilities.map! {|el| el*factor}
|
25
|
+
@probabilities.map! { |el| el * factor }
|
27
26
|
end
|
28
27
|
end
|
29
28
|
|
30
29
|
def normalize
|
31
|
-
if
|
32
|
-
normalization_factor = 1.to_f /
|
33
|
-
|
30
|
+
if sum > 0
|
31
|
+
normalization_factor = 1.to_f / sum
|
32
|
+
multiply(factor: normalization_factor)
|
34
33
|
end
|
35
34
|
self
|
36
35
|
end
|
@@ -38,7 +37,7 @@ class ProbabilityCollection
|
|
38
37
|
def category_with_max
|
39
38
|
if @probabilities.max > 0
|
40
39
|
id = @probabilities.find_index(@probabilities.max)
|
41
|
-
@categories.find {|category| category.id == id}
|
40
|
+
@categories.find { |category| category.id == id }
|
42
41
|
else
|
43
42
|
@categories.default
|
44
43
|
end
|
@@ -50,11 +49,11 @@ class ProbabilityCollection
|
|
50
49
|
|
51
50
|
def greater_then(value)
|
52
51
|
@probabilities.map! do |p|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
52
|
+
if p > value
|
53
|
+
p
|
54
|
+
else
|
55
|
+
0
|
56
|
+
end
|
58
57
|
end
|
59
58
|
end
|
60
59
|
|
@@ -67,15 +66,16 @@ class ProbabilityCollection
|
|
67
66
|
@categories.each do |category|
|
68
67
|
result << category.to_s
|
69
68
|
result << ':'
|
70
|
-
result <<
|
69
|
+
result << find(category).to_s
|
71
70
|
result << "\n"
|
72
71
|
end
|
73
72
|
result
|
74
73
|
end
|
75
74
|
|
76
75
|
private
|
76
|
+
|
77
77
|
def initialize_ids
|
78
|
-
@ids =
|
78
|
+
@ids = @categories.map(&:id)
|
79
79
|
end
|
80
80
|
|
81
81
|
def initalize_probabilities(ids)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
class TextClassifier
|
2
2
|
attr_reader :categories
|
3
|
-
def initialize(
|
3
|
+
def initialize(args)
|
4
4
|
@categories = args[:categories]
|
5
5
|
@calculator = args[:calculator] || ProbabilityCalculator.new(categories: @categories)
|
6
6
|
end
|
@@ -13,12 +13,8 @@ class TextClassifier
|
|
13
13
|
@calculator.get_probabilities_for(text)
|
14
14
|
end
|
15
15
|
|
16
|
-
|
17
|
-
puts "This notation is deprecated in will be removed in later versions. Please use probabilities (4th character b instead of p)"
|
18
|
-
probabilities(text)
|
19
|
-
end
|
16
|
+
private
|
20
17
|
|
21
|
-
private
|
22
18
|
def get_category_for(text)
|
23
19
|
probabilities = @calculator.get_probabilities_for(text)
|
24
20
|
@categories.each do |category|
|
data/lib/NaiveText/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: NaiveText
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- RicciFlowing
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-01-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|