NaiveText 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9a4ff5607deb99718f721a0e6b62636feb91f007
4
+ data.tar.gz: 6a0dd76a73cc56784bd333b6766ae2375ff6154d
5
+ SHA512:
6
+ metadata.gz: ad7d2e7dc253ebef99f9ee9fd2dcf87b63f41cd0fc7800102cc071b780a22ace3f554a939fcc90ee65acdeeeea4def99bfe86d3813ff9e59aa2d12f2067a2510
7
+ data.tar.gz: 4e54cb6f0bd1a0d091bd2b19e0f5afd10cdc85ea37c7cef74d2629d9613fe3a71e540da9fe41f7ed1329b7133933be168e913526c59cafa2563bf569c60b84da
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in NaiveText.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,48 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ ## Uncomment and set this to only include directories you want to watch
5
+ # directories %w(app lib config test spec features) \
6
+ # .select{|d| Dir.exists?(d) ? d : UI.warning("Directory #{d} does not exist")}
7
+
8
+ ## Note: if you are using the `directories` clause above and you are not
9
+ ## watching the project directory ('.'), then you will want to move
10
+ ## the Guardfile to a watched dir and symlink it back, e.g.
11
+ #
12
+ # $ mkdir config
13
+ # $ mv Guardfile config/
14
+ # $ ln -s config/Guardfile .
15
+ #
16
+ # and, you'll have to watch "config/Guardfile" instead of "Guardfile"
17
+
18
+ # Note: The cmd option is now required due to the increasing number of ways
19
+ # rspec may be run, below are examples of the most common uses.
20
+ # * bundler: 'bundle exec rspec'
21
+ # * bundler binstubs: 'bin/rspec'
22
+ # * spring: 'bin/rspec' (This will use spring if running and you have
23
+ # installed the spring binstubs per the docs)
24
+ # * zeus: 'zeus rspec' (requires the server to be started separately)
25
+ # * 'just' rspec: 'rspec'
26
+
27
+ guard :rspec, cmd: "bundle exec rspec" do
28
+ require "guard/rspec/dsl"
29
+ dsl = Guard::RSpec::Dsl.new(self)
30
+
31
+ # Feel free to open issues for suggestions and improvements
32
+
33
+ # RSpec files
34
+ rspec = dsl.rspec
35
+ watch(rspec.spec_helper) { rspec.spec_dir }
36
+ watch(rspec.spec_support) { rspec.spec_dir }
37
+ watch(rspec.spec_files)
38
+
39
+ # Ruby files
40
+ ruby = dsl.ruby
41
+ dsl.watch_spec_files_for(ruby.lib_files)
42
+
43
+ end
44
+
45
+ guard :rubocop, keep_failed: false do
46
+ watch(%r{(.+\.rb)$})
47
+ watch(%r{(?:.+/)?\.rubocop\.yml$}) { |m| File.dirname(m[0]) }
48
+ end
data/NaiveText.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'NaiveText/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "NaiveText"
8
+ spec.version = NaiveText::VERSION
9
+ spec.authors = ["RicciFlowing"]
10
+ spec.email = ["benjamin@mathe-sellin.de"]
11
+
12
+ spec.summary = "A NaiveText text classifier"
13
+ spec.description = "Sort texts based on expample texts in predefined categories"
14
+ spec.homepage = "https://github.com/RicciFlowing/NaiveText"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "exe"
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+ if spec.respond_to?(:metadata)
22
+ spec.metadata['allowed_push_host'] = "https://rubygems.org"
23
+ end
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.8"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "guard"
28
+ spec.add_development_dependency "guard-rspec"
29
+ spec.add_development_dependency "guard-rubocop"
30
+ end
data/README.md ADDED
@@ -0,0 +1,75 @@
1
+ # NaiveText
2
+
3
+ A naive Bayes Textclassifier written in Ruby
4
+
5
+ 1. What does it do?
6
+ ----
7
+
8
+ It sorts text into to categories predefined categories (i.e. interesting/boring).
9
+ The Algorithm bases the decisions on given text examples.
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ ```ruby
16
+ gem 'NaiveText'
17
+ ```
18
+
19
+ And then execute:
20
+
21
+ $ bundle
22
+
23
+ Or install it yourself as:
24
+
25
+ $ gem install NaiveText
26
+
27
+ ## Usage
28
+
29
+ In brief the process is as follows:
30
+
31
+ Create a directory (naming convention: training) containing subdirectories for every category (i.e. 'interesting' and 'boring').
32
+ Fill the directories with examples for interesting/boring texts. This will the training material for the algorithm.
33
+
34
+ Next up, the code:
35
+
36
+ ```ruby
37
+ require 'NaiveText'
38
+ ```
39
+ Now build the systems with your categories and training texts:
40
+
41
+ ```ruby
42
+ categories_config = [{name: 'interesting', path: 'spec/training/positive'},
43
+ {name: 'boring', path: 'spec/training/negative'}]
44
+ NaiveText.build(categories_config)
45
+ ```
46
+ Now you can start classifying texts:
47
+
48
+ ```ruby
49
+ NaiveText.classify('Seems to be interesting')
50
+ NaiveText.classify('Seems to be boring')
51
+ ```
52
+ Classify will return a category-object on which you can call name to get the name of the category as a string.
53
+
54
+ ```ruby
55
+ category = NaiveText.classify('Something interesting')
56
+ category.name
57
+ => 'interesting'
58
+ ```
59
+
60
+ Have fun using it!
61
+
62
+
63
+ ## Development
64
+
65
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
66
+
67
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
68
+
69
+ ## Contributing
70
+
71
+ 1. Fork it ( https://github.com/RicciFlowing/NaiveText/fork )
72
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
73
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
74
+ 4. Push to the branch (`git push origin my-new-feature`)
75
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "NaiveText"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
data/lib/NaiveText.rb ADDED
@@ -0,0 +1,32 @@
1
+ require "NaiveText/version"
2
+ require "NaiveText/ExamplesGroup"
3
+ require "NaiveText/PropabilityCollection"
4
+ require "NaiveText/PropabilityCalculator"
5
+ require "NaiveText/TextClassifier"
6
+ require "NaiveText/Text"
7
+ require "NaiveText/Category"
8
+ require "NaiveText/Categories"
9
+ require "NaiveText/CategoriesFactory"
10
+
11
+ module NaiveText
12
+ extend self
13
+
14
+ def build(config)
15
+ begin
16
+ @categories = CategoriesFactory.build(config)
17
+ @test_classifier = TextClassifier.new(categories: @categories)
18
+ rescue
19
+ puts "Their seems to be an error in your config.
20
+ The expectedt format is [{name: name_of_category, path: path_to_trainings_data}]"
21
+ end
22
+ end
23
+
24
+ def classify(text)
25
+ begin
26
+ @text = Text.new(text: text, classifier: @test_classifier)
27
+ @text.classify
28
+ rescue
29
+ puts "An Error occured. Did you call NaiveText.build before using classify"
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,23 @@
1
+ class Categories
2
+ include Enumerable
3
+
4
+ def initialize(args)
5
+ @categories = args[:categories] || []
6
+ end
7
+
8
+ def p_apriori(category)
9
+ calculate_apriori_propability_for(category)
10
+ end
11
+
12
+ def each(&block)
13
+ @categories.each(&block)
14
+ end
15
+
16
+ private
17
+
18
+ def calculate_apriori_propability_for(category)
19
+ sum_of_words = @categories.inject(0) {|sum, category| sum + category.word_count }
20
+ category.word_count.to_f / sum_of_words
21
+ end
22
+
23
+ end
@@ -0,0 +1,12 @@
1
+ class CategoriesFactory
2
+ def self.build(config)
3
+ categories = []
4
+
5
+ config.each do |category_config|
6
+ examples = ExamplesGroup.new(category_config[:path])
7
+ categories << Category.new(name: category_config[:name], examples: examples)
8
+ end
9
+
10
+ Categories.new(categories: categories)
11
+ end
12
+ end
@@ -0,0 +1,21 @@
1
+ class Category
2
+ @@id_counter = 1
3
+
4
+ attr_reader :name
5
+ attr_reader :id
6
+
7
+ def initialize(args)
8
+ @name = args[:name]
9
+ @examples = args[:examples]
10
+ @id = @@id_counter
11
+ @@id_counter += 1
12
+ end
13
+
14
+ def p(word)
15
+ @examples.count(word).to_f / @examples.word_count
16
+ end
17
+
18
+ def word_count
19
+ @examples.word_count
20
+ end
21
+ end
@@ -0,0 +1,25 @@
1
+ class ExamplesGroup
2
+ def initialize(path)
3
+ @text = load_text(path)
4
+ @words = @text.split(/\W+/)
5
+ end
6
+
7
+ def count(word)
8
+ @words.count(word)
9
+ end
10
+
11
+ def word_count
12
+ @words.count
13
+ end
14
+
15
+ private
16
+
17
+ def load_text(path)
18
+ text = ""
19
+ Dir.foreach(path) do |example_file|
20
+ next if example_file == '.' or example_file == '..'
21
+ text += File.read(path +'/'+ example_file)
22
+ end
23
+ text
24
+ end
25
+ end
@@ -0,0 +1,36 @@
1
+ class PropabilityCalculator
2
+ def initialize(args)
3
+ @categories = args[:categories] || []
4
+ @propabilities = PropabilityCollection.new(categories: @categories)
5
+ end
6
+
7
+ def get_propabilities_for(words)
8
+ calculateProbabilities(words)
9
+ end
10
+
11
+ private
12
+
13
+ def calculateProbabilities(list_of_words)
14
+ @categories.each do |category|
15
+ @propabilities.set(category: category, value: p_apriori(category))
16
+ end
17
+
18
+ list_of_words.each do |word|
19
+ @categories.each do |category|
20
+ @propabilities.multiply(category: category, factor: category.p(word) )
21
+ end
22
+ end
23
+
24
+ normalization_factor = 1.to_f / @propabilities.sum
25
+ @propabilities.multiply(factor: normalization_factor)
26
+
27
+ @propabilities
28
+ end
29
+
30
+ def p_apriori(category)
31
+ @categories.p_apriori(category)
32
+ end
33
+
34
+
35
+
36
+ end
@@ -0,0 +1,52 @@
1
+ class PropabilityCollection
2
+ def initialize(args)
3
+ @categories = args[:categories] || []
4
+ initialize_ids
5
+ @propabilities = []
6
+ initalize_propabilities(@ids)
7
+ end
8
+
9
+ def find(category)
10
+ return @propabilities[category.id]
11
+ end
12
+
13
+
14
+ def set(args)
15
+ category = args[:category]
16
+ value = args[:value]
17
+ @propabilities[category.id] = value
18
+ end
19
+
20
+ def multiply(args)
21
+ category = args[:category]
22
+ factor = args[:factor]
23
+ if category
24
+ @propabilities[category.id] *= factor
25
+ else
26
+ @propabilities.map! {|el| el*factor}
27
+ end
28
+ end
29
+
30
+ def max
31
+ id = @propabilities.find_index(@propabilities.max)
32
+ @categories.find {|category| category.id == id}
33
+ end
34
+
35
+ def sum
36
+ @propabilities.reduce(:+)
37
+ end
38
+
39
+ private
40
+ def initialize_ids
41
+ @ids = @categories.map { |category| category.id }
42
+ end
43
+
44
+ def initalize_propabilities(ids)
45
+ ids.max.times do
46
+ @propabilities << 0
47
+ end
48
+ end
49
+
50
+
51
+
52
+ end
@@ -0,0 +1,32 @@
1
+ class Text
2
+ attr_reader :words
3
+ def initialize( args )
4
+ @text = args[:text] || ""
5
+ path = args[:path]
6
+ @classifier = args[:classifier] || TextClassifier.new
7
+ # If both path and text is given both will be concatenated
8
+ @text += loadText(path) if path
9
+
10
+ @words = @text.split(/\W+/)
11
+ end
12
+
13
+ def sample
14
+ @text.slice(0,50)
15
+ end
16
+
17
+ def classify
18
+ @classifier.get_category_for(words)
19
+ end
20
+
21
+ private
22
+
23
+ def loadText(path)
24
+ begin
25
+ loaded_text = File.read(path)
26
+ rescue
27
+ puts "You tried to load the file #{path} for classification. This file was not found.
28
+ Please make sure, that the path is correctly spelled and that you have reading-access to the path given"
29
+ end
30
+ end
31
+
32
+ end
@@ -0,0 +1,12 @@
1
+ class TextClassifier
2
+ def initialize( args )
3
+ @categories = args[:categories]
4
+ @calculator = args[:calculator] || PropabilityCalculator.new(categories: @categories)
5
+ end
6
+
7
+ def get_category_for(list_of_words)
8
+ propabilities = @calculator.get_propabilities_for(list_of_words)
9
+ propabilities.max
10
+ end
11
+
12
+ end
@@ -0,0 +1,3 @@
1
+ module NaiveText
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: NaiveText
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - RicciFlowing
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-10-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: guard
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: guard-rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: guard-rubocop
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Sort texts based on expample texts in predefined categories
84
+ email:
85
+ - benjamin@mathe-sellin.de
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - ".travis.yml"
93
+ - Gemfile
94
+ - Guardfile
95
+ - NaiveText.gemspec
96
+ - README.md
97
+ - Rakefile
98
+ - bin/console
99
+ - bin/setup
100
+ - lib/NaiveText.rb
101
+ - lib/NaiveText/Categories.rb
102
+ - lib/NaiveText/CategoriesFactory.rb
103
+ - lib/NaiveText/Category.rb
104
+ - lib/NaiveText/ExamplesGroup.rb
105
+ - lib/NaiveText/PropabilityCalculator.rb
106
+ - lib/NaiveText/PropabilityCollection.rb
107
+ - lib/NaiveText/Text.rb
108
+ - lib/NaiveText/TextClassifier.rb
109
+ - lib/NaiveText/version.rb
110
+ homepage: https://github.com/RicciFlowing/NaiveText
111
+ licenses: []
112
+ metadata:
113
+ allowed_push_host: https://rubygems.org
114
+ post_install_message:
115
+ rdoc_options: []
116
+ require_paths:
117
+ - lib
118
+ required_ruby_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ requirements: []
129
+ rubyforge_project:
130
+ rubygems_version: 2.4.6
131
+ signing_key:
132
+ specification_version: 4
133
+ summary: A NaiveText text classifier
134
+ test_files: []