NaiveText 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9a4ff5607deb99718f721a0e6b62636feb91f007
4
+ data.tar.gz: 6a0dd76a73cc56784bd333b6766ae2375ff6154d
5
+ SHA512:
6
+ metadata.gz: ad7d2e7dc253ebef99f9ee9fd2dcf87b63f41cd0fc7800102cc071b780a22ace3f554a939fcc90ee65acdeeeea4def99bfe86d3813ff9e59aa2d12f2067a2510
7
+ data.tar.gz: 4e54cb6f0bd1a0d091bd2b19e0f5afd10cdc85ea37c7cef74d2629d9613fe3a71e540da9fe41f7ed1329b7133933be168e913526c59cafa2563bf569c60b84da
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,3 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.0
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in NaiveText.gemspec
4
+ gemspec
data/Guardfile ADDED
@@ -0,0 +1,48 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ ## Uncomment and set this to only include directories you want to watch
5
+ # directories %w(app lib config test spec features) \
6
+ # .select{|d| Dir.exists?(d) ? d : UI.warning("Directory #{d} does not exist")}
7
+
8
+ ## Note: if you are using the `directories` clause above and you are not
9
+ ## watching the project directory ('.'), then you will want to move
10
+ ## the Guardfile to a watched dir and symlink it back, e.g.
11
+ #
12
+ # $ mkdir config
13
+ # $ mv Guardfile config/
14
+ # $ ln -s config/Guardfile .
15
+ #
16
+ # and, you'll have to watch "config/Guardfile" instead of "Guardfile"
17
+
18
+ # Note: The cmd option is now required due to the increasing number of ways
19
+ # rspec may be run, below are examples of the most common uses.
20
+ # * bundler: 'bundle exec rspec'
21
+ # * bundler binstubs: 'bin/rspec'
22
+ # * spring: 'bin/rspec' (This will use spring if running and you have
23
+ # installed the spring binstubs per the docs)
24
+ # * zeus: 'zeus rspec' (requires the server to be started separately)
25
+ # * 'just' rspec: 'rspec'
26
+
27
+ guard :rspec, cmd: "bundle exec rspec" do
28
+ require "guard/rspec/dsl"
29
+ dsl = Guard::RSpec::Dsl.new(self)
30
+
31
+ # Feel free to open issues for suggestions and improvements
32
+
33
+ # RSpec files
34
+ rspec = dsl.rspec
35
+ watch(rspec.spec_helper) { rspec.spec_dir }
36
+ watch(rspec.spec_support) { rspec.spec_dir }
37
+ watch(rspec.spec_files)
38
+
39
+ # Ruby files
40
+ ruby = dsl.ruby
41
+ dsl.watch_spec_files_for(ruby.lib_files)
42
+
43
+ end
44
+
45
+ guard :rubocop, keep_failed: false do
46
+ watch(%r{(.+\.rb)$})
47
+ watch(%r{(?:.+/)?\.rubocop\.yml$}) { |m| File.dirname(m[0]) }
48
+ end
data/NaiveText.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'NaiveText/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "NaiveText"
8
+ spec.version = NaiveText::VERSION
9
+ spec.authors = ["RicciFlowing"]
10
+ spec.email = ["benjamin@mathe-sellin.de"]
11
+
12
+ spec.summary = "A NaiveText text classifier"
13
+ spec.description = "Sort texts based on expample texts in predefined categories"
14
+ spec.homepage = "https://github.com/RicciFlowing/NaiveText"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "exe"
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+ if spec.respond_to?(:metadata)
22
+ spec.metadata['allowed_push_host'] = "https://rubygems.org"
23
+ end
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.8"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "guard"
28
+ spec.add_development_dependency "guard-rspec"
29
+ spec.add_development_dependency "guard-rubocop"
30
+ end
data/README.md ADDED
@@ -0,0 +1,75 @@
1
+ # NaiveText
2
+
3
+ A naive Bayes Textclassifier written in Ruby
4
+
5
+ 1. What does it do?
6
+ ----
7
+
8
+ It sorts text into to categories predefined categories (i.e. interesting/boring).
9
+ The Algorithm bases the decisions on given text examples.
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application's Gemfile:
14
+
15
+ ```ruby
16
+ gem 'NaiveText'
17
+ ```
18
+
19
+ And then execute:
20
+
21
+ $ bundle
22
+
23
+ Or install it yourself as:
24
+
25
+ $ gem install NaiveText
26
+
27
+ ## Usage
28
+
29
+ In brief the process is as follows:
30
+
31
+ Create a directory (naming convention: training) containing subdirectories for every category (i.e. 'interesting' and 'boring').
32
+ Fill the directories with examples for interesting/boring texts. This will the training material for the algorithm.
33
+
34
+ Next up, the code:
35
+
36
+ ```ruby
37
+ require 'NaiveText'
38
+ ```
39
+ Now build the systems with your categories and training texts:
40
+
41
+ ```ruby
42
+ categories_config = [{name: 'interesting', path: 'spec/training/positive'},
43
+ {name: 'boring', path: 'spec/training/negative'}]
44
+ NaiveText.build(categories_config)
45
+ ```
46
+ Now you can start classifying texts:
47
+
48
+ ```ruby
49
+ NaiveText.classify('Seems to be interesting')
50
+ NaiveText.classify('Seems to be boring')
51
+ ```
52
+ Classify will return a category-object on which you can call name to get the name of the category as a string.
53
+
54
+ ```ruby
55
+ category = NaiveText.classify('Something interesting')
56
+ category.name
57
+ => 'interesting'
58
+ ```
59
+
60
+ Have fun using it!
61
+
62
+
63
+ ## Development
64
+
65
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
66
+
67
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
68
+
69
+ ## Contributing
70
+
71
+ 1. Fork it ( https://github.com/RicciFlowing/NaiveText/fork )
72
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
73
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
74
+ 4. Push to the branch (`git push origin my-new-feature`)
75
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "NaiveText"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
data/lib/NaiveText.rb ADDED
@@ -0,0 +1,32 @@
1
+ require "NaiveText/version"
2
+ require "NaiveText/ExamplesGroup"
3
+ require "NaiveText/PropabilityCollection"
4
+ require "NaiveText/PropabilityCalculator"
5
+ require "NaiveText/TextClassifier"
6
+ require "NaiveText/Text"
7
+ require "NaiveText/Category"
8
+ require "NaiveText/Categories"
9
+ require "NaiveText/CategoriesFactory"
10
+
11
+ module NaiveText
12
+ extend self
13
+
14
+ def build(config)
15
+ begin
16
+ @categories = CategoriesFactory.build(config)
17
+ @test_classifier = TextClassifier.new(categories: @categories)
18
+ rescue
19
+ puts "Their seems to be an error in your config.
20
+ The expectedt format is [{name: name_of_category, path: path_to_trainings_data}]"
21
+ end
22
+ end
23
+
24
+ def classify(text)
25
+ begin
26
+ @text = Text.new(text: text, classifier: @test_classifier)
27
+ @text.classify
28
+ rescue
29
+ puts "An Error occured. Did you call NaiveText.build before using classify"
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,23 @@
1
+ class Categories
2
+ include Enumerable
3
+
4
+ def initialize(args)
5
+ @categories = args[:categories] || []
6
+ end
7
+
8
+ def p_apriori(category)
9
+ calculate_apriori_propability_for(category)
10
+ end
11
+
12
+ def each(&block)
13
+ @categories.each(&block)
14
+ end
15
+
16
+ private
17
+
18
+ def calculate_apriori_propability_for(category)
19
+ sum_of_words = @categories.inject(0) {|sum, category| sum + category.word_count }
20
+ category.word_count.to_f / sum_of_words
21
+ end
22
+
23
+ end
@@ -0,0 +1,12 @@
1
+ class CategoriesFactory
2
+ def self.build(config)
3
+ categories = []
4
+
5
+ config.each do |category_config|
6
+ examples = ExamplesGroup.new(category_config[:path])
7
+ categories << Category.new(name: category_config[:name], examples: examples)
8
+ end
9
+
10
+ Categories.new(categories: categories)
11
+ end
12
+ end
@@ -0,0 +1,21 @@
1
+ class Category
2
+ @@id_counter = 1
3
+
4
+ attr_reader :name
5
+ attr_reader :id
6
+
7
+ def initialize(args)
8
+ @name = args[:name]
9
+ @examples = args[:examples]
10
+ @id = @@id_counter
11
+ @@id_counter += 1
12
+ end
13
+
14
+ def p(word)
15
+ @examples.count(word).to_f / @examples.word_count
16
+ end
17
+
18
+ def word_count
19
+ @examples.word_count
20
+ end
21
+ end
@@ -0,0 +1,25 @@
1
+ class ExamplesGroup
2
+ def initialize(path)
3
+ @text = load_text(path)
4
+ @words = @text.split(/\W+/)
5
+ end
6
+
7
+ def count(word)
8
+ @words.count(word)
9
+ end
10
+
11
+ def word_count
12
+ @words.count
13
+ end
14
+
15
+ private
16
+
17
+ def load_text(path)
18
+ text = ""
19
+ Dir.foreach(path) do |example_file|
20
+ next if example_file == '.' or example_file == '..'
21
+ text += File.read(path +'/'+ example_file)
22
+ end
23
+ text
24
+ end
25
+ end
@@ -0,0 +1,36 @@
1
+ class PropabilityCalculator
2
+ def initialize(args)
3
+ @categories = args[:categories] || []
4
+ @propabilities = PropabilityCollection.new(categories: @categories)
5
+ end
6
+
7
+ def get_propabilities_for(words)
8
+ calculateProbabilities(words)
9
+ end
10
+
11
+ private
12
+
13
+ def calculateProbabilities(list_of_words)
14
+ @categories.each do |category|
15
+ @propabilities.set(category: category, value: p_apriori(category))
16
+ end
17
+
18
+ list_of_words.each do |word|
19
+ @categories.each do |category|
20
+ @propabilities.multiply(category: category, factor: category.p(word) )
21
+ end
22
+ end
23
+
24
+ normalization_factor = 1.to_f / @propabilities.sum
25
+ @propabilities.multiply(factor: normalization_factor)
26
+
27
+ @propabilities
28
+ end
29
+
30
+ def p_apriori(category)
31
+ @categories.p_apriori(category)
32
+ end
33
+
34
+
35
+
36
+ end
@@ -0,0 +1,52 @@
1
+ class PropabilityCollection
2
+ def initialize(args)
3
+ @categories = args[:categories] || []
4
+ initialize_ids
5
+ @propabilities = []
6
+ initalize_propabilities(@ids)
7
+ end
8
+
9
+ def find(category)
10
+ return @propabilities[category.id]
11
+ end
12
+
13
+
14
+ def set(args)
15
+ category = args[:category]
16
+ value = args[:value]
17
+ @propabilities[category.id] = value
18
+ end
19
+
20
+ def multiply(args)
21
+ category = args[:category]
22
+ factor = args[:factor]
23
+ if category
24
+ @propabilities[category.id] *= factor
25
+ else
26
+ @propabilities.map! {|el| el*factor}
27
+ end
28
+ end
29
+
30
+ def max
31
+ id = @propabilities.find_index(@propabilities.max)
32
+ @categories.find {|category| category.id == id}
33
+ end
34
+
35
+ def sum
36
+ @propabilities.reduce(:+)
37
+ end
38
+
39
+ private
40
+ def initialize_ids
41
+ @ids = @categories.map { |category| category.id }
42
+ end
43
+
44
+ def initalize_propabilities(ids)
45
+ ids.max.times do
46
+ @propabilities << 0
47
+ end
48
+ end
49
+
50
+
51
+
52
+ end
@@ -0,0 +1,32 @@
1
+ class Text
2
+ attr_reader :words
3
+ def initialize( args )
4
+ @text = args[:text] || ""
5
+ path = args[:path]
6
+ @classifier = args[:classifier] || TextClassifier.new
7
+ # If both path and text is given both will be concatenated
8
+ @text += loadText(path) if path
9
+
10
+ @words = @text.split(/\W+/)
11
+ end
12
+
13
+ def sample
14
+ @text.slice(0,50)
15
+ end
16
+
17
+ def classify
18
+ @classifier.get_category_for(words)
19
+ end
20
+
21
+ private
22
+
23
+ def loadText(path)
24
+ begin
25
+ loaded_text = File.read(path)
26
+ rescue
27
+ puts "You tried to load the file #{path} for classification. This file was not found.
28
+ Please make sure, that the path is correctly spelled and that you have reading-access to the path given"
29
+ end
30
+ end
31
+
32
+ end
@@ -0,0 +1,12 @@
1
+ class TextClassifier
2
+ def initialize( args )
3
+ @categories = args[:categories]
4
+ @calculator = args[:calculator] || PropabilityCalculator.new(categories: @categories)
5
+ end
6
+
7
+ def get_category_for(list_of_words)
8
+ propabilities = @calculator.get_propabilities_for(list_of_words)
9
+ propabilities.max
10
+ end
11
+
12
+ end
@@ -0,0 +1,3 @@
1
+ module NaiveText
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: NaiveText
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - RicciFlowing
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-10-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.8'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.8'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: guard
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: guard-rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: guard-rubocop
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: Sort texts based on expample texts in predefined categories
84
+ email:
85
+ - benjamin@mathe-sellin.de
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - ".gitignore"
91
+ - ".rspec"
92
+ - ".travis.yml"
93
+ - Gemfile
94
+ - Guardfile
95
+ - NaiveText.gemspec
96
+ - README.md
97
+ - Rakefile
98
+ - bin/console
99
+ - bin/setup
100
+ - lib/NaiveText.rb
101
+ - lib/NaiveText/Categories.rb
102
+ - lib/NaiveText/CategoriesFactory.rb
103
+ - lib/NaiveText/Category.rb
104
+ - lib/NaiveText/ExamplesGroup.rb
105
+ - lib/NaiveText/PropabilityCalculator.rb
106
+ - lib/NaiveText/PropabilityCollection.rb
107
+ - lib/NaiveText/Text.rb
108
+ - lib/NaiveText/TextClassifier.rb
109
+ - lib/NaiveText/version.rb
110
+ homepage: https://github.com/RicciFlowing/NaiveText
111
+ licenses: []
112
+ metadata:
113
+ allowed_push_host: https://rubygems.org
114
+ post_install_message:
115
+ rdoc_options: []
116
+ require_paths:
117
+ - lib
118
+ required_ruby_version: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
123
+ required_rubygems_version: !ruby/object:Gem::Requirement
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ version: '0'
128
+ requirements: []
129
+ rubyforge_project:
130
+ rubygems_version: 2.4.6
131
+ signing_key:
132
+ specification_version: 4
133
+ summary: A NaiveText text classifier
134
+ test_files: []