fisher_classifier 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/README.md +134 -12
- data/README_EN.md +29 -0
- data/examples/classifier_initializer.rb +39 -0
- data/lib/fisher_classifier.rb +8 -15
- data/lib/fisher_classifier/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
OWYzNGVmOGRhZGYzYjg5ZGQ3ZGU5OGY2NWI3YTk1MjllM2IyZDNkYw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
YWRlNDQxZWE0M2E2ZjdiNTA1NWM2YTE0ZDhhM2MyNDQxZWEwNzJlMQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ODJiMjE4NjNiOWVjOTc5NjQyY2M0MGY1ZTJlOTZkNjNhNTc3YmQxMTI5OGQx
|
10
|
+
NGU2ZTcxZjI3YTM0YjI2NjY0NWJmMGUwODllZDEyNDYzZDNlYzhlMDFmMWRi
|
11
|
+
YTI0MzYyOWM4N2JlOTIzZDUwNTU3YWE0NDA5YWVhZjAwMTBlZWU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZTMyNDEyODYxYTAyYmExYzk0MDE1NDllMWU0NjNmM2RlMjFmMWFlMGI4YWNi
|
14
|
+
YjRiOGYzOWUwMTA4NWZlZDhlOGFlZGI2YjE2NWYwMzM0ZDFiZGNhYTFlMGFm
|
15
|
+
N2FiZmRlODIzNDg3NTY4ZDAzYzQ4OGEwYWI2ZTM0MWEwMjA5OTY=
|
data/README.md
CHANGED
@@ -1,29 +1,151 @@
|
|
1
1
|
# FisherClassifier
|
2
2
|
|
3
|
-
|
3
|
+
Реализация статистического классификатора докуметов на основе линейного дискриминанта Фишера.
|
4
4
|
|
5
|
-
|
5
|
+
Предоставляет прозрачный DSL для кофигурирования с возможностью определить:
|
6
6
|
|
7
|
-
|
7
|
+
* набор категорий;
|
8
|
+
* способ определения признаков;
|
9
|
+
* коэффициент для подсчета взвешенной вероятности;
|
10
|
+
* минимальный порог для определения принадлежности к категории;
|
11
|
+
* любое хранилище статистики.
|
12
|
+
|
13
|
+
Подробнее с теорией:
|
14
|
+
* [Баесовский классификатор](http://www.machinelearning.ru/wiki/index.php?title=%D0%91%D0%B0%D0%B9%D0%B5%D1%81%D0%BE%D0%B2%D1%81%D0%BA%D0%B8%D0%B9_%D0%BA%D0%BB%D0%B0%D1%81%D1%81%D0%B8%D1%84%D0%B8%D0%BA%D0%B0%D1%82%D0%BE%D1%80)
|
15
|
+
* [Линейный дискриминант Фишера](http://www.machinelearning.ru/wiki/index.php?title=%D0%9B%D0%B8%D0%BD%D0%B5%D0%B9%D0%BD%D1%8B%D0%B9_%D0%B4%D0%B8%D1%81%D0%BA%D1%80%D0%B8%D0%BC%D0%B8%D0%BD%D0%B0%D0%BD%D1%82_%D0%A4%D0%B8%D1%88%D0%B5%D1%80%D0%B0)
|
16
|
+
|
17
|
+
## Установка
|
18
|
+
|
19
|
+
Добавить в Gemfile:
|
8
20
|
|
9
21
|
gem 'fisher_classifier'
|
10
22
|
|
11
|
-
|
23
|
+
Выполнить:
|
12
24
|
|
13
25
|
$ bundle
|
14
26
|
|
15
|
-
|
27
|
+
Или поставить как гем:
|
16
28
|
|
17
29
|
$ gem install fisher_classifier
|
18
30
|
|
19
|
-
##
|
31
|
+
## Try Before You Buy™
|
32
|
+
|
33
|
+
Попробовать можно в консоли, вот так:
|
34
|
+
|
35
|
+
$ irb
|
36
|
+
|
37
|
+
1.9.3-p448 :002 > require 'fisher_classifier'
|
38
|
+
1.9.3-p448 :003 > cl = FisherClassifier.create_in_memory
|
39
|
+
1.9.3-p448 :005 > cl.train('Nobody owns the water.','good')
|
40
|
+
=> ["Nobody", "owns", "the", "water."]
|
41
|
+
1.9.3-p448 :006 > cl.train('the quick rabbit jumps fences','good')
|
42
|
+
=> ["the", "quick", "rabbit", "jumps", "fences"]
|
43
|
+
1.9.3-p448 :007 > cl.train('buy pharmaceuticals now','bad')
|
44
|
+
=> ["buy", "pharmaceuticals", "now"]
|
45
|
+
1.9.3-p448 :008 > cl.train('make quick money at the online casino','bad')
|
46
|
+
=> ["make", "quick", "money", "at", "the", "online", "casino"]
|
47
|
+
1.9.3-p448 :009 > cl.train('the quick brown fox jumps','good')
|
48
|
+
=> ["the", "quick", "brown", "fox", "jumps"]
|
49
|
+
1.9.3-p448 :015 > cl.train('online trading with forex','bad')
|
50
|
+
=> ["online", "trading", "with", "forex"]
|
51
|
+
1.9.3-p448 :008 > cl.classify('the quick money with forex now')
|
52
|
+
=> :bad
|
53
|
+
1.9.3-p448 :009 > cl.classify('quck mouse runs from fox')
|
54
|
+
=> :good
|
55
|
+
|
56
|
+
В данном примере в качестве хранилища используется оперативная память.
|
57
|
+
|
58
|
+
## DSL
|
59
|
+
|
60
|
+
### Определениепризнаков
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
get_features do |text|
|
64
|
+
# Выделить набор признаков из текста
|
65
|
+
end
|
66
|
+
```
|
67
|
+
|
68
|
+
### Обучение
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
inc_feature do |feature, category|
|
72
|
+
# Увеличить счетчик кол-ва использований признака в категории
|
73
|
+
end
|
74
|
+
|
75
|
+
inc_category do |category|
|
76
|
+
# Увеличить счетчик кол-ва использований категории
|
77
|
+
end
|
78
|
+
```
|
79
|
+
|
80
|
+
### Классификация
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
# Предполагаемая вероятность (Вероятность признака, если он ни разу не появлялся)
|
84
|
+
assumed_prob 0.4
|
85
|
+
|
86
|
+
# Порог. Минимальное значение вероятности принадлежности текста в категории
|
87
|
+
fisher_threshold 0.1
|
88
|
+
|
89
|
+
categories do
|
90
|
+
# Возможные категории
|
91
|
+
end
|
92
|
+
|
93
|
+
category_count do |category|
|
94
|
+
# Кол-во использований категории
|
95
|
+
end
|
96
|
+
|
97
|
+
features_count do |feature, category|
|
98
|
+
# Кол-во использований признака в категории
|
99
|
+
end
|
100
|
+
|
101
|
+
default_category do
|
102
|
+
# Категория по умолчанию
|
103
|
+
end
|
104
|
+
```
|
105
|
+
|
106
|
+
## Rails (Active Record)
|
107
|
+
|
108
|
+
Миграция (db/migrate/20131106143644_create_classifier_features.rb):
|
109
|
+
|
110
|
+
```ruby
|
111
|
+
class CreateClassifierFeatures < ActiveRecord::Migration
|
112
|
+
def change
|
113
|
+
create_table :classifier_features do |t|
|
114
|
+
t.string :name
|
115
|
+
t.string :category
|
116
|
+
t.integer :count, default: 1
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
```
|
121
|
+
|
122
|
+
Модель (app/models/classifier_feature.rb):
|
123
|
+
|
124
|
+
```ruby
|
125
|
+
class ClassifierFeature < ActiveRecord::Base
|
126
|
+
validates :category, presence: true
|
127
|
+
validates :name, presence: true, uniqueness: {:scope => :category}
|
128
|
+
|
129
|
+
def self.categories
|
130
|
+
[:good, :bad]
|
131
|
+
end
|
132
|
+
end
|
133
|
+
```
|
134
|
+
|
135
|
+
Инициалайзер:
|
136
|
+
|
137
|
+
[config/initializers/classifier.rb](https://github.com/Andrew8xx8/fisher_classifier/blob/master/examples/classifier_initializer.rb)
|
138
|
+
|
139
|
+
Использование:
|
20
140
|
|
21
|
-
|
141
|
+
$ rails c
|
142
|
+
1.9.3-p448 :009 > Classifier.train('the quick brown fox jumps', :good)
|
143
|
+
1.9.3-p448 :009 > Classifier.classify('the quick brown fox jumps', :good)
|
22
144
|
|
23
|
-
##
|
145
|
+
## Если хочется что-то исправить
|
24
146
|
|
25
|
-
1.
|
26
|
-
2.
|
27
|
-
3.
|
28
|
-
4.
|
147
|
+
1. Форкни
|
148
|
+
2. Зафигач фиче-ветку (`git checkout -b my-new-feature`)
|
149
|
+
3. Коммить изменения (`git commit -am 'Add some feature'`)
|
150
|
+
4. Пуш ветку (`git push origin my-new-feature`)
|
29
151
|
5. Create new Pull Request
|
data/README_EN.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# FisherClassifier
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'fisher_classifier'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install fisher_classifier
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
@@ -0,0 +1,39 @@
|
|
1
|
+
Classifier = FisherClassifier.create do
|
2
|
+
assumed_prob 0.4
|
3
|
+
fisher_threshold 0.1
|
4
|
+
|
5
|
+
inc_feature do |feature, category|
|
6
|
+
feature = ClassifierFeature.find_or_initialize_by(name: feature, category: category)
|
7
|
+
feature.count += 1 if feature
|
8
|
+
feature.save
|
9
|
+
end
|
10
|
+
|
11
|
+
get_features do |text|
|
12
|
+
if text
|
13
|
+
text.to_s.split(' ').map { |s| s.downcase }
|
14
|
+
else
|
15
|
+
[]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
categories do
|
20
|
+
ClassifierFeature.categories
|
21
|
+
end
|
22
|
+
|
23
|
+
category_count do |category|
|
24
|
+
ClassifierFeature.where(category: category).count
|
25
|
+
end
|
26
|
+
|
27
|
+
features_count do |feature, category|
|
28
|
+
f = ClassifierFeature.find_by(name: feature, category: category)
|
29
|
+
if f
|
30
|
+
f.count
|
31
|
+
else
|
32
|
+
0
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
default_category do
|
37
|
+
"bad"
|
38
|
+
end
|
39
|
+
end
|
data/lib/fisher_classifier.rb
CHANGED
@@ -19,24 +19,17 @@ module FisherClassifier
|
|
19
19
|
|
20
20
|
inc_feature do |feature, category|
|
21
21
|
@features[category] ||= {}
|
22
|
-
|
23
|
-
|
24
|
-
@features[category][feature] += 1
|
25
|
-
else
|
26
|
-
@features[category][feature] = 1
|
27
|
-
end
|
22
|
+
@features[category][feature] ||= 0
|
23
|
+
@features[category][feature] += 1
|
28
24
|
end
|
29
25
|
|
30
26
|
inc_category do |category|
|
31
|
-
|
32
|
-
|
33
|
-
else
|
34
|
-
@categories[category] = 1
|
35
|
-
end
|
27
|
+
@categories[category] ||= 0
|
28
|
+
@categories[category] += 1
|
36
29
|
end
|
37
30
|
|
38
31
|
get_features do |text|
|
39
|
-
text.split(' ')
|
32
|
+
text.split(' ').map { |s| s.downcase }
|
40
33
|
end
|
41
34
|
|
42
35
|
categories do
|
@@ -44,8 +37,8 @@ module FisherClassifier
|
|
44
37
|
end
|
45
38
|
|
46
39
|
category_count do |category|
|
47
|
-
if @
|
48
|
-
@categories[category]
|
40
|
+
if @categories.has_key?(category)
|
41
|
+
@categories[category]
|
49
42
|
else
|
50
43
|
0
|
51
44
|
end
|
@@ -53,7 +46,7 @@ module FisherClassifier
|
|
53
46
|
|
54
47
|
features_count do |feature, category|
|
55
48
|
if @features.has_key?(category) && @features[category].has_key?(feature)
|
56
|
-
@features[category][feature]
|
49
|
+
@features[category][feature]
|
57
50
|
else
|
58
51
|
0
|
59
52
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fisher_classifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew8xx8
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-12-
|
11
|
+
date: 2013-12-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -64,7 +64,9 @@ files:
|
|
64
64
|
- Gemfile
|
65
65
|
- LICENSE.txt
|
66
66
|
- README.md
|
67
|
+
- README_EN.md
|
67
68
|
- Rakefile
|
69
|
+
- examples/classifier_initializer.rb
|
68
70
|
- fisher_classifier.gemspec
|
69
71
|
- lib/fisher_classifier.rb
|
70
72
|
- lib/fisher_classifier/classifier.rb
|