yanbi-ml 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +27 -0
- data/lib/corpus.rb +28 -2
- data/lib/dictionary.rb +40 -0
- data/lib/version.rb +1 -1
- data/lib/yanbi.rb +2 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c5a2754ccc8b5d063fe67e47a3a1794195ba6e2b
|
4
|
+
data.tar.gz: 23a09668b5f69f61a9889eb0b59d285aa8ddf680
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41c3ab772fd3bb90de3860d6833bac4a08c91bd4f0b5ae43e8db39a6ed281cd1b5e379ad74271346e9dfee3ae76d95e01fcfae27640d0dfe5d0804904a4a8e64
|
7
|
+
data.tar.gz: a93add553c7928a0eb8f34f25fedd6309479a3981dbd8e6a838708a96af824ee8b436879950223ce27d14a05be339a172fc10e1b0e570acecf8487d778c2aa23
|
data/README.md
CHANGED
@@ -181,6 +181,33 @@ docs.each_doc do |d|
|
|
181
181
|
end
|
182
182
|
```
|
183
183
|
|
184
|
+
There's also a single, global bag of words that contains all of the words seen in every document in the corpus. This is accessed (surprisingly) through the 'all' attribute.
|
185
|
+
|
186
|
+
```ruby
|
187
|
+
# Non unique global list of words
|
188
|
+
docs.all.words
|
189
|
+
|
190
|
+
# Unique global list of words
|
191
|
+
docs.all.words.uniq
|
192
|
+
```
|
193
|
+
|
194
|
+
Note that this global word bag is updated whenever you remove words through by iterating through documents with each_doc.
|
195
|
+
|
196
|
+
|
197
|
+
## Dictionaries
|
198
|
+
|
199
|
+
Speaking of a global list of words, the corpus class also allows you to capture a snapshot of the unique list of words in a set of documents as a dictionary object. This object can then be used to encode strings as integer arrays of indices:
|
200
|
+
|
201
|
+
```ruby
|
202
|
+
my_dictionary = docs.to_index
|
203
|
+
|
204
|
+
# Get an integer mapping of the words in this string
|
205
|
+
indices = my_dictionary.to_idx('the quick brown fox')
|
206
|
+
```
|
207
|
+
|
208
|
+
Words not present in the dictionary will be returned as nils. This is useful for working with other types of classifiers that might not be capable of accepting straight text.
|
209
|
+
|
210
|
+
|
184
211
|
## Feature thresholds
|
185
212
|
|
186
213
|
A method on the classifier is provided to prune infrequently seen features. This is often one of the first things recommended for improving the accuracy of a classifier in real world applications. Note that when you prune features, there's no un-pruning afterwards - so be sure you actually want to do it!
|
data/lib/corpus.rb
CHANGED
@@ -21,6 +21,7 @@ module Yanbi
|
|
21
21
|
|
22
22
|
def initialize(klass=WordBag)
|
23
23
|
@all = klass.new
|
24
|
+
@index = nil
|
24
25
|
@docs = []
|
25
26
|
@bags = []
|
26
27
|
end
|
@@ -52,14 +53,39 @@ module Yanbi
|
|
52
53
|
@bags << @all.class.new(doc)
|
53
54
|
@all.add_text doc
|
54
55
|
@docs << doc
|
56
|
+
@index = nil
|
55
57
|
end
|
56
58
|
end
|
57
59
|
|
58
60
|
def each_doc
|
61
|
+
before = 0
|
62
|
+
after = 0
|
63
|
+
|
64
|
+
@bags.each do |bag, doc|
|
65
|
+
before += bag.words.count
|
66
|
+
yield bag, doc
|
67
|
+
after += bag.words.count
|
68
|
+
end
|
69
|
+
|
70
|
+
rebuild_all if before != after
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_index
|
74
|
+
if @index.nil?
|
75
|
+
w = all.words.uniq
|
76
|
+
@index = Yanbi::Dictionary.new(w, @all.class)
|
77
|
+
end
|
78
|
+
|
79
|
+
@index
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def rebuild_all
|
85
|
+
@all = @all.class.new
|
59
86
|
@bags.each do |bag|
|
60
|
-
|
87
|
+
@all.add_text bag.words.join(' ')
|
61
88
|
end
|
62
89
|
end
|
63
90
|
end
|
64
|
-
|
65
91
|
end
|
data/lib/dictionary.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# Author:: Robert Dormer (mailto:rdormer@gmail.com)
|
2
|
+
# Copyright:: Copyright (c) 2016 Robert Dormer
|
3
|
+
# License:: MIT
|
4
|
+
|
5
|
+
# Class for storing word dictionaries created from wordbags
|
6
|
+
# and corpuses. Includes methods to encode strings as integer
|
7
|
+
# index arrays into the dictionary.
|
8
|
+
|
9
|
+
$: << File.dirname(__FILE__)
|
10
|
+
require 'yaml'
|
11
|
+
|
12
|
+
module Yanbi
|
13
|
+
class Dictionary
|
14
|
+
attr_accessor :bag_class
|
15
|
+
|
16
|
+
def initialize(w, klass)
|
17
|
+
@index = {}
|
18
|
+
@klass = klass
|
19
|
+
i = (0..w.size).to_a
|
20
|
+
w.zip(i).each { |x| @index[x.first] = x.last }
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_idx(doc)
|
24
|
+
bag = @klass.new(doc)
|
25
|
+
bag.words.map { |w| @index[w] }
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.load(fname)
|
29
|
+
c = YAML.load(File.read(fname + '.yml'))
|
30
|
+
raise LoadError unless c.is_a? self
|
31
|
+
c
|
32
|
+
end
|
33
|
+
|
34
|
+
def save(name)
|
35
|
+
File.open(name + '.yml', 'w') do |out|
|
36
|
+
YAML.dump(self, out)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/version.rb
CHANGED
data/lib/yanbi.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yanbi-ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Robert Dormer
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -82,6 +82,7 @@ files:
|
|
82
82
|
- lib/bayes/bayes.rb
|
83
83
|
- lib/bayes/fisher.rb
|
84
84
|
- lib/corpus.rb
|
85
|
+
- lib/dictionary.rb
|
85
86
|
- lib/version.rb
|
86
87
|
- lib/wordbags/diadbag.rb
|
87
88
|
- lib/wordbags/stembag.rb
|