yanbi-ml 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +27 -0
- data/lib/corpus.rb +28 -2
- data/lib/dictionary.rb +40 -0
- data/lib/version.rb +1 -1
- data/lib/yanbi.rb +2 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c5a2754ccc8b5d063fe67e47a3a1794195ba6e2b
|
4
|
+
data.tar.gz: 23a09668b5f69f61a9889eb0b59d285aa8ddf680
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 41c3ab772fd3bb90de3860d6833bac4a08c91bd4f0b5ae43e8db39a6ed281cd1b5e379ad74271346e9dfee3ae76d95e01fcfae27640d0dfe5d0804904a4a8e64
|
7
|
+
data.tar.gz: a93add553c7928a0eb8f34f25fedd6309479a3981dbd8e6a838708a96af824ee8b436879950223ce27d14a05be339a172fc10e1b0e570acecf8487d778c2aa23
|
data/README.md
CHANGED
@@ -181,6 +181,33 @@ docs.each_doc do |d|
|
|
181
181
|
end
|
182
182
|
```
|
183
183
|
|
184
|
+
There's also a single, global bag of words that contains all of the words seen in every document in the corpus. This is accessed (surprisingly) through the 'all' attribute.
|
185
|
+
|
186
|
+
```ruby
|
187
|
+
# Non unique global list of words
|
188
|
+
docs.all.words
|
189
|
+
|
190
|
+
# Unique global list of words
|
191
|
+
docs.all.words.uniq
|
192
|
+
```
|
193
|
+
|
194
|
+
Note that this global word bag is updated whenever you remove words through by iterating through documents with each_doc.
|
195
|
+
|
196
|
+
|
197
|
+
## Dictionaries
|
198
|
+
|
199
|
+
Speaking of a global list of words, the corpus class also allows you to capture a snapshot of the unique list of words in a set of documents as a dictionary object. This object can then be used to encode strings as integer arrays of indices:
|
200
|
+
|
201
|
+
```ruby
|
202
|
+
my_dictionary = docs.to_index
|
203
|
+
|
204
|
+
# Get an integer mapping of the words in this string
|
205
|
+
indices = my_dictionary.to_idx('the quick brown fox')
|
206
|
+
```
|
207
|
+
|
208
|
+
Words not present in the dictionary will be returned as nils. This is useful for working with other types of classifiers that might not be capable of accepting straight text.
|
209
|
+
|
210
|
+
|
184
211
|
## Feature thresholds
|
185
212
|
|
186
213
|
A method on the classifier is provided to prune infrequently seen features. This is often one of the first things recommended for improving the accuracy of a classifier in real world applications. Note that when you prune features, there's no un-pruning afterwards - so be sure you actually want to do it!
|
data/lib/corpus.rb
CHANGED
@@ -21,6 +21,7 @@ module Yanbi
|
|
21
21
|
|
22
22
|
def initialize(klass=WordBag)
|
23
23
|
@all = klass.new
|
24
|
+
@index = nil
|
24
25
|
@docs = []
|
25
26
|
@bags = []
|
26
27
|
end
|
@@ -52,14 +53,39 @@ module Yanbi
|
|
52
53
|
@bags << @all.class.new(doc)
|
53
54
|
@all.add_text doc
|
54
55
|
@docs << doc
|
56
|
+
@index = nil
|
55
57
|
end
|
56
58
|
end
|
57
59
|
|
58
60
|
def each_doc
|
61
|
+
before = 0
|
62
|
+
after = 0
|
63
|
+
|
64
|
+
@bags.each do |bag, doc|
|
65
|
+
before += bag.words.count
|
66
|
+
yield bag, doc
|
67
|
+
after += bag.words.count
|
68
|
+
end
|
69
|
+
|
70
|
+
rebuild_all if before != after
|
71
|
+
end
|
72
|
+
|
73
|
+
def to_index
|
74
|
+
if @index.nil?
|
75
|
+
w = all.words.uniq
|
76
|
+
@index = Yanbi::Dictionary.new(w, @all.class)
|
77
|
+
end
|
78
|
+
|
79
|
+
@index
|
80
|
+
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def rebuild_all
|
85
|
+
@all = @all.class.new
|
59
86
|
@bags.each do |bag|
|
60
|
-
|
87
|
+
@all.add_text bag.words.join(' ')
|
61
88
|
end
|
62
89
|
end
|
63
90
|
end
|
64
|
-
|
65
91
|
end
|
data/lib/dictionary.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# Author:: Robert Dormer (mailto:rdormer@gmail.com)
|
2
|
+
# Copyright:: Copyright (c) 2016 Robert Dormer
|
3
|
+
# License:: MIT
|
4
|
+
|
5
|
+
# Class for storing word dictionaries created from wordbags
|
6
|
+
# and corpuses. Includes methods to encode strings as integer
|
7
|
+
# index arrays into the dictionary.
|
8
|
+
|
9
|
+
$: << File.dirname(__FILE__)
|
10
|
+
require 'yaml'
|
11
|
+
|
12
|
+
module Yanbi
|
13
|
+
class Dictionary
|
14
|
+
attr_accessor :bag_class
|
15
|
+
|
16
|
+
def initialize(w, klass)
|
17
|
+
@index = {}
|
18
|
+
@klass = klass
|
19
|
+
i = (0..w.size).to_a
|
20
|
+
w.zip(i).each { |x| @index[x.first] = x.last }
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_idx(doc)
|
24
|
+
bag = @klass.new(doc)
|
25
|
+
bag.words.map { |w| @index[w] }
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.load(fname)
|
29
|
+
c = YAML.load(File.read(fname + '.yml'))
|
30
|
+
raise LoadError unless c.is_a? self
|
31
|
+
c
|
32
|
+
end
|
33
|
+
|
34
|
+
def save(name)
|
35
|
+
File.open(name + '.yml', 'w') do |out|
|
36
|
+
YAML.dump(self, out)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/lib/version.rb
CHANGED
data/lib/yanbi.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: yanbi-ml
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Robert Dormer
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -82,6 +82,7 @@ files:
|
|
82
82
|
- lib/bayes/bayes.rb
|
83
83
|
- lib/bayes/fisher.rb
|
84
84
|
- lib/corpus.rb
|
85
|
+
- lib/dictionary.rb
|
85
86
|
- lib/version.rb
|
86
87
|
- lib/wordbags/diadbag.rb
|
87
88
|
- lib/wordbags/stembag.rb
|