yanbi-ml 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dd991aa7bffd5dd520fc71f40954b3603c8390a0
4
- data.tar.gz: ff8d4ae0a916964c018d325f5c07ca75bcd5dc3f
3
+ metadata.gz: c5a2754ccc8b5d063fe67e47a3a1794195ba6e2b
4
+ data.tar.gz: 23a09668b5f69f61a9889eb0b59d285aa8ddf680
5
5
  SHA512:
6
- metadata.gz: 8ee84e2f9a7c28dba34ab09af791f5be93b019317b2839597f40a55b3e88a1795e0e8474e6615a9c8dcfb708c3d72fac64920136c495e24c5c0a575233770b84
7
- data.tar.gz: 2f874ba50d8f5538b3941be0f70cedfc5a0936a810d2cba0ac0a598e86b4dc220a9bc5c561affd701c4d020fd80a3a41cfc3ac1338bcfaa9dc573a233544227f
6
+ metadata.gz: 41c3ab772fd3bb90de3860d6833bac4a08c91bd4f0b5ae43e8db39a6ed281cd1b5e379ad74271346e9dfee3ae76d95e01fcfae27640d0dfe5d0804904a4a8e64
7
+ data.tar.gz: a93add553c7928a0eb8f34f25fedd6309479a3981dbd8e6a838708a96af824ee8b436879950223ce27d14a05be339a172fc10e1b0e570acecf8487d778c2aa23
data/README.md CHANGED
@@ -181,6 +181,33 @@ docs.each_doc do |d|
181
181
  end
182
182
  ```
183
183
 
184
+ There's also a single, global bag of words that contains all of the words seen in every document in the corpus. This is accessed (surprisingly) through the 'all' attribute.
185
+
186
+ ```ruby
187
+ # Non unique global list of words
188
+ docs.all.words
189
+
190
+ # Unique global list of words
191
+ docs.all.words.uniq
192
+ ```
193
+
194
+ Note that this global word bag is updated whenever you remove words through by iterating through documents with each_doc.
195
+
196
+
197
+ ## Dictionaries
198
+
199
+ Speaking of a global list of words, the corpus class also allows you to capture a snapshot of the unique list of words in a set of documents as a dictionary object. This object can then be used to encode strings as integer arrays of indices:
200
+
201
+ ```ruby
202
+ my_dictionary = docs.to_index
203
+
204
+ # Get an integer mapping of the words in this string
205
+ indices = my_dictionary.to_idx('the quick brown fox')
206
+ ```
207
+
208
+ Words not present in the dictionary will be returned as nils. This is useful for working with other types of classifiers that might not be capable of accepting straight text.
209
+
210
+
184
211
  ## Feature thresholds
185
212
 
186
213
  A method on the classifier is provided to prune infrequently seen features. This is often one of the first things recommended for improving the accuracy of a classifier in real world applications. Note that when you prune features, there's no un-pruning afterwards - so be sure you actually want to do it!
@@ -21,6 +21,7 @@ module Yanbi
21
21
 
22
22
  def initialize(klass=WordBag)
23
23
  @all = klass.new
24
+ @index = nil
24
25
  @docs = []
25
26
  @bags = []
26
27
  end
@@ -52,14 +53,39 @@ module Yanbi
52
53
  @bags << @all.class.new(doc)
53
54
  @all.add_text doc
54
55
  @docs << doc
56
+ @index = nil
55
57
  end
56
58
  end
57
59
 
58
60
  def each_doc
61
+ before = 0
62
+ after = 0
63
+
64
+ @bags.each do |bag, doc|
65
+ before += bag.words.count
66
+ yield bag, doc
67
+ after += bag.words.count
68
+ end
69
+
70
+ rebuild_all if before != after
71
+ end
72
+
73
+ def to_index
74
+ if @index.nil?
75
+ w = all.words.uniq
76
+ @index = Yanbi::Dictionary.new(w, @all.class)
77
+ end
78
+
79
+ @index
80
+ end
81
+
82
+ private
83
+
84
+ def rebuild_all
85
+ @all = @all.class.new
59
86
  @bags.each do |bag|
60
- yield bag
87
+ @all.add_text bag.words.join(' ')
61
88
  end
62
89
  end
63
90
  end
64
-
65
91
  end
@@ -0,0 +1,40 @@
1
+ # Author:: Robert Dormer (mailto:rdormer@gmail.com)
2
+ # Copyright:: Copyright (c) 2016 Robert Dormer
3
+ # License:: MIT
4
+
5
+ # Class for storing word dictionaries created from wordbags
6
+ # and corpuses. Includes methods to encode strings as integer
7
+ # index arrays into the dictionary.
8
+
9
+ $: << File.dirname(__FILE__)
10
+ require 'yaml'
11
+
12
+ module Yanbi
13
+ class Dictionary
14
+ attr_accessor :bag_class
15
+
16
+ def initialize(w, klass)
17
+ @index = {}
18
+ @klass = klass
19
+ i = (0..w.size).to_a
20
+ w.zip(i).each { |x| @index[x.first] = x.last }
21
+ end
22
+
23
+ def to_idx(doc)
24
+ bag = @klass.new(doc)
25
+ bag.words.map { |w| @index[w] }
26
+ end
27
+
28
+ def self.load(fname)
29
+ c = YAML.load(File.read(fname + '.yml'))
30
+ raise LoadError unless c.is_a? self
31
+ c
32
+ end
33
+
34
+ def save(name)
35
+ File.open(name + '.yml', 'w') do |out|
36
+ YAML.dump(self, out)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -3,5 +3,5 @@
3
3
  # License:: MIT
4
4
 
5
5
  module Yanbi
6
- VERSION = "0.2.4"
6
+ VERSION = "0.3.0"
7
7
  end
@@ -13,5 +13,7 @@ Dir[base + "/bayes/**/*.rb"].each do |c|
13
13
  require c
14
14
  end
15
15
 
16
+ require 'dictionary'
16
17
  require 'corpus'
17
18
  require 'version'
19
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: yanbi-ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Robert Dormer
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-07-31 00:00:00.000000000 Z
11
+ date: 2017-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -82,6 +82,7 @@ files:
82
82
  - lib/bayes/bayes.rb
83
83
  - lib/bayes/fisher.rb
84
84
  - lib/corpus.rb
85
+ - lib/dictionary.rb
85
86
  - lib/version.rb
86
87
  - lib/wordbags/diadbag.rb
87
88
  - lib/wordbags/stembag.rb