n_grams_generator 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,124 @@
1
+ #encoding:utf-8
2
+
3
+ class NGramsGenerator
4
+
5
+ attr_reader :all_ngrams, :ns, :auto_clear, :no_count
6
+
7
+ # Creates NGramsGenerator object.
8
+ #
9
+ # @param [Number, Array<Number>] ns the lengths of generated n-grams
10
+ # @param [Hash] opts the additional options
11
+ # @option [true, false] :no_count enables storing of full ngrams instead of counting them
12
+ # @option [true, false] :auto_clear disables storing of n-grams
13
+ def initialize(ns, options={})
14
+ @ns = ns.kind_of?(Enumerable) ? ns : [ns]
15
+
16
+ @auto_clear = options[:auto_clear] || false
17
+ @no_count = options[:no_count] || false
18
+
19
+ @all_ngrams = Hash.new([])
20
+ end
21
+
22
+ # Generates n-grams according to lengths specified during construction.
23
+ #
24
+ # NGramsGenerator.new(3).ngrams(['Alice', 'has', 'a', 'cat'])
25
+ # # => {
26
+ # 3 => { ['Alice', 'has', 'a'] => 1, ['has', 'a', 'cat'] => 1 }
27
+ # }
28
+ # NGramsGenerator.new([2, 3]).ngrams(['Alice', 'has', 'a', 'cat'], :no_count => true)
29
+ # # => {
30
+ # 2 => [ ['Alice', 'has'], ['has', 'a'], ['a', 'cat'] ]
31
+ # 3 => [ ['Alice', 'has', 'a'], ['has', 'a', 'cat'] ]
32
+ # }
33
+ # NGramsGenerator.new(2).grams([['very', :adv], ['nice', :adj], ['job', :noun]])
34
+ # # => {
35
+ # 2 => {
36
+ # ["very", "nice"] => 1,
37
+ # [:adv, "nice"] => 1,
38
+ # ["very", :adj] => 1,
39
+ # [:adv, :adj] => 1,
40
+ # ["nice", "job"] => 1,
41
+ # [:adj, "job"] => 1,
42
+ # ["nice", :noun] => 1,
43
+ # [:adj, :noun] => 1
44
+ # }
45
+ # }
46
+ #
47
+ # @param [Array<Object>, Array<Array<Object>>] data
48
+ # @return [Hash] the generated ngrams by n-number
49
+ def ngrams(data)
50
+ return @all_ngrams if data.nil? or data.empty?
51
+
52
+ clear_ngrams if @auto_clear
53
+
54
+ temp_ngrams = flat_ngrams(data.map{ |item| item.kind_of?(Enumerable) ? item : [item] })
55
+ new_ngrams = Hash.new([])
56
+
57
+ temp_ngrams.keys.each do |n|
58
+ temp_ngrams[n].each { |ngram| new_ngrams[n] += multiply_ngram(ngram) }
59
+ end
60
+
61
+ @all_ngrams = add_new_ngrams(new_ngrams)
62
+ return @all_ngrams
63
+ end
64
+
65
+ def count_ngrams(ngrams=@all_ngrams)
66
+ @ns.inject({}) { |hsh, n| hsh[n] = count_array_elements(ngrams[n]); hsh }
67
+ end
68
+
69
+ def clear_ngrams
70
+ @all_ngrams.clear
71
+ end
72
+
73
+ private
74
+
75
+ def flat_ngrams(data)
76
+ temp_ngrams = {}
77
+
78
+ @ns.each do |n|
79
+ data.each_cons(n) do |ngram|
80
+ temp_ngrams[n] = [] unless temp_ngrams.has_key?(n)
81
+ temp_ngrams[n] << ngram
82
+ end
83
+ end
84
+
85
+ temp_ngrams
86
+ end
87
+
88
+ def multiply_ngram(ngram)
89
+ all_ngrams = [[]]
90
+
91
+ ngram.each_with_index do |token, t|
92
+ token.each_with_index do |item, i|
93
+ all_ngrams = extend_ngram(all_ngrams, item, t)
94
+ end
95
+ end
96
+
97
+ all_ngrams
98
+ end
99
+
100
+ def extend_ngram(all_ngrams, item, t)
101
+ if all_ngrams.first[t].nil?
102
+ all_ngrams.each { |e| e << item }
103
+ else
104
+ all_ngrams += all_ngrams.map { |e1| e2 = e1.clone; e2[t] = item; e2 }
105
+ end
106
+
107
+ all_ngrams
108
+ end
109
+
110
+ def add_new_ngrams(ngrams)
111
+ if @no_count
112
+ @all_ngrams.merge(ngrams) { |ngram, c1, c2| c1 + c2 }
113
+ else
114
+ @all_ngrams.merge(count_ngrams(ngrams)) do |ngram, h1, h2|
115
+ h1.merge(h2) { |n, c1, c2| c1 + c2 }
116
+ end
117
+ end
118
+ end
119
+
120
+ def count_array_elements(ary)
121
+ ary.inject(Hash.new(0)) { |hsh, elem| hsh[elem] += 1; hsh }
122
+ end
123
+
124
+ end
@@ -0,0 +1,114 @@
1
+ require File.expand_path(File.dirname(__FILE__)) + '/../lib/n_grams_generator'
2
+ require 'test/unit'
3
+
4
+ class TestNGramsGenerator < Test::Unit::TestCase
5
+
6
+ def test_initialization_with_number
7
+ n = 2
8
+ ngram_generator = NGramsGenerator.new(n)
9
+
10
+ assert_kind_of Array, ngram_generator.ns
11
+ assert_equal ngram_generator.ns, [n]
12
+ end
13
+
14
+ def test_initialization_with_array
15
+ n = 3
16
+ ngram_generator = nil
17
+
18
+ assert_nothing_raised { ngram_generator = NGramsGenerator.new([n]) }
19
+ assert_equal ngram_generator.ns, [n]
20
+
21
+ ns = [1, 2, 3]
22
+
23
+ assert_nothing_raised { ngram_generator = NGramsGenerator.new(ns) }
24
+ assert_equal ngram_generator.ns, ns
25
+ end
26
+
27
+ def test_flat_input
28
+ bigram_generator = NGramsGenerator.new(2, :no_count => true)
29
+
30
+ expected_output = [['Ala', 'ma'], ['ma', 'kota']]
31
+ input = %w{Ala ma kota}
32
+ ngrams = bigram_generator.ngrams(input)
33
+
34
+ assert_kind_of Hash, ngrams
35
+ assert_equal expected_output, ngrams[2]
36
+ end
37
+
38
+ def test_nested_input
39
+ bigram_generator = NGramsGenerator.new(2, :no_count => true)
40
+
41
+ expected_output = [[1, 2], [2, 3], [3, 4]]
42
+ input = [[1], [2], [3], [4]]
43
+ ngrams = bigram_generator.ngrams(input)
44
+
45
+ assert_equal expected_output, ngrams[2]
46
+ end
47
+
48
+ def test_simple_and_nested_input
49
+ bigram_generator = NGramsGenerator.new(2, :no_count => true)
50
+
51
+ expected_output = { 2 => [['Ala', 'ma'], [:subst, 'ma']] }
52
+ input = [['Ala', :subst], 'ma']
53
+ ngrams = bigram_generator.ngrams(input)
54
+
55
+ assert_equal expected_output, ngrams
56
+ end
57
+
58
+ def test_n_number_is_too_high_for_input_length
59
+ ngram_generator = NGramsGenerator.new(5)
60
+
61
+ expected_output = { 5 => {} }
62
+ input = [1, 2, 3]
63
+ ngrams = ngram_generator.ngrams(input)
64
+
65
+ assert_equal expected_output, ngrams
66
+ end
67
+
68
+ def test_multiple_n_values
69
+ ns = [1, 2, 3]
70
+ ngrams_generator = NGramsGenerator.new(ns, :no_count => true)
71
+
72
+ expected_output = { 1 => [[1], [2], [3]], 2 => [[1, 2], [2, 3]], 3 => [[1, 2, 3]] }
73
+ input = [1, 2, 3]
74
+ ngrams = ngrams_generator.ngrams(input)
75
+
76
+ assert_equal ns, ngrams.keys
77
+ assert_equal expected_output, ngrams
78
+ end
79
+
80
+ def test_counting_ngrams
81
+ n = 2
82
+ input = [1, 2, 2, 2, 3, 3, 3, 3, 3]
83
+ ngrams = NGramsGenerator.new(n).ngrams(input)
84
+
85
+ assert_equal 1, ngrams[n][[1, 2]]
86
+ assert_equal 2, ngrams[n][[2, 2]]
87
+ assert_equal 1, ngrams[n][[2, 3]]
88
+ assert_equal 4, ngrams[n][[3, 3]]
89
+ end
90
+
91
+ def test_adding_ngrams
92
+ n = 3
93
+ ngram_generator = NGramsGenerator.new(n)
94
+ input = [1, 2, 3, 4]
95
+
96
+ 3.times do |i|
97
+ expected_output = { n => { [1, 2, 3] => (i + 1), [2, 3, 4] => (i + 1) } }
98
+ ngrams = ngram_generator.ngrams(input)
99
+ assert_equal expected_output, ngrams
100
+ end
101
+ end
102
+
103
+ def test_auto_clear_option
104
+ trigram_generator = NGramsGenerator.new(3, :auto_clear => true)
105
+ input = [:a, :b, :c, :d]
106
+
107
+ 3.times do |i|
108
+ expected_output = { 3 => { [:a, :b, :c] => 1, [:b, :c, :d] => 1 } }
109
+ ngrams = trigram_generator.ngrams(input)
110
+ assert_equal expected_output, ngrams
111
+ end
112
+ end
113
+
114
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: n_grams_generator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - snukky
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-22 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: N-grams generator for multi-level strings, for example words and their
15
+ morphosyntactic descriptions.
16
+ email:
17
+ - snk987@gmail.com
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/n_grams_generator.rb
23
+ - test/test_n_grams_generator.rb
24
+ homepage: https://github.com/snukky/n_grams_generator
25
+ licenses: []
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 1.8.10
45
+ signing_key:
46
+ specification_version: 3
47
+ summary: N-grams generator for multi-level strings.
48
+ test_files:
49
+ - test/test_n_grams_generator.rb