n_grams_generator 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,124 @@
1
+ #encoding:utf-8
2
+
3
+ class NGramsGenerator
4
+
5
+ attr_reader :all_ngrams, :ns, :auto_clear, :no_count
6
+
7
+ # Creates NGramsGenerator object.
8
+ #
9
+ # @param [Number, Array<Number>] ns the lengths of generated n-grams
10
+ # @param [Hash] opts the additional options
11
+ # @option [true, false] :no_count enables storing of full ngrams instead of counting them
12
+ # @option [true, false] :auto_clear disables storing of n-grams
13
+ def initialize(ns, options={})
14
+ @ns = ns.kind_of?(Enumerable) ? ns : [ns]
15
+
16
+ @auto_clear = options[:auto_clear] || false
17
+ @no_count = options[:no_count] || false
18
+
19
+ @all_ngrams = Hash.new([])
20
+ end
21
+
22
+ # Generates n-grams according to lengths specified during construction.
23
+ #
24
+ # NGramsGenerator.new(3).ngrams(['Alice', 'has', 'a', 'cat'])
25
+ # # => {
26
+ # 3 => { ['Alice', 'has', 'a'] => 1, ['has', 'a', 'cat'] => 1 }
27
+ # }
28
+ # NGramsGenerator.new([2, 3]).ngrams(['Alice', 'has', 'a', 'cat'], :no_count => true)
29
+ # # => {
30
+ # 2 => [ ['Alice', 'has'], ['has', 'a'], ['a', 'cat'] ]
31
+ # 3 => [ ['Alice', 'has', 'a'], ['has', 'a', 'cat'] ]
32
+ # }
33
+ # NGramsGenerator.new(2).grams([['very', :adv], ['nice', :adj], ['job', :noun]])
34
+ # # => {
35
+ # 2 => {
36
+ # ["very", "nice"] => 1,
37
+ # [:adv, "nice"] => 1,
38
+ # ["very", :adj] => 1,
39
+ # [:adv, :adj] => 1,
40
+ # ["nice", "job"] => 1,
41
+ # [:adj, "job"] => 1,
42
+ # ["nice", :noun] => 1,
43
+ # [:adj, :noun] => 1
44
+ # }
45
+ # }
46
+ #
47
+ # @param [Array<Object>, Array<Array<Object>>] data
48
+ # @return [Hash] the generated ngrams by n-number
49
+ def ngrams(data)
50
+ return @all_ngrams if data.nil? or data.empty?
51
+
52
+ clear_ngrams if @auto_clear
53
+
54
+ temp_ngrams = flat_ngrams(data.map{ |item| item.kind_of?(Enumerable) ? item : [item] })
55
+ new_ngrams = Hash.new([])
56
+
57
+ temp_ngrams.keys.each do |n|
58
+ temp_ngrams[n].each { |ngram| new_ngrams[n] += multiply_ngram(ngram) }
59
+ end
60
+
61
+ @all_ngrams = add_new_ngrams(new_ngrams)
62
+ return @all_ngrams
63
+ end
64
+
65
+ def count_ngrams(ngrams=@all_ngrams)
66
+ @ns.inject({}) { |hsh, n| hsh[n] = count_array_elements(ngrams[n]); hsh }
67
+ end
68
+
69
+ def clear_ngrams
70
+ @all_ngrams.clear
71
+ end
72
+
73
+ private
74
+
75
+ def flat_ngrams(data)
76
+ temp_ngrams = {}
77
+
78
+ @ns.each do |n|
79
+ data.each_cons(n) do |ngram|
80
+ temp_ngrams[n] = [] unless temp_ngrams.has_key?(n)
81
+ temp_ngrams[n] << ngram
82
+ end
83
+ end
84
+
85
+ temp_ngrams
86
+ end
87
+
88
+ def multiply_ngram(ngram)
89
+ all_ngrams = [[]]
90
+
91
+ ngram.each_with_index do |token, t|
92
+ token.each_with_index do |item, i|
93
+ all_ngrams = extend_ngram(all_ngrams, item, t)
94
+ end
95
+ end
96
+
97
+ all_ngrams
98
+ end
99
+
100
+ def extend_ngram(all_ngrams, item, t)
101
+ if all_ngrams.first[t].nil?
102
+ all_ngrams.each { |e| e << item }
103
+ else
104
+ all_ngrams += all_ngrams.map { |e1| e2 = e1.clone; e2[t] = item; e2 }
105
+ end
106
+
107
+ all_ngrams
108
+ end
109
+
110
+ def add_new_ngrams(ngrams)
111
+ if @no_count
112
+ @all_ngrams.merge(ngrams) { |ngram, c1, c2| c1 + c2 }
113
+ else
114
+ @all_ngrams.merge(count_ngrams(ngrams)) do |ngram, h1, h2|
115
+ h1.merge(h2) { |n, c1, c2| c1 + c2 }
116
+ end
117
+ end
118
+ end
119
+
120
+ def count_array_elements(ary)
121
+ ary.inject(Hash.new(0)) { |hsh, elem| hsh[elem] += 1; hsh }
122
+ end
123
+
124
+ end
@@ -0,0 +1,114 @@
1
+ require File.expand_path(File.dirname(__FILE__)) + '/../lib/n_grams_generator'
2
+ require 'test/unit'
3
+
4
+ class TestNGramsGenerator < Test::Unit::TestCase
5
+
6
+ def test_initialization_with_number
7
+ n = 2
8
+ ngram_generator = NGramsGenerator.new(n)
9
+
10
+ assert_kind_of Array, ngram_generator.ns
11
+ assert_equal ngram_generator.ns, [n]
12
+ end
13
+
14
+ def test_initialization_with_array
15
+ n = 3
16
+ ngram_generator = nil
17
+
18
+ assert_nothing_raised { ngram_generator = NGramsGenerator.new([n]) }
19
+ assert_equal ngram_generator.ns, [n]
20
+
21
+ ns = [1, 2, 3]
22
+
23
+ assert_nothing_raised { ngram_generator = NGramsGenerator.new(ns) }
24
+ assert_equal ngram_generator.ns, ns
25
+ end
26
+
27
+ def test_flat_input
28
+ bigram_generator = NGramsGenerator.new(2, :no_count => true)
29
+
30
+ expected_output = [['Ala', 'ma'], ['ma', 'kota']]
31
+ input = %w{Ala ma kota}
32
+ ngrams = bigram_generator.ngrams(input)
33
+
34
+ assert_kind_of Hash, ngrams
35
+ assert_equal expected_output, ngrams[2]
36
+ end
37
+
38
+ def test_nested_input
39
+ bigram_generator = NGramsGenerator.new(2, :no_count => true)
40
+
41
+ expected_output = [[1, 2], [2, 3], [3, 4]]
42
+ input = [[1], [2], [3], [4]]
43
+ ngrams = bigram_generator.ngrams(input)
44
+
45
+ assert_equal expected_output, ngrams[2]
46
+ end
47
+
48
+ def test_simple_and_nested_input
49
+ bigram_generator = NGramsGenerator.new(2, :no_count => true)
50
+
51
+ expected_output = { 2 => [['Ala', 'ma'], [:subst, 'ma']] }
52
+ input = [['Ala', :subst], 'ma']
53
+ ngrams = bigram_generator.ngrams(input)
54
+
55
+ assert_equal expected_output, ngrams
56
+ end
57
+
58
+ def test_n_number_is_too_high_for_input_length
59
+ ngram_generator = NGramsGenerator.new(5)
60
+
61
+ expected_output = { 5 => {} }
62
+ input = [1, 2, 3]
63
+ ngrams = ngram_generator.ngrams(input)
64
+
65
+ assert_equal expected_output, ngrams
66
+ end
67
+
68
+ def test_multiple_n_values
69
+ ns = [1, 2, 3]
70
+ ngrams_generator = NGramsGenerator.new(ns, :no_count => true)
71
+
72
+ expected_output = { 1 => [[1], [2], [3]], 2 => [[1, 2], [2, 3]], 3 => [[1, 2, 3]] }
73
+ input = [1, 2, 3]
74
+ ngrams = ngrams_generator.ngrams(input)
75
+
76
+ assert_equal ns, ngrams.keys
77
+ assert_equal expected_output, ngrams
78
+ end
79
+
80
+ def test_counting_ngrams
81
+ n = 2
82
+ input = [1, 2, 2, 2, 3, 3, 3, 3, 3]
83
+ ngrams = NGramsGenerator.new(n).ngrams(input)
84
+
85
+ assert_equal 1, ngrams[n][[1, 2]]
86
+ assert_equal 2, ngrams[n][[2, 2]]
87
+ assert_equal 1, ngrams[n][[2, 3]]
88
+ assert_equal 4, ngrams[n][[3, 3]]
89
+ end
90
+
91
+ def test_adding_ngrams
92
+ n = 3
93
+ ngram_generator = NGramsGenerator.new(n)
94
+ input = [1, 2, 3, 4]
95
+
96
+ 3.times do |i|
97
+ expected_output = { n => { [1, 2, 3] => (i + 1), [2, 3, 4] => (i + 1) } }
98
+ ngrams = ngram_generator.ngrams(input)
99
+ assert_equal expected_output, ngrams
100
+ end
101
+ end
102
+
103
+ def test_auto_clear_option
104
+ trigram_generator = NGramsGenerator.new(3, :auto_clear => true)
105
+ input = [:a, :b, :c, :d]
106
+
107
+ 3.times do |i|
108
+ expected_output = { 3 => { [:a, :b, :c] => 1, [:b, :c, :d] => 1 } }
109
+ ngrams = trigram_generator.ngrams(input)
110
+ assert_equal expected_output, ngrams
111
+ end
112
+ end
113
+
114
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: n_grams_generator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - snukky
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-09-22 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: N-grams generator for multi-level strings, for example words and their
15
+ morphosyntactic descriptions.
16
+ email:
17
+ - snk987@gmail.com
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/n_grams_generator.rb
23
+ - test/test_n_grams_generator.rb
24
+ homepage: https://github.com/snukky/n_grams_generator
25
+ licenses: []
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 1.8.10
45
+ signing_key:
46
+ specification_version: 3
47
+ summary: N-grams generator for multi-level strings.
48
+ test_files:
49
+ - test/test_n_grams_generator.rb