n_grams_generator 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/n_grams_generator.rb +124 -0
- data/test/test_n_grams_generator.rb +114 -0
- metadata +49 -0
@@ -0,0 +1,124 @@
|
|
1
|
+
#encoding:utf-8
|
2
|
+
|
3
|
+
class NGramsGenerator
|
4
|
+
|
5
|
+
attr_reader :all_ngrams, :ns, :auto_clear, :no_count
|
6
|
+
|
7
|
+
# Creates NGramsGenerator object.
|
8
|
+
#
|
9
|
+
# @param [Number, Array<Number>] ns the lengths of generated n-grams
|
10
|
+
# @param [Hash] opts the additional options
|
11
|
+
# @option [true, false] :no_count enables storing of full ngrams instead of counting them
|
12
|
+
# @option [true, false] :auto_clear disables storing of n-grams
|
13
|
+
def initialize(ns, options={})
|
14
|
+
@ns = ns.kind_of?(Enumerable) ? ns : [ns]
|
15
|
+
|
16
|
+
@auto_clear = options[:auto_clear] || false
|
17
|
+
@no_count = options[:no_count] || false
|
18
|
+
|
19
|
+
@all_ngrams = Hash.new([])
|
20
|
+
end
|
21
|
+
|
22
|
+
# Generates n-grams according to lengths specified during construction.
|
23
|
+
#
|
24
|
+
# NGramsGenerator.new(3).ngrams(['Alice', 'has', 'a', 'cat'])
|
25
|
+
# # => {
|
26
|
+
# 3 => { ['Alice', 'has', 'a'] => 1, ['has', 'a', 'cat'] => 1 }
|
27
|
+
# }
|
28
|
+
# NGramsGenerator.new([2, 3]).ngrams(['Alice', 'has', 'a', 'cat'], :no_count => true)
|
29
|
+
# # => {
|
30
|
+
# 2 => [ ['Alice', 'has'], ['has', 'a'], ['a', 'cat'] ]
|
31
|
+
# 3 => [ ['Alice', 'has', 'a'], ['has', 'a', 'cat'] ]
|
32
|
+
# }
|
33
|
+
# NGramsGenerator.new(2).grams([['very', :adv], ['nice', :adj], ['job', :noun]])
|
34
|
+
# # => {
|
35
|
+
# 2 => {
|
36
|
+
# ["very", "nice"] => 1,
|
37
|
+
# [:adv, "nice"] => 1,
|
38
|
+
# ["very", :adj] => 1,
|
39
|
+
# [:adv, :adj] => 1,
|
40
|
+
# ["nice", "job"] => 1,
|
41
|
+
# [:adj, "job"] => 1,
|
42
|
+
# ["nice", :noun] => 1,
|
43
|
+
# [:adj, :noun] => 1
|
44
|
+
# }
|
45
|
+
# }
|
46
|
+
#
|
47
|
+
# @param [Array<Object>, Array<Array<Object>>] data
|
48
|
+
# @return [Hash] the generated ngrams by n-number
|
49
|
+
def ngrams(data)
|
50
|
+
return @all_ngrams if data.nil? or data.empty?
|
51
|
+
|
52
|
+
clear_ngrams if @auto_clear
|
53
|
+
|
54
|
+
temp_ngrams = flat_ngrams(data.map{ |item| item.kind_of?(Enumerable) ? item : [item] })
|
55
|
+
new_ngrams = Hash.new([])
|
56
|
+
|
57
|
+
temp_ngrams.keys.each do |n|
|
58
|
+
temp_ngrams[n].each { |ngram| new_ngrams[n] += multiply_ngram(ngram) }
|
59
|
+
end
|
60
|
+
|
61
|
+
@all_ngrams = add_new_ngrams(new_ngrams)
|
62
|
+
return @all_ngrams
|
63
|
+
end
|
64
|
+
|
65
|
+
def count_ngrams(ngrams=@all_ngrams)
|
66
|
+
@ns.inject({}) { |hsh, n| hsh[n] = count_array_elements(ngrams[n]); hsh }
|
67
|
+
end
|
68
|
+
|
69
|
+
def clear_ngrams
|
70
|
+
@all_ngrams.clear
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def flat_ngrams(data)
|
76
|
+
temp_ngrams = {}
|
77
|
+
|
78
|
+
@ns.each do |n|
|
79
|
+
data.each_cons(n) do |ngram|
|
80
|
+
temp_ngrams[n] = [] unless temp_ngrams.has_key?(n)
|
81
|
+
temp_ngrams[n] << ngram
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
temp_ngrams
|
86
|
+
end
|
87
|
+
|
88
|
+
def multiply_ngram(ngram)
|
89
|
+
all_ngrams = [[]]
|
90
|
+
|
91
|
+
ngram.each_with_index do |token, t|
|
92
|
+
token.each_with_index do |item, i|
|
93
|
+
all_ngrams = extend_ngram(all_ngrams, item, t)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
all_ngrams
|
98
|
+
end
|
99
|
+
|
100
|
+
def extend_ngram(all_ngrams, item, t)
|
101
|
+
if all_ngrams.first[t].nil?
|
102
|
+
all_ngrams.each { |e| e << item }
|
103
|
+
else
|
104
|
+
all_ngrams += all_ngrams.map { |e1| e2 = e1.clone; e2[t] = item; e2 }
|
105
|
+
end
|
106
|
+
|
107
|
+
all_ngrams
|
108
|
+
end
|
109
|
+
|
110
|
+
def add_new_ngrams(ngrams)
|
111
|
+
if @no_count
|
112
|
+
@all_ngrams.merge(ngrams) { |ngram, c1, c2| c1 + c2 }
|
113
|
+
else
|
114
|
+
@all_ngrams.merge(count_ngrams(ngrams)) do |ngram, h1, h2|
|
115
|
+
h1.merge(h2) { |n, c1, c2| c1 + c2 }
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def count_array_elements(ary)
|
121
|
+
ary.inject(Hash.new(0)) { |hsh, elem| hsh[elem] += 1; hsh }
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__)) + '/../lib/n_grams_generator'
|
2
|
+
require 'test/unit'
|
3
|
+
|
4
|
+
class TestNGramsGenerator < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_initialization_with_number
|
7
|
+
n = 2
|
8
|
+
ngram_generator = NGramsGenerator.new(n)
|
9
|
+
|
10
|
+
assert_kind_of Array, ngram_generator.ns
|
11
|
+
assert_equal ngram_generator.ns, [n]
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_initialization_with_array
|
15
|
+
n = 3
|
16
|
+
ngram_generator = nil
|
17
|
+
|
18
|
+
assert_nothing_raised { ngram_generator = NGramsGenerator.new([n]) }
|
19
|
+
assert_equal ngram_generator.ns, [n]
|
20
|
+
|
21
|
+
ns = [1, 2, 3]
|
22
|
+
|
23
|
+
assert_nothing_raised { ngram_generator = NGramsGenerator.new(ns) }
|
24
|
+
assert_equal ngram_generator.ns, ns
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_flat_input
|
28
|
+
bigram_generator = NGramsGenerator.new(2, :no_count => true)
|
29
|
+
|
30
|
+
expected_output = [['Ala', 'ma'], ['ma', 'kota']]
|
31
|
+
input = %w{Ala ma kota}
|
32
|
+
ngrams = bigram_generator.ngrams(input)
|
33
|
+
|
34
|
+
assert_kind_of Hash, ngrams
|
35
|
+
assert_equal expected_output, ngrams[2]
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_nested_input
|
39
|
+
bigram_generator = NGramsGenerator.new(2, :no_count => true)
|
40
|
+
|
41
|
+
expected_output = [[1, 2], [2, 3], [3, 4]]
|
42
|
+
input = [[1], [2], [3], [4]]
|
43
|
+
ngrams = bigram_generator.ngrams(input)
|
44
|
+
|
45
|
+
assert_equal expected_output, ngrams[2]
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_simple_and_nested_input
|
49
|
+
bigram_generator = NGramsGenerator.new(2, :no_count => true)
|
50
|
+
|
51
|
+
expected_output = { 2 => [['Ala', 'ma'], [:subst, 'ma']] }
|
52
|
+
input = [['Ala', :subst], 'ma']
|
53
|
+
ngrams = bigram_generator.ngrams(input)
|
54
|
+
|
55
|
+
assert_equal expected_output, ngrams
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_n_number_is_too_high_for_input_length
|
59
|
+
ngram_generator = NGramsGenerator.new(5)
|
60
|
+
|
61
|
+
expected_output = { 5 => {} }
|
62
|
+
input = [1, 2, 3]
|
63
|
+
ngrams = ngram_generator.ngrams(input)
|
64
|
+
|
65
|
+
assert_equal expected_output, ngrams
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_multiple_n_values
|
69
|
+
ns = [1, 2, 3]
|
70
|
+
ngrams_generator = NGramsGenerator.new(ns, :no_count => true)
|
71
|
+
|
72
|
+
expected_output = { 1 => [[1], [2], [3]], 2 => [[1, 2], [2, 3]], 3 => [[1, 2, 3]] }
|
73
|
+
input = [1, 2, 3]
|
74
|
+
ngrams = ngrams_generator.ngrams(input)
|
75
|
+
|
76
|
+
assert_equal ns, ngrams.keys
|
77
|
+
assert_equal expected_output, ngrams
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_counting_ngrams
|
81
|
+
n = 2
|
82
|
+
input = [1, 2, 2, 2, 3, 3, 3, 3, 3]
|
83
|
+
ngrams = NGramsGenerator.new(n).ngrams(input)
|
84
|
+
|
85
|
+
assert_equal 1, ngrams[n][[1, 2]]
|
86
|
+
assert_equal 2, ngrams[n][[2, 2]]
|
87
|
+
assert_equal 1, ngrams[n][[2, 3]]
|
88
|
+
assert_equal 4, ngrams[n][[3, 3]]
|
89
|
+
end
|
90
|
+
|
91
|
+
def test_adding_ngrams
|
92
|
+
n = 3
|
93
|
+
ngram_generator = NGramsGenerator.new(n)
|
94
|
+
input = [1, 2, 3, 4]
|
95
|
+
|
96
|
+
3.times do |i|
|
97
|
+
expected_output = { n => { [1, 2, 3] => (i + 1), [2, 3, 4] => (i + 1) } }
|
98
|
+
ngrams = ngram_generator.ngrams(input)
|
99
|
+
assert_equal expected_output, ngrams
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def test_auto_clear_option
|
104
|
+
trigram_generator = NGramsGenerator.new(3, :auto_clear => true)
|
105
|
+
input = [:a, :b, :c, :d]
|
106
|
+
|
107
|
+
3.times do |i|
|
108
|
+
expected_output = { 3 => { [:a, :b, :c] => 1, [:b, :c, :d] => 1 } }
|
109
|
+
ngrams = trigram_generator.ngrams(input)
|
110
|
+
assert_equal expected_output, ngrams
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: n_grams_generator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- snukky
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-09-22 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: N-grams generator for multi-level strings, for example words and their
|
15
|
+
morphosyntactic descriptions.
|
16
|
+
email:
|
17
|
+
- snk987@gmail.com
|
18
|
+
executables: []
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- lib/n_grams_generator.rb
|
23
|
+
- test/test_n_grams_generator.rb
|
24
|
+
homepage: https://github.com/snukky/n_grams_generator
|
25
|
+
licenses: []
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ! '>='
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubyforge_project:
|
44
|
+
rubygems_version: 1.8.10
|
45
|
+
signing_key:
|
46
|
+
specification_version: 3
|
47
|
+
summary: N-grams generator for multi-level strings.
|
48
|
+
test_files:
|
49
|
+
- test/test_n_grams_generator.rb
|