n_grams_generator 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/n_grams_generator.rb +124 -0
- data/test/test_n_grams_generator.rb +114 -0
- metadata +49 -0
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
#encoding:utf-8
|
|
2
|
+
|
|
3
|
+
class NGramsGenerator
|
|
4
|
+
|
|
5
|
+
attr_reader :all_ngrams, :ns, :auto_clear, :no_count
|
|
6
|
+
|
|
7
|
+
# Creates NGramsGenerator object.
|
|
8
|
+
#
|
|
9
|
+
# @param [Number, Array<Number>] ns the lengths of generated n-grams
|
|
10
|
+
# @param [Hash] opts the additional options
|
|
11
|
+
# @option [true, false] :no_count enables storing of full ngrams instead of counting them
|
|
12
|
+
# @option [true, false] :auto_clear disables storing of n-grams
|
|
13
|
+
def initialize(ns, options={})
|
|
14
|
+
@ns = ns.kind_of?(Enumerable) ? ns : [ns]
|
|
15
|
+
|
|
16
|
+
@auto_clear = options[:auto_clear] || false
|
|
17
|
+
@no_count = options[:no_count] || false
|
|
18
|
+
|
|
19
|
+
@all_ngrams = Hash.new([])
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Generates n-grams according to lengths specified during construction.
|
|
23
|
+
#
|
|
24
|
+
# NGramsGenerator.new(3).ngrams(['Alice', 'has', 'a', 'cat'])
|
|
25
|
+
# # => {
|
|
26
|
+
# 3 => { ['Alice', 'has', 'a'] => 1, ['has', 'a', 'cat'] => 1 }
|
|
27
|
+
# }
|
|
28
|
+
# NGramsGenerator.new([2, 3]).ngrams(['Alice', 'has', 'a', 'cat'], :no_count => true)
|
|
29
|
+
# # => {
|
|
30
|
+
# 2 => [ ['Alice', 'has'], ['has', 'a'], ['a', 'cat'] ]
|
|
31
|
+
# 3 => [ ['Alice', 'has', 'a'], ['has', 'a', 'cat'] ]
|
|
32
|
+
# }
|
|
33
|
+
# NGramsGenerator.new(2).grams([['very', :adv], ['nice', :adj], ['job', :noun]])
|
|
34
|
+
# # => {
|
|
35
|
+
# 2 => {
|
|
36
|
+
# ["very", "nice"] => 1,
|
|
37
|
+
# [:adv, "nice"] => 1,
|
|
38
|
+
# ["very", :adj] => 1,
|
|
39
|
+
# [:adv, :adj] => 1,
|
|
40
|
+
# ["nice", "job"] => 1,
|
|
41
|
+
# [:adj, "job"] => 1,
|
|
42
|
+
# ["nice", :noun] => 1,
|
|
43
|
+
# [:adj, :noun] => 1
|
|
44
|
+
# }
|
|
45
|
+
# }
|
|
46
|
+
#
|
|
47
|
+
# @param [Array<Object>, Array<Array<Object>>] data
|
|
48
|
+
# @return [Hash] the generated ngrams by n-number
|
|
49
|
+
def ngrams(data)
|
|
50
|
+
return @all_ngrams if data.nil? or data.empty?
|
|
51
|
+
|
|
52
|
+
clear_ngrams if @auto_clear
|
|
53
|
+
|
|
54
|
+
temp_ngrams = flat_ngrams(data.map{ |item| item.kind_of?(Enumerable) ? item : [item] })
|
|
55
|
+
new_ngrams = Hash.new([])
|
|
56
|
+
|
|
57
|
+
temp_ngrams.keys.each do |n|
|
|
58
|
+
temp_ngrams[n].each { |ngram| new_ngrams[n] += multiply_ngram(ngram) }
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
@all_ngrams = add_new_ngrams(new_ngrams)
|
|
62
|
+
return @all_ngrams
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def count_ngrams(ngrams=@all_ngrams)
|
|
66
|
+
@ns.inject({}) { |hsh, n| hsh[n] = count_array_elements(ngrams[n]); hsh }
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def clear_ngrams
|
|
70
|
+
@all_ngrams.clear
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
private
|
|
74
|
+
|
|
75
|
+
def flat_ngrams(data)
|
|
76
|
+
temp_ngrams = {}
|
|
77
|
+
|
|
78
|
+
@ns.each do |n|
|
|
79
|
+
data.each_cons(n) do |ngram|
|
|
80
|
+
temp_ngrams[n] = [] unless temp_ngrams.has_key?(n)
|
|
81
|
+
temp_ngrams[n] << ngram
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
temp_ngrams
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def multiply_ngram(ngram)
|
|
89
|
+
all_ngrams = [[]]
|
|
90
|
+
|
|
91
|
+
ngram.each_with_index do |token, t|
|
|
92
|
+
token.each_with_index do |item, i|
|
|
93
|
+
all_ngrams = extend_ngram(all_ngrams, item, t)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
all_ngrams
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def extend_ngram(all_ngrams, item, t)
|
|
101
|
+
if all_ngrams.first[t].nil?
|
|
102
|
+
all_ngrams.each { |e| e << item }
|
|
103
|
+
else
|
|
104
|
+
all_ngrams += all_ngrams.map { |e1| e2 = e1.clone; e2[t] = item; e2 }
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
all_ngrams
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def add_new_ngrams(ngrams)
|
|
111
|
+
if @no_count
|
|
112
|
+
@all_ngrams.merge(ngrams) { |ngram, c1, c2| c1 + c2 }
|
|
113
|
+
else
|
|
114
|
+
@all_ngrams.merge(count_ngrams(ngrams)) do |ngram, h1, h2|
|
|
115
|
+
h1.merge(h2) { |n, c1, c2| c1 + c2 }
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def count_array_elements(ary)
|
|
121
|
+
ary.inject(Hash.new(0)) { |hsh, elem| hsh[elem] += 1; hsh }
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
end
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__)) + '/../lib/n_grams_generator'
|
|
2
|
+
require 'test/unit'
|
|
3
|
+
|
|
4
|
+
class TestNGramsGenerator < Test::Unit::TestCase
|
|
5
|
+
|
|
6
|
+
def test_initialization_with_number
|
|
7
|
+
n = 2
|
|
8
|
+
ngram_generator = NGramsGenerator.new(n)
|
|
9
|
+
|
|
10
|
+
assert_kind_of Array, ngram_generator.ns
|
|
11
|
+
assert_equal ngram_generator.ns, [n]
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def test_initialization_with_array
|
|
15
|
+
n = 3
|
|
16
|
+
ngram_generator = nil
|
|
17
|
+
|
|
18
|
+
assert_nothing_raised { ngram_generator = NGramsGenerator.new([n]) }
|
|
19
|
+
assert_equal ngram_generator.ns, [n]
|
|
20
|
+
|
|
21
|
+
ns = [1, 2, 3]
|
|
22
|
+
|
|
23
|
+
assert_nothing_raised { ngram_generator = NGramsGenerator.new(ns) }
|
|
24
|
+
assert_equal ngram_generator.ns, ns
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def test_flat_input
|
|
28
|
+
bigram_generator = NGramsGenerator.new(2, :no_count => true)
|
|
29
|
+
|
|
30
|
+
expected_output = [['Ala', 'ma'], ['ma', 'kota']]
|
|
31
|
+
input = %w{Ala ma kota}
|
|
32
|
+
ngrams = bigram_generator.ngrams(input)
|
|
33
|
+
|
|
34
|
+
assert_kind_of Hash, ngrams
|
|
35
|
+
assert_equal expected_output, ngrams[2]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def test_nested_input
|
|
39
|
+
bigram_generator = NGramsGenerator.new(2, :no_count => true)
|
|
40
|
+
|
|
41
|
+
expected_output = [[1, 2], [2, 3], [3, 4]]
|
|
42
|
+
input = [[1], [2], [3], [4]]
|
|
43
|
+
ngrams = bigram_generator.ngrams(input)
|
|
44
|
+
|
|
45
|
+
assert_equal expected_output, ngrams[2]
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def test_simple_and_nested_input
|
|
49
|
+
bigram_generator = NGramsGenerator.new(2, :no_count => true)
|
|
50
|
+
|
|
51
|
+
expected_output = { 2 => [['Ala', 'ma'], [:subst, 'ma']] }
|
|
52
|
+
input = [['Ala', :subst], 'ma']
|
|
53
|
+
ngrams = bigram_generator.ngrams(input)
|
|
54
|
+
|
|
55
|
+
assert_equal expected_output, ngrams
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def test_n_number_is_too_high_for_input_length
|
|
59
|
+
ngram_generator = NGramsGenerator.new(5)
|
|
60
|
+
|
|
61
|
+
expected_output = { 5 => {} }
|
|
62
|
+
input = [1, 2, 3]
|
|
63
|
+
ngrams = ngram_generator.ngrams(input)
|
|
64
|
+
|
|
65
|
+
assert_equal expected_output, ngrams
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def test_multiple_n_values
|
|
69
|
+
ns = [1, 2, 3]
|
|
70
|
+
ngrams_generator = NGramsGenerator.new(ns, :no_count => true)
|
|
71
|
+
|
|
72
|
+
expected_output = { 1 => [[1], [2], [3]], 2 => [[1, 2], [2, 3]], 3 => [[1, 2, 3]] }
|
|
73
|
+
input = [1, 2, 3]
|
|
74
|
+
ngrams = ngrams_generator.ngrams(input)
|
|
75
|
+
|
|
76
|
+
assert_equal ns, ngrams.keys
|
|
77
|
+
assert_equal expected_output, ngrams
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def test_counting_ngrams
|
|
81
|
+
n = 2
|
|
82
|
+
input = [1, 2, 2, 2, 3, 3, 3, 3, 3]
|
|
83
|
+
ngrams = NGramsGenerator.new(n).ngrams(input)
|
|
84
|
+
|
|
85
|
+
assert_equal 1, ngrams[n][[1, 2]]
|
|
86
|
+
assert_equal 2, ngrams[n][[2, 2]]
|
|
87
|
+
assert_equal 1, ngrams[n][[2, 3]]
|
|
88
|
+
assert_equal 4, ngrams[n][[3, 3]]
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def test_adding_ngrams
|
|
92
|
+
n = 3
|
|
93
|
+
ngram_generator = NGramsGenerator.new(n)
|
|
94
|
+
input = [1, 2, 3, 4]
|
|
95
|
+
|
|
96
|
+
3.times do |i|
|
|
97
|
+
expected_output = { n => { [1, 2, 3] => (i + 1), [2, 3, 4] => (i + 1) } }
|
|
98
|
+
ngrams = ngram_generator.ngrams(input)
|
|
99
|
+
assert_equal expected_output, ngrams
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def test_auto_clear_option
|
|
104
|
+
trigram_generator = NGramsGenerator.new(3, :auto_clear => true)
|
|
105
|
+
input = [:a, :b, :c, :d]
|
|
106
|
+
|
|
107
|
+
3.times do |i|
|
|
108
|
+
expected_output = { 3 => { [:a, :b, :c] => 1, [:b, :c, :d] => 1 } }
|
|
109
|
+
ngrams = trigram_generator.ngrams(input)
|
|
110
|
+
assert_equal expected_output, ngrams
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: n_grams_generator
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
prerelease:
|
|
6
|
+
platform: ruby
|
|
7
|
+
authors:
|
|
8
|
+
- snukky
|
|
9
|
+
autorequire:
|
|
10
|
+
bindir: bin
|
|
11
|
+
cert_chain: []
|
|
12
|
+
date: 2012-09-22 00:00:00.000000000 Z
|
|
13
|
+
dependencies: []
|
|
14
|
+
description: N-grams generator for multi-level strings, for example words and their
|
|
15
|
+
morphosyntactic descriptions.
|
|
16
|
+
email:
|
|
17
|
+
- snk987@gmail.com
|
|
18
|
+
executables: []
|
|
19
|
+
extensions: []
|
|
20
|
+
extra_rdoc_files: []
|
|
21
|
+
files:
|
|
22
|
+
- lib/n_grams_generator.rb
|
|
23
|
+
- test/test_n_grams_generator.rb
|
|
24
|
+
homepage: https://github.com/snukky/n_grams_generator
|
|
25
|
+
licenses: []
|
|
26
|
+
post_install_message:
|
|
27
|
+
rdoc_options: []
|
|
28
|
+
require_paths:
|
|
29
|
+
- lib
|
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
31
|
+
none: false
|
|
32
|
+
requirements:
|
|
33
|
+
- - ! '>='
|
|
34
|
+
- !ruby/object:Gem::Version
|
|
35
|
+
version: '0'
|
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
37
|
+
none: false
|
|
38
|
+
requirements:
|
|
39
|
+
- - ! '>='
|
|
40
|
+
- !ruby/object:Gem::Version
|
|
41
|
+
version: '0'
|
|
42
|
+
requirements: []
|
|
43
|
+
rubyforge_project:
|
|
44
|
+
rubygems_version: 1.8.10
|
|
45
|
+
signing_key:
|
|
46
|
+
specification_version: 3
|
|
47
|
+
summary: N-grams generator for multi-level strings.
|
|
48
|
+
test_files:
|
|
49
|
+
- test/test_n_grams_generator.rb
|