ngrams_parser 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/ngrams_parser/ngram.rb +3 -1
- data/lib/ngrams_parser/ngrams.rb +2 -1
- data/lib/ngrams_parser/string.rb +3 -1
- data/lib/ngrams_parser/version.rb +2 -1
- data/lib/ngrams_parser.rb +4 -7
- data/spec/ngrams_parser/ngram_spec.rb +10 -10
- data/spec/ngrams_parser/ngrams_spec.rb +70 -67
- data/spec/ngrams_parser/string_spec.rb +10 -9
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6c3cdef6702ffbc5a36f42d454a3bb491abf72f0
|
4
|
+
data.tar.gz: 69e88fdec44a26144c90f668be1c80ed97c804e3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a1c60d03f3802948359c7e3993a315e62b481a15179b7ae2f9c5c0bc44066f1bf3919d59f8b15a7d35288414e074448ca0def7815fc58f034091f95f495d2f83
|
7
|
+
data.tar.gz: 5c936796efb35bddadfe3d34381c5cba344ddff8c2d86fe169a95bed70ed6086c785a1aee19509ba9f6204084e6c079ce4d1d1d4a475d131d9985391b9f9073e
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# NgramsParser [![Gem Version](https://badge.fury.io/rb/ngrams_parser.png)](http://badge.fury.io/rb/ngrams_parser) [![Build Status](https://travis-ci.org/fractalsoft/ngrams_parser.png)](https://travis-ci.org/fractalsoft/ngrams_parser) [![Dependency Status](https://gemnasium.com/fractalsoft/ngrams_parser.png)](https://gemnasium.com/fractalsoft/ngrams_parser) [![Coverage Status](https://coveralls.io/repos/fractalsoft/ngrams_parser/badge.png)](https://coveralls.io/r/fractalsoft/ngrams_parser)
|
1
|
+
# NgramsParser [![Gem Version](https://badge.fury.io/rb/ngrams_parser.png)](http://badge.fury.io/rb/ngrams_parser) [![Build Status](https://travis-ci.org/fractalsoft/ngrams_parser.png)](https://travis-ci.org/fractalsoft/ngrams_parser) [![Dependency Status](https://gemnasium.com/fractalsoft/ngrams_parser.png)](https://gemnasium.com/fractalsoft/ngrams_parser) [![Coverage Status](https://coveralls.io/repos/fractalsoft/ngrams_parser/badge.png)](https://coveralls.io/r/fractalsoft/ngrams_parser) [![Stories in Ready](https://badge.waffle.io/fractalsoft/ngrams_parser.png)](http://waffle.io/fractalsoft/ngrams_parser)
|
2
2
|
[![endorse](https://api.coderwall.com/torrocus/endorsecount.png)](https://coderwall.com/torrocus)
|
3
3
|
|
4
4
|
N-gram is a contiguous sequence of n items from a given sequence of text or speech. The items are letters, but can be phonemes, syllables, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus.
|
data/lib/ngrams_parser/ngram.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
|
3
|
+
# Parse word to ngrams
|
2
4
|
module NgramsParser
|
3
5
|
# Split word into ngrams
|
4
6
|
#
|
@@ -7,7 +9,7 @@ module NgramsParser
|
|
7
9
|
def self.ngram(word, size)
|
8
10
|
array = []
|
9
11
|
word.split('').each_index do |index|
|
10
|
-
text = word[index..index+size-1]
|
12
|
+
text = word[index..index + size - 1]
|
11
13
|
array << text.ljust(size, ' ')
|
12
14
|
end
|
13
15
|
array
|
data/lib/ngrams_parser/ngrams.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
require 'lexical_units'
|
3
3
|
|
4
|
+
# Parse given *text* into ngrams
|
4
5
|
module NgramsParser
|
5
6
|
def self.ngrams(text, size)
|
6
7
|
array = []
|
7
|
-
LexicalUnits
|
8
|
+
LexicalUnits.words_without_digits(text).each do |word|
|
8
9
|
array << ngram(word, size)
|
9
10
|
end
|
10
11
|
array.flatten
|
data/lib/ngrams_parser/string.rb
CHANGED
data/lib/ngrams_parser.rb
CHANGED
@@ -1,7 +1,4 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
|
6
|
-
module NgramsParser
|
7
|
-
end
|
1
|
+
require 'ngrams_parser/ngram'
|
2
|
+
require 'ngrams_parser/ngrams'
|
3
|
+
require 'ngrams_parser/string'
|
4
|
+
require 'ngrams_parser/version'
|
@@ -2,29 +2,29 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe NgramsParser do
|
5
|
-
context
|
6
|
-
let(:klass) { NgramsParser }
|
7
|
-
|
5
|
+
context '.ngram' do
|
8
6
|
[
|
9
7
|
{
|
10
8
|
text: "Will's",
|
11
|
-
digrams: [
|
12
|
-
trigrams: [
|
9
|
+
digrams: ['Wi', 'il', 'll', "l'", "'s", 's '],
|
10
|
+
trigrams: ['Wil', 'ill', "ll'", "l's", "'s ", 's '],
|
13
11
|
},
|
14
12
|
{
|
15
|
-
text:
|
16
|
-
digrams: [
|
17
|
-
trigrams: [
|
13
|
+
text: 'činčilový',
|
14
|
+
digrams: ['či', 'in', 'nč', 'či', 'il', 'lo', 'ov', 'vý', 'ý '],
|
15
|
+
trigrams: [
|
16
|
+
'čin', 'inč', 'nči', 'čil', 'ilo', 'lov', 'ový', 'vý ', 'ý '
|
17
|
+
]
|
18
18
|
}
|
19
19
|
].each do |hash|
|
20
20
|
text, bigrams, trigrams = hash.values
|
21
21
|
|
22
22
|
it "split word '#{text}' into bigrams: #{bigrams}" do
|
23
|
-
|
23
|
+
subject.ngram(text, 2).should eq(bigrams)
|
24
24
|
end
|
25
25
|
|
26
26
|
it "split word '#{text}' into trigrams: #{trigrams}" do
|
27
|
-
|
27
|
+
subject.ngram(text, 3).should eq(trigrams)
|
28
28
|
end
|
29
29
|
end
|
30
30
|
end
|
@@ -2,121 +2,124 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe NgramsParser do
|
5
|
-
context
|
5
|
+
context '.ngrams' do
|
6
6
|
let(:klass) { NgramsParser }
|
7
7
|
|
8
8
|
[
|
9
9
|
{
|
10
10
|
text: "Will will Will will Will's will to Will?",
|
11
11
|
bigrams: [
|
12
|
-
[
|
13
|
-
[
|
14
|
-
[
|
15
|
-
[
|
16
|
-
[
|
12
|
+
['Wi', 'il', 'll', 'l ', 'wi', 'il', 'll', 'l '],
|
13
|
+
['Wi', 'il', 'll', 'l ', 'wi', 'il', 'll', 'l '],
|
14
|
+
['Wi', 'il', 'll', "l'", "'s", 's ', 'wi', 'il', 'll', 'l '],
|
15
|
+
['to', 'o '],
|
16
|
+
['Wi', 'il', 'll', 'l ']
|
17
17
|
].flatten,
|
18
18
|
trigrams: [
|
19
|
-
[
|
20
|
-
[
|
21
|
-
[
|
22
|
-
[
|
23
|
-
[
|
24
|
-
[
|
25
|
-
[
|
19
|
+
['Wil', 'ill', 'll ', 'l '],
|
20
|
+
['wil', 'ill', 'll ', 'l '],
|
21
|
+
['Wil', 'ill', 'll ', 'l '],
|
22
|
+
['wil', 'ill', 'll ', 'l '],
|
23
|
+
['Wil', 'ill', "ll'", "l's", "'s ", 's '],
|
24
|
+
['wil', 'ill', 'll ', 'l '],
|
25
|
+
['to ', 'o ', 'Wil', 'ill', 'll ', 'l ']
|
26
26
|
].flatten
|
27
27
|
},
|
28
28
|
{
|
29
|
-
text:
|
29
|
+
text: 'Acht alte Ameisen aßen am Abend Ananas.',
|
30
30
|
bigrams: [
|
31
|
-
[
|
32
|
-
[
|
33
|
-
[
|
34
|
-
[
|
35
|
-
[
|
36
|
-
[
|
37
|
-
[
|
31
|
+
['Ac', 'ch', 'ht', 't '],
|
32
|
+
['al', 'lt', 'te', 'e '],
|
33
|
+
['Am', 'me', 'ei', 'is', 'se', 'en', 'n '],
|
34
|
+
['aß', 'ße', 'en', 'n '],
|
35
|
+
['am', 'm '],
|
36
|
+
['Ab', 'be', 'en', 'nd', 'd '],
|
37
|
+
['An', 'na', 'an', 'na', 'as', 's ']
|
38
38
|
].flatten,
|
39
39
|
trigrams: [
|
40
|
-
[
|
41
|
-
[
|
42
|
-
[
|
43
|
-
[
|
44
|
-
[
|
45
|
-
[
|
46
|
-
[
|
40
|
+
['Ach', 'cht', 'ht ', 't '],
|
41
|
+
['alt', 'lte', 'te ', 'e '],
|
42
|
+
['Ame', 'mei', 'eis', 'ise', 'sen', 'en ', 'n '],
|
43
|
+
['aße', 'ßen', 'en ', 'n '],
|
44
|
+
['am ', 'm '],
|
45
|
+
['Abe', 'ben', 'end', 'nd ', 'd '],
|
46
|
+
['Ana', 'nan', 'ana', 'nas', 'as ', 's ']
|
47
47
|
].flatten
|
48
48
|
},
|
49
49
|
{
|
50
|
-
text:
|
51
|
-
bigrams: [
|
50
|
+
text: 'Ödögidöggi',
|
51
|
+
bigrams: ['Öd', 'dö', 'ög', 'gi', 'id', 'dö', 'ög', 'gg', 'gi', 'i '],
|
52
52
|
trigrams: [
|
53
|
-
[
|
53
|
+
[
|
54
|
+
'Ödö', 'dög', 'ögi', 'gid', 'idö',
|
55
|
+
'dög', 'ögg', 'ggi', 'gi ', 'i '
|
56
|
+
]
|
54
57
|
].flatten
|
55
58
|
},
|
56
59
|
{
|
57
|
-
text:
|
60
|
+
text: 'Ćma ćmę ćmi.',
|
58
61
|
bigrams: [
|
59
|
-
[
|
62
|
+
['Ćm', 'ma', 'a ', 'ćm', 'mę', 'ę ', 'ćm', 'mi', 'i ']
|
60
63
|
].flatten,
|
61
64
|
trigrams: [
|
62
|
-
[
|
65
|
+
['Ćma', 'ma ', 'a ', 'ćmę', 'mę ', 'ę ', 'ćmi', 'mi ', 'i ']
|
63
66
|
].flatten
|
64
67
|
},
|
65
68
|
{
|
66
|
-
text:
|
69
|
+
text: 'Łzy złej zołzy',
|
67
70
|
bigrams: [
|
68
|
-
[
|
69
|
-
[
|
71
|
+
['Łz', 'zy', 'y ', 'zł', 'łe', 'ej', 'j '],
|
72
|
+
['zo', 'oł', 'łz', 'zy', 'y ']
|
70
73
|
].flatten,
|
71
74
|
trigrams: [
|
72
|
-
[
|
73
|
-
[
|
75
|
+
['Łzy', 'zy ', 'y ', 'złe', 'łej', 'ej ', 'j '],
|
76
|
+
['zoł', 'ołz', 'łzy', 'zy ', 'y ']
|
74
77
|
].flatten
|
75
78
|
},
|
76
79
|
{
|
77
|
-
text:
|
80
|
+
text: 'Żubr żuł żuchwą żurawinę.',
|
78
81
|
bigrams: [
|
79
|
-
[
|
80
|
-
[
|
81
|
-
[
|
82
|
+
['Żu', 'ub', 'br', 'r ', 'żu', 'uł', 'ł '],
|
83
|
+
['żu', 'uc', 'ch', 'hw', 'wą', 'ą '],
|
84
|
+
['żu', 'ur', 'ra', 'aw', 'wi', 'in', 'nę', 'ę ']
|
82
85
|
].flatten,
|
83
86
|
trigrams: [
|
84
|
-
[
|
85
|
-
[
|
86
|
-
[
|
87
|
+
['Żub', 'ubr', 'br ', 'r ', 'żuł', 'uł ', 'ł '],
|
88
|
+
['żuc', 'uch', 'chw', 'hwą', 'wą ', 'ą '],
|
89
|
+
['żur', 'ura', 'raw', 'awi', 'win', 'inę', 'nę ', 'ę ']
|
87
90
|
].flatten
|
88
91
|
},
|
89
92
|
{
|
90
|
-
text:
|
93
|
+
text: 'Čistý s Čistou čistili činčilový čepec.',
|
91
94
|
bigrams: [
|
92
|
-
[
|
93
|
-
[
|
94
|
-
[
|
95
|
+
['Či', 'is', 'st', 'tý', 'ý ', 's ', 'Či', 'is', 'st', 'to', 'ou'],
|
96
|
+
['u ', 'či', 'is', 'st', 'ti', 'il', 'li', 'i ', 'či', 'in', 'nč'],
|
97
|
+
['či', 'il', 'lo', 'ov', 'vý', 'ý ', 'če', 'ep', 'pe', 'ec', 'c ']
|
95
98
|
].flatten,
|
96
99
|
trigrams: [
|
97
|
-
[
|
98
|
-
[
|
99
|
-
[
|
100
|
-
[
|
100
|
+
['Čis', 'ist', 'stý', 'tý ', 'ý ', 's ', 'Čis', 'ist', 'sto'],
|
101
|
+
['tou', 'ou ', 'u ', 'čis', 'ist', 'sti', 'til', 'ili', 'li '],
|
102
|
+
['i ', 'čin', 'inč', 'nči', 'čil', 'ilo', 'lov', 'ový', 'vý '],
|
103
|
+
['ý ', 'čep', 'epe', 'pec', 'ec ', 'c ']
|
101
104
|
].flatten
|
102
105
|
},
|
103
106
|
{
|
104
|
-
text:
|
107
|
+
text: '99 bottles of beer on the wall,',
|
105
108
|
bigrams: [
|
106
|
-
[
|
107
|
-
[
|
108
|
-
[
|
109
|
-
[
|
110
|
-
[
|
111
|
-
[
|
109
|
+
['bo', 'ot', 'tt', 'tl', 'le', 'es', 's '],
|
110
|
+
['of', 'f '],
|
111
|
+
['be', 'ee', 'er', 'r '],
|
112
|
+
['on', 'n '],
|
113
|
+
['th', 'he', 'e '],
|
114
|
+
['wa', 'al', 'll', 'l ']
|
112
115
|
].flatten,
|
113
116
|
trigrams: [
|
114
|
-
[
|
115
|
-
[
|
116
|
-
[
|
117
|
-
[
|
118
|
-
[
|
119
|
-
[
|
117
|
+
['bot', 'ott', 'ttl', 'tle', 'les', 'es ', 's '],
|
118
|
+
['of ', 'f '],
|
119
|
+
['bee', 'eer', 'er ', 'r '],
|
120
|
+
['on ', 'n '],
|
121
|
+
['the', 'he ', 'e '],
|
122
|
+
['wal', 'all', 'll ', 'l ']
|
120
123
|
].flatten
|
121
124
|
}
|
122
125
|
].each do |hash|
|
@@ -2,24 +2,25 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe NgramsParser::String do
|
5
|
+
# Testing class
|
5
6
|
class String
|
6
7
|
include NgramsParser::String
|
7
8
|
end
|
8
9
|
|
9
|
-
context
|
10
|
-
it
|
11
|
-
string =
|
10
|
+
context '#ngrams' do
|
11
|
+
it 'splits String into ngrams' do
|
12
|
+
string = 'Lorem ipsum'
|
12
13
|
bigrams = [
|
13
|
-
|
14
|
-
|
14
|
+
'Lo', 'or', 're', 'em', 'm ',
|
15
|
+
'ip', 'ps', 'su', 'um', 'm '
|
15
16
|
]
|
16
17
|
trigrams = [
|
17
|
-
|
18
|
-
|
18
|
+
'Lor', 'ore', 'rem', 'em ', 'm ',
|
19
|
+
'ips', 'psu', 'sum', 'um ', 'm '
|
19
20
|
]
|
20
21
|
quadgrams = [
|
21
|
-
|
22
|
-
|
22
|
+
'Lore', 'orem', 'rem ', 'em ', 'm ',
|
23
|
+
'ipsu', 'psum', 'sum ', 'um ', 'm '
|
23
24
|
]
|
24
25
|
|
25
26
|
string.ngrams(2).should eq(bigrams)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ngrams_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksander Malaszkiewicz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|