ngrams_parser 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/ngrams_parser/ngram.rb +3 -1
- data/lib/ngrams_parser/ngrams.rb +2 -1
- data/lib/ngrams_parser/string.rb +3 -1
- data/lib/ngrams_parser/version.rb +2 -1
- data/lib/ngrams_parser.rb +4 -7
- data/spec/ngrams_parser/ngram_spec.rb +10 -10
- data/spec/ngrams_parser/ngrams_spec.rb +70 -67
- data/spec/ngrams_parser/string_spec.rb +10 -9
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6c3cdef6702ffbc5a36f42d454a3bb491abf72f0
|
4
|
+
data.tar.gz: 69e88fdec44a26144c90f668be1c80ed97c804e3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a1c60d03f3802948359c7e3993a315e62b481a15179b7ae2f9c5c0bc44066f1bf3919d59f8b15a7d35288414e074448ca0def7815fc58f034091f95f495d2f83
|
7
|
+
data.tar.gz: 5c936796efb35bddadfe3d34381c5cba344ddff8c2d86fe169a95bed70ed6086c785a1aee19509ba9f6204084e6c079ce4d1d1d4a475d131d9985391b9f9073e
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# NgramsParser [](http://badge.fury.io/rb/ngrams_parser) [](https://travis-ci.org/fractalsoft/ngrams_parser) [](https://gemnasium.com/fractalsoft/ngrams_parser) [](https://coveralls.io/r/fractalsoft/ngrams_parser)
|
1
|
+
# NgramsParser [](http://badge.fury.io/rb/ngrams_parser) [](https://travis-ci.org/fractalsoft/ngrams_parser) [](https://gemnasium.com/fractalsoft/ngrams_parser) [](https://coveralls.io/r/fractalsoft/ngrams_parser) [](http://waffle.io/fractalsoft/ngrams_parser)
|
2
2
|
[](https://coderwall.com/torrocus)
|
3
3
|
|
4
4
|
N-gram is a contiguous sequence of n items from a given sequence of text or speech. The items are letters, but can be phonemes, syllables, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus.
|
data/lib/ngrams_parser/ngram.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
|
3
|
+
# Parse word to ngrams
|
2
4
|
module NgramsParser
|
3
5
|
# Split word into ngrams
|
4
6
|
#
|
@@ -7,7 +9,7 @@ module NgramsParser
|
|
7
9
|
def self.ngram(word, size)
|
8
10
|
array = []
|
9
11
|
word.split('').each_index do |index|
|
10
|
-
text = word[index..index+size-1]
|
12
|
+
text = word[index..index + size - 1]
|
11
13
|
array << text.ljust(size, ' ')
|
12
14
|
end
|
13
15
|
array
|
data/lib/ngrams_parser/ngrams.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
require 'lexical_units'
|
3
3
|
|
4
|
+
# Parse given *text* into ngrams
|
4
5
|
module NgramsParser
|
5
6
|
def self.ngrams(text, size)
|
6
7
|
array = []
|
7
|
-
LexicalUnits
|
8
|
+
LexicalUnits.words_without_digits(text).each do |word|
|
8
9
|
array << ngram(word, size)
|
9
10
|
end
|
10
11
|
array.flatten
|
data/lib/ngrams_parser/string.rb
CHANGED
data/lib/ngrams_parser.rb
CHANGED
@@ -1,7 +1,4 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
|
6
|
-
module NgramsParser
|
7
|
-
end
|
1
|
+
require 'ngrams_parser/ngram'
|
2
|
+
require 'ngrams_parser/ngrams'
|
3
|
+
require 'ngrams_parser/string'
|
4
|
+
require 'ngrams_parser/version'
|
@@ -2,29 +2,29 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe NgramsParser do
|
5
|
-
context
|
6
|
-
let(:klass) { NgramsParser }
|
7
|
-
|
5
|
+
context '.ngram' do
|
8
6
|
[
|
9
7
|
{
|
10
8
|
text: "Will's",
|
11
|
-
digrams: [
|
12
|
-
trigrams: [
|
9
|
+
digrams: ['Wi', 'il', 'll', "l'", "'s", 's '],
|
10
|
+
trigrams: ['Wil', 'ill', "ll'", "l's", "'s ", 's '],
|
13
11
|
},
|
14
12
|
{
|
15
|
-
text:
|
16
|
-
digrams: [
|
17
|
-
trigrams: [
|
13
|
+
text: 'činčilový',
|
14
|
+
digrams: ['či', 'in', 'nč', 'či', 'il', 'lo', 'ov', 'vý', 'ý '],
|
15
|
+
trigrams: [
|
16
|
+
'čin', 'inč', 'nči', 'čil', 'ilo', 'lov', 'ový', 'vý ', 'ý '
|
17
|
+
]
|
18
18
|
}
|
19
19
|
].each do |hash|
|
20
20
|
text, bigrams, trigrams = hash.values
|
21
21
|
|
22
22
|
it "split word '#{text}' into bigrams: #{bigrams}" do
|
23
|
-
|
23
|
+
subject.ngram(text, 2).should eq(bigrams)
|
24
24
|
end
|
25
25
|
|
26
26
|
it "split word '#{text}' into trigrams: #{trigrams}" do
|
27
|
-
|
27
|
+
subject.ngram(text, 3).should eq(trigrams)
|
28
28
|
end
|
29
29
|
end
|
30
30
|
end
|
@@ -2,121 +2,124 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe NgramsParser do
|
5
|
-
context
|
5
|
+
context '.ngrams' do
|
6
6
|
let(:klass) { NgramsParser }
|
7
7
|
|
8
8
|
[
|
9
9
|
{
|
10
10
|
text: "Will will Will will Will's will to Will?",
|
11
11
|
bigrams: [
|
12
|
-
[
|
13
|
-
[
|
14
|
-
[
|
15
|
-
[
|
16
|
-
[
|
12
|
+
['Wi', 'il', 'll', 'l ', 'wi', 'il', 'll', 'l '],
|
13
|
+
['Wi', 'il', 'll', 'l ', 'wi', 'il', 'll', 'l '],
|
14
|
+
['Wi', 'il', 'll', "l'", "'s", 's ', 'wi', 'il', 'll', 'l '],
|
15
|
+
['to', 'o '],
|
16
|
+
['Wi', 'il', 'll', 'l ']
|
17
17
|
].flatten,
|
18
18
|
trigrams: [
|
19
|
-
[
|
20
|
-
[
|
21
|
-
[
|
22
|
-
[
|
23
|
-
[
|
24
|
-
[
|
25
|
-
[
|
19
|
+
['Wil', 'ill', 'll ', 'l '],
|
20
|
+
['wil', 'ill', 'll ', 'l '],
|
21
|
+
['Wil', 'ill', 'll ', 'l '],
|
22
|
+
['wil', 'ill', 'll ', 'l '],
|
23
|
+
['Wil', 'ill', "ll'", "l's", "'s ", 's '],
|
24
|
+
['wil', 'ill', 'll ', 'l '],
|
25
|
+
['to ', 'o ', 'Wil', 'ill', 'll ', 'l ']
|
26
26
|
].flatten
|
27
27
|
},
|
28
28
|
{
|
29
|
-
text:
|
29
|
+
text: 'Acht alte Ameisen aßen am Abend Ananas.',
|
30
30
|
bigrams: [
|
31
|
-
[
|
32
|
-
[
|
33
|
-
[
|
34
|
-
[
|
35
|
-
[
|
36
|
-
[
|
37
|
-
[
|
31
|
+
['Ac', 'ch', 'ht', 't '],
|
32
|
+
['al', 'lt', 'te', 'e '],
|
33
|
+
['Am', 'me', 'ei', 'is', 'se', 'en', 'n '],
|
34
|
+
['aß', 'ße', 'en', 'n '],
|
35
|
+
['am', 'm '],
|
36
|
+
['Ab', 'be', 'en', 'nd', 'd '],
|
37
|
+
['An', 'na', 'an', 'na', 'as', 's ']
|
38
38
|
].flatten,
|
39
39
|
trigrams: [
|
40
|
-
[
|
41
|
-
[
|
42
|
-
[
|
43
|
-
[
|
44
|
-
[
|
45
|
-
[
|
46
|
-
[
|
40
|
+
['Ach', 'cht', 'ht ', 't '],
|
41
|
+
['alt', 'lte', 'te ', 'e '],
|
42
|
+
['Ame', 'mei', 'eis', 'ise', 'sen', 'en ', 'n '],
|
43
|
+
['aße', 'ßen', 'en ', 'n '],
|
44
|
+
['am ', 'm '],
|
45
|
+
['Abe', 'ben', 'end', 'nd ', 'd '],
|
46
|
+
['Ana', 'nan', 'ana', 'nas', 'as ', 's ']
|
47
47
|
].flatten
|
48
48
|
},
|
49
49
|
{
|
50
|
-
text:
|
51
|
-
bigrams: [
|
50
|
+
text: 'Ödögidöggi',
|
51
|
+
bigrams: ['Öd', 'dö', 'ög', 'gi', 'id', 'dö', 'ög', 'gg', 'gi', 'i '],
|
52
52
|
trigrams: [
|
53
|
-
[
|
53
|
+
[
|
54
|
+
'Ödö', 'dög', 'ögi', 'gid', 'idö',
|
55
|
+
'dög', 'ögg', 'ggi', 'gi ', 'i '
|
56
|
+
]
|
54
57
|
].flatten
|
55
58
|
},
|
56
59
|
{
|
57
|
-
text:
|
60
|
+
text: 'Ćma ćmę ćmi.',
|
58
61
|
bigrams: [
|
59
|
-
[
|
62
|
+
['Ćm', 'ma', 'a ', 'ćm', 'mę', 'ę ', 'ćm', 'mi', 'i ']
|
60
63
|
].flatten,
|
61
64
|
trigrams: [
|
62
|
-
[
|
65
|
+
['Ćma', 'ma ', 'a ', 'ćmę', 'mę ', 'ę ', 'ćmi', 'mi ', 'i ']
|
63
66
|
].flatten
|
64
67
|
},
|
65
68
|
{
|
66
|
-
text:
|
69
|
+
text: 'Łzy złej zołzy',
|
67
70
|
bigrams: [
|
68
|
-
[
|
69
|
-
[
|
71
|
+
['Łz', 'zy', 'y ', 'zł', 'łe', 'ej', 'j '],
|
72
|
+
['zo', 'oł', 'łz', 'zy', 'y ']
|
70
73
|
].flatten,
|
71
74
|
trigrams: [
|
72
|
-
[
|
73
|
-
[
|
75
|
+
['Łzy', 'zy ', 'y ', 'złe', 'łej', 'ej ', 'j '],
|
76
|
+
['zoł', 'ołz', 'łzy', 'zy ', 'y ']
|
74
77
|
].flatten
|
75
78
|
},
|
76
79
|
{
|
77
|
-
text:
|
80
|
+
text: 'Żubr żuł żuchwą żurawinę.',
|
78
81
|
bigrams: [
|
79
|
-
[
|
80
|
-
[
|
81
|
-
[
|
82
|
+
['Żu', 'ub', 'br', 'r ', 'żu', 'uł', 'ł '],
|
83
|
+
['żu', 'uc', 'ch', 'hw', 'wą', 'ą '],
|
84
|
+
['żu', 'ur', 'ra', 'aw', 'wi', 'in', 'nę', 'ę ']
|
82
85
|
].flatten,
|
83
86
|
trigrams: [
|
84
|
-
[
|
85
|
-
[
|
86
|
-
[
|
87
|
+
['Żub', 'ubr', 'br ', 'r ', 'żuł', 'uł ', 'ł '],
|
88
|
+
['żuc', 'uch', 'chw', 'hwą', 'wą ', 'ą '],
|
89
|
+
['żur', 'ura', 'raw', 'awi', 'win', 'inę', 'nę ', 'ę ']
|
87
90
|
].flatten
|
88
91
|
},
|
89
92
|
{
|
90
|
-
text:
|
93
|
+
text: 'Čistý s Čistou čistili činčilový čepec.',
|
91
94
|
bigrams: [
|
92
|
-
[
|
93
|
-
[
|
94
|
-
[
|
95
|
+
['Či', 'is', 'st', 'tý', 'ý ', 's ', 'Či', 'is', 'st', 'to', 'ou'],
|
96
|
+
['u ', 'či', 'is', 'st', 'ti', 'il', 'li', 'i ', 'či', 'in', 'nč'],
|
97
|
+
['či', 'il', 'lo', 'ov', 'vý', 'ý ', 'če', 'ep', 'pe', 'ec', 'c ']
|
95
98
|
].flatten,
|
96
99
|
trigrams: [
|
97
|
-
[
|
98
|
-
[
|
99
|
-
[
|
100
|
-
[
|
100
|
+
['Čis', 'ist', 'stý', 'tý ', 'ý ', 's ', 'Čis', 'ist', 'sto'],
|
101
|
+
['tou', 'ou ', 'u ', 'čis', 'ist', 'sti', 'til', 'ili', 'li '],
|
102
|
+
['i ', 'čin', 'inč', 'nči', 'čil', 'ilo', 'lov', 'ový', 'vý '],
|
103
|
+
['ý ', 'čep', 'epe', 'pec', 'ec ', 'c ']
|
101
104
|
].flatten
|
102
105
|
},
|
103
106
|
{
|
104
|
-
text:
|
107
|
+
text: '99 bottles of beer on the wall,',
|
105
108
|
bigrams: [
|
106
|
-
[
|
107
|
-
[
|
108
|
-
[
|
109
|
-
[
|
110
|
-
[
|
111
|
-
[
|
109
|
+
['bo', 'ot', 'tt', 'tl', 'le', 'es', 's '],
|
110
|
+
['of', 'f '],
|
111
|
+
['be', 'ee', 'er', 'r '],
|
112
|
+
['on', 'n '],
|
113
|
+
['th', 'he', 'e '],
|
114
|
+
['wa', 'al', 'll', 'l ']
|
112
115
|
].flatten,
|
113
116
|
trigrams: [
|
114
|
-
[
|
115
|
-
[
|
116
|
-
[
|
117
|
-
[
|
118
|
-
[
|
119
|
-
[
|
117
|
+
['bot', 'ott', 'ttl', 'tle', 'les', 'es ', 's '],
|
118
|
+
['of ', 'f '],
|
119
|
+
['bee', 'eer', 'er ', 'r '],
|
120
|
+
['on ', 'n '],
|
121
|
+
['the', 'he ', 'e '],
|
122
|
+
['wal', 'all', 'll ', 'l ']
|
120
123
|
].flatten
|
121
124
|
}
|
122
125
|
].each do |hash|
|
@@ -2,24 +2,25 @@
|
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
4
|
describe NgramsParser::String do
|
5
|
+
# Testing class
|
5
6
|
class String
|
6
7
|
include NgramsParser::String
|
7
8
|
end
|
8
9
|
|
9
|
-
context
|
10
|
-
it
|
11
|
-
string =
|
10
|
+
context '#ngrams' do
|
11
|
+
it 'splits String into ngrams' do
|
12
|
+
string = 'Lorem ipsum'
|
12
13
|
bigrams = [
|
13
|
-
|
14
|
-
|
14
|
+
'Lo', 'or', 're', 'em', 'm ',
|
15
|
+
'ip', 'ps', 'su', 'um', 'm '
|
15
16
|
]
|
16
17
|
trigrams = [
|
17
|
-
|
18
|
-
|
18
|
+
'Lor', 'ore', 'rem', 'em ', 'm ',
|
19
|
+
'ips', 'psu', 'sum', 'um ', 'm '
|
19
20
|
]
|
20
21
|
quadgrams = [
|
21
|
-
|
22
|
-
|
22
|
+
'Lore', 'orem', 'rem ', 'em ', 'm ',
|
23
|
+
'ipsu', 'psum', 'sum ', 'um ', 'm '
|
23
24
|
]
|
24
25
|
|
25
26
|
string.ngrams(2).should eq(bigrams)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ngrams_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksander Malaszkiewicz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|