ngrams_parser 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4050a66b4d50b418ea1c4deffbb033a4b31f84ed
4
- data.tar.gz: 54acdd2b29a1a17efc1e933fc3e938355629ae62
3
+ metadata.gz: 6c3cdef6702ffbc5a36f42d454a3bb491abf72f0
4
+ data.tar.gz: 69e88fdec44a26144c90f668be1c80ed97c804e3
5
5
  SHA512:
6
- metadata.gz: cb385713b8d33dc66af608534b74eebb9c60d73f6813b98d6f6d83e3fc1401e88cd530dd5d72e7d0e96745f1036b64df1be0ca21593bdd945a310d0019874564
7
- data.tar.gz: d3af55e33005e70ce2efc638262789e0ba3e772107fb60203ea697ea936b8f84f038708596d4aa5a2e8c1728cf009b0cde645b213840ad41c2aba4629c1dd14f
6
+ metadata.gz: a1c60d03f3802948359c7e3993a315e62b481a15179b7ae2f9c5c0bc44066f1bf3919d59f8b15a7d35288414e074448ca0def7815fc58f034091f95f495d2f83
7
+ data.tar.gz: 5c936796efb35bddadfe3d34381c5cba344ddff8c2d86fe169a95bed70ed6086c785a1aee19509ba9f6204084e6c079ce4d1d1d4a475d131d9985391b9f9073e
data/.travis.yml CHANGED
@@ -2,4 +2,3 @@ language: ruby
2
2
  rvm:
3
3
  - 1.9.3
4
4
  - 2.0.0
5
- - jruby-head
data/CHANGELOG.md CHANGED
@@ -9,3 +9,7 @@
9
9
  ## v0.0.3
10
10
 
11
11
  * ngrams without digits
12
+
13
+ ## v0.0.4
14
+
15
+ * code clean up
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # NgramsParser [![Gem Version](https://badge.fury.io/rb/ngrams_parser.png)](http://badge.fury.io/rb/ngrams_parser) [![Build Status](https://travis-ci.org/fractalsoft/ngrams_parser.png)](https://travis-ci.org/fractalsoft/ngrams_parser) [![Dependency Status](https://gemnasium.com/fractalsoft/ngrams_parser.png)](https://gemnasium.com/fractalsoft/ngrams_parser) [![Coverage Status](https://coveralls.io/repos/fractalsoft/ngrams_parser/badge.png)](https://coveralls.io/r/fractalsoft/ngrams_parser)
1
+ # NgramsParser [![Gem Version](https://badge.fury.io/rb/ngrams_parser.png)](http://badge.fury.io/rb/ngrams_parser) [![Build Status](https://travis-ci.org/fractalsoft/ngrams_parser.png)](https://travis-ci.org/fractalsoft/ngrams_parser) [![Dependency Status](https://gemnasium.com/fractalsoft/ngrams_parser.png)](https://gemnasium.com/fractalsoft/ngrams_parser) [![Coverage Status](https://coveralls.io/repos/fractalsoft/ngrams_parser/badge.png)](https://coveralls.io/r/fractalsoft/ngrams_parser) [![Stories in Ready](https://badge.waffle.io/fractalsoft/ngrams_parser.png)](http://waffle.io/fractalsoft/ngrams_parser)
2
2
  [![endorse](https://api.coderwall.com/torrocus/endorsecount.png)](https://coderwall.com/torrocus)
3
3
 
4
4
  N-gram is a contiguous sequence of n items from a given sequence of text or speech. The items are letters, but can be phonemes, syllables, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus.
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+
3
+ # Parse word to ngrams
2
4
  module NgramsParser
3
5
  # Split word into ngrams
4
6
  #
@@ -7,7 +9,7 @@ module NgramsParser
7
9
  def self.ngram(word, size)
8
10
  array = []
9
11
  word.split('').each_index do |index|
10
- text = word[index..index+size-1]
12
+ text = word[index..index + size - 1]
11
13
  array << text.ljust(size, ' ')
12
14
  end
13
15
  array
@@ -1,10 +1,11 @@
1
1
  # coding: utf-8
2
2
  require 'lexical_units'
3
3
 
4
+ # Parse given *text* into ngrams
4
5
  module NgramsParser
5
6
  def self.ngrams(text, size)
6
7
  array = []
7
- LexicalUnits::words_without_digits(text).each do |word|
8
+ LexicalUnits.words_without_digits(text).each do |word|
8
9
  array << ngram(word, size)
9
10
  end
10
11
  array.flatten
@@ -1,8 +1,10 @@
1
1
  # coding: utf-8
2
+
2
3
  module NgramsParser
4
+ # Use ngrams inside String class
3
5
  module String
4
6
  def ngrams(size)
5
- NgramsParser::ngrams(self, size)
7
+ NgramsParser.ngrams(self, size)
6
8
  end
7
9
  end
8
10
  end
@@ -1,3 +1,4 @@
1
+ # Gem version
1
2
  module NgramsParser
2
- VERSION = "0.0.3"
3
+ VERSION = '0.0.4'
3
4
  end
data/lib/ngrams_parser.rb CHANGED
@@ -1,7 +1,4 @@
1
- require "ngrams_parser/ngram"
2
- require "ngrams_parser/ngrams"
3
- require "ngrams_parser/string"
4
- require "ngrams_parser/version"
5
-
6
- module NgramsParser
7
- end
1
+ require 'ngrams_parser/ngram'
2
+ require 'ngrams_parser/ngrams'
3
+ require 'ngrams_parser/string'
4
+ require 'ngrams_parser/version'
@@ -2,29 +2,29 @@
2
2
  require 'spec_helper'
3
3
 
4
4
  describe NgramsParser do
5
- context ".ngram" do
6
- let(:klass) { NgramsParser }
7
-
5
+ context '.ngram' do
8
6
  [
9
7
  {
10
8
  text: "Will's",
11
- digrams: ["Wi", "il", "ll", "l'", "'s", "s "],
12
- trigrams: ["Wil", "ill", "ll'", "l's", "'s ", "s "],
9
+ digrams: ['Wi', 'il', 'll', "l'", "'s", 's '],
10
+ trigrams: ['Wil', 'ill', "ll'", "l's", "'s ", 's '],
13
11
  },
14
12
  {
15
- text: "činčilový",
16
- digrams: ["či", "in", "", "či", "il", "lo", "ov", "", "ý "],
17
- trigrams: ["čin", "inč", "nči", "čil", "ilo", "lov", "ový", "vý ", "ý "]
13
+ text: 'činčilový',
14
+ digrams: ['či', 'in', '', 'či', 'il', 'lo', 'ov', '', 'ý '],
15
+ trigrams: [
16
+ 'čin', 'inč', 'nči', 'čil', 'ilo', 'lov', 'ový', 'vý ', 'ý '
17
+ ]
18
18
  }
19
19
  ].each do |hash|
20
20
  text, bigrams, trigrams = hash.values
21
21
 
22
22
  it "split word '#{text}' into bigrams: #{bigrams}" do
23
- klass::ngram(text, 2).should eq(bigrams)
23
+ subject.ngram(text, 2).should eq(bigrams)
24
24
  end
25
25
 
26
26
  it "split word '#{text}' into trigrams: #{trigrams}" do
27
- klass::ngram(text, 3).should eq(trigrams)
27
+ subject.ngram(text, 3).should eq(trigrams)
28
28
  end
29
29
  end
30
30
  end
@@ -2,121 +2,124 @@
2
2
  require 'spec_helper'
3
3
 
4
4
  describe NgramsParser do
5
- context ".ngrams" do
5
+ context '.ngrams' do
6
6
  let(:klass) { NgramsParser }
7
7
 
8
8
  [
9
9
  {
10
10
  text: "Will will Will will Will's will to Will?",
11
11
  bigrams: [
12
- ["Wi", "il", "ll", "l ", "wi", "il", "ll", "l "],
13
- ["Wi", "il", "ll", "l ", "wi", "il", "ll", "l "],
14
- ["Wi", "il", "ll", "l'", "'s", "s ", "wi", "il", "ll", "l "],
15
- ["to", "o "],
16
- ["Wi", "il", "ll", "l "]
12
+ ['Wi', 'il', 'll', 'l ', 'wi', 'il', 'll', 'l '],
13
+ ['Wi', 'il', 'll', 'l ', 'wi', 'il', 'll', 'l '],
14
+ ['Wi', 'il', 'll', "l'", "'s", 's ', 'wi', 'il', 'll', 'l '],
15
+ ['to', 'o '],
16
+ ['Wi', 'il', 'll', 'l ']
17
17
  ].flatten,
18
18
  trigrams: [
19
- ["Wil", "ill", "ll ","l "],
20
- ["wil", "ill", "ll ", "l "],
21
- ["Wil", "ill", "ll ","l "],
22
- ["wil", "ill", "ll ", "l "],
23
- ["Wil", "ill", "ll'","l's", "'s ", "s "],
24
- ["wil", "ill", "ll ", "l "],
25
- ["to ", "o ", "Wil", "ill", "ll ", "l "]
19
+ ['Wil', 'ill', 'll ', 'l '],
20
+ ['wil', 'ill', 'll ', 'l '],
21
+ ['Wil', 'ill', 'll ', 'l '],
22
+ ['wil', 'ill', 'll ', 'l '],
23
+ ['Wil', 'ill', "ll'", "l's", "'s ", 's '],
24
+ ['wil', 'ill', 'll ', 'l '],
25
+ ['to ', 'o ', 'Wil', 'ill', 'll ', 'l ']
26
26
  ].flatten
27
27
  },
28
28
  {
29
- text: "Acht alte Ameisen aßen am Abend Ananas.",
29
+ text: 'Acht alte Ameisen aßen am Abend Ananas.',
30
30
  bigrams: [
31
- ["Ac", "ch", "ht", "t "],
32
- ["al", "lt", "te", "e "],
33
- ["Am", "me", "ei", "is", "se", "en", "n "],
34
- ["", "ße", "en", "n "],
35
- ["am", "m "],
36
- ["Ab", "be", "en", "nd", "d "],
37
- ["An", "na", "an", "na", "as", "s "]
31
+ ['Ac', 'ch', 'ht', 't '],
32
+ ['al', 'lt', 'te', 'e '],
33
+ ['Am', 'me', 'ei', 'is', 'se', 'en', 'n '],
34
+ ['', 'ße', 'en', 'n '],
35
+ ['am', 'm '],
36
+ ['Ab', 'be', 'en', 'nd', 'd '],
37
+ ['An', 'na', 'an', 'na', 'as', 's ']
38
38
  ].flatten,
39
39
  trigrams: [
40
- ["Ach", "cht", "ht ", "t "],
41
- ["alt", "lte", "te ", "e "],
42
- ["Ame", "mei", "eis", "ise", "sen", "en ", "n "],
43
- ["aße", "ßen", "en ", "n "],
44
- ["am ", "m "],
45
- ["Abe", "ben", "end", "nd ", "d "],
46
- ["Ana", "nan", "ana", "nas", "as ", "s "]
40
+ ['Ach', 'cht', 'ht ', 't '],
41
+ ['alt', 'lte', 'te ', 'e '],
42
+ ['Ame', 'mei', 'eis', 'ise', 'sen', 'en ', 'n '],
43
+ ['aße', 'ßen', 'en ', 'n '],
44
+ ['am ', 'm '],
45
+ ['Abe', 'ben', 'end', 'nd ', 'd '],
46
+ ['Ana', 'nan', 'ana', 'nas', 'as ', 's ']
47
47
  ].flatten
48
48
  },
49
49
  {
50
- text: "Ödögidöggi",
51
- bigrams: ["Öd", "", "ög", "gi", "id", "", "ög", "gg", "gi", "i "],
50
+ text: 'Ödögidöggi',
51
+ bigrams: ['Öd', '', 'ög', 'gi', 'id', '', 'ög', 'gg', 'gi', 'i '],
52
52
  trigrams: [
53
- ["Ödö", "dög", "ögi", "gid", "idö", "dög", "ögg", "ggi", "gi ", "i "]
53
+ [
54
+ 'Ödö', 'dög', 'ögi', 'gid', 'idö',
55
+ 'dög', 'ögg', 'ggi', 'gi ', 'i '
56
+ ]
54
57
  ].flatten
55
58
  },
56
59
  {
57
- text: "Ćma ćmę ćmi.",
60
+ text: 'Ćma ćmę ćmi.',
58
61
  bigrams: [
59
- ["Ćm", "ma", "a ", "ćm", "", "ę ", "ćm", "mi", "i "]
62
+ ['Ćm', 'ma', 'a ', 'ćm', '', 'ę ', 'ćm', 'mi', 'i ']
60
63
  ].flatten,
61
64
  trigrams: [
62
- ["Ćma", "ma ", "a ", "ćmę", "", "ę ", "ćmi", "mi ", "i "]
65
+ ['Ćma', 'ma ', 'a ', 'ćmę', '', 'ę ', 'ćmi', 'mi ', 'i ']
63
66
  ].flatten
64
67
  },
65
68
  {
66
- text: "Łzy złej zołzy",
69
+ text: 'Łzy złej zołzy',
67
70
  bigrams: [
68
- ["Łz", "zy", "y ", "", "łe", "ej", "j "],
69
- ["zo", "", "łz", "zy", "y "]
71
+ ['Łz', 'zy', 'y ', '', 'łe', 'ej', 'j '],
72
+ ['zo', '', 'łz', 'zy', 'y ']
70
73
  ].flatten,
71
74
  trigrams: [
72
- ["Łzy", "zy ", "y ", "złe", "łej", "ej ", "j "],
73
- ["zoł", "ołz", "łzy", "zy ", "y "]
75
+ ['Łzy', 'zy ', 'y ', 'złe', 'łej', 'ej ', 'j '],
76
+ ['zoł', 'ołz', 'łzy', 'zy ', 'y ']
74
77
  ].flatten
75
78
  },
76
79
  {
77
- text: "Żubr żuł żuchwą żurawinę.",
80
+ text: 'Żubr żuł żuchwą żurawinę.',
78
81
  bigrams: [
79
- ["Żu", "ub", "br", "r ", "żu", "", "ł "],
80
- ["żu", "uc", "ch", "hw", "", "ą "],
81
- ["żu", "ur", "ra", "aw", "wi", "in", "", "ę "]
82
+ ['Żu', 'ub', 'br', 'r ', 'żu', '', 'ł '],
83
+ ['żu', 'uc', 'ch', 'hw', '', 'ą '],
84
+ ['żu', 'ur', 'ra', 'aw', 'wi', 'in', '', 'ę ']
82
85
  ].flatten,
83
86
  trigrams: [
84
- ["Żub", "ubr", "br ", "r ", "żuł", "", "ł "],
85
- ["żuc", "uch", "chw", "hwą", "", "ą "],
86
- ["żur", "ura", "raw", "awi", "win", "inę", "", "ę "]
87
+ ['Żub', 'ubr', 'br ', 'r ', 'żuł', '', 'ł '],
88
+ ['żuc', 'uch', 'chw', 'hwą', '', 'ą '],
89
+ ['żur', 'ura', 'raw', 'awi', 'win', 'inę', '', 'ę ']
87
90
  ].flatten
88
91
  },
89
92
  {
90
- text: "Čistý s Čistou čistili činčilový čepec.",
93
+ text: 'Čistý s Čistou čistili činčilový čepec.',
91
94
  bigrams: [
92
- ["Či", "is", "st", "", "ý ", "s ", "Či", "is", "st", "to", "ou"],
93
- ["u ", "či", "is", "st", "ti", "il", "li", "i ", "či", "in", ""],
94
- ["či", "il", "lo", "ov", "", "ý ", "če", "ep", "pe", "ec", "c "]
95
+ ['Či', 'is', 'st', '', 'ý ', 's ', 'Či', 'is', 'st', 'to', 'ou'],
96
+ ['u ', 'či', 'is', 'st', 'ti', 'il', 'li', 'i ', 'či', 'in', ''],
97
+ ['či', 'il', 'lo', 'ov', '', 'ý ', 'če', 'ep', 'pe', 'ec', 'c ']
95
98
  ].flatten,
96
99
  trigrams: [
97
- ["Čis", "ist", "stý", "", "ý ", "s ", "Čis", "ist", "sto"],
98
- ["tou", "ou ", "u ", "čis", "ist", "sti", "til", "ili", "li "],
99
- ["i ", "čin", "inč", "nči", "čil", "ilo", "lov", "ový", ""],
100
- ["ý ", "čep", "epe", "pec", "ec ", "c "]
100
+ ['Čis', 'ist', 'stý', '', 'ý ', 's ', 'Čis', 'ist', 'sto'],
101
+ ['tou', 'ou ', 'u ', 'čis', 'ist', 'sti', 'til', 'ili', 'li '],
102
+ ['i ', 'čin', 'inč', 'nči', 'čil', 'ilo', 'lov', 'ový', ''],
103
+ ['ý ', 'čep', 'epe', 'pec', 'ec ', 'c ']
101
104
  ].flatten
102
105
  },
103
106
  {
104
- text: "99 bottles of beer on the wall,",
107
+ text: '99 bottles of beer on the wall,',
105
108
  bigrams: [
106
- ["bo", "ot", "tt", "tl", "le", "es", "s "],
107
- ["of", "f "],
108
- ["be", "ee", "er", "r "],
109
- ["on", "n "],
110
- ["th", "he", "e "],
111
- ["wa", "al", "ll", "l "]
109
+ ['bo', 'ot', 'tt', 'tl', 'le', 'es', 's '],
110
+ ['of', 'f '],
111
+ ['be', 'ee', 'er', 'r '],
112
+ ['on', 'n '],
113
+ ['th', 'he', 'e '],
114
+ ['wa', 'al', 'll', 'l ']
112
115
  ].flatten,
113
116
  trigrams: [
114
- ["bot", "ott", "ttl", "tle", "les", "es ", "s "],
115
- ["of ", "f "],
116
- ["bee", "eer", "er ", "r "],
117
- ["on ", "n "],
118
- ["the", "he ", "e "],
119
- ["wal", "all", "ll ", "l "]
117
+ ['bot', 'ott', 'ttl', 'tle', 'les', 'es ', 's '],
118
+ ['of ', 'f '],
119
+ ['bee', 'eer', 'er ', 'r '],
120
+ ['on ', 'n '],
121
+ ['the', 'he ', 'e '],
122
+ ['wal', 'all', 'll ', 'l ']
120
123
  ].flatten
121
124
  }
122
125
  ].each do |hash|
@@ -2,24 +2,25 @@
2
2
  require 'spec_helper'
3
3
 
4
4
  describe NgramsParser::String do
5
+ # Testing class
5
6
  class String
6
7
  include NgramsParser::String
7
8
  end
8
9
 
9
- context "#ngrams" do
10
- it "splits String into ngrams" do
11
- string = "Lorem ipsum"
10
+ context '#ngrams' do
11
+ it 'splits String into ngrams' do
12
+ string = 'Lorem ipsum'
12
13
  bigrams = [
13
- "Lo", "or", "re", "em", "m ",
14
- "ip", "ps", "su", "um", "m "
14
+ 'Lo', 'or', 're', 'em', 'm ',
15
+ 'ip', 'ps', 'su', 'um', 'm '
15
16
  ]
16
17
  trigrams = [
17
- "Lor", "ore", "rem", "em ", "m ",
18
- "ips", "psu", "sum", "um ", "m "
18
+ 'Lor', 'ore', 'rem', 'em ', 'm ',
19
+ 'ips', 'psu', 'sum', 'um ', 'm '
19
20
  ]
20
21
  quadgrams = [
21
- "Lore", "orem", "rem ", "em ", "m ",
22
- "ipsu", "psum", "sum ", "um ", "m "
22
+ 'Lore', 'orem', 'rem ', 'em ', 'm ',
23
+ 'ipsu', 'psum', 'sum ', 'um ', 'm '
23
24
  ]
24
25
 
25
26
  string.ngrams(2).should eq(bigrams)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ngrams_parser
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aleksander Malaszkiewicz
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-16 00:00:00.000000000 Z
11
+ date: 2013-08-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler