ngrams_parser 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: fed54ec83869838172748c8260f8c48eeeed5be0
4
+ data.tar.gz: af4d624af9d708cfcd8624d32a6a605350ff554d
5
+ SHA512:
6
+ metadata.gz: 633367af35abcb5298894de7464513fc271290ebe9fa21ccb198aadacdae3f1bb91aa8fd769088d2d4330f2d354dca3fd5057b1e70b0a133a0990feb4cea24b6
7
+ data.tar.gz: e359a7dc70271d7ee7c87715bdfc59de9c1c17c6ecc7f4018c86cb7311ca1883f41904aa60d32c694c5cded703ec90b13eed0864d031bfa5ec5f1b1fea9e49ab
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ ngrams_parser
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.0.0
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ## v0.0.1
2
+
3
+ * initial release
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in ngrams_parser.gemspec
4
+ gemspec
5
+
6
+ group :development, :test do
7
+ gem 'coveralls', require: false
8
+ gem 'rspec'
9
+ gem 'reek'
10
+ gem 'guard'
11
+ gem 'guard-bundler'
12
+ gem 'guard-rspec'
13
+ end
data/Guardfile ADDED
@@ -0,0 +1,13 @@
1
+ guard 'bundler' do
2
+ watch('Gemfile')
3
+ watch(/^.+\.gemspec/)
4
+ end
5
+
6
+ guard :rspec do
7
+ watch(%r{^spec/.+_spec\.rb$})
8
+ watch(%r{^lib/(.+)\.rb$}) do |array|
9
+ name = array.last
10
+ "spec/#{name}_spec.rb"
11
+ end
12
+ watch('spec/spec_helper.rb') { "spec" }
13
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Fractal Soft
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,43 @@
1
+ # NgramsParser [![Gem Version](https://badge.fury.io/rb/ngrams_parser.png)](http://badge.fury.io/rb/ngrams_parser) [![Build Status](https://travis-ci.org/fractalsoft/ngrams_parser.png)](https://travis-ci.org/fractalsoft/ngrams_parser) [![Dependency Status](https://gemnasium.com/fractalsoft/ngrams_parser.png)](https://gemnasium.com/fractalsoft/ngrams_parser) [![Coverage Status](https://coveralls.io/repos/fractalsoft/ngrams_parser/badge.png)](https://coveralls.io/r/fractalsoft/ngrams_parser)
2
+
3
+ N-gram is a contiguous sequence of n items from a given sequence of text or speech. The items are letters, but can be phonemes, syllables, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'ngrams_parser'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install ngrams_parser
18
+
19
+ ## Usage
20
+
21
+ ```ruby
22
+ # splits single word into ngrams
23
+ NgramsParser::ngram(word, size)
24
+
25
+ # splits given text into ngrams
26
+ NgramsParser::ngrams(text, size)
27
+ ```
28
+
29
+ You can use ngrams method into String class:
30
+
31
+ ```ruby
32
+ class String
33
+ include NgramsParser::String
34
+ end
35
+ ```
36
+
37
+ ## Contributing
38
+
39
+ 1. Fork it
40
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
41
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
42
+ 4. Push to the branch (`git push origin my-new-feature`)
43
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ desc 'Default: run specs.'
5
+ task default: :spec
6
+
7
+ desc "Run specs"
8
+ RSpec::Core::RakeTask.new do |t|
9
+ t.rspec_opts = "--tag fast"
10
+ end
@@ -0,0 +1,7 @@
1
+ require "ngrams_parser/ngram"
2
+ require "ngrams_parser/ngrams"
3
+ require "ngrams_parser/string"
4
+ require "ngrams_parser/version"
5
+
6
+ module NgramsParser
7
+ end
@@ -0,0 +1,15 @@
1
+ # coding: utf-8
2
+ module NgramsParser
3
+ # Split word into ngrams
4
+ #
5
+ # self.ngram("lorem", 2) #=> ["lo", "or", "re", "em", "m "]
6
+ # self.ngram("ipsum", 3) #=> ["ips", "psu", "sum", "um ", "m "]
7
+ def self.ngram(word, size)
8
+ array = []
9
+ word.split('').each_index do |index|
10
+ text = word[index..index+size-1]
11
+ array << text.ljust(size, ' ')
12
+ end
13
+ array
14
+ end
15
+ end
@@ -0,0 +1,12 @@
1
+ # coding: utf-8
2
+ require 'lexical_units'
3
+
4
+ module NgramsParser
5
+ def self.ngrams(text, size)
6
+ array = []
7
+ LexicalUnits::words(text).each do |word|
8
+ array << ngram(word, size)
9
+ end
10
+ array.flatten
11
+ end
12
+ end
@@ -0,0 +1,8 @@
1
+ # coding: utf-8
2
+ module NgramsParser
3
+ module String
4
+ def ngrams(size)
5
+ NgramsParser::ngrams(self, size)
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,3 @@
1
+ module NgramsParser
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'ngrams_parser/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "ngrams_parser"
8
+ spec.version = NgramsParser::VERSION
9
+ spec.authors = ["Aleksander Malaszkiewicz"]
10
+ spec.email = ["info@fractalsoft.org"]
11
+ spec.summary = %q{Split text into ngrams}
12
+ spec.homepage = ""
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_development_dependency "bundler", "~> 1.3"
21
+ spec.add_development_dependency "rake"
22
+
23
+ spec.add_dependency "lexical_units"
24
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe NgramsParser do
4
+ context ".ngram" do
5
+ let(:klass) { NgramsParser }
6
+
7
+ [
8
+ {
9
+ text: "Will's",
10
+ digrams: ["Wi", "il", "ll", "l'", "'s", "s "],
11
+ trigrams: ["Wil", "ill", "ll'", "l's", "'s ", "s "],
12
+ },
13
+ {
14
+ text: "činčilový",
15
+ digrams: ["či", "in", "nč", "či", "il", "lo", "ov", "vý", "ý "],
16
+ trigrams: ["čin", "inč", "nči", "čil", "ilo", "lov", "ový", "vý ", "ý "]
17
+ }
18
+ ].each do |hash|
19
+ text, bigrams, trigrams = hash.values
20
+
21
+ it "split word '#{text}' into bigrams: #{bigrams}" do
22
+ klass::ngram(text, 2).should eq(bigrams)
23
+ end
24
+
25
+ it "split word '#{text}' into trigrams: #{trigrams}" do
26
+ klass::ngram(text, 3).should eq(trigrams)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,114 @@
1
+ require 'spec_helper'
2
+
3
+ describe NgramsParser do
4
+ context ".ngrams" do
5
+ let(:klass) { NgramsParser }
6
+
7
+ [
8
+ {
9
+ text: "Will will Will will Will's will to Will?",
10
+ bigrams: [
11
+ ["Wi", "il", "ll", "l ", "wi", "il", "ll", "l "],
12
+ ["Wi", "il", "ll", "l ", "wi", "il", "ll", "l "],
13
+ ["Wi", "il", "ll", "l'", "'s", "s ", "wi", "il", "ll", "l "],
14
+ ["to", "o "],
15
+ ["Wi", "il", "ll", "l "]
16
+ ].flatten,
17
+ trigrams: [
18
+ ["Wil", "ill", "ll ","l "],
19
+ ["wil", "ill", "ll ", "l "],
20
+ ["Wil", "ill", "ll ","l "],
21
+ ["wil", "ill", "ll ", "l "],
22
+ ["Wil", "ill", "ll'","l's", "'s ", "s "],
23
+ ["wil", "ill", "ll ", "l "],
24
+ ["to ", "o ", "Wil", "ill", "ll ", "l "]
25
+ ].flatten
26
+ },
27
+ {
28
+ text: "Acht alte Ameisen aßen am Abend Ananas.",
29
+ bigrams: [
30
+ ["Ac", "ch", "ht", "t "],
31
+ ["al", "lt", "te", "e "],
32
+ ["Am", "me", "ei", "is", "se", "en", "n "],
33
+ ["aß", "ße", "en", "n "],
34
+ ["am", "m "],
35
+ ["Ab", "be", "en", "nd", "d "],
36
+ ["An", "na", "an", "na", "as", "s "]
37
+ ].flatten,
38
+ trigrams: [
39
+ ["Ach", "cht", "ht ", "t "],
40
+ ["alt", "lte", "te ", "e "],
41
+ ["Ame", "mei", "eis", "ise", "sen", "en ", "n "],
42
+ ["aße", "ßen", "en ", "n "],
43
+ ["am ", "m "],
44
+ ["Abe", "ben", "end", "nd ", "d "],
45
+ ["Ana", "nan", "ana", "nas", "as ", "s "]
46
+ ].flatten
47
+ },
48
+ {
49
+ text: "Ödögidöggi",
50
+ bigrams: ["Öd", "dö", "ög", "gi", "id", "dö", "ög", "gg", "gi", "i "],
51
+ trigrams: [
52
+ ["Ödö", "dög", "ögi", "gid", "idö", "dög", "ögg", "ggi", "gi ", "i "]
53
+ ].flatten
54
+ },
55
+ {
56
+ text: "Ćma ćmę ćmi.",
57
+ bigrams: [
58
+ ["Ćm", "ma", "a ", "ćm", "mę", "ę ", "ćm", "mi", "i "]
59
+ ].flatten,
60
+ trigrams: [
61
+ ["Ćma", "ma ", "a ", "ćmę", "mę ", "ę ", "ćmi", "mi ", "i "]
62
+ ].flatten
63
+ },
64
+ {
65
+ text: "Łzy złej zołzy",
66
+ bigrams: [
67
+ ["Łz", "zy", "y ", "zł", "łe", "ej", "j "],
68
+ ["zo", "oł", "łz", "zy", "y "]
69
+ ].flatten,
70
+ trigrams: [
71
+ ["Łzy", "zy ", "y ", "złe", "łej", "ej ", "j "],
72
+ ["zoł", "ołz", "łzy", "zy ", "y "]
73
+ ].flatten
74
+ },
75
+ {
76
+ text: "Żubr żuł żuchwą żurawinę.",
77
+ bigrams: [
78
+ ["Żu", "ub", "br", "r ", "żu", "uł", "ł "],
79
+ ["żu", "uc", "ch", "hw", "wą", "ą "],
80
+ ["żu", "ur", "ra", "aw", "wi", "in", "nę", "ę "]
81
+ ].flatten,
82
+ trigrams: [
83
+ ["Żub", "ubr", "br ", "r ", "żuł", "uł ", "ł "],
84
+ ["żuc", "uch", "chw", "hwą", "wą ", "ą "],
85
+ ["żur", "ura", "raw", "awi", "win", "inę", "nę ", "ę "]
86
+ ].flatten
87
+ },
88
+ {
89
+ text: "Čistý s Čistou čistili činčilový čepec.",
90
+ bigrams: [
91
+ ["Či", "is", "st", "tý", "ý ", "s ", "Či", "is", "st", "to", "ou"],
92
+ ["u ", "či", "is", "st", "ti", "il", "li", "i ", "či", "in", "nč"],
93
+ ["či", "il", "lo", "ov", "vý", "ý ", "če", "ep", "pe", "ec", "c "]
94
+ ].flatten,
95
+ trigrams: [
96
+ ["Čis", "ist", "stý", "tý ", "ý ", "s ", "Čis", "ist", "sto"],
97
+ ["tou", "ou ", "u ", "čis", "ist", "sti", "til", "ili", "li "],
98
+ ["i ", "čin", "inč", "nči", "čil", "ilo", "lov", "ový", "vý "],
99
+ ["ý ", "čep", "epe", "pec", "ec ", "c "]
100
+ ].flatten
101
+ }
102
+ ].each do |hash|
103
+ text, bigrams, trigrams = hash.values
104
+
105
+ it "split text '#{text}' into bigrams" do
106
+ klass::ngrams(text, 2).should eq(bigrams)
107
+ end
108
+
109
+ it "split text '#{text}' into trigrams" do
110
+ klass::ngrams(text, 3).should eq(trigrams)
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe NgramsParser::String do
4
+ class String
5
+ include NgramsParser::String
6
+ end
7
+
8
+ context "#ngrams" do
9
+ it "splits String into ngrams" do
10
+ string = "Lorem ipsum"
11
+ bigrams = [
12
+ "Lo", "or", "re", "em", "m ",
13
+ "ip", "ps", "su", "um", "m "
14
+ ]
15
+ trigrams = [
16
+ "Lor", "ore", "rem", "em ", "m ",
17
+ "ips", "psu", "sum", "um ", "m "
18
+ ]
19
+ quadgrams = [
20
+ "Lore", "orem", "rem ", "em ", "m ",
21
+ "ipsu", "psum", "sum ", "um ", "m "
22
+ ]
23
+
24
+ string.ngrams(2).should eq(bigrams)
25
+ string.ngrams(3).should eq(trigrams)
26
+ string.ngrams(4).should eq(quadgrams)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,9 @@
1
+ require 'ngrams_parser'
2
+ require 'coveralls'
3
+ Coveralls.wear!
4
+
5
+ RSpec.configure do |config|
6
+ config.treat_symbols_as_metadata_keys_with_true_values = true
7
+ config.run_all_when_everything_filtered = true
8
+ config.filter_run :focus
9
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ngrams_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Aleksander Malaszkiewicz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: lexical_units
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description:
56
+ email:
57
+ - info@fractalsoft.org
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - .gitignore
63
+ - .rspec
64
+ - .ruby-gemset
65
+ - .ruby-version
66
+ - CHANGELOG.md
67
+ - Gemfile
68
+ - Guardfile
69
+ - LICENSE.txt
70
+ - README.md
71
+ - Rakefile
72
+ - lib/ngrams_parser.rb
73
+ - lib/ngrams_parser/ngram.rb
74
+ - lib/ngrams_parser/ngrams.rb
75
+ - lib/ngrams_parser/string.rb
76
+ - lib/ngrams_parser/version.rb
77
+ - ngrams_parser.gemspec
78
+ - spec/ngrams_parser/ngram_spec.rb
79
+ - spec/ngrams_parser/ngrams_spec.rb
80
+ - spec/ngrams_parser/string_spec.rb
81
+ - spec/spec_helper.rb
82
+ homepage: ''
83
+ licenses:
84
+ - MIT
85
+ metadata: {}
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ required_ruby_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ required_rubygems_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - '>='
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ requirements: []
101
+ rubyforge_project:
102
+ rubygems_version: 2.0.3
103
+ signing_key:
104
+ specification_version: 4
105
+ summary: Split text into ngrams
106
+ test_files:
107
+ - spec/ngrams_parser/ngram_spec.rb
108
+ - spec/ngrams_parser/ngrams_spec.rb
109
+ - spec/ngrams_parser/string_spec.rb
110
+ - spec/spec_helper.rb