ngrams_parser 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: fed54ec83869838172748c8260f8c48eeeed5be0
4
+ data.tar.gz: af4d624af9d708cfcd8624d32a6a605350ff554d
5
+ SHA512:
6
+ metadata.gz: 633367af35abcb5298894de7464513fc271290ebe9fa21ccb198aadacdae3f1bb91aa8fd769088d2d4330f2d354dca3fd5057b1e70b0a133a0990feb4cea24b6
7
+ data.tar.gz: e359a7dc70271d7ee7c87715bdfc59de9c1c17c6ecc7f4018c86cb7311ca1883f41904aa60d32c694c5cded703ec90b13eed0864d031bfa5ec5f1b1fea9e49ab
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ ngrams_parser
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.0.0
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ## v0.0.1
2
+
3
+ * initial release
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in ngrams_parser.gemspec
4
+ gemspec
5
+
6
+ group :development, :test do
7
+ gem 'coveralls', require: false
8
+ gem 'rspec'
9
+ gem 'reek'
10
+ gem 'guard'
11
+ gem 'guard-bundler'
12
+ gem 'guard-rspec'
13
+ end
data/Guardfile ADDED
@@ -0,0 +1,13 @@
1
+ guard 'bundler' do
2
+ watch('Gemfile')
3
+ watch(/^.+\.gemspec/)
4
+ end
5
+
6
+ guard :rspec do
7
+ watch(%r{^spec/.+_spec\.rb$})
8
+ watch(%r{^lib/(.+)\.rb$}) do |array|
9
+ name = array.last
10
+ "spec/#{name}_spec.rb"
11
+ end
12
+ watch('spec/spec_helper.rb') { "spec" }
13
+ end
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Fractal Soft
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,43 @@
1
+ # NgramsParser [![Gem Version](https://badge.fury.io/rb/ngrams_parser.png)](http://badge.fury.io/rb/ngrams_parser) [![Build Status](https://travis-ci.org/fractalsoft/ngrams_parser.png)](https://travis-ci.org/fractalsoft/ngrams_parser) [![Dependency Status](https://gemnasium.com/fractalsoft/ngrams_parser.png)](https://gemnasium.com/fractalsoft/ngrams_parser) [![Coverage Status](https://coveralls.io/repos/fractalsoft/ngrams_parser/badge.png)](https://coveralls.io/r/fractalsoft/ngrams_parser)
2
+
3
+ N-gram is a contiguous sequence of n items from a given sequence of text or speech. The items are letters, but can be phonemes, syllables, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'ngrams_parser'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install ngrams_parser
18
+
19
+ ## Usage
20
+
21
+ ```ruby
22
+ # splits single word into ngrams
23
+ NgramsParser::ngram(word, size)
24
+
25
+ # splits given text into ngrams
26
+ NgramsParser::ngrams(text, size)
27
+ ```
28
+
29
+ You can use ngrams method into String class:
30
+
31
+ ```ruby
32
+ class String
33
+ include NgramsParser::String
34
+ end
35
+ ```
36
+
37
+ ## Contributing
38
+
39
+ 1. Fork it
40
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
41
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
42
+ 4. Push to the branch (`git push origin my-new-feature`)
43
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ desc 'Default: run specs.'
5
+ task default: :spec
6
+
7
+ desc "Run specs"
8
+ RSpec::Core::RakeTask.new do |t|
9
+ t.rspec_opts = "--tag fast"
10
+ end
@@ -0,0 +1,7 @@
1
+ require "ngrams_parser/ngram"
2
+ require "ngrams_parser/ngrams"
3
+ require "ngrams_parser/string"
4
+ require "ngrams_parser/version"
5
+
6
+ module NgramsParser
7
+ end
@@ -0,0 +1,15 @@
1
+ # coding: utf-8
2
+ module NgramsParser
3
+ # Split word into ngrams
4
+ #
5
+ # self.ngram("lorem", 2) #=> ["lo", "or", "re", "em", "m "]
6
+ # self.ngram("ipsum", 3) #=> ["ips", "psu", "sum", "um ", "m "]
7
+ def self.ngram(word, size)
8
+ array = []
9
+ word.split('').each_index do |index|
10
+ text = word[index..index+size-1]
11
+ array << text.ljust(size, ' ')
12
+ end
13
+ array
14
+ end
15
+ end
@@ -0,0 +1,12 @@
1
+ # coding: utf-8
2
+ require 'lexical_units'
3
+
4
+ module NgramsParser
5
+ def self.ngrams(text, size)
6
+ array = []
7
+ LexicalUnits::words(text).each do |word|
8
+ array << ngram(word, size)
9
+ end
10
+ array.flatten
11
+ end
12
+ end
@@ -0,0 +1,8 @@
1
+ # coding: utf-8
2
+ module NgramsParser
3
+ module String
4
+ def ngrams(size)
5
+ NgramsParser::ngrams(self, size)
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,3 @@
1
+ module NgramsParser
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'ngrams_parser/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "ngrams_parser"
8
+ spec.version = NgramsParser::VERSION
9
+ spec.authors = ["Aleksander Malaszkiewicz"]
10
+ spec.email = ["info@fractalsoft.org"]
11
+ spec.summary = %q{Split text into ngrams}
12
+ spec.homepage = ""
13
+ spec.license = "MIT"
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_development_dependency "bundler", "~> 1.3"
21
+ spec.add_development_dependency "rake"
22
+
23
+ spec.add_dependency "lexical_units"
24
+ end
@@ -0,0 +1,30 @@
1
+ require 'spec_helper'
2
+
3
+ describe NgramsParser do
4
+ context ".ngram" do
5
+ let(:klass) { NgramsParser }
6
+
7
+ [
8
+ {
9
+ text: "Will's",
10
+ digrams: ["Wi", "il", "ll", "l'", "'s", "s "],
11
+ trigrams: ["Wil", "ill", "ll'", "l's", "'s ", "s "],
12
+ },
13
+ {
14
+ text: "činčilový",
15
+ digrams: ["či", "in", "nč", "či", "il", "lo", "ov", "vý", "ý "],
16
+ trigrams: ["čin", "inč", "nči", "čil", "ilo", "lov", "ový", "vý ", "ý "]
17
+ }
18
+ ].each do |hash|
19
+ text, bigrams, trigrams = hash.values
20
+
21
+ it "split word '#{text}' into bigrams: #{bigrams}" do
22
+ klass::ngram(text, 2).should eq(bigrams)
23
+ end
24
+
25
+ it "split word '#{text}' into trigrams: #{trigrams}" do
26
+ klass::ngram(text, 3).should eq(trigrams)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,114 @@
1
+ require 'spec_helper'
2
+
3
+ describe NgramsParser do
4
+ context ".ngrams" do
5
+ let(:klass) { NgramsParser }
6
+
7
+ [
8
+ {
9
+ text: "Will will Will will Will's will to Will?",
10
+ bigrams: [
11
+ ["Wi", "il", "ll", "l ", "wi", "il", "ll", "l "],
12
+ ["Wi", "il", "ll", "l ", "wi", "il", "ll", "l "],
13
+ ["Wi", "il", "ll", "l'", "'s", "s ", "wi", "il", "ll", "l "],
14
+ ["to", "o "],
15
+ ["Wi", "il", "ll", "l "]
16
+ ].flatten,
17
+ trigrams: [
18
+ ["Wil", "ill", "ll ","l "],
19
+ ["wil", "ill", "ll ", "l "],
20
+ ["Wil", "ill", "ll ","l "],
21
+ ["wil", "ill", "ll ", "l "],
22
+ ["Wil", "ill", "ll'","l's", "'s ", "s "],
23
+ ["wil", "ill", "ll ", "l "],
24
+ ["to ", "o ", "Wil", "ill", "ll ", "l "]
25
+ ].flatten
26
+ },
27
+ {
28
+ text: "Acht alte Ameisen aßen am Abend Ananas.",
29
+ bigrams: [
30
+ ["Ac", "ch", "ht", "t "],
31
+ ["al", "lt", "te", "e "],
32
+ ["Am", "me", "ei", "is", "se", "en", "n "],
33
+ ["aß", "ße", "en", "n "],
34
+ ["am", "m "],
35
+ ["Ab", "be", "en", "nd", "d "],
36
+ ["An", "na", "an", "na", "as", "s "]
37
+ ].flatten,
38
+ trigrams: [
39
+ ["Ach", "cht", "ht ", "t "],
40
+ ["alt", "lte", "te ", "e "],
41
+ ["Ame", "mei", "eis", "ise", "sen", "en ", "n "],
42
+ ["aße", "ßen", "en ", "n "],
43
+ ["am ", "m "],
44
+ ["Abe", "ben", "end", "nd ", "d "],
45
+ ["Ana", "nan", "ana", "nas", "as ", "s "]
46
+ ].flatten
47
+ },
48
+ {
49
+ text: "Ödögidöggi",
50
+ bigrams: ["Öd", "dö", "ög", "gi", "id", "dö", "ög", "gg", "gi", "i "],
51
+ trigrams: [
52
+ ["Ödö", "dög", "ögi", "gid", "idö", "dög", "ögg", "ggi", "gi ", "i "]
53
+ ].flatten
54
+ },
55
+ {
56
+ text: "Ćma ćmę ćmi.",
57
+ bigrams: [
58
+ ["Ćm", "ma", "a ", "ćm", "mę", "ę ", "ćm", "mi", "i "]
59
+ ].flatten,
60
+ trigrams: [
61
+ ["Ćma", "ma ", "a ", "ćmę", "mę ", "ę ", "ćmi", "mi ", "i "]
62
+ ].flatten
63
+ },
64
+ {
65
+ text: "Łzy złej zołzy",
66
+ bigrams: [
67
+ ["Łz", "zy", "y ", "zł", "łe", "ej", "j "],
68
+ ["zo", "oł", "łz", "zy", "y "]
69
+ ].flatten,
70
+ trigrams: [
71
+ ["Łzy", "zy ", "y ", "złe", "łej", "ej ", "j "],
72
+ ["zoł", "ołz", "łzy", "zy ", "y "]
73
+ ].flatten
74
+ },
75
+ {
76
+ text: "Żubr żuł żuchwą żurawinę.",
77
+ bigrams: [
78
+ ["Żu", "ub", "br", "r ", "żu", "uł", "ł "],
79
+ ["żu", "uc", "ch", "hw", "wą", "ą "],
80
+ ["żu", "ur", "ra", "aw", "wi", "in", "nę", "ę "]
81
+ ].flatten,
82
+ trigrams: [
83
+ ["Żub", "ubr", "br ", "r ", "żuł", "uł ", "ł "],
84
+ ["żuc", "uch", "chw", "hwą", "wą ", "ą "],
85
+ ["żur", "ura", "raw", "awi", "win", "inę", "nę ", "ę "]
86
+ ].flatten
87
+ },
88
+ {
89
+ text: "Čistý s Čistou čistili činčilový čepec.",
90
+ bigrams: [
91
+ ["Či", "is", "st", "tý", "ý ", "s ", "Či", "is", "st", "to", "ou"],
92
+ ["u ", "či", "is", "st", "ti", "il", "li", "i ", "či", "in", "nč"],
93
+ ["či", "il", "lo", "ov", "vý", "ý ", "če", "ep", "pe", "ec", "c "]
94
+ ].flatten,
95
+ trigrams: [
96
+ ["Čis", "ist", "stý", "tý ", "ý ", "s ", "Čis", "ist", "sto"],
97
+ ["tou", "ou ", "u ", "čis", "ist", "sti", "til", "ili", "li "],
98
+ ["i ", "čin", "inč", "nči", "čil", "ilo", "lov", "ový", "vý "],
99
+ ["ý ", "čep", "epe", "pec", "ec ", "c "]
100
+ ].flatten
101
+ }
102
+ ].each do |hash|
103
+ text, bigrams, trigrams = hash.values
104
+
105
+ it "split text '#{text}' into bigrams" do
106
+ klass::ngrams(text, 2).should eq(bigrams)
107
+ end
108
+
109
+ it "split text '#{text}' into trigrams" do
110
+ klass::ngrams(text, 3).should eq(trigrams)
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,29 @@
1
+ require 'spec_helper'
2
+
3
+ describe NgramsParser::String do
4
+ class String
5
+ include NgramsParser::String
6
+ end
7
+
8
+ context "#ngrams" do
9
+ it "splits String into ngrams" do
10
+ string = "Lorem ipsum"
11
+ bigrams = [
12
+ "Lo", "or", "re", "em", "m ",
13
+ "ip", "ps", "su", "um", "m "
14
+ ]
15
+ trigrams = [
16
+ "Lor", "ore", "rem", "em ", "m ",
17
+ "ips", "psu", "sum", "um ", "m "
18
+ ]
19
+ quadgrams = [
20
+ "Lore", "orem", "rem ", "em ", "m ",
21
+ "ipsu", "psum", "sum ", "um ", "m "
22
+ ]
23
+
24
+ string.ngrams(2).should eq(bigrams)
25
+ string.ngrams(3).should eq(trigrams)
26
+ string.ngrams(4).should eq(quadgrams)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,9 @@
1
+ require 'ngrams_parser'
2
+ require 'coveralls'
3
+ Coveralls.wear!
4
+
5
+ RSpec.configure do |config|
6
+ config.treat_symbols_as_metadata_keys_with_true_values = true
7
+ config.run_all_when_everything_filtered = true
8
+ config.filter_run :focus
9
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ngrams_parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Aleksander Malaszkiewicz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-07-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: lexical_units
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description:
56
+ email:
57
+ - info@fractalsoft.org
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - .gitignore
63
+ - .rspec
64
+ - .ruby-gemset
65
+ - .ruby-version
66
+ - CHANGELOG.md
67
+ - Gemfile
68
+ - Guardfile
69
+ - LICENSE.txt
70
+ - README.md
71
+ - Rakefile
72
+ - lib/ngrams_parser.rb
73
+ - lib/ngrams_parser/ngram.rb
74
+ - lib/ngrams_parser/ngrams.rb
75
+ - lib/ngrams_parser/string.rb
76
+ - lib/ngrams_parser/version.rb
77
+ - ngrams_parser.gemspec
78
+ - spec/ngrams_parser/ngram_spec.rb
79
+ - spec/ngrams_parser/ngrams_spec.rb
80
+ - spec/ngrams_parser/string_spec.rb
81
+ - spec/spec_helper.rb
82
+ homepage: ''
83
+ licenses:
84
+ - MIT
85
+ metadata: {}
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ required_ruby_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - '>='
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ required_rubygems_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - '>='
98
+ - !ruby/object:Gem::Version
99
+ version: '0'
100
+ requirements: []
101
+ rubyforge_project:
102
+ rubygems_version: 2.0.3
103
+ signing_key:
104
+ specification_version: 4
105
+ summary: Split text into ngrams
106
+ test_files:
107
+ - spec/ngrams_parser/ngram_spec.rb
108
+ - spec/ngrams_parser/ngrams_spec.rb
109
+ - spec/ngrams_parser/string_spec.rb
110
+ - spec/spec_helper.rb