ngrams-enabler 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +6 -0
- data/.rspec +2 -0
- data/.travis.yml +8 -0
- data/Gemfile +8 -0
- data/README.md +41 -0
- data/Rakefile +8 -0
- data/lib/ngrams/core_extensions/string.rb +22 -0
- data/lib/ngrams_enabler.rb +3 -0
- data/ngrams-enabler.gemspec +18 -0
- data/spec/ngrams_enabler/ngrams_enabler_spec.rb +39 -0
- data/spec/spec_helper.rb +12 -0
- metadata +86 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MjEyNjc3YTM2MTA4YmVjMjQ3OGNiMzM1NmRjNzI5YWFmYTE0YWVlNQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
ODYwYTMyNTNkZWJjNDI5NTQzMDZkYmYzNTM2ZDdlYWE0ZWFlNzhjMg==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
OTkzNmYxYTYxZjUzNmQ3ODViNzEzMzRlN2NhZDIyMzQ4NGNjY2VlZmZhYTQw
|
10
|
+
YjQ1NzcyZWNiMTBhZTMyMTZjNmY4ZWYxMWUzZTVhZWZjOWU1OGM2YWYwYmMz
|
11
|
+
ZWJjYmMwMWE5MTcwODI1NjFkMDA1YmE5OWNlY2VjYjJiMDMwMGI=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
YjBmODI1YjkxYjk2ZTE2ZTNkZWJjZmUyMzM4NWMwYjE2NDFjMDU2ODVlYThh
|
14
|
+
YzQ4MjcyMGExZDVjMzNhNTc5NjYyNTkxNjhhZDJlMzMyZWFjYzg2OTdhYjdk
|
15
|
+
NGY3ODkyZGJlMzAzM2RiNTY2OTc2MzU5YjFjYzg3Yjg3OTQ4MzY=
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
ngrams-enabler [![Build Status](https://travis-ci.org/lloydmeta/ngrams-enabler.png?branch=master)](https://travis-ci.org/lloydmeta/ngrams-enabler) [![Code Climate](https://codeclimate.com/github/lloydmeta/ngrams-enabler.png)](https://codeclimate.com/github/lloydmeta/ngrams-enabler)
|
2
|
+
-------------
|
3
|
+
|
4
|
+
A simple way of getting ngrams out of any given String object. Supports CJK (Chinese, Japanese, Korean) as well as alphabet based languages.
|
5
|
+
|
6
|
+
Example Usage
|
7
|
+
=========
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
require 'ngrams_enabler'
|
11
|
+
|
12
|
+
"This is just a test".ngrams
|
13
|
+
|
14
|
+
"This is just a test".ngrams(2)
|
15
|
+
|
16
|
+
"こんにちは".ngrams
|
17
|
+
|
18
|
+
"こんにちは".ngrams(2)
|
19
|
+
```
|
20
|
+
|
21
|
+
## License
|
22
|
+
|
23
|
+
Copyright (c) 2013 by Lloyd Chan
|
24
|
+
|
25
|
+
Permission is hereby granted, free of charge, to any person obtaining a
|
26
|
+
copy of this software and associated documentation files (the
|
27
|
+
"Software"), to deal in the Software without restriction, including
|
28
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
29
|
+
distribute, and to permit persons to whom the Software is furnished to do so, subject to
|
30
|
+
the following conditions:
|
31
|
+
|
32
|
+
The above copyright notice and this permission notice shall be included
|
33
|
+
in all copies or substantial portions of the Software.
|
34
|
+
|
35
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
36
|
+
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
37
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
38
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
39
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
40
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
41
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
class String
|
4
|
+
|
5
|
+
def ngrams(n = 1, options = {})
|
6
|
+
split_char = options.fetch(:split_char, ' ')
|
7
|
+
is_cjk_string = contains_cjk?
|
8
|
+
tokens = is_cjk_string ? self.split(//u) : self.split(split_char)
|
9
|
+
if tokens.size <= n
|
10
|
+
self
|
11
|
+
else
|
12
|
+
is_cjk_string ? tokens.each_cons(n).map(&:join) : tokens.each_cons(n).map{|cons| cons.join(split_char)}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def contains_cjk?
|
19
|
+
!!(self =~ /\p{Han}|\p{Katakana}|\p{Hiragana}|\p{Hangul}/)
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
Gem::Specification.new do |gem|
|
2
|
+
gem.name = %q{ngrams-enabler}
|
3
|
+
gem.version = "0.0.1"
|
4
|
+
gem.date = %q{2013-05-16}
|
5
|
+
gem.authors = ["Lloyd Meta"]
|
6
|
+
gem.email = ["lloydmeta@gmail.com"]
|
7
|
+
gem.homepage = "http://github.com/lloydmeta/ngrams-enabler"
|
8
|
+
gem.description = %q{A simple way of getting ngrams out of any given String object. Supports CJK (Chinese, Japanese, Korean) as well as alphabet based languages.}
|
9
|
+
gem.summary = gem.description
|
10
|
+
|
11
|
+
gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
12
|
+
gem.files = `git ls-files`.split("\n")
|
13
|
+
gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
14
|
+
gem.require_paths = ["lib"]
|
15
|
+
|
16
|
+
gem.add_development_dependency 'rake'
|
17
|
+
gem.add_development_dependency 'rspec'
|
18
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#encoding: UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe String do
|
5
|
+
|
6
|
+
let(:cjk_string){"これは日本語"}
|
7
|
+
let(:english_string){"this string is in english"}
|
8
|
+
|
9
|
+
describe "#ngrams" do
|
10
|
+
|
11
|
+
context "not CJK" do
|
12
|
+
|
13
|
+
it "should return the proper default ngrams" do
|
14
|
+
ngrams = english_string.ngrams
|
15
|
+
ngrams.should eq(english_string.split(" "))
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should allow me to set higher n" do
|
19
|
+
english_string.ngrams(2).should eq(['this string', 'string is', 'is in', 'in english'])
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
context "CJK string" do
|
25
|
+
|
26
|
+
it "should return the proper default ngrams" do
|
27
|
+
ngrams = cjk_string.ngrams
|
28
|
+
ngrams.should eq(%w[こ れ は 日 本 語])
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should allow me to set higher n" do
|
32
|
+
cjk_string.ngrams(2).should eq(%w[これ れは は日 日本 本語])
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ngrams-enabler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Lloyd Meta
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-05-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ! '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ! '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ! '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ! '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: A simple way of getting ngrams out of any given String object. Supports
|
42
|
+
CJK (Chinese, Japanese, Korean) as well as alphabet based languages.
|
43
|
+
email:
|
44
|
+
- lloydmeta@gmail.com
|
45
|
+
executables: []
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- .gitignore
|
50
|
+
- .rspec
|
51
|
+
- .travis.yml
|
52
|
+
- Gemfile
|
53
|
+
- README.md
|
54
|
+
- Rakefile
|
55
|
+
- lib/ngrams/core_extensions/string.rb
|
56
|
+
- lib/ngrams_enabler.rb
|
57
|
+
- ngrams-enabler.gemspec
|
58
|
+
- spec/ngrams_enabler/ngrams_enabler_spec.rb
|
59
|
+
- spec/spec_helper.rb
|
60
|
+
homepage: http://github.com/lloydmeta/ngrams-enabler
|
61
|
+
licenses: []
|
62
|
+
metadata: {}
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options: []
|
65
|
+
require_paths:
|
66
|
+
- lib
|
67
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
requirements: []
|
78
|
+
rubyforge_project:
|
79
|
+
rubygems_version: 2.0.3
|
80
|
+
signing_key:
|
81
|
+
specification_version: 4
|
82
|
+
summary: A simple way of getting ngrams out of any given String object. Supports CJK
|
83
|
+
(Chinese, Japanese, Korean) as well as alphabet based languages.
|
84
|
+
test_files:
|
85
|
+
- spec/ngrams_enabler/ngrams_enabler_spec.rb
|
86
|
+
- spec/spec_helper.rb
|