TJNGram 1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/example/example.rb +15 -0
- data/lib/tjngram.rb +31 -0
- metadata +48 -0
data/example/example.rb
ADDED
data/lib/tjngram.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
module TJNGram
|
2
|
+
def self.process n, content
|
3
|
+
# matche Chinese, English, Japanes and Korean words
|
4
|
+
pattern = %r((\w+)|([\u3041-\u9FFF]))
|
5
|
+
# record if the token is an English word
|
6
|
+
is_english = {}
|
7
|
+
|
8
|
+
# tokenize
|
9
|
+
tokens = []
|
10
|
+
content.scan(pattern){|e,c|
|
11
|
+
is_english[e || c] = e ? true : false
|
12
|
+
tokens << (e || c)
|
13
|
+
}
|
14
|
+
|
15
|
+
resault = Hash.new(0)
|
16
|
+
n.times{|i|
|
17
|
+
tokens.each_slice(n){|slice|
|
18
|
+
key = ""
|
19
|
+
pre_state = nil
|
20
|
+
slice.each{|token|
|
21
|
+
key << " " if !pre_state.nil? && (pre_state || is_english[token])
|
22
|
+
key << token
|
23
|
+
pre_state = is_english[token]
|
24
|
+
}
|
25
|
+
resault[key] += 1
|
26
|
+
}
|
27
|
+
tokens.shift
|
28
|
+
}
|
29
|
+
resault
|
30
|
+
end
|
31
|
+
end
|
metadata
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: TJNGram
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '1.0'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Tony Jian
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-06 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: It's common to see Chinese, Jananse and Korean articles contain some
|
15
|
+
English, but it's not common to see an n-gram library which can parse this sort
|
16
|
+
of articles. TJNGram was made for solving this problem.
|
17
|
+
email: tonytonyjan@gmail.com
|
18
|
+
executables: []
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- example/example.rb
|
23
|
+
- lib/tjngram.rb
|
24
|
+
homepage: http://tonytonyjan.github.com
|
25
|
+
licenses: []
|
26
|
+
post_install_message:
|
27
|
+
rdoc_options: []
|
28
|
+
require_paths:
|
29
|
+
- lib
|
30
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ! '>='
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: '0'
|
36
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '0'
|
42
|
+
requirements: []
|
43
|
+
rubyforge_project:
|
44
|
+
rubygems_version: 1.8.23
|
45
|
+
signing_key:
|
46
|
+
specification_version: 3
|
47
|
+
summary: N-Gram generator in Ruby, supporting English, Chinese, Janpanese and Korean.
|
48
|
+
test_files: []
|