TJNGram 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/example/example.rb +15 -0
  2. data/lib/tjngram.rb +31 -0
  3. metadata +48 -0
@@ -0,0 +1,15 @@
1
+ require '../lib/tjngram'
2
+
3
+ text = <<eos
4
+ 這是一個範例。
5
+ This is an example.
6
+ これは例です。
7
+
8
+ 這裡有一個蘋果。
9
+ There is an apple.
10
+ これはリンゴです。
11
+ eos
12
+
13
+ puts text, "=========="
14
+
15
+ p TJNGram.process(2, text)
data/lib/tjngram.rb ADDED
@@ -0,0 +1,31 @@
1
+ module TJNGram
2
+ def self.process n, content
3
+ # matche Chinese, English, Japanes and Korean words
4
+ pattern = %r((\w+)|([\u3041-\u9FFF]))
5
+ # record if the token is an English word
6
+ is_english = {}
7
+
8
+ # tokenize
9
+ tokens = []
10
+ content.scan(pattern){|e,c|
11
+ is_english[e || c] = e ? true : false
12
+ tokens << (e || c)
13
+ }
14
+
15
+ resault = Hash.new(0)
16
+ n.times{|i|
17
+ tokens.each_slice(n){|slice|
18
+ key = ""
19
+ pre_state = nil
20
+ slice.each{|token|
21
+ key << " " if !pre_state.nil? && (pre_state || is_english[token])
22
+ key << token
23
+ pre_state = is_english[token]
24
+ }
25
+ resault[key] += 1
26
+ }
27
+ tokens.shift
28
+ }
29
+ resault
30
+ end
31
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: TJNGram
3
+ version: !ruby/object:Gem::Version
4
+ version: '1.0'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tony Jian
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-06 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: It's common to see Chinese, Jananse and Korean articles contain some
15
+ English, but it's not common to see an n-gram library which can parse this sort
16
+ of articles. TJNGram was made for solving this problem.
17
+ email: tonytonyjan@gmail.com
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - example/example.rb
23
+ - lib/tjngram.rb
24
+ homepage: http://tonytonyjan.github.com
25
+ licenses: []
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 1.8.23
45
+ signing_key:
46
+ specification_version: 3
47
+ summary: N-Gram generator in Ruby, supporting English, Chinese, Janpanese and Korean.
48
+ test_files: []