TJNGram 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/example/example.rb +15 -0
  2. data/lib/tjngram.rb +31 -0
  3. metadata +48 -0
@@ -0,0 +1,15 @@
1
+ require '../lib/tjngram'
2
+
3
+ text = <<eos
4
+ 這是一個範例。
5
+ This is an example.
6
+ これは例です。
7
+
8
+ 這裡有一個蘋果。
9
+ There is an apple.
10
+ これはリンゴです。
11
+ eos
12
+
13
+ puts text, "=========="
14
+
15
+ p TJNGram.process(2, text)
data/lib/tjngram.rb ADDED
@@ -0,0 +1,31 @@
1
+ module TJNGram
2
+ def self.process n, content
3
+ # matche Chinese, English, Japanes and Korean words
4
+ pattern = %r((\w+)|([\u3041-\u9FFF]))
5
+ # record if the token is an English word
6
+ is_english = {}
7
+
8
+ # tokenize
9
+ tokens = []
10
+ content.scan(pattern){|e,c|
11
+ is_english[e || c] = e ? true : false
12
+ tokens << (e || c)
13
+ }
14
+
15
+ resault = Hash.new(0)
16
+ n.times{|i|
17
+ tokens.each_slice(n){|slice|
18
+ key = ""
19
+ pre_state = nil
20
+ slice.each{|token|
21
+ key << " " if !pre_state.nil? && (pre_state || is_english[token])
22
+ key << token
23
+ pre_state = is_english[token]
24
+ }
25
+ resault[key] += 1
26
+ }
27
+ tokens.shift
28
+ }
29
+ resault
30
+ end
31
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: TJNGram
3
+ version: !ruby/object:Gem::Version
4
+ version: '1.0'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Tony Jian
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-06 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: It's common to see Chinese, Jananse and Korean articles contain some
15
+ English, but it's not common to see an n-gram library which can parse this sort
16
+ of articles. TJNGram was made for solving this problem.
17
+ email: tonytonyjan@gmail.com
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - example/example.rb
23
+ - lib/tjngram.rb
24
+ homepage: http://tonytonyjan.github.com
25
+ licenses: []
26
+ post_install_message:
27
+ rdoc_options: []
28
+ require_paths:
29
+ - lib
30
+ required_ruby_version: !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ! '>='
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ required_rubygems_version: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: '0'
42
+ requirements: []
43
+ rubyforge_project:
44
+ rubygems_version: 1.8.23
45
+ signing_key:
46
+ specification_version: 3
47
+ summary: N-Gram generator in Ruby, supporting English, Chinese, Janpanese and Korean.
48
+ test_files: []