cseg 0.0.5 → 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/README.md +13 -16
- data/cseg.gemspec +3 -2
- data/lib/cseg.rb +19 -19
- data/lib/cseg/version.rb +1 -1
- metadata +14 -15
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2e72d6d7a3b9559d28b26af6b3b8e65749e4b3a3
|
4
|
+
data.tar.gz: d4c8bec5d8e91cb20e39df81dd0e69eef7b690f7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: bdf2ab1fea49f6773a5098983f6e11fe2b7afc416ab62df0bf01cdccce08e4e8fdcaf5688721dc66be5155ffae600f92866244ebc809418c6aa1838a73aac3fa
|
7
|
+
data.tar.gz: 3306478b36f552272388f4be4c64dbcea6f2a8d786faf47aadcbc99cc1b4ba181dce1557105fe015a6e9d2f9af4fdd5e3e11e8b7f0187bcafe84618b6eb6120f
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,9 +1,10 @@
|
|
1
|
-
#
|
1
|
+
# Kurumi
|
2
2
|
|
3
3
|
Use MIRA to train a large amount of features.
|
4
4
|
|
5
|
-
Segment chinese sentences into words in high speed and correctly.
|
5
|
+
Segment chinese(both traditional and simplized) sentences into words in high speed and correctly.
|
6
6
|
|
7
|
+
take care the name of the gem is different from the repo name!
|
7
8
|
## Installation
|
8
9
|
|
9
10
|
Add this line to your application's Gemfile:
|
@@ -18,9 +19,7 @@ Or install it yourself as:
|
|
18
19
|
|
19
20
|
$ gem install cseg
|
20
21
|
|
21
|
-
you need to install CRF++ first and set the environment variables.
|
22
|
-
|
23
|
-
Here is the site of CRF++<http://crfpp.googlecode.com/svn/trunk/doc/index.html> and you should follow the manual
|
22
|
+
you need to install [CRF++](http://crfpp.googlecode.com/svn/trunk/doc/index.html?source=navbar) first and set the environment variables.
|
24
23
|
|
25
24
|
On github the dictionary file was deleted since it is quite large, though you can get all from rubygems.
|
26
25
|
|
@@ -34,17 +33,15 @@ Recall: 92.86%
|
|
34
33
|
|
35
34
|
## Usage
|
36
35
|
|
36
|
+
```
|
37
|
+
The default is Simplified Chinese
|
37
38
|
require "cseg"
|
38
|
-
a=Kurumi.segment("
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
a=Kurumi.segment("屌丝是一种自我讽刺。")
|
40
|
+
=>["屌丝", "是", "一", "种", "自我", "讽刺", "。"]
|
41
|
+
Use parameter "tr" to specify Traditional Chinese
|
42
|
+
a=Kurumi.segment("台妹真是正點。","tr")
|
43
|
+
=>["台妹", "真", "是", "正點", "。"]
|
43
44
|
|
44
|
-
|
45
|
+
```
|
45
46
|
|
46
|
-
|
47
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
48
|
-
3. Commit your changes (`git commit -am 'Add some feature'`)
|
49
|
-
4. Push to the branch (`git push origin my-new-feature`)
|
50
|
-
5. Create new Pull Request
|
47
|
+
## Contributing
|
data/cseg.gemspec
CHANGED
@@ -8,8 +8,8 @@ Gem::Specification.new do |gem|
|
|
8
8
|
gem.version = Cseg::VERSION
|
9
9
|
gem.authors = ["gyorou"]
|
10
10
|
gem.email = ["gyorou@tjjtds.com"]
|
11
|
-
gem.description = %q{"a chinese segmentation tool using CRF
|
12
|
-
gem.summary = %q{"
|
11
|
+
gem.description = %q{"a chinese segmentation tool using CRF"}
|
12
|
+
gem.summary = %q{""}
|
13
13
|
gem.homepage = ""
|
14
14
|
|
15
15
|
gem.files = [".gitignore",
|
@@ -17,6 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
"README.md",
|
18
18
|
"Gemfile",
|
19
19
|
"data/pkumodle.data",
|
20
|
+
"data/as_training_less.data",
|
20
21
|
"lib/cseg/version.rb",
|
21
22
|
"lib/cseg.rb",
|
22
23
|
"cseg.gemspec",
|
data/lib/cseg.rb
CHANGED
@@ -1,23 +1,26 @@
|
|
1
1
|
# encoding:utf-8
|
2
2
|
require "cseg/version"
|
3
|
-
require "tempfile"
|
4
3
|
class Kurumi
|
5
4
|
# since crf++ can only read from file
|
6
|
-
@
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
@modle_sp=File.expand_path("../../data/as_training.data", __FILE__)
|
6
|
+
@modle_tr=File.expand_path("../../data/as_training_less.data", __FILE__)
|
7
|
+
def self.segment(str, mode="sp")
|
8
|
+
@result=Array.new
|
9
|
+
case mode
|
10
|
+
when "sp"
|
11
|
+
@modle=@modle_sp
|
12
|
+
when "tr"
|
13
|
+
@modle=@modle_tr
|
14
|
+
else
|
15
|
+
raise "no such parameter, please use sp or tr"
|
11
16
|
end
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
system("crf_test -m #{@modle} #{@tmp.path}>#{@resultfile.path}")
|
18
|
-
@resultfile.rewind
|
17
|
+
result_data = IO.popen "crf_test -m #{@modle}", 'r+' do |io|
|
18
|
+
io.puts *str.chars
|
19
|
+
io.close_write
|
20
|
+
io.read
|
21
|
+
end
|
19
22
|
word=""
|
20
|
-
|
23
|
+
result_data.each_line{|line|
|
21
24
|
token=line.chomp.split(" ")
|
22
25
|
if token[1]=="B"or token[1]=="O"
|
23
26
|
if word!=""
|
@@ -27,6 +30,7 @@ class Kurumi
|
|
27
30
|
elsif token[1]=="I"
|
28
31
|
word+=token[0]
|
29
32
|
else
|
33
|
+
#nil
|
30
34
|
if word!=""
|
31
35
|
@result.push(word)
|
32
36
|
word=""
|
@@ -34,12 +38,8 @@ class Kurumi
|
|
34
38
|
end
|
35
39
|
|
36
40
|
}
|
37
|
-
|
38
|
-
@resultfile.close(true)
|
39
|
-
@tmp.close(true)
|
40
41
|
return @result
|
41
42
|
end
|
42
43
|
end
|
43
44
|
|
44
|
-
|
45
|
-
# print result
|
45
|
+
print Kurumi.segment("屌丝是一种自我讽刺。")
|
data/lib/cseg/version.rb
CHANGED
metadata
CHANGED
@@ -1,17 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: '0.1'
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- gyorou
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2014-
|
11
|
+
date: 2014-09-08 00:00:00.000000000 Z
|
13
12
|
dependencies: []
|
14
|
-
description:
|
13
|
+
description: '"a chinese segmentation tool using CRF"'
|
15
14
|
email:
|
16
15
|
- gyorou@tjjtds.com
|
17
16
|
executables: []
|
@@ -19,37 +18,37 @@ extensions: []
|
|
19
18
|
extra_rdoc_files: []
|
20
19
|
files:
|
21
20
|
- .gitignore
|
21
|
+
- Gemfile
|
22
22
|
- LICENSE.txt
|
23
23
|
- README.md
|
24
|
-
-
|
24
|
+
- Rakefile
|
25
|
+
- cseg.gemspec
|
26
|
+
- data/as_training_less.data
|
25
27
|
- data/pkumodle.data
|
26
|
-
- lib/cseg/version.rb
|
27
28
|
- lib/cseg.rb
|
28
|
-
- cseg.
|
29
|
-
- Rakefile
|
29
|
+
- lib/cseg/version.rb
|
30
30
|
homepage: ''
|
31
31
|
licenses: []
|
32
|
+
metadata: {}
|
32
33
|
post_install_message:
|
33
34
|
rdoc_options: []
|
34
35
|
require_paths:
|
35
36
|
- lib
|
36
37
|
- data
|
37
38
|
required_ruby_version: !ruby/object:Gem::Requirement
|
38
|
-
none: false
|
39
39
|
requirements:
|
40
|
-
- -
|
40
|
+
- - '>='
|
41
41
|
- !ruby/object:Gem::Version
|
42
42
|
version: '0'
|
43
43
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
44
|
-
none: false
|
45
44
|
requirements:
|
46
|
-
- -
|
45
|
+
- - '>='
|
47
46
|
- !ruby/object:Gem::Version
|
48
47
|
version: '0'
|
49
48
|
requirements: []
|
50
49
|
rubyforge_project:
|
51
|
-
rubygems_version:
|
50
|
+
rubygems_version: 2.4.1
|
52
51
|
signing_key:
|
53
|
-
specification_version:
|
54
|
-
summary:
|
52
|
+
specification_version: 4
|
53
|
+
summary: '""'
|
55
54
|
test_files: []
|