cseg 0.0.5 → 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/README.md +13 -16
- data/cseg.gemspec +3 -2
- data/lib/cseg.rb +19 -19
- data/lib/cseg/version.rb +1 -1
- metadata +14 -15
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2e72d6d7a3b9559d28b26af6b3b8e65749e4b3a3
|
4
|
+
data.tar.gz: d4c8bec5d8e91cb20e39df81dd0e69eef7b690f7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: bdf2ab1fea49f6773a5098983f6e11fe2b7afc416ab62df0bf01cdccce08e4e8fdcaf5688721dc66be5155ffae600f92866244ebc809418c6aa1838a73aac3fa
|
7
|
+
data.tar.gz: 3306478b36f552272388f4be4c64dbcea6f2a8d786faf47aadcbc99cc1b4ba181dce1557105fe015a6e9d2f9af4fdd5e3e11e8b7f0187bcafe84618b6eb6120f
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,9 +1,10 @@
|
|
1
|
-
#
|
1
|
+
# Kurumi
|
2
2
|
|
3
3
|
Use MIRA to train a large amount of features.
|
4
4
|
|
5
|
-
Segment chinese sentences into words in high speed and correctly.
|
5
|
+
Segment chinese(both traditional and simplized) sentences into words in high speed and correctly.
|
6
6
|
|
7
|
+
take care the name of the gem is different from the repo name!
|
7
8
|
## Installation
|
8
9
|
|
9
10
|
Add this line to your application's Gemfile:
|
@@ -18,9 +19,7 @@ Or install it yourself as:
|
|
18
19
|
|
19
20
|
$ gem install cseg
|
20
21
|
|
21
|
-
you need to install CRF++ first and set the environment variables.
|
22
|
-
|
23
|
-
Here is the site of CRF++<http://crfpp.googlecode.com/svn/trunk/doc/index.html> and you should follow the manual
|
22
|
+
you need to install [CRF++](http://crfpp.googlecode.com/svn/trunk/doc/index.html?source=navbar) first and set the environment variables.
|
24
23
|
|
25
24
|
On github the dictionary file was deleted since it is quite large, though you can get all from rubygems.
|
26
25
|
|
@@ -34,17 +33,15 @@ Recall: 92.86%
|
|
34
33
|
|
35
34
|
## Usage
|
36
35
|
|
36
|
+
```
|
37
|
+
The default is Simplified Chinese
|
37
38
|
require "cseg"
|
38
|
-
a=Kurumi.segment("
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
39
|
+
a=Kurumi.segment("屌丝是一种自我讽刺。")
|
40
|
+
=>["屌丝", "是", "一", "种", "自我", "讽刺", "。"]
|
41
|
+
Use parameter "tr" to specify Traditional Chinese
|
42
|
+
a=Kurumi.segment("台妹真是正點。","tr")
|
43
|
+
=>["台妹", "真", "是", "正點", "。"]
|
43
44
|
|
44
|
-
|
45
|
+
```
|
45
46
|
|
46
|
-
|
47
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
48
|
-
3. Commit your changes (`git commit -am 'Add some feature'`)
|
49
|
-
4. Push to the branch (`git push origin my-new-feature`)
|
50
|
-
5. Create new Pull Request
|
47
|
+
## Contributing
|
data/cseg.gemspec
CHANGED
@@ -8,8 +8,8 @@ Gem::Specification.new do |gem|
|
|
8
8
|
gem.version = Cseg::VERSION
|
9
9
|
gem.authors = ["gyorou"]
|
10
10
|
gem.email = ["gyorou@tjjtds.com"]
|
11
|
-
gem.description = %q{"a chinese segmentation tool using CRF
|
12
|
-
gem.summary = %q{"
|
11
|
+
gem.description = %q{"a chinese segmentation tool using CRF"}
|
12
|
+
gem.summary = %q{""}
|
13
13
|
gem.homepage = ""
|
14
14
|
|
15
15
|
gem.files = [".gitignore",
|
@@ -17,6 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
"README.md",
|
18
18
|
"Gemfile",
|
19
19
|
"data/pkumodle.data",
|
20
|
+
"data/as_training_less.data",
|
20
21
|
"lib/cseg/version.rb",
|
21
22
|
"lib/cseg.rb",
|
22
23
|
"cseg.gemspec",
|
data/lib/cseg.rb
CHANGED
@@ -1,23 +1,26 @@
|
|
1
1
|
# encoding:utf-8
|
2
2
|
require "cseg/version"
|
3
|
-
require "tempfile"
|
4
3
|
class Kurumi
|
5
4
|
# since crf++ can only read from file
|
6
|
-
@
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
@modle_sp=File.expand_path("../../data/as_training.data", __FILE__)
|
6
|
+
@modle_tr=File.expand_path("../../data/as_training_less.data", __FILE__)
|
7
|
+
def self.segment(str, mode="sp")
|
8
|
+
@result=Array.new
|
9
|
+
case mode
|
10
|
+
when "sp"
|
11
|
+
@modle=@modle_sp
|
12
|
+
when "tr"
|
13
|
+
@modle=@modle_tr
|
14
|
+
else
|
15
|
+
raise "no such parameter, please use sp or tr"
|
11
16
|
end
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
system("crf_test -m #{@modle} #{@tmp.path}>#{@resultfile.path}")
|
18
|
-
@resultfile.rewind
|
17
|
+
result_data = IO.popen "crf_test -m #{@modle}", 'r+' do |io|
|
18
|
+
io.puts *str.chars
|
19
|
+
io.close_write
|
20
|
+
io.read
|
21
|
+
end
|
19
22
|
word=""
|
20
|
-
|
23
|
+
result_data.each_line{|line|
|
21
24
|
token=line.chomp.split(" ")
|
22
25
|
if token[1]=="B"or token[1]=="O"
|
23
26
|
if word!=""
|
@@ -27,6 +30,7 @@ class Kurumi
|
|
27
30
|
elsif token[1]=="I"
|
28
31
|
word+=token[0]
|
29
32
|
else
|
33
|
+
#nil
|
30
34
|
if word!=""
|
31
35
|
@result.push(word)
|
32
36
|
word=""
|
@@ -34,12 +38,8 @@ class Kurumi
|
|
34
38
|
end
|
35
39
|
|
36
40
|
}
|
37
|
-
|
38
|
-
@resultfile.close(true)
|
39
|
-
@tmp.close(true)
|
40
41
|
return @result
|
41
42
|
end
|
42
43
|
end
|
43
44
|
|
44
|
-
|
45
|
-
# print result
|
45
|
+
print Kurumi.segment("屌丝是一种自我讽刺。")
|
data/lib/cseg/version.rb
CHANGED
metadata
CHANGED
@@ -1,17 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: '0.1'
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- gyorou
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2014-
|
11
|
+
date: 2014-09-08 00:00:00.000000000 Z
|
13
12
|
dependencies: []
|
14
|
-
description:
|
13
|
+
description: '"a chinese segmentation tool using CRF"'
|
15
14
|
email:
|
16
15
|
- gyorou@tjjtds.com
|
17
16
|
executables: []
|
@@ -19,37 +18,37 @@ extensions: []
|
|
19
18
|
extra_rdoc_files: []
|
20
19
|
files:
|
21
20
|
- .gitignore
|
21
|
+
- Gemfile
|
22
22
|
- LICENSE.txt
|
23
23
|
- README.md
|
24
|
-
-
|
24
|
+
- Rakefile
|
25
|
+
- cseg.gemspec
|
26
|
+
- data/as_training_less.data
|
25
27
|
- data/pkumodle.data
|
26
|
-
- lib/cseg/version.rb
|
27
28
|
- lib/cseg.rb
|
28
|
-
- cseg.
|
29
|
-
- Rakefile
|
29
|
+
- lib/cseg/version.rb
|
30
30
|
homepage: ''
|
31
31
|
licenses: []
|
32
|
+
metadata: {}
|
32
33
|
post_install_message:
|
33
34
|
rdoc_options: []
|
34
35
|
require_paths:
|
35
36
|
- lib
|
36
37
|
- data
|
37
38
|
required_ruby_version: !ruby/object:Gem::Requirement
|
38
|
-
none: false
|
39
39
|
requirements:
|
40
|
-
- -
|
40
|
+
- - '>='
|
41
41
|
- !ruby/object:Gem::Version
|
42
42
|
version: '0'
|
43
43
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
44
|
-
none: false
|
45
44
|
requirements:
|
46
|
-
- -
|
45
|
+
- - '>='
|
47
46
|
- !ruby/object:Gem::Version
|
48
47
|
version: '0'
|
49
48
|
requirements: []
|
50
49
|
rubyforge_project:
|
51
|
-
rubygems_version:
|
50
|
+
rubygems_version: 2.4.1
|
52
51
|
signing_key:
|
53
|
-
specification_version:
|
54
|
-
summary:
|
52
|
+
specification_version: 4
|
53
|
+
summary: '""'
|
55
54
|
test_files: []
|