cseg 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +12 -12
- data/cseg.gemspec +1 -1
- data/lib/cseg.rb +1 -1
- data/lib/cseg/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 50e095f936a44f0a92394260ad6f82b6e0b3391d
|
4
|
+
data.tar.gz: ce17fc8ecaa4d4f0b15566ebf5225b2c312d8940
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 284d67a876fa1d382801e1801dd0f2813f39ecf9a5753da7308e026646bb3bf036d24d811c75e69df1234b52b31c8d16c316eea11b2c60d95f85c51e79dc4962
|
7
|
+
data.tar.gz: d336a86d2239e2126360d8f5ee27328892c7659d2822c58d135b736d2f38ebdfc0ad7e0abff37aae23fc7dbba85136d6afdcb628f62070dba7b61259f6f99ac4
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
-
# Kurumi
|
1
|
+
# Cseg(Kurumi)
|
2
2
|
|
3
3
|
Use MIRA to train a large amount of features.
|
4
4
|
|
5
|
-
Segment chinese(both traditional and
|
5
|
+
Segment chinese(both traditional and simplified) sentences into words in high speed and correctly.
|
6
6
|
|
7
7
|
take care the name of the gem is different from the repo name!
|
8
|
+
|
8
9
|
## Installation
|
9
10
|
|
10
11
|
Add this line to your application's Gemfile:
|
@@ -23,7 +24,7 @@ you need to install [CRF++](http://crfpp.googlecode.com/svn/trunk/doc/index.html
|
|
23
24
|
|
24
25
|
On github the dictionary file was deleted since it is quite large, though you can get all from rubygems.
|
25
26
|
|
26
|
-
##
|
27
|
+
## Recall and Precision
|
27
28
|
|
28
29
|
Tested on seghanbakeoff pku test set
|
29
30
|
|
@@ -33,15 +34,14 @@ Recall: 92.86%
|
|
33
34
|
|
34
35
|
## Usage
|
35
36
|
|
36
|
-
```
|
37
|
-
The default is Simplified Chinese
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
Use parameter "tr" to specify Traditional Chinese
|
42
|
-
|
43
|
-
|
37
|
+
```ruby
|
38
|
+
#The default is Simplified Chinese
|
39
|
+
require "cseg"
|
40
|
+
Kurumi.segment("屌丝是一种自我讽刺。")
|
41
|
+
#=>["屌丝", "是", "一", "种", "自我", "讽刺", "。"]
|
42
|
+
#Use parameter "tr" to specify Traditional Chinese
|
43
|
+
Kurumi.segment("台妹真是正點。","tr")
|
44
|
+
#=>["台妹", "真", "是", "正點", "。"]
|
44
45
|
|
45
46
|
```
|
46
47
|
|
47
|
-
## Contributing
|
data/cseg.gemspec
CHANGED
data/lib/cseg.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
require "cseg/version"
|
3
3
|
class Kurumi
|
4
4
|
# since crf++ can only read from file
|
5
|
-
@modle_sp=File.expand_path("../../data/
|
5
|
+
@modle_sp=File.expand_path("../../data/pku_training.data", __FILE__)
|
6
6
|
@modle_tr=File.expand_path("../../data/as_training_less.data", __FILE__)
|
7
7
|
def self.segment(str, mode="sp")
|
8
8
|
@result=Array.new
|
data/lib/cseg/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- gyorou
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: '"a chinese segmentation tool using CRF"'
|
14
14
|
email:
|
@@ -24,7 +24,7 @@ files:
|
|
24
24
|
- Rakefile
|
25
25
|
- cseg.gemspec
|
26
26
|
- data/as_training_less.data
|
27
|
-
- data/
|
27
|
+
- data/pku_training.data
|
28
28
|
- lib/cseg.rb
|
29
29
|
- lib/cseg/version.rb
|
30
30
|
homepage: ''
|