cseg 0.0.5 → 0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 2e72d6d7a3b9559d28b26af6b3b8e65749e4b3a3
4
+ data.tar.gz: d4c8bec5d8e91cb20e39df81dd0e69eef7b690f7
5
+ SHA512:
6
+ metadata.gz: bdf2ab1fea49f6773a5098983f6e11fe2b7afc416ab62df0bf01cdccce08e4e8fdcaf5688721dc66be5155ffae600f92866244ebc809418c6aa1838a73aac3fa
7
+ data.tar.gz: 3306478b36f552272388f4be4c64dbcea6f2a8d786faf47aadcbc99cc1b4ba181dce1557105fe015a6e9d2f9af4fdd5e3e11e8b7f0187bcafe84618b6eb6120f
data/.gitignore CHANGED
@@ -15,3 +15,4 @@ spec/reports
15
15
  test/tmp
16
16
  test/version_tmp
17
17
  tmp
18
+ data/
data/README.md CHANGED
@@ -1,9 +1,10 @@
1
- # Cseg
1
+ # Kurumi
2
2
 
3
3
  Use MIRA to train a large amount of features.
4
4
 
5
- Segment chinese sentences into words in high speed and correctly.
5
+ Segment chinese(both traditional and simplized) sentences into words in high speed and correctly.
6
6
 
7
+ take care the name of the gem is different from the repo name!
7
8
  ## Installation
8
9
 
9
10
  Add this line to your application's Gemfile:
@@ -18,9 +19,7 @@ Or install it yourself as:
18
19
 
19
20
  $ gem install cseg
20
21
 
21
- you need to install CRF++ first and set the environment variables.
22
-
23
- Here is the site of CRF++<http://crfpp.googlecode.com/svn/trunk/doc/index.html> and you should follow the manual
22
+ you need to install [CRF++](http://crfpp.googlecode.com/svn/trunk/doc/index.html?source=navbar) first and set the environment variables.
24
23
 
25
24
  On github the dictionary file was deleted since it is quite large, though you can get all from rubygems.
26
25
 
@@ -34,17 +33,15 @@ Recall: 92.86%
34
33
 
35
34
  ## Usage
36
35
 
36
+ ```
37
+ The default is Simplified Chinese
37
38
  require "cseg"
38
- a=Kurumi.segment("屌丝是一种自我讽刺")
39
- =>屌丝/是/一/种/自我/讽刺
40
-
41
- the result will be an array.
42
-
39
+ a=Kurumi.segment("屌丝是一种自我讽刺。")
40
+ =>["屌丝", "是", "一", "种", "自我", "讽刺", "。"]
41
+ Use parameter "tr" to specify Traditional Chinese
42
+ a=Kurumi.segment("台妹真是正點。","tr")
43
+ =>["台妹", "真", "是", "正點", "。"]
43
44
 
44
- ## Contributing
45
+ ```
45
46
 
46
- 1. Fork it
47
- 2. Create your feature branch (`git checkout -b my-new-feature`)
48
- 3. Commit your changes (`git commit -am 'Add some feature'`)
49
- 4. Push to the branch (`git push origin my-new-feature`)
50
- 5. Create new Pull Request
47
+ ## Contributing
@@ -8,8 +8,8 @@ Gem::Specification.new do |gem|
8
8
  gem.version = Cseg::VERSION
9
9
  gem.authors = ["gyorou"]
10
10
  gem.email = ["gyorou@tjjtds.com"]
11
- gem.description = %q{"a chinese segmentation tool using CRF++"}
12
- gem.summary = %q{"CRF++ should be installed and set in the environment variables"}
11
+ gem.description = %q{"a chinese segmentation tool using CRF"}
12
+ gem.summary = %q{""}
13
13
  gem.homepage = ""
14
14
 
15
15
  gem.files = [".gitignore",
@@ -17,6 +17,7 @@ Gem::Specification.new do |gem|
17
17
  "README.md",
18
18
  "Gemfile",
19
19
  "data/pkumodle.data",
20
+ "data/as_training_less.data",
20
21
  "lib/cseg/version.rb",
21
22
  "lib/cseg.rb",
22
23
  "cseg.gemspec",
@@ -1,23 +1,26 @@
1
1
  # encoding:utf-8
2
2
  require "cseg/version"
3
- require "tempfile"
4
3
  class Kurumi
5
4
  # since crf++ can only read from file
6
- @modle=File.expand_path("../../data/pkumodle.data", __FILE__)
7
- def self.segment(str)
8
- tmpstr=""
9
- for i in 0..str.length-1
10
- tmpstr+=str[i]+"\n"
5
+ @modle_sp=File.expand_path("../../data/as_training.data", __FILE__)
6
+ @modle_tr=File.expand_path("../../data/as_training_less.data", __FILE__)
7
+ def self.segment(str, mode="sp")
8
+ @result=Array.new
9
+ case mode
10
+ when "sp"
11
+ @modle=@modle_sp
12
+ when "tr"
13
+ @modle=@modle_tr
14
+ else
15
+ raise "no such parameter, please use sp or tr"
11
16
  end
12
- @tmp=Tempfile::new("tmp")
13
- @resultfile=Tempfile::new("result")
14
- @tmp.write(tmpstr)
15
- @tmp.rewind
16
- @result=Array.new
17
- system("crf_test -m #{@modle} #{@tmp.path}>#{@resultfile.path}")
18
- @resultfile.rewind
17
+ result_data = IO.popen "crf_test -m #{@modle}", 'r+' do |io|
18
+ io.puts *str.chars
19
+ io.close_write
20
+ io.read
21
+ end
19
22
  word=""
20
- @resultfile.read.each_line{|line|
23
+ result_data.each_line{|line|
21
24
  token=line.chomp.split(" ")
22
25
  if token[1]=="B"or token[1]=="O"
23
26
  if word!=""
@@ -27,6 +30,7 @@ class Kurumi
27
30
  elsif token[1]=="I"
28
31
  word+=token[0]
29
32
  else
33
+ #nil
30
34
  if word!=""
31
35
  @result.push(word)
32
36
  word=""
@@ -34,12 +38,8 @@ class Kurumi
34
38
  end
35
39
 
36
40
  }
37
-
38
- @resultfile.close(true)
39
- @tmp.close(true)
40
41
  return @result
41
42
  end
42
43
  end
43
44
 
44
- # result=Kurumi.segment("屌丝是一种生活态度")
45
- # print result
45
+ print Kurumi.segment("屌丝是一种自我讽刺。")
@@ -1,3 +1,3 @@
1
1
  module Cseg
2
- VERSION = "0.0.5"
2
+ VERSION = "0.1"
3
3
  end
metadata CHANGED
@@ -1,17 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cseg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
5
- prerelease:
4
+ version: '0.1'
6
5
  platform: ruby
7
6
  authors:
8
7
  - gyorou
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2014-02-25 00:00:00.000000000 Z
11
+ date: 2014-09-08 00:00:00.000000000 Z
13
12
  dependencies: []
14
- description: ! '"a chinese segmentation tool using CRF++"'
13
+ description: '"a chinese segmentation tool using CRF"'
15
14
  email:
16
15
  - gyorou@tjjtds.com
17
16
  executables: []
@@ -19,37 +18,37 @@ extensions: []
19
18
  extra_rdoc_files: []
20
19
  files:
21
20
  - .gitignore
21
+ - Gemfile
22
22
  - LICENSE.txt
23
23
  - README.md
24
- - Gemfile
24
+ - Rakefile
25
+ - cseg.gemspec
26
+ - data/as_training_less.data
25
27
  - data/pkumodle.data
26
- - lib/cseg/version.rb
27
28
  - lib/cseg.rb
28
- - cseg.gemspec
29
- - Rakefile
29
+ - lib/cseg/version.rb
30
30
  homepage: ''
31
31
  licenses: []
32
+ metadata: {}
32
33
  post_install_message:
33
34
  rdoc_options: []
34
35
  require_paths:
35
36
  - lib
36
37
  - data
37
38
  required_ruby_version: !ruby/object:Gem::Requirement
38
- none: false
39
39
  requirements:
40
- - - ! '>='
40
+ - - '>='
41
41
  - !ruby/object:Gem::Version
42
42
  version: '0'
43
43
  required_rubygems_version: !ruby/object:Gem::Requirement
44
- none: false
45
44
  requirements:
46
- - - ! '>='
45
+ - - '>='
47
46
  - !ruby/object:Gem::Version
48
47
  version: '0'
49
48
  requirements: []
50
49
  rubyforge_project:
51
- rubygems_version: 1.8.28
50
+ rubygems_version: 2.4.1
52
51
  signing_key:
53
- specification_version: 3
54
- summary: ! '"CRF++ should be installed and set in the environment variables"'
52
+ specification_version: 4
53
+ summary: '""'
55
54
  test_files: []