jieba_rb 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 122f7b0e8353ea96f9eecc620c894d8dc27b3402
4
- data.tar.gz: 8ceb3c44c9957f8a080653f091580d6ea24a7e48
3
+ metadata.gz: cf9ad1c59145d326f4edd1c639066e107b52c13a
4
+ data.tar.gz: 4732514648268711284a8198a4a324889636112d
5
5
  SHA512:
6
- metadata.gz: 39f49d5d34221863aab6864465a8671a662e4d94fdb93ffe0439fd2f89d7b63d983002c031abb54e651a5a2f99d220f4ec27b24778774f93497dd9ecf9e788df
7
- data.tar.gz: 9232b90f160726d296d64c95bbb39545ebb1ed362fef0463fdd30a9d7777dedfe37cbca162cdf31618cf3cc1502c7db5de815866fba80956dc97325bbe881372
6
+ metadata.gz: 8fdc938f12a9506eb46baef952fd6bd5a261803934673273ba33f8e57cccd614ea31725dc58262b1e6c24828db9f8787027f8e95f4e614504eb943cf75dadf04
7
+ data.tar.gz: cd5d7df0f5f46cf4a7f8553d9f09cf05d0aeceb2f7b211a9238b9596aa43c3bd395425d2d278fc971bc3f224ac764b55cdfe88506c1b77a0ab8bcb927b206b48
data/.travis.yml CHANGED
@@ -2,5 +2,13 @@ language: ruby
2
2
  rvm:
3
3
  - 1.9.3
4
4
  - 2.0.0
5
- - 2.1.2
5
+ - 2.1.5
6
+ - ruby-head
7
+ - rbx-2
8
+
6
9
  before_script: rake compile
10
+
11
+ matrix:
12
+ allow_failures:
13
+ - rvm: ruby-head
14
+ - rvm: rbx-2
data/README.md CHANGED
@@ -23,24 +23,42 @@ Or install it yourself as:
23
23
 
24
24
  ## Word segment Usage
25
25
 
26
- Mix Segment (HMM with Max Prob, default):
26
+ Mix Segment mode (HMM with Max Prob, default):
27
27
 
28
28
  require 'jieba_rb'
29
- seg = JiebaRb::Segment.new type: :mix
29
+ seg = JiebaRb::Segment.new # equivalent to "JiebaRb::Segment.new mode: :mix"
30
30
  words = seg.cut "令狐冲是云计算行业的专家"
31
31
  # 令狐冲 是 云 计算 行业 的 专家
32
32
 
33
- Mix Segment with user-defined dictionary:
33
+ Mix Segment mode with user-defined dictionary:
34
34
 
35
- seg = JiebaRb::Segment.new type: :mix, user_dict: "ext/cppjieba/dict/user.dict.utf8"
35
+ seg = JiebaRb::Segment.new mode: :mix, user_dict: "ext/cppjieba/dict/user.dict.utf8"
36
36
  words = seg.cut "令狐冲是云计算行业的专家"
37
37
  # 令狐冲 是 云计算 行业 的 专家
38
38
 
39
- HMM or Max probability (mp) Segment:
39
+ HMM or Max probability (mp) Segment mode:
40
40
 
41
- seg = JiebaRb::Segment.new type: :hmm # or type: :mp
41
+ seg = JiebaRb::Segment.new mode: :hmm # or mode: :mp
42
42
  words = seg.cut "令狐冲是云计算行业的专家"
43
43
 
44
+ ## Keyword Extractor Usage
45
+
46
+ * only support TF-IDF currently
47
+
48
+ ```
49
+ keyword = JiebaRb::Keyword.new
50
+ keywords_weights = keyword.extract "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
51
+
52
+ [
53
+ ["CEO", 11.739204307083542],
54
+ ["升职", 10.8561552143],
55
+ ["加薪", 10.642581114],
56
+ ["手扶拖拉机", 10.0088573539],
57
+ ["巅峰", 9.49395840471]
58
+ ]
59
+ ```
60
+
61
+
44
62
 
45
63
  ## Contributing
46
64
 
data/Rakefile CHANGED
@@ -7,5 +7,9 @@ Rake::ExtensionTask.new "jieba"
7
7
  Rake::TestTask.new do |t|
8
8
  t.libs << 'test'
9
9
  end
10
-
10
+ desc "clean compile files"
11
+ task :clean_compile do
12
+ system "rm -r tmp"
13
+ system "rm lib/*.bundle"
14
+ end
11
15
  task :default => :test
data/ext/jieba/jieba.c CHANGED
@@ -6,4 +6,5 @@ void Init_jieba()
6
6
  mJieba = rb_define_module("JiebaRb");
7
7
 
8
8
  Init_segment();
9
+ Init_keyword();
9
10
  }
data/ext/jieba/jieba.h CHANGED
@@ -3,6 +3,7 @@
3
3
 
4
4
  #include <ruby.h>
5
5
  #include <segment.h>
6
+ #include <keyword.h>
6
7
 
7
8
  extern VALUE mJieba;
8
9
 
@@ -0,0 +1,98 @@
1
+ #include "segment.h"
2
+ #include <ruby/encoding.h>
3
+ #include <KeywordExtractor.hpp>
4
+
5
+ static rb_encoding* u8_enc;
6
+
7
+ struct Keyword{
8
+ CppJieba::KeywordExtractor * p;
9
+ };
10
+
11
+ static void keyword_free(void *p){
12
+ delete ((Keyword*) p) -> p;
13
+ delete (Keyword*)p;
14
+ }
15
+
16
+ static VALUE allocate(VALUE klass)
17
+ {
18
+ Keyword * keyword = new Keyword();
19
+ return Data_Wrap_Struct(klass, NULL, keyword_free, keyword);
20
+ }
21
+
22
+ static void init(VALUE self,
23
+ VALUE mode_rb_sym,
24
+ VALUE jieba_dict_rbs,
25
+ VALUE hmm_dict_rbs,
26
+ VALUE idf_rbs,
27
+ VALUE stop_words_rbs,
28
+ VALUE user_dict_rbs)
29
+ {
30
+ Keyword * keyword;
31
+ Data_Get_Struct(self, Keyword, keyword);
32
+
33
+ Check_Type(jieba_dict_rbs, T_STRING);
34
+ Check_Type(hmm_dict_rbs, T_STRING);
35
+ Check_Type(user_dict_rbs, T_STRING);
36
+ Check_Type(idf_rbs, T_STRING);
37
+ Check_Type(stop_words_rbs, T_STRING);
38
+
39
+ std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
40
+ std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
41
+ std::string idf = StringValueCStr(idf_rbs);
42
+ std::string stop_words = StringValueCStr(stop_words_rbs);
43
+ std::string user_dict = StringValueCStr(user_dict_rbs);
44
+
45
+ ID mode = SYM2ID(mode_rb_sym);
46
+ if ( mode == rb_intern("tf_idf") )
47
+ {
48
+ keyword->p = new CppJieba::KeywordExtractor(jieba_dict, hmm_dict, idf, stop_words);
49
+ }
50
+ }
51
+
52
+ static VALUE extract(VALUE self, VALUE text_rbs, VALUE topN)
53
+ {
54
+ Check_Type(text_rbs, T_STRING);
55
+ std::string text = StringValueCStr(text_rbs);
56
+
57
+ Check_Type(topN, T_FIXNUM);
58
+ int top_n = NUM2INT(topN);
59
+
60
+ Keyword * keyword;
61
+ Data_Get_Struct(self, Keyword, keyword);
62
+
63
+ std::vector<std::pair<std::string, double> > top_words;
64
+
65
+ if (keyword->p->extract(text, top_words, top_n))
66
+ {
67
+ volatile VALUE arr = rb_ary_new();
68
+ for(size_t i = 0; i < top_words.size(); i++)
69
+ {
70
+ volatile VALUE inner_arr = rb_ary_new();
71
+ std::string & word = top_words[i].first;
72
+ rb_ary_push(inner_arr, rb_enc_str_new(word.c_str(), word.length(), u8_enc));
73
+ rb_ary_push(inner_arr, rb_float_new(top_words[i].second));
74
+
75
+ rb_ary_push(arr, inner_arr);
76
+
77
+ }
78
+ return arr;
79
+ }
80
+ else
81
+ {
82
+ return Qfalse;
83
+ }
84
+ }
85
+
86
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
87
+
88
+ extern "C" {
89
+ void Init_keyword()
90
+ {
91
+ VALUE cKeyword = rb_define_class_under(mJieba, "Keyword", rb_cObject);
92
+ u8_enc = rb_utf8_encoding();
93
+ rb_define_alloc_func(cKeyword, allocate);
94
+ DEF(cKeyword, "_init", init, 6);
95
+ DEF(cKeyword, "extract",extract,2);
96
+ }
97
+
98
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef RUBY_JIEBA_KEYWORD
2
+ #define RUBY_JIEBA_KEYWORD
3
+
4
+ #include <jieba.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void Init_keyword();
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif
15
+
16
+
17
+ #endif
@@ -1,3 +1,3 @@
1
1
  module JiebaRb
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/jieba_rb.rb CHANGED
@@ -1,28 +1,55 @@
1
1
  require "jieba_rb/version"
2
2
  require "jieba"
3
3
  module JiebaRb
4
+ abs = File.expand_path File.dirname(__FILE__)
5
+ EXT_BASE = "#{abs}/../ext/cppjieba/"
6
+ DEFAULT_JIEBA_DICT = EXT_BASE + "dict/jieba.dict.utf8";
7
+ DEFAULT_HMM_DICT = EXT_BASE + "dict/hmm_model.utf8";
8
+ DEFAULT_USER_DICT = EXT_BASE + "dict/user.dict.utf8";
9
+
4
10
  class Segment
5
- abs = File.expand_path File.dirname(__FILE__)
6
- EXT_BASE = "#{abs}/../ext/cppjieba/"
7
- JIEBA_DICT_FILE = EXT_BASE + "dict/jieba.dict.utf8";
8
- HMM_DICT_FILE = EXT_BASE + "dict/hmm_model.utf8";
9
- USER_DICT_FILE = EXT_BASE + "dict/user.dict.utf8";
11
+ private :_init
12
+ def initialize opts = {}
13
+ valid_modes = [:mix, :hmm, :mp]
14
+ if mode = opts[:mode]
15
+ raise "Mode must be one of :mix :hmm :mp" unless valid_modes.include? mode
16
+ else
17
+ mode = :mix #default
18
+ end
19
+
20
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
21
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
22
+ user_dict = opts[:user_dict] || ""
23
+ user_dict = DEFAULT_USER_DICT if user_dict == :default
24
+
25
+ _init mode, jieba_dict, hmm_dict, user_dict
26
+ end
27
+ end
28
+
29
+ class Keyword
30
+ DEFAULT_IDF = EXT_BASE + "dict/idf.utf8"
31
+ DEFAULT_STOP_WORDS = EXT_BASE + "dict/stop_words.utf8"
10
32
 
11
33
  private :_init
34
+
12
35
  def initialize opts = {}
13
- valid_seg_types = [:mix, :hmm, :mp]
14
- if type = opts[:type]
15
- raise "Type must be one of :mix :hmm :mp" unless valid_seg_types.include? type
36
+ valid_modes = [:tf_idf]
37
+ if mode = opts[:mode]
38
+ raise "Mode must be one of :tf_idf" unless valid_modes.include? mode
16
39
  else
17
- type = :mix #default
40
+ mode = :tf_idf #default
18
41
  end
19
42
 
20
- jieba_dict = opts[:jieba_dict] || JIEBA_DICT_FILE
21
- hmm_dict = opts[:hmm_dict] || HMM_DICT_FILE
43
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
44
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
45
+ idf_path = opts[:idf] || DEFAULT_IDF
46
+ stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
47
+
22
48
  user_dict = opts[:user_dict] || ""
23
49
  user_dict = USER_DICT_FILE if user_dict == :default
24
50
 
25
- _init type, jieba_dict, hmm_dict, user_dict
51
+ _init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
26
52
  end
27
53
  end
54
+
28
55
  end
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'jieba_rb'
4
+ class JiebaTest < Minitest::Test
5
+ def test_keywords
6
+ keyword = JiebaRb::Keyword.new
7
+ keywords_weights = keyword.extract "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
8
+
9
+ assert_equal [["CEO",
10
+ 11.739204307083542],
11
+ ["升职", 10.8561552143],
12
+ ["加薪", 10.642581114],
13
+ ["手扶拖拉机", 10.0088573539],
14
+ ["巅峰", 9.49395840471]], keywords_weights
15
+
16
+ end
17
+ end
data/test/test_segment.rb CHANGED
@@ -18,13 +18,13 @@ class JiebaTest < Minitest::Test
18
18
  end
19
19
 
20
20
  def test_hmm_segment
21
- seg = JiebaRb::Segment.new type: :hmm
21
+ seg = JiebaRb::Segment.new mode: :hmm
22
22
  words = seg.cut "令狐冲是云计算行业的专家"
23
23
  assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
24
24
  end
25
25
 
26
26
  def test_max_prob_segment
27
- seg = JiebaRb::Segment.new type: :mp
27
+ seg = JiebaRb::Segment.new mode: :mp
28
28
  words = seg.cut "令狐冲是云计算行业的专家"
29
29
  assert_equal %w(令狐冲 是 云 计算 行业 的 专家), words
30
30
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jieba_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Li
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-21 00:00:00.000000000 Z
11
+ date: 2014-12-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -81,15 +81,6 @@ files:
81
81
  - LICENSE.txt
82
82
  - README.md
83
83
  - Rakefile
84
- - ext/jieba/extconf.rb
85
- - ext/jieba/jieba.c
86
- - ext/jieba/jieba.h
87
- - ext/jieba/segment.cc
88
- - ext/jieba/segment.h
89
- - jieba_rb.gemspec
90
- - lib/jieba_rb.rb
91
- - lib/jieba_rb/version.rb
92
- - test/test_segment.rb
93
84
  - ext/cppjieba/.gitignore
94
85
  - ext/cppjieba/.travis.yml
95
86
  - ext/cppjieba/CMakeLists.txt
@@ -217,6 +208,18 @@ files:
217
208
  - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc
218
209
  - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc
219
210
  - ext/cppjieba/test/unittest/gtest_main.cpp
211
+ - ext/jieba/extconf.rb
212
+ - ext/jieba/jieba.c
213
+ - ext/jieba/jieba.h
214
+ - ext/jieba/keyword.cc
215
+ - ext/jieba/keyword.h
216
+ - ext/jieba/segment.cc
217
+ - ext/jieba/segment.h
218
+ - jieba_rb.gemspec
219
+ - lib/jieba_rb.rb
220
+ - lib/jieba_rb/version.rb
221
+ - test/test_keyword.rb
222
+ - test/test_segment.rb
220
223
  homepage: https://github.com/altkatz/jieba_rb
221
224
  licenses:
222
225
  - MIT
@@ -237,10 +240,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
237
240
  version: '0'
238
241
  requirements: []
239
242
  rubyforge_project:
240
- rubygems_version: 2.1.11
243
+ rubygems_version: 2.4.5
241
244
  signing_key:
242
245
  specification_version: 4
243
246
  summary: cppjieba binding for ruby
244
247
  test_files:
248
+ - test/test_keyword.rb
245
249
  - test/test_segment.rb
246
250
  has_rdoc: