jieba_rb 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 122f7b0e8353ea96f9eecc620c894d8dc27b3402
4
- data.tar.gz: 8ceb3c44c9957f8a080653f091580d6ea24a7e48
3
+ metadata.gz: cf9ad1c59145d326f4edd1c639066e107b52c13a
4
+ data.tar.gz: 4732514648268711284a8198a4a324889636112d
5
5
  SHA512:
6
- metadata.gz: 39f49d5d34221863aab6864465a8671a662e4d94fdb93ffe0439fd2f89d7b63d983002c031abb54e651a5a2f99d220f4ec27b24778774f93497dd9ecf9e788df
7
- data.tar.gz: 9232b90f160726d296d64c95bbb39545ebb1ed362fef0463fdd30a9d7777dedfe37cbca162cdf31618cf3cc1502c7db5de815866fba80956dc97325bbe881372
6
+ metadata.gz: 8fdc938f12a9506eb46baef952fd6bd5a261803934673273ba33f8e57cccd614ea31725dc58262b1e6c24828db9f8787027f8e95f4e614504eb943cf75dadf04
7
+ data.tar.gz: cd5d7df0f5f46cf4a7f8553d9f09cf05d0aeceb2f7b211a9238b9596aa43c3bd395425d2d278fc971bc3f224ac764b55cdfe88506c1b77a0ab8bcb927b206b48
data/.travis.yml CHANGED
@@ -2,5 +2,13 @@ language: ruby
2
2
  rvm:
3
3
  - 1.9.3
4
4
  - 2.0.0
5
- - 2.1.2
5
+ - 2.1.5
6
+ - ruby-head
7
+ - rbx-2
8
+
6
9
  before_script: rake compile
10
+
11
+ matrix:
12
+ allow_failures:
13
+ - rvm: ruby-head
14
+ - rvm: rbx-2
data/README.md CHANGED
@@ -23,24 +23,42 @@ Or install it yourself as:
23
23
 
24
24
  ## Word segment Usage
25
25
 
26
- Mix Segment (HMM with Max Prob, default):
26
+ Mix Segment mode (HMM with Max Prob, default):
27
27
 
28
28
  require 'jieba_rb'
29
- seg = JiebaRb::Segment.new type: :mix
29
+ seg = JiebaRb::Segment.new # equivalent to "JiebaRb::Segment.new mode: :mix"
30
30
  words = seg.cut "令狐冲是云计算行业的专家"
31
31
  # 令狐冲 是 云 计算 行业 的 专家
32
32
 
33
- Mix Segment with user-defined dictionary:
33
+ Mix Segment mode with user-defined dictionary:
34
34
 
35
- seg = JiebaRb::Segment.new type: :mix, user_dict: "ext/cppjieba/dict/user.dict.utf8"
35
+ seg = JiebaRb::Segment.new mode: :mix, user_dict: "ext/cppjieba/dict/user.dict.utf8"
36
36
  words = seg.cut "令狐冲是云计算行业的专家"
37
37
  # 令狐冲 是 云计算 行业 的 专家
38
38
 
39
- HMM or Max probability (mp) Segment:
39
+ HMM or Max probability (mp) Segment mode:
40
40
 
41
- seg = JiebaRb::Segment.new type: :hmm # or type: :mp
41
+ seg = JiebaRb::Segment.new mode: :hmm # or mode: :mp
42
42
  words = seg.cut "令狐冲是云计算行业的专家"
43
43
 
44
+ ## Keyword Extractor Usage
45
+
46
+ * only support TF-IDF currently
47
+
48
+ ```
49
+ keyword = JiebaRb::Keyword.new
50
+ keywords_weights = keyword.extract "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
51
+
52
+ [
53
+ ["CEO", 11.739204307083542],
54
+ ["升职", 10.8561552143],
55
+ ["加薪", 10.642581114],
56
+ ["手扶拖拉机", 10.0088573539],
57
+ ["巅峰", 9.49395840471]
58
+ ]
59
+ ```
60
+
61
+
44
62
 
45
63
  ## Contributing
46
64
 
data/Rakefile CHANGED
@@ -7,5 +7,9 @@ Rake::ExtensionTask.new "jieba"
7
7
  Rake::TestTask.new do |t|
8
8
  t.libs << 'test'
9
9
  end
10
-
10
+ desc "clean compile files"
11
+ task :clean_compile do
12
+ system "rm -r tmp"
13
+ system "rm lib/*.bundle"
14
+ end
11
15
  task :default => :test
data/ext/jieba/jieba.c CHANGED
@@ -6,4 +6,5 @@ void Init_jieba()
6
6
  mJieba = rb_define_module("JiebaRb");
7
7
 
8
8
  Init_segment();
9
+ Init_keyword();
9
10
  }
data/ext/jieba/jieba.h CHANGED
@@ -3,6 +3,7 @@
3
3
 
4
4
  #include <ruby.h>
5
5
  #include <segment.h>
6
+ #include <keyword.h>
6
7
 
7
8
  extern VALUE mJieba;
8
9
 
@@ -0,0 +1,98 @@
1
+ #include "segment.h"
2
+ #include <ruby/encoding.h>
3
+ #include <KeywordExtractor.hpp>
4
+
5
+ static rb_encoding* u8_enc;
6
+
7
+ struct Keyword{
8
+ CppJieba::KeywordExtractor * p;
9
+ };
10
+
11
+ static void keyword_free(void *p){
12
+ delete ((Keyword*) p) -> p;
13
+ delete (Keyword*)p;
14
+ }
15
+
16
+ static VALUE allocate(VALUE klass)
17
+ {
18
+ Keyword * keyword = new Keyword();
19
+ return Data_Wrap_Struct(klass, NULL, keyword_free, keyword);
20
+ }
21
+
22
+ static void init(VALUE self,
23
+ VALUE mode_rb_sym,
24
+ VALUE jieba_dict_rbs,
25
+ VALUE hmm_dict_rbs,
26
+ VALUE idf_rbs,
27
+ VALUE stop_words_rbs,
28
+ VALUE user_dict_rbs)
29
+ {
30
+ Keyword * keyword;
31
+ Data_Get_Struct(self, Keyword, keyword);
32
+
33
+ Check_Type(jieba_dict_rbs, T_STRING);
34
+ Check_Type(hmm_dict_rbs, T_STRING);
35
+ Check_Type(user_dict_rbs, T_STRING);
36
+ Check_Type(idf_rbs, T_STRING);
37
+ Check_Type(stop_words_rbs, T_STRING);
38
+
39
+ std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
40
+ std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
41
+ std::string idf = StringValueCStr(idf_rbs);
42
+ std::string stop_words = StringValueCStr(stop_words_rbs);
43
+ std::string user_dict = StringValueCStr(user_dict_rbs);
44
+
45
+ ID mode = SYM2ID(mode_rb_sym);
46
+ if ( mode == rb_intern("tf_idf") )
47
+ {
48
+ keyword->p = new CppJieba::KeywordExtractor(jieba_dict, hmm_dict, idf, stop_words);
49
+ }
50
+ }
51
+
52
+ static VALUE extract(VALUE self, VALUE text_rbs, VALUE topN)
53
+ {
54
+ Check_Type(text_rbs, T_STRING);
55
+ std::string text = StringValueCStr(text_rbs);
56
+
57
+ Check_Type(topN, T_FIXNUM);
58
+ int top_n = NUM2INT(topN);
59
+
60
+ Keyword * keyword;
61
+ Data_Get_Struct(self, Keyword, keyword);
62
+
63
+ std::vector<std::pair<std::string, double> > top_words;
64
+
65
+ if (keyword->p->extract(text, top_words, top_n))
66
+ {
67
+ volatile VALUE arr = rb_ary_new();
68
+ for(size_t i = 0; i < top_words.size(); i++)
69
+ {
70
+ volatile VALUE inner_arr = rb_ary_new();
71
+ std::string & word = top_words[i].first;
72
+ rb_ary_push(inner_arr, rb_enc_str_new(word.c_str(), word.length(), u8_enc));
73
+ rb_ary_push(inner_arr, rb_float_new(top_words[i].second));
74
+
75
+ rb_ary_push(arr, inner_arr);
76
+
77
+ }
78
+ return arr;
79
+ }
80
+ else
81
+ {
82
+ return Qfalse;
83
+ }
84
+ }
85
+
86
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
87
+
88
+ extern "C" {
89
+ void Init_keyword()
90
+ {
91
+ VALUE cKeyword = rb_define_class_under(mJieba, "Keyword", rb_cObject);
92
+ u8_enc = rb_utf8_encoding();
93
+ rb_define_alloc_func(cKeyword, allocate);
94
+ DEF(cKeyword, "_init", init, 6);
95
+ DEF(cKeyword, "extract",extract,2);
96
+ }
97
+
98
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef RUBY_JIEBA_KEYWORD
2
+ #define RUBY_JIEBA_KEYWORD
3
+
4
+ #include <jieba.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void Init_keyword();
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif
15
+
16
+
17
+ #endif
@@ -1,3 +1,3 @@
1
1
  module JiebaRb
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/jieba_rb.rb CHANGED
@@ -1,28 +1,55 @@
1
1
  require "jieba_rb/version"
2
2
  require "jieba"
3
3
  module JiebaRb
4
+ abs = File.expand_path File.dirname(__FILE__)
5
+ EXT_BASE = "#{abs}/../ext/cppjieba/"
6
+ DEFAULT_JIEBA_DICT = EXT_BASE + "dict/jieba.dict.utf8";
7
+ DEFAULT_HMM_DICT = EXT_BASE + "dict/hmm_model.utf8";
8
+ DEFAULT_USER_DICT = EXT_BASE + "dict/user.dict.utf8";
9
+
4
10
  class Segment
5
- abs = File.expand_path File.dirname(__FILE__)
6
- EXT_BASE = "#{abs}/../ext/cppjieba/"
7
- JIEBA_DICT_FILE = EXT_BASE + "dict/jieba.dict.utf8";
8
- HMM_DICT_FILE = EXT_BASE + "dict/hmm_model.utf8";
9
- USER_DICT_FILE = EXT_BASE + "dict/user.dict.utf8";
11
+ private :_init
12
+ def initialize opts = {}
13
+ valid_modes = [:mix, :hmm, :mp]
14
+ if mode = opts[:mode]
15
+ raise "Mode must be one of :mix :hmm :mp" unless valid_modes.include? mode
16
+ else
17
+ mode = :mix #default
18
+ end
19
+
20
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
21
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
22
+ user_dict = opts[:user_dict] || ""
23
+ user_dict = DEFAULT_USER_DICT if user_dict == :default
24
+
25
+ _init mode, jieba_dict, hmm_dict, user_dict
26
+ end
27
+ end
28
+
29
+ class Keyword
30
+ DEFAULT_IDF = EXT_BASE + "dict/idf.utf8"
31
+ DEFAULT_STOP_WORDS = EXT_BASE + "dict/stop_words.utf8"
10
32
 
11
33
  private :_init
34
+
12
35
  def initialize opts = {}
13
- valid_seg_types = [:mix, :hmm, :mp]
14
- if type = opts[:type]
15
- raise "Type must be one of :mix :hmm :mp" unless valid_seg_types.include? type
36
+ valid_modes = [:tf_idf]
37
+ if mode = opts[:mode]
38
+ raise "Mode must be one of :tf_idf" unless valid_modes.include? mode
16
39
  else
17
- type = :mix #default
40
+ mode = :tf_idf #default
18
41
  end
19
42
 
20
- jieba_dict = opts[:jieba_dict] || JIEBA_DICT_FILE
21
- hmm_dict = opts[:hmm_dict] || HMM_DICT_FILE
43
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
44
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
45
+ idf_path = opts[:idf] || DEFAULT_IDF
46
+ stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
47
+
22
48
  user_dict = opts[:user_dict] || ""
23
49
  user_dict = USER_DICT_FILE if user_dict == :default
24
50
 
25
- _init type, jieba_dict, hmm_dict, user_dict
51
+ _init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
26
52
  end
27
53
  end
54
+
28
55
  end
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'jieba_rb'
4
+ class JiebaTest < Minitest::Test
5
+ def test_keywords
6
+ keyword = JiebaRb::Keyword.new
7
+ keywords_weights = keyword.extract "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
8
+
9
+ assert_equal [["CEO",
10
+ 11.739204307083542],
11
+ ["升职", 10.8561552143],
12
+ ["加薪", 10.642581114],
13
+ ["手扶拖拉机", 10.0088573539],
14
+ ["巅峰", 9.49395840471]], keywords_weights
15
+
16
+ end
17
+ end
data/test/test_segment.rb CHANGED
@@ -18,13 +18,13 @@ class JiebaTest < Minitest::Test
18
18
  end
19
19
 
20
20
  def test_hmm_segment
21
- seg = JiebaRb::Segment.new type: :hmm
21
+ seg = JiebaRb::Segment.new mode: :hmm
22
22
  words = seg.cut "令狐冲是云计算行业的专家"
23
23
  assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
24
24
  end
25
25
 
26
26
  def test_max_prob_segment
27
- seg = JiebaRb::Segment.new type: :mp
27
+ seg = JiebaRb::Segment.new mode: :mp
28
28
  words = seg.cut "令狐冲是云计算行业的专家"
29
29
  assert_equal %w(令狐冲 是 云 计算 行业 的 专家), words
30
30
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jieba_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Li
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-21 00:00:00.000000000 Z
11
+ date: 2014-12-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -81,15 +81,6 @@ files:
81
81
  - LICENSE.txt
82
82
  - README.md
83
83
  - Rakefile
84
- - ext/jieba/extconf.rb
85
- - ext/jieba/jieba.c
86
- - ext/jieba/jieba.h
87
- - ext/jieba/segment.cc
88
- - ext/jieba/segment.h
89
- - jieba_rb.gemspec
90
- - lib/jieba_rb.rb
91
- - lib/jieba_rb/version.rb
92
- - test/test_segment.rb
93
84
  - ext/cppjieba/.gitignore
94
85
  - ext/cppjieba/.travis.yml
95
86
  - ext/cppjieba/CMakeLists.txt
@@ -217,6 +208,18 @@ files:
217
208
  - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc
218
209
  - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc
219
210
  - ext/cppjieba/test/unittest/gtest_main.cpp
211
+ - ext/jieba/extconf.rb
212
+ - ext/jieba/jieba.c
213
+ - ext/jieba/jieba.h
214
+ - ext/jieba/keyword.cc
215
+ - ext/jieba/keyword.h
216
+ - ext/jieba/segment.cc
217
+ - ext/jieba/segment.h
218
+ - jieba_rb.gemspec
219
+ - lib/jieba_rb.rb
220
+ - lib/jieba_rb/version.rb
221
+ - test/test_keyword.rb
222
+ - test/test_segment.rb
220
223
  homepage: https://github.com/altkatz/jieba_rb
221
224
  licenses:
222
225
  - MIT
@@ -237,10 +240,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
237
240
  version: '0'
238
241
  requirements: []
239
242
  rubyforge_project:
240
- rubygems_version: 2.1.11
243
+ rubygems_version: 2.4.5
241
244
  signing_key:
242
245
  specification_version: 4
243
246
  summary: cppjieba binding for ruby
244
247
  test_files:
248
+ - test/test_keyword.rb
245
249
  - test/test_segment.rb
246
250
  has_rdoc: