jieba_rb 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +9 -1
- data/README.md +24 -6
- data/Rakefile +5 -1
- data/ext/jieba/jieba.c +1 -0
- data/ext/jieba/jieba.h +1 -0
- data/ext/jieba/keyword.cc +98 -0
- data/ext/jieba/keyword.h +17 -0
- data/lib/jieba_rb/version.rb +1 -1
- data/lib/jieba_rb.rb +39 -12
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +2 -2
- metadata +16 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf9ad1c59145d326f4edd1c639066e107b52c13a
|
4
|
+
data.tar.gz: 4732514648268711284a8198a4a324889636112d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8fdc938f12a9506eb46baef952fd6bd5a261803934673273ba33f8e57cccd614ea31725dc58262b1e6c24828db9f8787027f8e95f4e614504eb943cf75dadf04
|
7
|
+
data.tar.gz: cd5d7df0f5f46cf4a7f8553d9f09cf05d0aeceb2f7b211a9238b9596aa43c3bd395425d2d278fc971bc3f224ac764b55cdfe88506c1b77a0ab8bcb927b206b48
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -23,24 +23,42 @@ Or install it yourself as:
|
|
23
23
|
|
24
24
|
## Word segment Usage
|
25
25
|
|
26
|
-
Mix Segment (HMM with Max Prob, default):
|
26
|
+
Mix Segment mode (HMM with Max Prob, default):
|
27
27
|
|
28
28
|
require 'jieba_rb'
|
29
|
-
seg = JiebaRb::Segment.new
|
29
|
+
seg = JiebaRb::Segment.new # equivalent to "JiebaRb::Segment.new mode: :mix"
|
30
30
|
words = seg.cut "令狐冲是云计算行业的专家"
|
31
31
|
# 令狐冲 是 云 计算 行业 的 专家
|
32
32
|
|
33
|
-
Mix Segment with user-defined dictionary:
|
33
|
+
Mix Segment mode with user-defined dictionary:
|
34
34
|
|
35
|
-
seg = JiebaRb::Segment.new
|
35
|
+
seg = JiebaRb::Segment.new mode: :mix, user_dict: "ext/cppjieba/dict/user.dict.utf8"
|
36
36
|
words = seg.cut "令狐冲是云计算行业的专家"
|
37
37
|
# 令狐冲 是 云计算 行业 的 专家
|
38
38
|
|
39
|
-
HMM or Max probability (mp) Segment:
|
39
|
+
HMM or Max probability (mp) Segment mode:
|
40
40
|
|
41
|
-
seg = JiebaRb::Segment.new
|
41
|
+
seg = JiebaRb::Segment.new mode: :hmm # or mode: :mp
|
42
42
|
words = seg.cut "令狐冲是云计算行业的专家"
|
43
43
|
|
44
|
+
## Keyword Extractor Usage
|
45
|
+
|
46
|
+
* only support TF-IDF currently
|
47
|
+
|
48
|
+
```
|
49
|
+
keyword = JiebaRb::Keyword.new
|
50
|
+
keywords_weights = keyword.extract "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
|
51
|
+
|
52
|
+
[
|
53
|
+
["CEO", 11.739204307083542],
|
54
|
+
["升职", 10.8561552143],
|
55
|
+
["加薪", 10.642581114],
|
56
|
+
["手扶拖拉机", 10.0088573539],
|
57
|
+
["巅峰", 9.49395840471]
|
58
|
+
]
|
59
|
+
```
|
60
|
+
|
61
|
+
|
44
62
|
|
45
63
|
## Contributing
|
46
64
|
|
data/Rakefile
CHANGED
data/ext/jieba/jieba.c
CHANGED
data/ext/jieba/jieba.h
CHANGED
@@ -0,0 +1,98 @@
|
|
1
|
+
#include "segment.h"
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
#include <KeywordExtractor.hpp>
|
4
|
+
|
5
|
+
static rb_encoding* u8_enc;
|
6
|
+
|
7
|
+
struct Keyword{
|
8
|
+
CppJieba::KeywordExtractor * p;
|
9
|
+
};
|
10
|
+
|
11
|
+
static void keyword_free(void *p){
|
12
|
+
delete ((Keyword*) p) -> p;
|
13
|
+
delete (Keyword*)p;
|
14
|
+
}
|
15
|
+
|
16
|
+
static VALUE allocate(VALUE klass)
|
17
|
+
{
|
18
|
+
Keyword * keyword = new Keyword();
|
19
|
+
return Data_Wrap_Struct(klass, NULL, keyword_free, keyword);
|
20
|
+
}
|
21
|
+
|
22
|
+
static void init(VALUE self,
|
23
|
+
VALUE mode_rb_sym,
|
24
|
+
VALUE jieba_dict_rbs,
|
25
|
+
VALUE hmm_dict_rbs,
|
26
|
+
VALUE idf_rbs,
|
27
|
+
VALUE stop_words_rbs,
|
28
|
+
VALUE user_dict_rbs)
|
29
|
+
{
|
30
|
+
Keyword * keyword;
|
31
|
+
Data_Get_Struct(self, Keyword, keyword);
|
32
|
+
|
33
|
+
Check_Type(jieba_dict_rbs, T_STRING);
|
34
|
+
Check_Type(hmm_dict_rbs, T_STRING);
|
35
|
+
Check_Type(user_dict_rbs, T_STRING);
|
36
|
+
Check_Type(idf_rbs, T_STRING);
|
37
|
+
Check_Type(stop_words_rbs, T_STRING);
|
38
|
+
|
39
|
+
std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
|
40
|
+
std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
|
41
|
+
std::string idf = StringValueCStr(idf_rbs);
|
42
|
+
std::string stop_words = StringValueCStr(stop_words_rbs);
|
43
|
+
std::string user_dict = StringValueCStr(user_dict_rbs);
|
44
|
+
|
45
|
+
ID mode = SYM2ID(mode_rb_sym);
|
46
|
+
if ( mode == rb_intern("tf_idf") )
|
47
|
+
{
|
48
|
+
keyword->p = new CppJieba::KeywordExtractor(jieba_dict, hmm_dict, idf, stop_words);
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
static VALUE extract(VALUE self, VALUE text_rbs, VALUE topN)
|
53
|
+
{
|
54
|
+
Check_Type(text_rbs, T_STRING);
|
55
|
+
std::string text = StringValueCStr(text_rbs);
|
56
|
+
|
57
|
+
Check_Type(topN, T_FIXNUM);
|
58
|
+
int top_n = NUM2INT(topN);
|
59
|
+
|
60
|
+
Keyword * keyword;
|
61
|
+
Data_Get_Struct(self, Keyword, keyword);
|
62
|
+
|
63
|
+
std::vector<std::pair<std::string, double> > top_words;
|
64
|
+
|
65
|
+
if (keyword->p->extract(text, top_words, top_n))
|
66
|
+
{
|
67
|
+
volatile VALUE arr = rb_ary_new();
|
68
|
+
for(size_t i = 0; i < top_words.size(); i++)
|
69
|
+
{
|
70
|
+
volatile VALUE inner_arr = rb_ary_new();
|
71
|
+
std::string & word = top_words[i].first;
|
72
|
+
rb_ary_push(inner_arr, rb_enc_str_new(word.c_str(), word.length(), u8_enc));
|
73
|
+
rb_ary_push(inner_arr, rb_float_new(top_words[i].second));
|
74
|
+
|
75
|
+
rb_ary_push(arr, inner_arr);
|
76
|
+
|
77
|
+
}
|
78
|
+
return arr;
|
79
|
+
}
|
80
|
+
else
|
81
|
+
{
|
82
|
+
return Qfalse;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
87
|
+
|
88
|
+
extern "C" {
|
89
|
+
void Init_keyword()
|
90
|
+
{
|
91
|
+
VALUE cKeyword = rb_define_class_under(mJieba, "Keyword", rb_cObject);
|
92
|
+
u8_enc = rb_utf8_encoding();
|
93
|
+
rb_define_alloc_func(cKeyword, allocate);
|
94
|
+
DEF(cKeyword, "_init", init, 6);
|
95
|
+
DEF(cKeyword, "extract",extract,2);
|
96
|
+
}
|
97
|
+
|
98
|
+
}
|
data/ext/jieba/keyword.h
ADDED
data/lib/jieba_rb/version.rb
CHANGED
data/lib/jieba_rb.rb
CHANGED
@@ -1,28 +1,55 @@
|
|
1
1
|
require "jieba_rb/version"
|
2
2
|
require "jieba"
|
3
3
|
module JiebaRb
|
4
|
+
abs = File.expand_path File.dirname(__FILE__)
|
5
|
+
EXT_BASE = "#{abs}/../ext/cppjieba/"
|
6
|
+
DEFAULT_JIEBA_DICT = EXT_BASE + "dict/jieba.dict.utf8";
|
7
|
+
DEFAULT_HMM_DICT = EXT_BASE + "dict/hmm_model.utf8";
|
8
|
+
DEFAULT_USER_DICT = EXT_BASE + "dict/user.dict.utf8";
|
9
|
+
|
4
10
|
class Segment
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
11
|
+
private :_init
|
12
|
+
def initialize opts = {}
|
13
|
+
valid_modes = [:mix, :hmm, :mp]
|
14
|
+
if mode = opts[:mode]
|
15
|
+
raise "Mode must be one of :mix :hmm :mp" unless valid_modes.include? mode
|
16
|
+
else
|
17
|
+
mode = :mix #default
|
18
|
+
end
|
19
|
+
|
20
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
21
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
22
|
+
user_dict = opts[:user_dict] || ""
|
23
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
24
|
+
|
25
|
+
_init mode, jieba_dict, hmm_dict, user_dict
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Keyword
|
30
|
+
DEFAULT_IDF = EXT_BASE + "dict/idf.utf8"
|
31
|
+
DEFAULT_STOP_WORDS = EXT_BASE + "dict/stop_words.utf8"
|
10
32
|
|
11
33
|
private :_init
|
34
|
+
|
12
35
|
def initialize opts = {}
|
13
|
-
|
14
|
-
if
|
15
|
-
raise "
|
36
|
+
valid_modes = [:tf_idf]
|
37
|
+
if mode = opts[:mode]
|
38
|
+
raise "Mode must be one of :tf_idf" unless valid_modes.include? mode
|
16
39
|
else
|
17
|
-
|
40
|
+
mode = :tf_idf #default
|
18
41
|
end
|
19
42
|
|
20
|
-
jieba_dict = opts[:jieba_dict] ||
|
21
|
-
hmm_dict = opts[:hmm_dict] ||
|
43
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
44
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
45
|
+
idf_path = opts[:idf] || DEFAULT_IDF
|
46
|
+
stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
|
47
|
+
|
22
48
|
user_dict = opts[:user_dict] || ""
|
23
49
|
user_dict = USER_DICT_FILE if user_dict == :default
|
24
50
|
|
25
|
-
_init
|
51
|
+
_init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
|
26
52
|
end
|
27
53
|
end
|
54
|
+
|
28
55
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'minitest/autorun'
|
3
|
+
require 'jieba_rb'
|
4
|
+
class JiebaTest < Minitest::Test
|
5
|
+
def test_keywords
|
6
|
+
keyword = JiebaRb::Keyword.new
|
7
|
+
keywords_weights = keyword.extract "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
|
8
|
+
|
9
|
+
assert_equal [["CEO",
|
10
|
+
11.739204307083542],
|
11
|
+
["升职", 10.8561552143],
|
12
|
+
["加薪", 10.642581114],
|
13
|
+
["手扶拖拉机", 10.0088573539],
|
14
|
+
["巅峰", 9.49395840471]], keywords_weights
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
data/test/test_segment.rb
CHANGED
@@ -18,13 +18,13 @@ class JiebaTest < Minitest::Test
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def test_hmm_segment
|
21
|
-
seg = JiebaRb::Segment.new
|
21
|
+
seg = JiebaRb::Segment.new mode: :hmm
|
22
22
|
words = seg.cut "令狐冲是云计算行业的专家"
|
23
23
|
assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
|
24
24
|
end
|
25
25
|
|
26
26
|
def test_max_prob_segment
|
27
|
-
seg = JiebaRb::Segment.new
|
27
|
+
seg = JiebaRb::Segment.new mode: :mp
|
28
28
|
words = seg.cut "令狐冲是云计算行业的专家"
|
29
29
|
assert_equal %w(令狐冲 是 云 计算 行业 的 专家), words
|
30
30
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jieba_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Li
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-12-
|
11
|
+
date: 2014-12-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -81,15 +81,6 @@ files:
|
|
81
81
|
- LICENSE.txt
|
82
82
|
- README.md
|
83
83
|
- Rakefile
|
84
|
-
- ext/jieba/extconf.rb
|
85
|
-
- ext/jieba/jieba.c
|
86
|
-
- ext/jieba/jieba.h
|
87
|
-
- ext/jieba/segment.cc
|
88
|
-
- ext/jieba/segment.h
|
89
|
-
- jieba_rb.gemspec
|
90
|
-
- lib/jieba_rb.rb
|
91
|
-
- lib/jieba_rb/version.rb
|
92
|
-
- test/test_segment.rb
|
93
84
|
- ext/cppjieba/.gitignore
|
94
85
|
- ext/cppjieba/.travis.yml
|
95
86
|
- ext/cppjieba/CMakeLists.txt
|
@@ -217,6 +208,18 @@ files:
|
|
217
208
|
- ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc
|
218
209
|
- ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc
|
219
210
|
- ext/cppjieba/test/unittest/gtest_main.cpp
|
211
|
+
- ext/jieba/extconf.rb
|
212
|
+
- ext/jieba/jieba.c
|
213
|
+
- ext/jieba/jieba.h
|
214
|
+
- ext/jieba/keyword.cc
|
215
|
+
- ext/jieba/keyword.h
|
216
|
+
- ext/jieba/segment.cc
|
217
|
+
- ext/jieba/segment.h
|
218
|
+
- jieba_rb.gemspec
|
219
|
+
- lib/jieba_rb.rb
|
220
|
+
- lib/jieba_rb/version.rb
|
221
|
+
- test/test_keyword.rb
|
222
|
+
- test/test_segment.rb
|
220
223
|
homepage: https://github.com/altkatz/jieba_rb
|
221
224
|
licenses:
|
222
225
|
- MIT
|
@@ -237,10 +240,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
237
240
|
version: '0'
|
238
241
|
requirements: []
|
239
242
|
rubyforge_project:
|
240
|
-
rubygems_version: 2.
|
243
|
+
rubygems_version: 2.4.5
|
241
244
|
signing_key:
|
242
245
|
specification_version: 4
|
243
246
|
summary: cppjieba binding for ruby
|
244
247
|
test_files:
|
248
|
+
- test/test_keyword.rb
|
245
249
|
- test/test_segment.rb
|
246
250
|
has_rdoc:
|