jieba_rb 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +9 -1
- data/README.md +24 -6
- data/Rakefile +5 -1
- data/ext/jieba/jieba.c +1 -0
- data/ext/jieba/jieba.h +1 -0
- data/ext/jieba/keyword.cc +98 -0
- data/ext/jieba/keyword.h +17 -0
- data/lib/jieba_rb/version.rb +1 -1
- data/lib/jieba_rb.rb +39 -12
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +2 -2
- metadata +16 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf9ad1c59145d326f4edd1c639066e107b52c13a
|
4
|
+
data.tar.gz: 4732514648268711284a8198a4a324889636112d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8fdc938f12a9506eb46baef952fd6bd5a261803934673273ba33f8e57cccd614ea31725dc58262b1e6c24828db9f8787027f8e95f4e614504eb943cf75dadf04
|
7
|
+
data.tar.gz: cd5d7df0f5f46cf4a7f8553d9f09cf05d0aeceb2f7b211a9238b9596aa43c3bd395425d2d278fc971bc3f224ac764b55cdfe88506c1b77a0ab8bcb927b206b48
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -23,24 +23,42 @@ Or install it yourself as:
|
|
23
23
|
|
24
24
|
## Word segment Usage
|
25
25
|
|
26
|
-
Mix Segment (HMM with Max Prob, default):
|
26
|
+
Mix Segment mode (HMM with Max Prob, default):
|
27
27
|
|
28
28
|
require 'jieba_rb'
|
29
|
-
seg = JiebaRb::Segment.new
|
29
|
+
seg = JiebaRb::Segment.new # equivalent to "JiebaRb::Segment.new mode: :mix"
|
30
30
|
words = seg.cut "令狐冲是云计算行业的专家"
|
31
31
|
# 令狐冲 是 云 计算 行业 的 专家
|
32
32
|
|
33
|
-
Mix Segment with user-defined dictionary:
|
33
|
+
Mix Segment mode with user-defined dictionary:
|
34
34
|
|
35
|
-
seg = JiebaRb::Segment.new
|
35
|
+
seg = JiebaRb::Segment.new mode: :mix, user_dict: "ext/cppjieba/dict/user.dict.utf8"
|
36
36
|
words = seg.cut "令狐冲是云计算行业的专家"
|
37
37
|
# 令狐冲 是 云计算 行业 的 专家
|
38
38
|
|
39
|
-
HMM or Max probability (mp) Segment:
|
39
|
+
HMM or Max probability (mp) Segment mode:
|
40
40
|
|
41
|
-
seg = JiebaRb::Segment.new
|
41
|
+
seg = JiebaRb::Segment.new mode: :hmm # or mode: :mp
|
42
42
|
words = seg.cut "令狐冲是云计算行业的专家"
|
43
43
|
|
44
|
+
## Keyword Extractor Usage
|
45
|
+
|
46
|
+
* only support TF-IDF currently
|
47
|
+
|
48
|
+
```
|
49
|
+
keyword = JiebaRb::Keyword.new
|
50
|
+
keywords_weights = keyword.extract "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
|
51
|
+
|
52
|
+
[
|
53
|
+
["CEO", 11.739204307083542],
|
54
|
+
["升职", 10.8561552143],
|
55
|
+
["加薪", 10.642581114],
|
56
|
+
["手扶拖拉机", 10.0088573539],
|
57
|
+
["巅峰", 9.49395840471]
|
58
|
+
]
|
59
|
+
```
|
60
|
+
|
61
|
+
|
44
62
|
|
45
63
|
## Contributing
|
46
64
|
|
data/Rakefile
CHANGED
data/ext/jieba/jieba.c
CHANGED
data/ext/jieba/jieba.h
CHANGED
@@ -0,0 +1,98 @@
|
|
1
|
+
#include "segment.h"
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
#include <KeywordExtractor.hpp>
|
4
|
+
|
5
|
+
static rb_encoding* u8_enc;
|
6
|
+
|
7
|
+
struct Keyword{
|
8
|
+
CppJieba::KeywordExtractor * p;
|
9
|
+
};
|
10
|
+
|
11
|
+
static void keyword_free(void *p){
|
12
|
+
delete ((Keyword*) p) -> p;
|
13
|
+
delete (Keyword*)p;
|
14
|
+
}
|
15
|
+
|
16
|
+
static VALUE allocate(VALUE klass)
|
17
|
+
{
|
18
|
+
Keyword * keyword = new Keyword();
|
19
|
+
return Data_Wrap_Struct(klass, NULL, keyword_free, keyword);
|
20
|
+
}
|
21
|
+
|
22
|
+
static void init(VALUE self,
|
23
|
+
VALUE mode_rb_sym,
|
24
|
+
VALUE jieba_dict_rbs,
|
25
|
+
VALUE hmm_dict_rbs,
|
26
|
+
VALUE idf_rbs,
|
27
|
+
VALUE stop_words_rbs,
|
28
|
+
VALUE user_dict_rbs)
|
29
|
+
{
|
30
|
+
Keyword * keyword;
|
31
|
+
Data_Get_Struct(self, Keyword, keyword);
|
32
|
+
|
33
|
+
Check_Type(jieba_dict_rbs, T_STRING);
|
34
|
+
Check_Type(hmm_dict_rbs, T_STRING);
|
35
|
+
Check_Type(user_dict_rbs, T_STRING);
|
36
|
+
Check_Type(idf_rbs, T_STRING);
|
37
|
+
Check_Type(stop_words_rbs, T_STRING);
|
38
|
+
|
39
|
+
std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
|
40
|
+
std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
|
41
|
+
std::string idf = StringValueCStr(idf_rbs);
|
42
|
+
std::string stop_words = StringValueCStr(stop_words_rbs);
|
43
|
+
std::string user_dict = StringValueCStr(user_dict_rbs);
|
44
|
+
|
45
|
+
ID mode = SYM2ID(mode_rb_sym);
|
46
|
+
if ( mode == rb_intern("tf_idf") )
|
47
|
+
{
|
48
|
+
keyword->p = new CppJieba::KeywordExtractor(jieba_dict, hmm_dict, idf, stop_words);
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
static VALUE extract(VALUE self, VALUE text_rbs, VALUE topN)
|
53
|
+
{
|
54
|
+
Check_Type(text_rbs, T_STRING);
|
55
|
+
std::string text = StringValueCStr(text_rbs);
|
56
|
+
|
57
|
+
Check_Type(topN, T_FIXNUM);
|
58
|
+
int top_n = NUM2INT(topN);
|
59
|
+
|
60
|
+
Keyword * keyword;
|
61
|
+
Data_Get_Struct(self, Keyword, keyword);
|
62
|
+
|
63
|
+
std::vector<std::pair<std::string, double> > top_words;
|
64
|
+
|
65
|
+
if (keyword->p->extract(text, top_words, top_n))
|
66
|
+
{
|
67
|
+
volatile VALUE arr = rb_ary_new();
|
68
|
+
for(size_t i = 0; i < top_words.size(); i++)
|
69
|
+
{
|
70
|
+
volatile VALUE inner_arr = rb_ary_new();
|
71
|
+
std::string & word = top_words[i].first;
|
72
|
+
rb_ary_push(inner_arr, rb_enc_str_new(word.c_str(), word.length(), u8_enc));
|
73
|
+
rb_ary_push(inner_arr, rb_float_new(top_words[i].second));
|
74
|
+
|
75
|
+
rb_ary_push(arr, inner_arr);
|
76
|
+
|
77
|
+
}
|
78
|
+
return arr;
|
79
|
+
}
|
80
|
+
else
|
81
|
+
{
|
82
|
+
return Qfalse;
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
87
|
+
|
88
|
+
extern "C" {
|
89
|
+
void Init_keyword()
|
90
|
+
{
|
91
|
+
VALUE cKeyword = rb_define_class_under(mJieba, "Keyword", rb_cObject);
|
92
|
+
u8_enc = rb_utf8_encoding();
|
93
|
+
rb_define_alloc_func(cKeyword, allocate);
|
94
|
+
DEF(cKeyword, "_init", init, 6);
|
95
|
+
DEF(cKeyword, "extract",extract,2);
|
96
|
+
}
|
97
|
+
|
98
|
+
}
|
data/ext/jieba/keyword.h
ADDED
data/lib/jieba_rb/version.rb
CHANGED
data/lib/jieba_rb.rb
CHANGED
@@ -1,28 +1,55 @@
|
|
1
1
|
require "jieba_rb/version"
|
2
2
|
require "jieba"
|
3
3
|
module JiebaRb
|
4
|
+
abs = File.expand_path File.dirname(__FILE__)
|
5
|
+
EXT_BASE = "#{abs}/../ext/cppjieba/"
|
6
|
+
DEFAULT_JIEBA_DICT = EXT_BASE + "dict/jieba.dict.utf8";
|
7
|
+
DEFAULT_HMM_DICT = EXT_BASE + "dict/hmm_model.utf8";
|
8
|
+
DEFAULT_USER_DICT = EXT_BASE + "dict/user.dict.utf8";
|
9
|
+
|
4
10
|
class Segment
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
11
|
+
private :_init
|
12
|
+
def initialize opts = {}
|
13
|
+
valid_modes = [:mix, :hmm, :mp]
|
14
|
+
if mode = opts[:mode]
|
15
|
+
raise "Mode must be one of :mix :hmm :mp" unless valid_modes.include? mode
|
16
|
+
else
|
17
|
+
mode = :mix #default
|
18
|
+
end
|
19
|
+
|
20
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
21
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
22
|
+
user_dict = opts[:user_dict] || ""
|
23
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
24
|
+
|
25
|
+
_init mode, jieba_dict, hmm_dict, user_dict
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Keyword
|
30
|
+
DEFAULT_IDF = EXT_BASE + "dict/idf.utf8"
|
31
|
+
DEFAULT_STOP_WORDS = EXT_BASE + "dict/stop_words.utf8"
|
10
32
|
|
11
33
|
private :_init
|
34
|
+
|
12
35
|
def initialize opts = {}
|
13
|
-
|
14
|
-
if
|
15
|
-
raise "
|
36
|
+
valid_modes = [:tf_idf]
|
37
|
+
if mode = opts[:mode]
|
38
|
+
raise "Mode must be one of :tf_idf" unless valid_modes.include? mode
|
16
39
|
else
|
17
|
-
|
40
|
+
mode = :tf_idf #default
|
18
41
|
end
|
19
42
|
|
20
|
-
jieba_dict = opts[:jieba_dict] ||
|
21
|
-
hmm_dict = opts[:hmm_dict] ||
|
43
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
44
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
45
|
+
idf_path = opts[:idf] || DEFAULT_IDF
|
46
|
+
stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
|
47
|
+
|
22
48
|
user_dict = opts[:user_dict] || ""
|
23
49
|
user_dict = USER_DICT_FILE if user_dict == :default
|
24
50
|
|
25
|
-
_init
|
51
|
+
_init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
|
26
52
|
end
|
27
53
|
end
|
54
|
+
|
28
55
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'minitest/autorun'
|
3
|
+
require 'jieba_rb'
|
4
|
+
class JiebaTest < Minitest::Test
|
5
|
+
def test_keywords
|
6
|
+
keyword = JiebaRb::Keyword.new
|
7
|
+
keywords_weights = keyword.extract "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 5
|
8
|
+
|
9
|
+
assert_equal [["CEO",
|
10
|
+
11.739204307083542],
|
11
|
+
["升职", 10.8561552143],
|
12
|
+
["加薪", 10.642581114],
|
13
|
+
["手扶拖拉机", 10.0088573539],
|
14
|
+
["巅峰", 9.49395840471]], keywords_weights
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
data/test/test_segment.rb
CHANGED
@@ -18,13 +18,13 @@ class JiebaTest < Minitest::Test
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def test_hmm_segment
|
21
|
-
seg = JiebaRb::Segment.new
|
21
|
+
seg = JiebaRb::Segment.new mode: :hmm
|
22
22
|
words = seg.cut "令狐冲是云计算行业的专家"
|
23
23
|
assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
|
24
24
|
end
|
25
25
|
|
26
26
|
def test_max_prob_segment
|
27
|
-
seg = JiebaRb::Segment.new
|
27
|
+
seg = JiebaRb::Segment.new mode: :mp
|
28
28
|
words = seg.cut "令狐冲是云计算行业的专家"
|
29
29
|
assert_equal %w(令狐冲 是 云 计算 行业 的 专家), words
|
30
30
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jieba_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Li
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-12-
|
11
|
+
date: 2014-12-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -81,15 +81,6 @@ files:
|
|
81
81
|
- LICENSE.txt
|
82
82
|
- README.md
|
83
83
|
- Rakefile
|
84
|
-
- ext/jieba/extconf.rb
|
85
|
-
- ext/jieba/jieba.c
|
86
|
-
- ext/jieba/jieba.h
|
87
|
-
- ext/jieba/segment.cc
|
88
|
-
- ext/jieba/segment.h
|
89
|
-
- jieba_rb.gemspec
|
90
|
-
- lib/jieba_rb.rb
|
91
|
-
- lib/jieba_rb/version.rb
|
92
|
-
- test/test_segment.rb
|
93
84
|
- ext/cppjieba/.gitignore
|
94
85
|
- ext/cppjieba/.travis.yml
|
95
86
|
- ext/cppjieba/CMakeLists.txt
|
@@ -217,6 +208,18 @@ files:
|
|
217
208
|
- ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc
|
218
209
|
- ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc
|
219
210
|
- ext/cppjieba/test/unittest/gtest_main.cpp
|
211
|
+
- ext/jieba/extconf.rb
|
212
|
+
- ext/jieba/jieba.c
|
213
|
+
- ext/jieba/jieba.h
|
214
|
+
- ext/jieba/keyword.cc
|
215
|
+
- ext/jieba/keyword.h
|
216
|
+
- ext/jieba/segment.cc
|
217
|
+
- ext/jieba/segment.h
|
218
|
+
- jieba_rb.gemspec
|
219
|
+
- lib/jieba_rb.rb
|
220
|
+
- lib/jieba_rb/version.rb
|
221
|
+
- test/test_keyword.rb
|
222
|
+
- test/test_segment.rb
|
220
223
|
homepage: https://github.com/altkatz/jieba_rb
|
221
224
|
licenses:
|
222
225
|
- MIT
|
@@ -237,10 +240,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
237
240
|
version: '0'
|
238
241
|
requirements: []
|
239
242
|
rubyforge_project:
|
240
|
-
rubygems_version: 2.
|
243
|
+
rubygems_version: 2.4.5
|
241
244
|
signing_key:
|
242
245
|
specification_version: 4
|
243
246
|
summary: cppjieba binding for ruby
|
244
247
|
test_files:
|
248
|
+
- test/test_keyword.rb
|
245
249
|
- test/test_segment.rb
|
246
250
|
has_rdoc:
|