jieba-rb 5.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +15 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +28 -0
- data/ext/cppjieba/ChangeLog.md +236 -0
- data/ext/cppjieba/README.md +285 -0
- data/ext/cppjieba/README_EN.md +111 -0
- data/ext/cppjieba/appveyor.yml +32 -0
- data/ext/cppjieba/deps/CMakeLists.txt +1 -0
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
- data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
- data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
- data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
- data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
- data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
- data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
- data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
- data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +4 -0
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +24 -0
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
- data/ext/jieba/extconf.rb +28 -0
- data/ext/jieba/jieba.c +11 -0
- data/ext/jieba/jieba.h +11 -0
- data/ext/jieba/keyword.cc +92 -0
- data/ext/jieba/keyword.h +17 -0
- data/ext/jieba/segment.cc +107 -0
- data/ext/jieba/segment.h +17 -0
- data/ext/jieba/tagging.cc +76 -0
- data/ext/jieba/tagging.h +17 -0
- data/jieba_rb.gemspec +51 -0
- data/lib/jieba-rb.rb +66 -0
- data/lib/jieba_rb/version.rb +3 -0
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +32 -0
- data/test/test_tagging.rb +22 -0
- data/test/user.dict.utf8 +23 -0
- metadata +219 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
require "mkmf"
|
2
|
+
abs = File.expand_path File.dirname(__FILE__)
|
3
|
+
|
4
|
+
LIBDIR = RbConfig::CONFIG['libdir']
|
5
|
+
INCLUDEDIR = RbConfig::CONFIG['includedir']
|
6
|
+
|
7
|
+
HEADER_DIRS = [
|
8
|
+
INCLUDEDIR,
|
9
|
+
"#{abs}/../cppjieba/src",
|
10
|
+
"#{abs}/../cppjieba/include/cppjieba",
|
11
|
+
"#{abs}/../cppjieba/deps"
|
12
|
+
]
|
13
|
+
|
14
|
+
LIB_DIRS = [
|
15
|
+
|
16
|
+
LIBDIR
|
17
|
+
]
|
18
|
+
|
19
|
+
dir_config('cppjieba_src', HEADER_DIRS, LIB_DIRS)
|
20
|
+
|
21
|
+
CONFIG["CXXFLAGS"] += " -std=c++0x -O3"
|
22
|
+
$CXXFLAGS = "#{$CXXFLAGS} -std=c++0x -O3"
|
23
|
+
create_makefile 'jieba'
|
24
|
+
# respect header changes
|
25
|
+
headers = Dir.glob('*.{hpp,h}').join ' '
|
26
|
+
File.open 'Makefile', 'a' do |f|
|
27
|
+
f.puts "\n$(OBJS): #{headers}"
|
28
|
+
end
|
data/ext/jieba/jieba.c
ADDED
data/ext/jieba/jieba.h
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
#include "segment.h"
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
#include <KeywordExtractor.hpp>
|
4
|
+
|
5
|
+
static rb_encoding* u8_enc;
|
6
|
+
|
7
|
+
struct Keyword{
|
8
|
+
cppjieba::KeywordExtractor * p;
|
9
|
+
};
|
10
|
+
|
11
|
+
static void keyword_free(void *p){
|
12
|
+
delete ((Keyword*) p) -> p;
|
13
|
+
delete (Keyword*)p;
|
14
|
+
}
|
15
|
+
|
16
|
+
static VALUE allocate(VALUE klass)
|
17
|
+
{
|
18
|
+
Keyword * keyword = new Keyword();
|
19
|
+
return Data_Wrap_Struct(klass, NULL, keyword_free, keyword);
|
20
|
+
}
|
21
|
+
|
22
|
+
static void init(VALUE self,
|
23
|
+
VALUE mode_rb_sym,
|
24
|
+
VALUE jieba_dict_rbs,
|
25
|
+
VALUE hmm_dict_rbs,
|
26
|
+
VALUE idf_rbs,
|
27
|
+
VALUE stop_words_rbs,
|
28
|
+
VALUE user_dict_rbs)
|
29
|
+
{
|
30
|
+
Keyword * keyword;
|
31
|
+
Data_Get_Struct(self, Keyword, keyword);
|
32
|
+
|
33
|
+
Check_Type(jieba_dict_rbs, T_STRING);
|
34
|
+
Check_Type(hmm_dict_rbs, T_STRING);
|
35
|
+
Check_Type(user_dict_rbs, T_STRING);
|
36
|
+
Check_Type(idf_rbs, T_STRING);
|
37
|
+
Check_Type(stop_words_rbs, T_STRING);
|
38
|
+
|
39
|
+
std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
|
40
|
+
std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
|
41
|
+
std::string idf = StringValueCStr(idf_rbs);
|
42
|
+
std::string stop_words = StringValueCStr(stop_words_rbs);
|
43
|
+
std::string user_dict = StringValueCStr(user_dict_rbs);
|
44
|
+
|
45
|
+
ID mode = SYM2ID(mode_rb_sym);
|
46
|
+
if ( mode == rb_intern("tf_idf") )
|
47
|
+
{
|
48
|
+
keyword->p = new cppjieba::KeywordExtractor(jieba_dict, hmm_dict, idf, stop_words, user_dict);
|
49
|
+
}
|
50
|
+
}
|
51
|
+
|
52
|
+
static VALUE extract(VALUE self, VALUE text_rbs, VALUE topN)
|
53
|
+
{
|
54
|
+
Check_Type(text_rbs, T_STRING);
|
55
|
+
std::string text = StringValueCStr(text_rbs);
|
56
|
+
|
57
|
+
Check_Type(topN, T_FIXNUM);
|
58
|
+
int top_n = NUM2INT(topN);
|
59
|
+
|
60
|
+
Keyword * keyword;
|
61
|
+
Data_Get_Struct(self, Keyword, keyword);
|
62
|
+
|
63
|
+
std::vector<std::pair<std::string, double> > top_words;
|
64
|
+
|
65
|
+
keyword->p->Extract(text, top_words, top_n);
|
66
|
+
volatile VALUE arr = rb_ary_new();
|
67
|
+
for(size_t i = 0; i < top_words.size(); i++)
|
68
|
+
{
|
69
|
+
volatile VALUE inner_arr = rb_ary_new();
|
70
|
+
std::string & word = top_words[i].first;
|
71
|
+
rb_ary_push(inner_arr, rb_enc_str_new(word.c_str(), word.length(), u8_enc));
|
72
|
+
rb_ary_push(inner_arr, rb_float_new(top_words[i].second));
|
73
|
+
|
74
|
+
rb_ary_push(arr, inner_arr);
|
75
|
+
|
76
|
+
}
|
77
|
+
return arr;
|
78
|
+
}
|
79
|
+
|
80
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
81
|
+
|
82
|
+
extern "C" {
|
83
|
+
void Init_keyword()
|
84
|
+
{
|
85
|
+
VALUE cKeyword = rb_define_class_under(mJieba, "Keyword", rb_cObject);
|
86
|
+
u8_enc = rb_utf8_encoding();
|
87
|
+
rb_define_alloc_func(cKeyword, allocate);
|
88
|
+
DEF(cKeyword, "_init", init, 6);
|
89
|
+
DEF(cKeyword, "extract",extract,2);
|
90
|
+
}
|
91
|
+
|
92
|
+
}
|
data/ext/jieba/keyword.h
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
#include "segment.h"
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
#include <MPSegment.hpp>
|
4
|
+
#include <HMMSegment.hpp>
|
5
|
+
#include <MixSegment.hpp>
|
6
|
+
|
7
|
+
static rb_encoding* u8_enc;
|
8
|
+
|
9
|
+
struct SegWrapper{
|
10
|
+
cppjieba::MixSegment *mixp;
|
11
|
+
cppjieba::HMMSegment *hmmp;
|
12
|
+
cppjieba::MPSegment *mpsp;
|
13
|
+
SegWrapper(): mixp(nullptr), hmmp(nullptr), mpsp(nullptr) {}
|
14
|
+
};
|
15
|
+
|
16
|
+
static void seg_free(void *p){
|
17
|
+
auto seg = reinterpret_cast<SegWrapper *>(p);
|
18
|
+
if (seg->mixp)
|
19
|
+
delete seg->mixp;
|
20
|
+
else if (seg->hmmp)
|
21
|
+
delete seg->hmmp;
|
22
|
+
else
|
23
|
+
delete seg->mpsp;
|
24
|
+
delete seg;
|
25
|
+
}
|
26
|
+
|
27
|
+
static VALUE allocate(VALUE klass)
|
28
|
+
{
|
29
|
+
SegWrapper* seg_wrapper = new SegWrapper();
|
30
|
+
return Data_Wrap_Struct(klass, NULL, seg_free, seg_wrapper);
|
31
|
+
}
|
32
|
+
|
33
|
+
static void seg_init(VALUE self,
|
34
|
+
VALUE type_rb_sym,
|
35
|
+
VALUE jieba_dict_rbs,
|
36
|
+
VALUE hmm_dict_rbs,
|
37
|
+
VALUE user_dict_rbs)
|
38
|
+
{
|
39
|
+
SegWrapper* seg_wrapper;
|
40
|
+
Data_Get_Struct(self, SegWrapper, seg_wrapper);
|
41
|
+
|
42
|
+
Check_Type(jieba_dict_rbs, T_STRING);
|
43
|
+
Check_Type(hmm_dict_rbs, T_STRING);
|
44
|
+
Check_Type(user_dict_rbs, T_STRING);
|
45
|
+
|
46
|
+
std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
|
47
|
+
std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
|
48
|
+
std::string user_dict = StringValueCStr(user_dict_rbs);
|
49
|
+
|
50
|
+
ID type = SYM2ID(type_rb_sym);
|
51
|
+
if ( type == rb_intern("mix") )
|
52
|
+
{
|
53
|
+
seg_wrapper->mixp = new cppjieba::MixSegment(jieba_dict, hmm_dict, user_dict);
|
54
|
+
}
|
55
|
+
else if ( type == rb_intern("hmm") )
|
56
|
+
{
|
57
|
+
seg_wrapper->hmmp = new cppjieba::HMMSegment(hmm_dict);
|
58
|
+
}
|
59
|
+
else if ( type == rb_intern("mp"))
|
60
|
+
{
|
61
|
+
seg_wrapper->mpsp = new cppjieba::MPSegment(jieba_dict);
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
static VALUE seg_cut(VALUE self, VALUE text_rbs)
|
66
|
+
{
|
67
|
+
Check_Type(text_rbs, T_STRING);
|
68
|
+
std::string text = StringValueCStr(text_rbs);
|
69
|
+
|
70
|
+
SegWrapper* seg_wrapper;
|
71
|
+
Data_Get_Struct(self, SegWrapper, seg_wrapper);
|
72
|
+
|
73
|
+
std::vector<std::string> words;
|
74
|
+
|
75
|
+
if (seg_wrapper->mixp) {
|
76
|
+
seg_wrapper->mixp->Cut(text, words);
|
77
|
+
}
|
78
|
+
else if (seg_wrapper->hmmp) {
|
79
|
+
seg_wrapper->hmmp->Cut(text, words);
|
80
|
+
}
|
81
|
+
else {
|
82
|
+
seg_wrapper->mpsp->Cut(text, words);
|
83
|
+
}
|
84
|
+
|
85
|
+
volatile VALUE arr = rb_ary_new();
|
86
|
+
for (std::vector<std::string>::const_iterator j = words.begin(); j != words.end(); j++)
|
87
|
+
{
|
88
|
+
|
89
|
+
rb_ary_push(arr, rb_enc_str_new((*j).c_str(), (*j).length(), u8_enc));
|
90
|
+
|
91
|
+
}
|
92
|
+
return arr;
|
93
|
+
}
|
94
|
+
|
95
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
96
|
+
|
97
|
+
extern "C" {
|
98
|
+
void Init_segment()
|
99
|
+
{
|
100
|
+
VALUE cSegment = rb_define_class_under(mJieba, "Segment", rb_cObject);
|
101
|
+
u8_enc = rb_utf8_encoding();
|
102
|
+
rb_define_alloc_func(cSegment, allocate);
|
103
|
+
DEF(cSegment, "_init",seg_init,4);
|
104
|
+
DEF(cSegment, "cut",seg_cut,1);
|
105
|
+
}
|
106
|
+
|
107
|
+
}
|
data/ext/jieba/segment.h
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
#include "tagging.h"
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
#include <PosTagger.hpp>
|
4
|
+
#include <MixSegment.hpp>
|
5
|
+
|
6
|
+
static rb_encoding* u8_enc;
|
7
|
+
|
8
|
+
struct Tagging{
|
9
|
+
cppjieba::MixSegment *p;
|
10
|
+
};
|
11
|
+
|
12
|
+
static void tagger_free(void *p){
|
13
|
+
delete reinterpret_cast<Tagging *>(p)->p;
|
14
|
+
delete reinterpret_cast<Tagging *>(p);
|
15
|
+
}
|
16
|
+
|
17
|
+
static VALUE alloc(VALUE klass)
|
18
|
+
{
|
19
|
+
Tagging * tagging = new Tagging();
|
20
|
+
return Data_Wrap_Struct(klass, NULL, tagger_free, tagging);
|
21
|
+
}
|
22
|
+
|
23
|
+
static void init(VALUE self,
|
24
|
+
VALUE jieba_dict_rbs,
|
25
|
+
VALUE hmm_dict_rbs,
|
26
|
+
VALUE user_dict_rbs)
|
27
|
+
{
|
28
|
+
Tagging *tagging;
|
29
|
+
Data_Get_Struct(self, Tagging, tagging);
|
30
|
+
|
31
|
+
Check_Type(jieba_dict_rbs, T_STRING);
|
32
|
+
Check_Type(hmm_dict_rbs, T_STRING);
|
33
|
+
Check_Type(user_dict_rbs, T_STRING);
|
34
|
+
|
35
|
+
std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
|
36
|
+
std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
|
37
|
+
std::string user_dict = StringValueCStr(user_dict_rbs);
|
38
|
+
|
39
|
+
tagging->p = new cppjieba::MixSegment(jieba_dict, hmm_dict, user_dict);
|
40
|
+
}
|
41
|
+
|
42
|
+
static VALUE tag(VALUE self, VALUE text_rbs)
|
43
|
+
{
|
44
|
+
Check_Type(text_rbs, T_STRING);
|
45
|
+
std::string text = StringValueCStr(text_rbs);
|
46
|
+
|
47
|
+
Tagging *tagging;
|
48
|
+
Data_Get_Struct(self, Tagging, tagging);
|
49
|
+
|
50
|
+
std::vector<std::pair<std::string, std::string>> pairs;
|
51
|
+
tagging->p->Tag(text, pairs);
|
52
|
+
|
53
|
+
volatile VALUE arr = rb_ary_new();
|
54
|
+
for (std::vector<std::pair<std::string, std::string>>::const_iterator j = pairs.begin(); j != pairs.end(); j++)
|
55
|
+
{
|
56
|
+
VALUE pair = rb_hash_new();
|
57
|
+
rb_hash_aset(pair, rb_enc_str_new(std::get<0>(*j).c_str(), std::get<0>(*j).length(), u8_enc), rb_enc_str_new(std::get<1>(*j).c_str(), std::get<1>(*j).length(), u8_enc));
|
58
|
+
rb_ary_push(arr, pair);
|
59
|
+
|
60
|
+
}
|
61
|
+
return arr;
|
62
|
+
}
|
63
|
+
|
64
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
65
|
+
|
66
|
+
extern "C" {
|
67
|
+
void Init_tagging()
|
68
|
+
{
|
69
|
+
VALUE cTagging = rb_define_class_under(mJieba, "Tagging", rb_cObject);
|
70
|
+
u8_enc = rb_utf8_encoding();
|
71
|
+
rb_define_alloc_func(cTagging, alloc);
|
72
|
+
DEF(cTagging, "_init",init,3);
|
73
|
+
DEF(cTagging, "tag",tag,1);
|
74
|
+
}
|
75
|
+
|
76
|
+
}
|
data/ext/jieba/tagging.h
ADDED
data/jieba_rb.gemspec
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'jieba_rb/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "jieba-rb"
|
8
|
+
spec.version = JiebaRb::VERSION
|
9
|
+
spec.authors = ["Chris Li"]
|
10
|
+
spec.email = ["liqi8822@gmail.com"]
|
11
|
+
spec.summary = "cppjieba binding for ruby"
|
12
|
+
spec.description = "cppjieba binding for ruby"
|
13
|
+
spec.homepage = "https://github.com/Xu-Zhiqing/jieba_rb"
|
14
|
+
spec.required_ruby_version = ">=1.9.2"
|
15
|
+
spec.license = "MIT"
|
16
|
+
spec.extensions = ["ext/jieba/extconf.rb"]
|
17
|
+
|
18
|
+
spec.files = `git ls-files -z`.split("\x0")
|
19
|
+
relative_path = File.expand_path("../", __FILE__) + '/'
|
20
|
+
`git submodule --quiet foreach pwd`.split($\).each do |submodule_path|
|
21
|
+
if (ENV['OS'] == 'Windows_NT') && submodule_path[0] == '/'
|
22
|
+
# Detect if cygwin path is being used by git
|
23
|
+
submodule_path = submodule_path[1..-1]
|
24
|
+
submodule_path.insert(1, ':')
|
25
|
+
end
|
26
|
+
# for each submodule, change working directory to that submodule
|
27
|
+
Dir.chdir(submodule_path) do
|
28
|
+
# Make the submodule path relative
|
29
|
+
submodule_path = submodule_path.gsub(/#{relative_path}/i, '')
|
30
|
+
# issue git ls-files in submodule's directory
|
31
|
+
submodule_files = `git ls-files`.split($\).reject { |i| i.start_with? 'test/' }
|
32
|
+
|
33
|
+
# prepend the submodule path to create relative file paths
|
34
|
+
submodule_files_paths = submodule_files.map do |filename|
|
35
|
+
File.join(submodule_path, filename)
|
36
|
+
end
|
37
|
+
|
38
|
+
# add relative paths to gem.files
|
39
|
+
spec.files += submodule_files_paths
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
44
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
45
|
+
spec.require_paths = ['lib']
|
46
|
+
|
47
|
+
spec.add_development_dependency "bundler"
|
48
|
+
spec.add_development_dependency "rake"
|
49
|
+
spec.add_development_dependency "rake-compiler"
|
50
|
+
spec.add_development_dependency "minitest"
|
51
|
+
end
|
data/lib/jieba-rb.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
require "jieba_rb/version"
|
2
|
+
require "jieba"
|
3
|
+
module JiebaRb
|
4
|
+
abs = File.expand_path File.dirname(__FILE__)
|
5
|
+
EXT_BASE = "#{abs}/../ext/cppjieba/"
|
6
|
+
DEFAULT_JIEBA_DICT = EXT_BASE + "dict/jieba.dict.utf8";
|
7
|
+
DEFAULT_HMM_DICT = EXT_BASE + "dict/hmm_model.utf8";
|
8
|
+
DEFAULT_USER_DICT = EXT_BASE + "dict/user.dict.utf8";
|
9
|
+
|
10
|
+
class Segment
|
11
|
+
private :_init
|
12
|
+
def initialize opts = {}
|
13
|
+
valid_modes = [:mix, :hmm, :mp]
|
14
|
+
if mode = opts[:mode]
|
15
|
+
raise "Mode must be one of :mix :hmm :mp" unless valid_modes.include? mode
|
16
|
+
else
|
17
|
+
mode = :mix #default
|
18
|
+
end
|
19
|
+
|
20
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
21
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
22
|
+
user_dict = opts[:user_dict] || ""
|
23
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
24
|
+
|
25
|
+
_init mode, jieba_dict, hmm_dict, user_dict
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class Keyword
|
30
|
+
DEFAULT_IDF = EXT_BASE + "dict/idf.utf8"
|
31
|
+
DEFAULT_STOP_WORDS = EXT_BASE + "dict/stop_words.utf8"
|
32
|
+
|
33
|
+
private :_init
|
34
|
+
|
35
|
+
def initialize opts = {}
|
36
|
+
valid_modes = [:tf_idf]
|
37
|
+
if mode = opts[:mode]
|
38
|
+
raise "Mode must be one of :tf_idf" unless valid_modes.include? mode
|
39
|
+
else
|
40
|
+
mode = :tf_idf #default
|
41
|
+
end
|
42
|
+
|
43
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
44
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
45
|
+
idf_path = opts[:idf] || DEFAULT_IDF
|
46
|
+
stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
|
47
|
+
|
48
|
+
user_dict = opts[:user_dict] || ""
|
49
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
50
|
+
|
51
|
+
_init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class Tagging
|
56
|
+
private :_init
|
57
|
+
def initialize opts = {}
|
58
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
59
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
60
|
+
user_dict = opts[:user_dict] || ""
|
61
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
62
|
+
|
63
|
+
_init jieba_dict, hmm_dict, user_dict
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|