jieba-rb 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +19 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +85 -0
- data/Rakefile +15 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +28 -0
- data/ext/cppjieba/ChangeLog.md +236 -0
- data/ext/cppjieba/README.md +285 -0
- data/ext/cppjieba/README_EN.md +111 -0
- data/ext/cppjieba/appveyor.yml +32 -0
- data/ext/cppjieba/deps/CMakeLists.txt +1 -0
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
- data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
- data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
- data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
- data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
- data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
- data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
- data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
- data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +4 -0
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +24 -0
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
- data/ext/jieba/extconf.rb +28 -0
- data/ext/jieba/jieba.c +11 -0
- data/ext/jieba/jieba.h +11 -0
- data/ext/jieba/keyword.cc +92 -0
- data/ext/jieba/keyword.h +17 -0
- data/ext/jieba/segment.cc +107 -0
- data/ext/jieba/segment.h +17 -0
- data/ext/jieba/tagging.cc +76 -0
- data/ext/jieba/tagging.h +17 -0
- data/jieba_rb.gemspec +51 -0
- data/lib/jieba-rb.rb +66 -0
- data/lib/jieba_rb/version.rb +3 -0
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +32 -0
- data/test/test_tagging.rb +22 -0
- data/test/user.dict.utf8 +23 -0
- metadata +219 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require "mkmf"
|
|
2
|
+
abs = File.expand_path File.dirname(__FILE__)
|
|
3
|
+
|
|
4
|
+
LIBDIR = RbConfig::CONFIG['libdir']
|
|
5
|
+
INCLUDEDIR = RbConfig::CONFIG['includedir']
|
|
6
|
+
|
|
7
|
+
HEADER_DIRS = [
|
|
8
|
+
INCLUDEDIR,
|
|
9
|
+
"#{abs}/../cppjieba/src",
|
|
10
|
+
"#{abs}/../cppjieba/include/cppjieba",
|
|
11
|
+
"#{abs}/../cppjieba/deps"
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
LIB_DIRS = [
|
|
15
|
+
|
|
16
|
+
LIBDIR
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
dir_config('cppjieba_src', HEADER_DIRS, LIB_DIRS)
|
|
20
|
+
|
|
21
|
+
CONFIG["CXXFLAGS"] += " -std=c++0x -O3"
|
|
22
|
+
$CXXFLAGS = "#{$CXXFLAGS} -std=c++0x -O3"
|
|
23
|
+
create_makefile 'jieba'
|
|
24
|
+
# respect header changes
|
|
25
|
+
headers = Dir.glob('*.{hpp,h}').join ' '
|
|
26
|
+
File.open 'Makefile', 'a' do |f|
|
|
27
|
+
f.puts "\n$(OBJS): #{headers}"
|
|
28
|
+
end
|
data/ext/jieba/jieba.c
ADDED
data/ext/jieba/jieba.h
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#include "segment.h"
|
|
2
|
+
#include <ruby/encoding.h>
|
|
3
|
+
#include <KeywordExtractor.hpp>
|
|
4
|
+
|
|
5
|
+
static rb_encoding* u8_enc;
|
|
6
|
+
|
|
7
|
+
struct Keyword{
|
|
8
|
+
cppjieba::KeywordExtractor * p;
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
static void keyword_free(void *p){
|
|
12
|
+
delete ((Keyword*) p) -> p;
|
|
13
|
+
delete (Keyword*)p;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
static VALUE allocate(VALUE klass)
|
|
17
|
+
{
|
|
18
|
+
Keyword * keyword = new Keyword();
|
|
19
|
+
return Data_Wrap_Struct(klass, NULL, keyword_free, keyword);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
static void init(VALUE self,
|
|
23
|
+
VALUE mode_rb_sym,
|
|
24
|
+
VALUE jieba_dict_rbs,
|
|
25
|
+
VALUE hmm_dict_rbs,
|
|
26
|
+
VALUE idf_rbs,
|
|
27
|
+
VALUE stop_words_rbs,
|
|
28
|
+
VALUE user_dict_rbs)
|
|
29
|
+
{
|
|
30
|
+
Keyword * keyword;
|
|
31
|
+
Data_Get_Struct(self, Keyword, keyword);
|
|
32
|
+
|
|
33
|
+
Check_Type(jieba_dict_rbs, T_STRING);
|
|
34
|
+
Check_Type(hmm_dict_rbs, T_STRING);
|
|
35
|
+
Check_Type(user_dict_rbs, T_STRING);
|
|
36
|
+
Check_Type(idf_rbs, T_STRING);
|
|
37
|
+
Check_Type(stop_words_rbs, T_STRING);
|
|
38
|
+
|
|
39
|
+
std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
|
|
40
|
+
std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
|
|
41
|
+
std::string idf = StringValueCStr(idf_rbs);
|
|
42
|
+
std::string stop_words = StringValueCStr(stop_words_rbs);
|
|
43
|
+
std::string user_dict = StringValueCStr(user_dict_rbs);
|
|
44
|
+
|
|
45
|
+
ID mode = SYM2ID(mode_rb_sym);
|
|
46
|
+
if ( mode == rb_intern("tf_idf") )
|
|
47
|
+
{
|
|
48
|
+
keyword->p = new cppjieba::KeywordExtractor(jieba_dict, hmm_dict, idf, stop_words, user_dict);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
static VALUE extract(VALUE self, VALUE text_rbs, VALUE topN)
|
|
53
|
+
{
|
|
54
|
+
Check_Type(text_rbs, T_STRING);
|
|
55
|
+
std::string text = StringValueCStr(text_rbs);
|
|
56
|
+
|
|
57
|
+
Check_Type(topN, T_FIXNUM);
|
|
58
|
+
int top_n = NUM2INT(topN);
|
|
59
|
+
|
|
60
|
+
Keyword * keyword;
|
|
61
|
+
Data_Get_Struct(self, Keyword, keyword);
|
|
62
|
+
|
|
63
|
+
std::vector<std::pair<std::string, double> > top_words;
|
|
64
|
+
|
|
65
|
+
keyword->p->Extract(text, top_words, top_n);
|
|
66
|
+
volatile VALUE arr = rb_ary_new();
|
|
67
|
+
for(size_t i = 0; i < top_words.size(); i++)
|
|
68
|
+
{
|
|
69
|
+
volatile VALUE inner_arr = rb_ary_new();
|
|
70
|
+
std::string & word = top_words[i].first;
|
|
71
|
+
rb_ary_push(inner_arr, rb_enc_str_new(word.c_str(), word.length(), u8_enc));
|
|
72
|
+
rb_ary_push(inner_arr, rb_float_new(top_words[i].second));
|
|
73
|
+
|
|
74
|
+
rb_ary_push(arr, inner_arr);
|
|
75
|
+
|
|
76
|
+
}
|
|
77
|
+
return arr;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
|
81
|
+
|
|
82
|
+
extern "C" {
|
|
83
|
+
void Init_keyword()
|
|
84
|
+
{
|
|
85
|
+
VALUE cKeyword = rb_define_class_under(mJieba, "Keyword", rb_cObject);
|
|
86
|
+
u8_enc = rb_utf8_encoding();
|
|
87
|
+
rb_define_alloc_func(cKeyword, allocate);
|
|
88
|
+
DEF(cKeyword, "_init", init, 6);
|
|
89
|
+
DEF(cKeyword, "extract",extract,2);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
}
|
data/ext/jieba/keyword.h
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#include "segment.h"
|
|
2
|
+
#include <ruby/encoding.h>
|
|
3
|
+
#include <MPSegment.hpp>
|
|
4
|
+
#include <HMMSegment.hpp>
|
|
5
|
+
#include <MixSegment.hpp>
|
|
6
|
+
|
|
7
|
+
static rb_encoding* u8_enc;
|
|
8
|
+
|
|
9
|
+
struct SegWrapper{
|
|
10
|
+
cppjieba::MixSegment *mixp;
|
|
11
|
+
cppjieba::HMMSegment *hmmp;
|
|
12
|
+
cppjieba::MPSegment *mpsp;
|
|
13
|
+
SegWrapper(): mixp(nullptr), hmmp(nullptr), mpsp(nullptr) {}
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
static void seg_free(void *p){
|
|
17
|
+
auto seg = reinterpret_cast<SegWrapper *>(p);
|
|
18
|
+
if (seg->mixp)
|
|
19
|
+
delete seg->mixp;
|
|
20
|
+
else if (seg->hmmp)
|
|
21
|
+
delete seg->hmmp;
|
|
22
|
+
else
|
|
23
|
+
delete seg->mpsp;
|
|
24
|
+
delete seg;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
static VALUE allocate(VALUE klass)
|
|
28
|
+
{
|
|
29
|
+
SegWrapper* seg_wrapper = new SegWrapper();
|
|
30
|
+
return Data_Wrap_Struct(klass, NULL, seg_free, seg_wrapper);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
static void seg_init(VALUE self,
|
|
34
|
+
VALUE type_rb_sym,
|
|
35
|
+
VALUE jieba_dict_rbs,
|
|
36
|
+
VALUE hmm_dict_rbs,
|
|
37
|
+
VALUE user_dict_rbs)
|
|
38
|
+
{
|
|
39
|
+
SegWrapper* seg_wrapper;
|
|
40
|
+
Data_Get_Struct(self, SegWrapper, seg_wrapper);
|
|
41
|
+
|
|
42
|
+
Check_Type(jieba_dict_rbs, T_STRING);
|
|
43
|
+
Check_Type(hmm_dict_rbs, T_STRING);
|
|
44
|
+
Check_Type(user_dict_rbs, T_STRING);
|
|
45
|
+
|
|
46
|
+
std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
|
|
47
|
+
std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
|
|
48
|
+
std::string user_dict = StringValueCStr(user_dict_rbs);
|
|
49
|
+
|
|
50
|
+
ID type = SYM2ID(type_rb_sym);
|
|
51
|
+
if ( type == rb_intern("mix") )
|
|
52
|
+
{
|
|
53
|
+
seg_wrapper->mixp = new cppjieba::MixSegment(jieba_dict, hmm_dict, user_dict);
|
|
54
|
+
}
|
|
55
|
+
else if ( type == rb_intern("hmm") )
|
|
56
|
+
{
|
|
57
|
+
seg_wrapper->hmmp = new cppjieba::HMMSegment(hmm_dict);
|
|
58
|
+
}
|
|
59
|
+
else if ( type == rb_intern("mp"))
|
|
60
|
+
{
|
|
61
|
+
seg_wrapper->mpsp = new cppjieba::MPSegment(jieba_dict);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
static VALUE seg_cut(VALUE self, VALUE text_rbs)
|
|
66
|
+
{
|
|
67
|
+
Check_Type(text_rbs, T_STRING);
|
|
68
|
+
std::string text = StringValueCStr(text_rbs);
|
|
69
|
+
|
|
70
|
+
SegWrapper* seg_wrapper;
|
|
71
|
+
Data_Get_Struct(self, SegWrapper, seg_wrapper);
|
|
72
|
+
|
|
73
|
+
std::vector<std::string> words;
|
|
74
|
+
|
|
75
|
+
if (seg_wrapper->mixp) {
|
|
76
|
+
seg_wrapper->mixp->Cut(text, words);
|
|
77
|
+
}
|
|
78
|
+
else if (seg_wrapper->hmmp) {
|
|
79
|
+
seg_wrapper->hmmp->Cut(text, words);
|
|
80
|
+
}
|
|
81
|
+
else {
|
|
82
|
+
seg_wrapper->mpsp->Cut(text, words);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
volatile VALUE arr = rb_ary_new();
|
|
86
|
+
for (std::vector<std::string>::const_iterator j = words.begin(); j != words.end(); j++)
|
|
87
|
+
{
|
|
88
|
+
|
|
89
|
+
rb_ary_push(arr, rb_enc_str_new((*j).c_str(), (*j).length(), u8_enc));
|
|
90
|
+
|
|
91
|
+
}
|
|
92
|
+
return arr;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
|
96
|
+
|
|
97
|
+
extern "C" {
|
|
98
|
+
void Init_segment()
|
|
99
|
+
{
|
|
100
|
+
VALUE cSegment = rb_define_class_under(mJieba, "Segment", rb_cObject);
|
|
101
|
+
u8_enc = rb_utf8_encoding();
|
|
102
|
+
rb_define_alloc_func(cSegment, allocate);
|
|
103
|
+
DEF(cSegment, "_init",seg_init,4);
|
|
104
|
+
DEF(cSegment, "cut",seg_cut,1);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
}
|
data/ext/jieba/segment.h
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
#include "tagging.h"
|
|
2
|
+
#include <ruby/encoding.h>
|
|
3
|
+
#include <PosTagger.hpp>
|
|
4
|
+
#include <MixSegment.hpp>
|
|
5
|
+
|
|
6
|
+
static rb_encoding* u8_enc;
|
|
7
|
+
|
|
8
|
+
struct Tagging{
|
|
9
|
+
cppjieba::MixSegment *p;
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
static void tagger_free(void *p){
|
|
13
|
+
delete reinterpret_cast<Tagging *>(p)->p;
|
|
14
|
+
delete reinterpret_cast<Tagging *>(p);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
static VALUE alloc(VALUE klass)
|
|
18
|
+
{
|
|
19
|
+
Tagging * tagging = new Tagging();
|
|
20
|
+
return Data_Wrap_Struct(klass, NULL, tagger_free, tagging);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
static void init(VALUE self,
|
|
24
|
+
VALUE jieba_dict_rbs,
|
|
25
|
+
VALUE hmm_dict_rbs,
|
|
26
|
+
VALUE user_dict_rbs)
|
|
27
|
+
{
|
|
28
|
+
Tagging *tagging;
|
|
29
|
+
Data_Get_Struct(self, Tagging, tagging);
|
|
30
|
+
|
|
31
|
+
Check_Type(jieba_dict_rbs, T_STRING);
|
|
32
|
+
Check_Type(hmm_dict_rbs, T_STRING);
|
|
33
|
+
Check_Type(user_dict_rbs, T_STRING);
|
|
34
|
+
|
|
35
|
+
std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
|
|
36
|
+
std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
|
|
37
|
+
std::string user_dict = StringValueCStr(user_dict_rbs);
|
|
38
|
+
|
|
39
|
+
tagging->p = new cppjieba::MixSegment(jieba_dict, hmm_dict, user_dict);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
static VALUE tag(VALUE self, VALUE text_rbs)
|
|
43
|
+
{
|
|
44
|
+
Check_Type(text_rbs, T_STRING);
|
|
45
|
+
std::string text = StringValueCStr(text_rbs);
|
|
46
|
+
|
|
47
|
+
Tagging *tagging;
|
|
48
|
+
Data_Get_Struct(self, Tagging, tagging);
|
|
49
|
+
|
|
50
|
+
std::vector<std::pair<std::string, std::string>> pairs;
|
|
51
|
+
tagging->p->Tag(text, pairs);
|
|
52
|
+
|
|
53
|
+
volatile VALUE arr = rb_ary_new();
|
|
54
|
+
for (std::vector<std::pair<std::string, std::string>>::const_iterator j = pairs.begin(); j != pairs.end(); j++)
|
|
55
|
+
{
|
|
56
|
+
VALUE pair = rb_hash_new();
|
|
57
|
+
rb_hash_aset(pair, rb_enc_str_new(std::get<0>(*j).c_str(), std::get<0>(*j).length(), u8_enc), rb_enc_str_new(std::get<1>(*j).c_str(), std::get<1>(*j).length(), u8_enc));
|
|
58
|
+
rb_ary_push(arr, pair);
|
|
59
|
+
|
|
60
|
+
}
|
|
61
|
+
return arr;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
|
65
|
+
|
|
66
|
+
extern "C" {
|
|
67
|
+
void Init_tagging()
|
|
68
|
+
{
|
|
69
|
+
VALUE cTagging = rb_define_class_under(mJieba, "Tagging", rb_cObject);
|
|
70
|
+
u8_enc = rb_utf8_encoding();
|
|
71
|
+
rb_define_alloc_func(cTagging, alloc);
|
|
72
|
+
DEF(cTagging, "_init",init,3);
|
|
73
|
+
DEF(cTagging, "tag",tag,1);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
}
|
data/ext/jieba/tagging.h
ADDED
data/jieba_rb.gemspec
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
require 'jieba_rb/version'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |spec|
|
|
7
|
+
spec.name = "jieba-rb"
|
|
8
|
+
spec.version = JiebaRb::VERSION
|
|
9
|
+
spec.authors = ["Chris Li"]
|
|
10
|
+
spec.email = ["liqi8822@gmail.com"]
|
|
11
|
+
spec.summary = "cppjieba binding for ruby"
|
|
12
|
+
spec.description = "cppjieba binding for ruby"
|
|
13
|
+
spec.homepage = "https://github.com/Xu-Zhiqing/jieba_rb"
|
|
14
|
+
spec.required_ruby_version = ">=1.9.2"
|
|
15
|
+
spec.license = "MIT"
|
|
16
|
+
spec.extensions = ["ext/jieba/extconf.rb"]
|
|
17
|
+
|
|
18
|
+
spec.files = `git ls-files -z`.split("\x0")
|
|
19
|
+
relative_path = File.expand_path("../", __FILE__) + '/'
|
|
20
|
+
`git submodule --quiet foreach pwd`.split($\).each do |submodule_path|
|
|
21
|
+
if (ENV['OS'] == 'Windows_NT') && submodule_path[0] == '/'
|
|
22
|
+
# Detect if cygwin path is being used by git
|
|
23
|
+
submodule_path = submodule_path[1..-1]
|
|
24
|
+
submodule_path.insert(1, ':')
|
|
25
|
+
end
|
|
26
|
+
# for each submodule, change working directory to that submodule
|
|
27
|
+
Dir.chdir(submodule_path) do
|
|
28
|
+
# Make the submodule path relative
|
|
29
|
+
submodule_path = submodule_path.gsub(/#{relative_path}/i, '')
|
|
30
|
+
# issue git ls-files in submodule's directory
|
|
31
|
+
submodule_files = `git ls-files`.split($\).reject { |i| i.start_with? 'test/' }
|
|
32
|
+
|
|
33
|
+
# prepend the submodule path to create relative file paths
|
|
34
|
+
submodule_files_paths = submodule_files.map do |filename|
|
|
35
|
+
File.join(submodule_path, filename)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# add relative paths to gem.files
|
|
39
|
+
spec.files += submodule_files_paths
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
44
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
45
|
+
spec.require_paths = ['lib']
|
|
46
|
+
|
|
47
|
+
spec.add_development_dependency "bundler"
|
|
48
|
+
spec.add_development_dependency "rake"
|
|
49
|
+
spec.add_development_dependency "rake-compiler"
|
|
50
|
+
spec.add_development_dependency "minitest"
|
|
51
|
+
end
|
data/lib/jieba-rb.rb
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
require "jieba_rb/version"
|
|
2
|
+
require "jieba"
|
|
3
|
+
module JiebaRb
|
|
4
|
+
abs = File.expand_path File.dirname(__FILE__)
|
|
5
|
+
EXT_BASE = "#{abs}/../ext/cppjieba/"
|
|
6
|
+
DEFAULT_JIEBA_DICT = EXT_BASE + "dict/jieba.dict.utf8";
|
|
7
|
+
DEFAULT_HMM_DICT = EXT_BASE + "dict/hmm_model.utf8";
|
|
8
|
+
DEFAULT_USER_DICT = EXT_BASE + "dict/user.dict.utf8";
|
|
9
|
+
|
|
10
|
+
class Segment
|
|
11
|
+
private :_init
|
|
12
|
+
def initialize opts = {}
|
|
13
|
+
valid_modes = [:mix, :hmm, :mp]
|
|
14
|
+
if mode = opts[:mode]
|
|
15
|
+
raise "Mode must be one of :mix :hmm :mp" unless valid_modes.include? mode
|
|
16
|
+
else
|
|
17
|
+
mode = :mix #default
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
|
21
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
|
22
|
+
user_dict = opts[:user_dict] || ""
|
|
23
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
|
24
|
+
|
|
25
|
+
_init mode, jieba_dict, hmm_dict, user_dict
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
class Keyword
|
|
30
|
+
DEFAULT_IDF = EXT_BASE + "dict/idf.utf8"
|
|
31
|
+
DEFAULT_STOP_WORDS = EXT_BASE + "dict/stop_words.utf8"
|
|
32
|
+
|
|
33
|
+
private :_init
|
|
34
|
+
|
|
35
|
+
def initialize opts = {}
|
|
36
|
+
valid_modes = [:tf_idf]
|
|
37
|
+
if mode = opts[:mode]
|
|
38
|
+
raise "Mode must be one of :tf_idf" unless valid_modes.include? mode
|
|
39
|
+
else
|
|
40
|
+
mode = :tf_idf #default
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
|
44
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
|
45
|
+
idf_path = opts[:idf] || DEFAULT_IDF
|
|
46
|
+
stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
|
|
47
|
+
|
|
48
|
+
user_dict = opts[:user_dict] || ""
|
|
49
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
|
50
|
+
|
|
51
|
+
_init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
class Tagging
|
|
56
|
+
private :_init
|
|
57
|
+
def initialize opts = {}
|
|
58
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
|
59
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
|
60
|
+
user_dict = opts[:user_dict] || ""
|
|
61
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
|
62
|
+
|
|
63
|
+
_init jieba_dict, hmm_dict, user_dict
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|