jieba-rb 5.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +19 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +85 -0
  8. data/Rakefile +15 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +28 -0
  12. data/ext/cppjieba/ChangeLog.md +236 -0
  13. data/ext/cppjieba/README.md +285 -0
  14. data/ext/cppjieba/README_EN.md +111 -0
  15. data/ext/cppjieba/appveyor.yml +32 -0
  16. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  45. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  46. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  56. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  57. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  58. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  59. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  60. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  61. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  62. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  63. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  64. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  65. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  66. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  67. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  68. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  69. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  70. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  71. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  72. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  73. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  74. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  75. data/ext/cppjieba/dict/README.md +31 -0
  76. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  77. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  78. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  79. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  80. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  83. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  84. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  85. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  86. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  87. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  88. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  89. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  90. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  91. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  92. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  93. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  94. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  95. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  96. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +24 -0
  98. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  99. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  100. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  101. data/ext/jieba/extconf.rb +28 -0
  102. data/ext/jieba/jieba.c +11 -0
  103. data/ext/jieba/jieba.h +11 -0
  104. data/ext/jieba/keyword.cc +92 -0
  105. data/ext/jieba/keyword.h +17 -0
  106. data/ext/jieba/segment.cc +107 -0
  107. data/ext/jieba/segment.h +17 -0
  108. data/ext/jieba/tagging.cc +76 -0
  109. data/ext/jieba/tagging.h +17 -0
  110. data/jieba_rb.gemspec +51 -0
  111. data/lib/jieba-rb.rb +66 -0
  112. data/lib/jieba_rb/version.rb +3 -0
  113. data/test/test_keyword.rb +17 -0
  114. data/test/test_segment.rb +32 -0
  115. data/test/test_tagging.rb +22 -0
  116. data/test/user.dict.utf8 +23 -0
  117. metadata +219 -0
@@ -0,0 +1,28 @@
1
+ require "mkmf"
2
+ abs = File.expand_path File.dirname(__FILE__)
3
+
4
+ LIBDIR = RbConfig::CONFIG['libdir']
5
+ INCLUDEDIR = RbConfig::CONFIG['includedir']
6
+
7
+ HEADER_DIRS = [
8
+ INCLUDEDIR,
9
+ "#{abs}/../cppjieba/src",
10
+ "#{abs}/../cppjieba/include/cppjieba",
11
+ "#{abs}/../cppjieba/deps"
12
+ ]
13
+
14
+ LIB_DIRS = [
15
+
16
+ LIBDIR
17
+ ]
18
+
19
+ dir_config('cppjieba_src', HEADER_DIRS, LIB_DIRS)
20
+
21
+ CONFIG["CXXFLAGS"] += " -std=c++0x -O3"
22
+ $CXXFLAGS = "#{$CXXFLAGS} -std=c++0x -O3"
23
+ create_makefile 'jieba'
24
+ # respect header changes
25
+ headers = Dir.glob('*.{hpp,h}').join ' '
26
+ File.open 'Makefile', 'a' do |f|
27
+ f.puts "\n$(OBJS): #{headers}"
28
+ end
@@ -0,0 +1,11 @@
1
+ #include <jieba.h>
2
+ VALUE mJieba;
3
+
4
+ void Init_jieba()
5
+ {
6
+ mJieba = rb_define_module("JiebaRb");
7
+
8
+ Init_segment();
9
+ Init_keyword();
10
+ Init_tagging();
11
+ }
@@ -0,0 +1,11 @@
1
+ #ifndef RUBY_JIEBA
2
+ #define RUBY_JIEBA
3
+
4
+ #include <ruby.h>
5
+ #include <segment.h>
6
+ #include <keyword.h>
7
+ #include <tagging.h>
8
+
9
+ extern VALUE mJieba;
10
+
11
+ #endif
@@ -0,0 +1,92 @@
1
+ #include "segment.h"
2
+ #include <ruby/encoding.h>
3
+ #include <KeywordExtractor.hpp>
4
+
5
+ static rb_encoding* u8_enc;
6
+
7
+ struct Keyword{
8
+ cppjieba::KeywordExtractor * p;
9
+ };
10
+
11
+ static void keyword_free(void *p){
12
+ delete ((Keyword*) p) -> p;
13
+ delete (Keyword*)p;
14
+ }
15
+
16
+ static VALUE allocate(VALUE klass)
17
+ {
18
+ Keyword * keyword = new Keyword();
19
+ return Data_Wrap_Struct(klass, NULL, keyword_free, keyword);
20
+ }
21
+
22
+ static void init(VALUE self,
23
+ VALUE mode_rb_sym,
24
+ VALUE jieba_dict_rbs,
25
+ VALUE hmm_dict_rbs,
26
+ VALUE idf_rbs,
27
+ VALUE stop_words_rbs,
28
+ VALUE user_dict_rbs)
29
+ {
30
+ Keyword * keyword;
31
+ Data_Get_Struct(self, Keyword, keyword);
32
+
33
+ Check_Type(jieba_dict_rbs, T_STRING);
34
+ Check_Type(hmm_dict_rbs, T_STRING);
35
+ Check_Type(user_dict_rbs, T_STRING);
36
+ Check_Type(idf_rbs, T_STRING);
37
+ Check_Type(stop_words_rbs, T_STRING);
38
+
39
+ std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
40
+ std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
41
+ std::string idf = StringValueCStr(idf_rbs);
42
+ std::string stop_words = StringValueCStr(stop_words_rbs);
43
+ std::string user_dict = StringValueCStr(user_dict_rbs);
44
+
45
+ ID mode = SYM2ID(mode_rb_sym);
46
+ if ( mode == rb_intern("tf_idf") )
47
+ {
48
+ keyword->p = new cppjieba::KeywordExtractor(jieba_dict, hmm_dict, idf, stop_words, user_dict);
49
+ }
50
+ }
51
+
52
+ static VALUE extract(VALUE self, VALUE text_rbs, VALUE topN)
53
+ {
54
+ Check_Type(text_rbs, T_STRING);
55
+ std::string text = StringValueCStr(text_rbs);
56
+
57
+ Check_Type(topN, T_FIXNUM);
58
+ int top_n = NUM2INT(topN);
59
+
60
+ Keyword * keyword;
61
+ Data_Get_Struct(self, Keyword, keyword);
62
+
63
+ std::vector<std::pair<std::string, double> > top_words;
64
+
65
+ keyword->p->Extract(text, top_words, top_n);
66
+ volatile VALUE arr = rb_ary_new();
67
+ for(size_t i = 0; i < top_words.size(); i++)
68
+ {
69
+ volatile VALUE inner_arr = rb_ary_new();
70
+ std::string & word = top_words[i].first;
71
+ rb_ary_push(inner_arr, rb_enc_str_new(word.c_str(), word.length(), u8_enc));
72
+ rb_ary_push(inner_arr, rb_float_new(top_words[i].second));
73
+
74
+ rb_ary_push(arr, inner_arr);
75
+
76
+ }
77
+ return arr;
78
+ }
79
+
80
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
81
+
82
+ extern "C" {
83
+ void Init_keyword()
84
+ {
85
+ VALUE cKeyword = rb_define_class_under(mJieba, "Keyword", rb_cObject);
86
+ u8_enc = rb_utf8_encoding();
87
+ rb_define_alloc_func(cKeyword, allocate);
88
+ DEF(cKeyword, "_init", init, 6);
89
+ DEF(cKeyword, "extract",extract,2);
90
+ }
91
+
92
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef RUBY_JIEBA_KEYWORD
2
+ #define RUBY_JIEBA_KEYWORD
3
+
4
+ #include <jieba.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void Init_keyword();
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif
15
+
16
+
17
+ #endif
@@ -0,0 +1,107 @@
1
+ #include "segment.h"
2
+ #include <ruby/encoding.h>
3
+ #include <MPSegment.hpp>
4
+ #include <HMMSegment.hpp>
5
+ #include <MixSegment.hpp>
6
+
7
+ static rb_encoding* u8_enc;
8
+
9
+ struct SegWrapper{
10
+ cppjieba::MixSegment *mixp;
11
+ cppjieba::HMMSegment *hmmp;
12
+ cppjieba::MPSegment *mpsp;
13
+ SegWrapper(): mixp(nullptr), hmmp(nullptr), mpsp(nullptr) {}
14
+ };
15
+
16
+ static void seg_free(void *p){
17
+ auto seg = reinterpret_cast<SegWrapper *>(p);
18
+ if (seg->mixp)
19
+ delete seg->mixp;
20
+ else if (seg->hmmp)
21
+ delete seg->hmmp;
22
+ else
23
+ delete seg->mpsp;
24
+ delete seg;
25
+ }
26
+
27
+ static VALUE allocate(VALUE klass)
28
+ {
29
+ SegWrapper* seg_wrapper = new SegWrapper();
30
+ return Data_Wrap_Struct(klass, NULL, seg_free, seg_wrapper);
31
+ }
32
+
33
+ static void seg_init(VALUE self,
34
+ VALUE type_rb_sym,
35
+ VALUE jieba_dict_rbs,
36
+ VALUE hmm_dict_rbs,
37
+ VALUE user_dict_rbs)
38
+ {
39
+ SegWrapper* seg_wrapper;
40
+ Data_Get_Struct(self, SegWrapper, seg_wrapper);
41
+
42
+ Check_Type(jieba_dict_rbs, T_STRING);
43
+ Check_Type(hmm_dict_rbs, T_STRING);
44
+ Check_Type(user_dict_rbs, T_STRING);
45
+
46
+ std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
47
+ std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
48
+ std::string user_dict = StringValueCStr(user_dict_rbs);
49
+
50
+ ID type = SYM2ID(type_rb_sym);
51
+ if ( type == rb_intern("mix") )
52
+ {
53
+ seg_wrapper->mixp = new cppjieba::MixSegment(jieba_dict, hmm_dict, user_dict);
54
+ }
55
+ else if ( type == rb_intern("hmm") )
56
+ {
57
+ seg_wrapper->hmmp = new cppjieba::HMMSegment(hmm_dict);
58
+ }
59
+ else if ( type == rb_intern("mp"))
60
+ {
61
+ seg_wrapper->mpsp = new cppjieba::MPSegment(jieba_dict);
62
+ }
63
+ }
64
+
65
+ static VALUE seg_cut(VALUE self, VALUE text_rbs)
66
+ {
67
+ Check_Type(text_rbs, T_STRING);
68
+ std::string text = StringValueCStr(text_rbs);
69
+
70
+ SegWrapper* seg_wrapper;
71
+ Data_Get_Struct(self, SegWrapper, seg_wrapper);
72
+
73
+ std::vector<std::string> words;
74
+
75
+ if (seg_wrapper->mixp) {
76
+ seg_wrapper->mixp->Cut(text, words);
77
+ }
78
+ else if (seg_wrapper->hmmp) {
79
+ seg_wrapper->hmmp->Cut(text, words);
80
+ }
81
+ else {
82
+ seg_wrapper->mpsp->Cut(text, words);
83
+ }
84
+
85
+ volatile VALUE arr = rb_ary_new();
86
+ for (std::vector<std::string>::const_iterator j = words.begin(); j != words.end(); j++)
87
+ {
88
+
89
+ rb_ary_push(arr, rb_enc_str_new((*j).c_str(), (*j).length(), u8_enc));
90
+
91
+ }
92
+ return arr;
93
+ }
94
+
95
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
96
+
97
+ extern "C" {
98
+ void Init_segment()
99
+ {
100
+ VALUE cSegment = rb_define_class_under(mJieba, "Segment", rb_cObject);
101
+ u8_enc = rb_utf8_encoding();
102
+ rb_define_alloc_func(cSegment, allocate);
103
+ DEF(cSegment, "_init",seg_init,4);
104
+ DEF(cSegment, "cut",seg_cut,1);
105
+ }
106
+
107
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef RUBY_JIEBA_SEGMENT
2
+ #define RUBY_JIEBA_SEGMENT
3
+
4
+ #include <jieba.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void Init_segment();
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif
15
+
16
+
17
+ #endif
@@ -0,0 +1,76 @@
1
+ #include "tagging.h"
2
+ #include <ruby/encoding.h>
3
+ #include <PosTagger.hpp>
4
+ #include <MixSegment.hpp>
5
+
6
+ static rb_encoding* u8_enc;
7
+
8
+ struct Tagging{
9
+ cppjieba::MixSegment *p;
10
+ };
11
+
12
+ static void tagger_free(void *p){
13
+ delete reinterpret_cast<Tagging *>(p)->p;
14
+ delete reinterpret_cast<Tagging *>(p);
15
+ }
16
+
17
+ static VALUE alloc(VALUE klass)
18
+ {
19
+ Tagging * tagging = new Tagging();
20
+ return Data_Wrap_Struct(klass, NULL, tagger_free, tagging);
21
+ }
22
+
23
+ static void init(VALUE self,
24
+ VALUE jieba_dict_rbs,
25
+ VALUE hmm_dict_rbs,
26
+ VALUE user_dict_rbs)
27
+ {
28
+ Tagging *tagging;
29
+ Data_Get_Struct(self, Tagging, tagging);
30
+
31
+ Check_Type(jieba_dict_rbs, T_STRING);
32
+ Check_Type(hmm_dict_rbs, T_STRING);
33
+ Check_Type(user_dict_rbs, T_STRING);
34
+
35
+ std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
36
+ std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
37
+ std::string user_dict = StringValueCStr(user_dict_rbs);
38
+
39
+ tagging->p = new cppjieba::MixSegment(jieba_dict, hmm_dict, user_dict);
40
+ }
41
+
42
+ static VALUE tag(VALUE self, VALUE text_rbs)
43
+ {
44
+ Check_Type(text_rbs, T_STRING);
45
+ std::string text = StringValueCStr(text_rbs);
46
+
47
+ Tagging *tagging;
48
+ Data_Get_Struct(self, Tagging, tagging);
49
+
50
+ std::vector<std::pair<std::string, std::string>> pairs;
51
+ tagging->p->Tag(text, pairs);
52
+
53
+ volatile VALUE arr = rb_ary_new();
54
+ for (std::vector<std::pair<std::string, std::string>>::const_iterator j = pairs.begin(); j != pairs.end(); j++)
55
+ {
56
+ VALUE pair = rb_hash_new();
57
+ rb_hash_aset(pair, rb_enc_str_new(std::get<0>(*j).c_str(), std::get<0>(*j).length(), u8_enc), rb_enc_str_new(std::get<1>(*j).c_str(), std::get<1>(*j).length(), u8_enc));
58
+ rb_ary_push(arr, pair);
59
+
60
+ }
61
+ return arr;
62
+ }
63
+
64
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
65
+
66
+ extern "C" {
67
+ void Init_tagging()
68
+ {
69
+ VALUE cTagging = rb_define_class_under(mJieba, "Tagging", rb_cObject);
70
+ u8_enc = rb_utf8_encoding();
71
+ rb_define_alloc_func(cTagging, alloc);
72
+ DEF(cTagging, "_init",init,3);
73
+ DEF(cTagging, "tag",tag,1);
74
+ }
75
+
76
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef RUBY_JIEBA_TAGGING
2
+ #define RUBY_JIEBA_TAGGING
3
+
4
+ #include <jieba.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void Init_tagging();
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif
15
+
16
+
17
+ #endif
@@ -0,0 +1,51 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'jieba_rb/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "jieba-rb"
8
+ spec.version = JiebaRb::VERSION
9
+ spec.authors = ["Chris Li"]
10
+ spec.email = ["liqi8822@gmail.com"]
11
+ spec.summary = "cppjieba binding for ruby"
12
+ spec.description = "cppjieba binding for ruby"
13
+ spec.homepage = "https://github.com/Xu-Zhiqing/jieba_rb"
14
+ spec.required_ruby_version = ">=1.9.2"
15
+ spec.license = "MIT"
16
+ spec.extensions = ["ext/jieba/extconf.rb"]
17
+
18
+ spec.files = `git ls-files -z`.split("\x0")
19
+ relative_path = File.expand_path("../", __FILE__) + '/'
20
+ `git submodule --quiet foreach pwd`.split($\).each do |submodule_path|
21
+ if (ENV['OS'] == 'Windows_NT') && submodule_path[0] == '/'
22
+ # Detect if cygwin path is being used by git
23
+ submodule_path = submodule_path[1..-1]
24
+ submodule_path.insert(1, ':')
25
+ end
26
+ # for each submodule, change working directory to that submodule
27
+ Dir.chdir(submodule_path) do
28
+ # Make the submodule path relative
29
+ submodule_path = submodule_path.gsub(/#{relative_path}/i, '')
30
+ # issue git ls-files in submodule's directory
31
+ submodule_files = `git ls-files`.split($\).reject { |i| i.start_with? 'test/' }
32
+
33
+ # prepend the submodule path to create relative file paths
34
+ submodule_files_paths = submodule_files.map do |filename|
35
+ File.join(submodule_path, filename)
36
+ end
37
+
38
+ # add relative paths to gem.files
39
+ spec.files += submodule_files_paths
40
+ end
41
+ end
42
+
43
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
44
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
45
+ spec.require_paths = ['lib']
46
+
47
+ spec.add_development_dependency "bundler"
48
+ spec.add_development_dependency "rake"
49
+ spec.add_development_dependency "rake-compiler"
50
+ spec.add_development_dependency "minitest"
51
+ end
@@ -0,0 +1,66 @@
1
+ require "jieba_rb/version"
2
+ require "jieba"
3
+ module JiebaRb
4
+ abs = File.expand_path File.dirname(__FILE__)
5
+ EXT_BASE = "#{abs}/../ext/cppjieba/"
6
+ DEFAULT_JIEBA_DICT = EXT_BASE + "dict/jieba.dict.utf8";
7
+ DEFAULT_HMM_DICT = EXT_BASE + "dict/hmm_model.utf8";
8
+ DEFAULT_USER_DICT = EXT_BASE + "dict/user.dict.utf8";
9
+
10
+ class Segment
11
+ private :_init
12
+ def initialize opts = {}
13
+ valid_modes = [:mix, :hmm, :mp]
14
+ if mode = opts[:mode]
15
+ raise "Mode must be one of :mix :hmm :mp" unless valid_modes.include? mode
16
+ else
17
+ mode = :mix #default
18
+ end
19
+
20
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
21
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
22
+ user_dict = opts[:user_dict] || ""
23
+ user_dict = DEFAULT_USER_DICT if user_dict == :default
24
+
25
+ _init mode, jieba_dict, hmm_dict, user_dict
26
+ end
27
+ end
28
+
29
+ class Keyword
30
+ DEFAULT_IDF = EXT_BASE + "dict/idf.utf8"
31
+ DEFAULT_STOP_WORDS = EXT_BASE + "dict/stop_words.utf8"
32
+
33
+ private :_init
34
+
35
+ def initialize opts = {}
36
+ valid_modes = [:tf_idf]
37
+ if mode = opts[:mode]
38
+ raise "Mode must be one of :tf_idf" unless valid_modes.include? mode
39
+ else
40
+ mode = :tf_idf #default
41
+ end
42
+
43
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
44
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
45
+ idf_path = opts[:idf] || DEFAULT_IDF
46
+ stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
47
+
48
+ user_dict = opts[:user_dict] || ""
49
+ user_dict = DEFAULT_USER_DICT if user_dict == :default
50
+
51
+ _init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
52
+ end
53
+ end
54
+
55
+ class Tagging
56
+ private :_init
57
+ def initialize opts = {}
58
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
59
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
60
+ user_dict = opts[:user_dict] || ""
61
+ user_dict = DEFAULT_USER_DICT if user_dict == :default
62
+
63
+ _init jieba_dict, hmm_dict, user_dict
64
+ end
65
+ end
66
+ end