jieba-rb 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +19 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +85 -0
  8. data/Rakefile +15 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +28 -0
  12. data/ext/cppjieba/ChangeLog.md +236 -0
  13. data/ext/cppjieba/README.md +285 -0
  14. data/ext/cppjieba/README_EN.md +111 -0
  15. data/ext/cppjieba/appveyor.yml +32 -0
  16. data/ext/cppjieba/deps/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
  18. data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
  19. data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
  20. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
  21. data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
  22. data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
  23. data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
  24. data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
  25. data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
  26. data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
  27. data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
  28. data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
  29. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
  30. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
  31. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
  32. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
  33. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  34. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  35. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
  36. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
  37. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
  38. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
  39. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
  40. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
  41. data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
  42. data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
  43. data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
  44. data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
  45. data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
  46. data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
  47. data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
  48. data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
  49. data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
  50. data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
  51. data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
  52. data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
  53. data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
  54. data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
  55. data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
  56. data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
  57. data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
  58. data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
  59. data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
  60. data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
  61. data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
  62. data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
  63. data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
  64. data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
  65. data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
  66. data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
  67. data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
  68. data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
  69. data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
  70. data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
  71. data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
  72. data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
  73. data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
  74. data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
  75. data/ext/cppjieba/dict/README.md +31 -0
  76. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  77. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  78. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  79. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  80. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  81. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  82. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  83. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  84. data/ext/cppjieba/dict/user.dict.utf8 +4 -0
  85. data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
  86. data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
  87. data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
  88. data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
  89. data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
  90. data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
  91. data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
  92. data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
  93. data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
  94. data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
  95. data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
  96. data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
  97. data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +24 -0
  98. data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
  99. data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
  100. data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
  101. data/ext/jieba/extconf.rb +28 -0
  102. data/ext/jieba/jieba.c +11 -0
  103. data/ext/jieba/jieba.h +11 -0
  104. data/ext/jieba/keyword.cc +92 -0
  105. data/ext/jieba/keyword.h +17 -0
  106. data/ext/jieba/segment.cc +107 -0
  107. data/ext/jieba/segment.h +17 -0
  108. data/ext/jieba/tagging.cc +76 -0
  109. data/ext/jieba/tagging.h +17 -0
  110. data/jieba_rb.gemspec +51 -0
  111. data/lib/jieba-rb.rb +66 -0
  112. data/lib/jieba_rb/version.rb +3 -0
  113. data/test/test_keyword.rb +17 -0
  114. data/test/test_segment.rb +32 -0
  115. data/test/test_tagging.rb +22 -0
  116. data/test/user.dict.utf8 +23 -0
  117. metadata +219 -0
@@ -0,0 +1,28 @@
1
+ require "mkmf"
2
+ abs = File.expand_path File.dirname(__FILE__)
3
+
4
+ LIBDIR = RbConfig::CONFIG['libdir']
5
+ INCLUDEDIR = RbConfig::CONFIG['includedir']
6
+
7
+ HEADER_DIRS = [
8
+ INCLUDEDIR,
9
+ "#{abs}/../cppjieba/src",
10
+ "#{abs}/../cppjieba/include/cppjieba",
11
+ "#{abs}/../cppjieba/deps"
12
+ ]
13
+
14
+ LIB_DIRS = [
15
+
16
+ LIBDIR
17
+ ]
18
+
19
+ dir_config('cppjieba_src', HEADER_DIRS, LIB_DIRS)
20
+
21
+ CONFIG["CXXFLAGS"] += " -std=c++0x -O3"
22
+ $CXXFLAGS = "#{$CXXFLAGS} -std=c++0x -O3"
23
+ create_makefile 'jieba'
24
+ # respect header changes
25
+ headers = Dir.glob('*.{hpp,h}').join ' '
26
+ File.open 'Makefile', 'a' do |f|
27
+ f.puts "\n$(OBJS): #{headers}"
28
+ end
@@ -0,0 +1,11 @@
1
+ #include <jieba.h>
2
+ VALUE mJieba;
3
+
4
+ void Init_jieba()
5
+ {
6
+ mJieba = rb_define_module("JiebaRb");
7
+
8
+ Init_segment();
9
+ Init_keyword();
10
+ Init_tagging();
11
+ }
@@ -0,0 +1,11 @@
1
+ #ifndef RUBY_JIEBA
2
+ #define RUBY_JIEBA
3
+
4
+ #include <ruby.h>
5
+ #include <segment.h>
6
+ #include <keyword.h>
7
+ #include <tagging.h>
8
+
9
+ extern VALUE mJieba;
10
+
11
+ #endif
@@ -0,0 +1,92 @@
1
+ #include "segment.h"
2
+ #include <ruby/encoding.h>
3
+ #include <KeywordExtractor.hpp>
4
+
5
+ static rb_encoding* u8_enc;
6
+
7
+ struct Keyword{
8
+ cppjieba::KeywordExtractor * p;
9
+ };
10
+
11
+ static void keyword_free(void *p){
12
+ delete ((Keyword*) p) -> p;
13
+ delete (Keyword*)p;
14
+ }
15
+
16
+ static VALUE allocate(VALUE klass)
17
+ {
18
+ Keyword * keyword = new Keyword();
19
+ return Data_Wrap_Struct(klass, NULL, keyword_free, keyword);
20
+ }
21
+
22
+ static void init(VALUE self,
23
+ VALUE mode_rb_sym,
24
+ VALUE jieba_dict_rbs,
25
+ VALUE hmm_dict_rbs,
26
+ VALUE idf_rbs,
27
+ VALUE stop_words_rbs,
28
+ VALUE user_dict_rbs)
29
+ {
30
+ Keyword * keyword;
31
+ Data_Get_Struct(self, Keyword, keyword);
32
+
33
+ Check_Type(jieba_dict_rbs, T_STRING);
34
+ Check_Type(hmm_dict_rbs, T_STRING);
35
+ Check_Type(user_dict_rbs, T_STRING);
36
+ Check_Type(idf_rbs, T_STRING);
37
+ Check_Type(stop_words_rbs, T_STRING);
38
+
39
+ std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
40
+ std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
41
+ std::string idf = StringValueCStr(idf_rbs);
42
+ std::string stop_words = StringValueCStr(stop_words_rbs);
43
+ std::string user_dict = StringValueCStr(user_dict_rbs);
44
+
45
+ ID mode = SYM2ID(mode_rb_sym);
46
+ if ( mode == rb_intern("tf_idf") )
47
+ {
48
+ keyword->p = new cppjieba::KeywordExtractor(jieba_dict, hmm_dict, idf, stop_words, user_dict);
49
+ }
50
+ }
51
+
52
+ static VALUE extract(VALUE self, VALUE text_rbs, VALUE topN)
53
+ {
54
+ Check_Type(text_rbs, T_STRING);
55
+ std::string text = StringValueCStr(text_rbs);
56
+
57
+ Check_Type(topN, T_FIXNUM);
58
+ int top_n = NUM2INT(topN);
59
+
60
+ Keyword * keyword;
61
+ Data_Get_Struct(self, Keyword, keyword);
62
+
63
+ std::vector<std::pair<std::string, double> > top_words;
64
+
65
+ keyword->p->Extract(text, top_words, top_n);
66
+ volatile VALUE arr = rb_ary_new();
67
+ for(size_t i = 0; i < top_words.size(); i++)
68
+ {
69
+ volatile VALUE inner_arr = rb_ary_new();
70
+ std::string & word = top_words[i].first;
71
+ rb_ary_push(inner_arr, rb_enc_str_new(word.c_str(), word.length(), u8_enc));
72
+ rb_ary_push(inner_arr, rb_float_new(top_words[i].second));
73
+
74
+ rb_ary_push(arr, inner_arr);
75
+
76
+ }
77
+ return arr;
78
+ }
79
+
80
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
81
+
82
+ extern "C" {
83
+ void Init_keyword()
84
+ {
85
+ VALUE cKeyword = rb_define_class_under(mJieba, "Keyword", rb_cObject);
86
+ u8_enc = rb_utf8_encoding();
87
+ rb_define_alloc_func(cKeyword, allocate);
88
+ DEF(cKeyword, "_init", init, 6);
89
+ DEF(cKeyword, "extract",extract,2);
90
+ }
91
+
92
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef RUBY_JIEBA_KEYWORD
2
+ #define RUBY_JIEBA_KEYWORD
3
+
4
+ #include <jieba.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void Init_keyword();
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif
15
+
16
+
17
+ #endif
@@ -0,0 +1,107 @@
1
+ #include "segment.h"
2
+ #include <ruby/encoding.h>
3
+ #include <MPSegment.hpp>
4
+ #include <HMMSegment.hpp>
5
+ #include <MixSegment.hpp>
6
+
7
+ static rb_encoding* u8_enc;
8
+
9
+ struct SegWrapper{
10
+ cppjieba::MixSegment *mixp;
11
+ cppjieba::HMMSegment *hmmp;
12
+ cppjieba::MPSegment *mpsp;
13
+ SegWrapper(): mixp(nullptr), hmmp(nullptr), mpsp(nullptr) {}
14
+ };
15
+
16
+ static void seg_free(void *p){
17
+ auto seg = reinterpret_cast<SegWrapper *>(p);
18
+ if (seg->mixp)
19
+ delete seg->mixp;
20
+ else if (seg->hmmp)
21
+ delete seg->hmmp;
22
+ else
23
+ delete seg->mpsp;
24
+ delete seg;
25
+ }
26
+
27
+ static VALUE allocate(VALUE klass)
28
+ {
29
+ SegWrapper* seg_wrapper = new SegWrapper();
30
+ return Data_Wrap_Struct(klass, NULL, seg_free, seg_wrapper);
31
+ }
32
+
33
+ static void seg_init(VALUE self,
34
+ VALUE type_rb_sym,
35
+ VALUE jieba_dict_rbs,
36
+ VALUE hmm_dict_rbs,
37
+ VALUE user_dict_rbs)
38
+ {
39
+ SegWrapper* seg_wrapper;
40
+ Data_Get_Struct(self, SegWrapper, seg_wrapper);
41
+
42
+ Check_Type(jieba_dict_rbs, T_STRING);
43
+ Check_Type(hmm_dict_rbs, T_STRING);
44
+ Check_Type(user_dict_rbs, T_STRING);
45
+
46
+ std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
47
+ std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
48
+ std::string user_dict = StringValueCStr(user_dict_rbs);
49
+
50
+ ID type = SYM2ID(type_rb_sym);
51
+ if ( type == rb_intern("mix") )
52
+ {
53
+ seg_wrapper->mixp = new cppjieba::MixSegment(jieba_dict, hmm_dict, user_dict);
54
+ }
55
+ else if ( type == rb_intern("hmm") )
56
+ {
57
+ seg_wrapper->hmmp = new cppjieba::HMMSegment(hmm_dict);
58
+ }
59
+ else if ( type == rb_intern("mp"))
60
+ {
61
+ seg_wrapper->mpsp = new cppjieba::MPSegment(jieba_dict);
62
+ }
63
+ }
64
+
65
+ static VALUE seg_cut(VALUE self, VALUE text_rbs)
66
+ {
67
+ Check_Type(text_rbs, T_STRING);
68
+ std::string text = StringValueCStr(text_rbs);
69
+
70
+ SegWrapper* seg_wrapper;
71
+ Data_Get_Struct(self, SegWrapper, seg_wrapper);
72
+
73
+ std::vector<std::string> words;
74
+
75
+ if (seg_wrapper->mixp) {
76
+ seg_wrapper->mixp->Cut(text, words);
77
+ }
78
+ else if (seg_wrapper->hmmp) {
79
+ seg_wrapper->hmmp->Cut(text, words);
80
+ }
81
+ else {
82
+ seg_wrapper->mpsp->Cut(text, words);
83
+ }
84
+
85
+ volatile VALUE arr = rb_ary_new();
86
+ for (std::vector<std::string>::const_iterator j = words.begin(); j != words.end(); j++)
87
+ {
88
+
89
+ rb_ary_push(arr, rb_enc_str_new((*j).c_str(), (*j).length(), u8_enc));
90
+
91
+ }
92
+ return arr;
93
+ }
94
+
95
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
96
+
97
+ extern "C" {
98
+ void Init_segment()
99
+ {
100
+ VALUE cSegment = rb_define_class_under(mJieba, "Segment", rb_cObject);
101
+ u8_enc = rb_utf8_encoding();
102
+ rb_define_alloc_func(cSegment, allocate);
103
+ DEF(cSegment, "_init",seg_init,4);
104
+ DEF(cSegment, "cut",seg_cut,1);
105
+ }
106
+
107
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef RUBY_JIEBA_SEGMENT
2
+ #define RUBY_JIEBA_SEGMENT
3
+
4
+ #include <jieba.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void Init_segment();
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif
15
+
16
+
17
+ #endif
@@ -0,0 +1,76 @@
1
+ #include "tagging.h"
2
+ #include <ruby/encoding.h>
3
+ #include <PosTagger.hpp>
4
+ #include <MixSegment.hpp>
5
+
6
+ static rb_encoding* u8_enc;
7
+
8
+ struct Tagging{
9
+ cppjieba::MixSegment *p;
10
+ };
11
+
12
+ static void tagger_free(void *p){
13
+ delete reinterpret_cast<Tagging *>(p)->p;
14
+ delete reinterpret_cast<Tagging *>(p);
15
+ }
16
+
17
+ static VALUE alloc(VALUE klass)
18
+ {
19
+ Tagging * tagging = new Tagging();
20
+ return Data_Wrap_Struct(klass, NULL, tagger_free, tagging);
21
+ }
22
+
23
+ static void init(VALUE self,
24
+ VALUE jieba_dict_rbs,
25
+ VALUE hmm_dict_rbs,
26
+ VALUE user_dict_rbs)
27
+ {
28
+ Tagging *tagging;
29
+ Data_Get_Struct(self, Tagging, tagging);
30
+
31
+ Check_Type(jieba_dict_rbs, T_STRING);
32
+ Check_Type(hmm_dict_rbs, T_STRING);
33
+ Check_Type(user_dict_rbs, T_STRING);
34
+
35
+ std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
36
+ std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
37
+ std::string user_dict = StringValueCStr(user_dict_rbs);
38
+
39
+ tagging->p = new cppjieba::MixSegment(jieba_dict, hmm_dict, user_dict);
40
+ }
41
+
42
+ static VALUE tag(VALUE self, VALUE text_rbs)
43
+ {
44
+ Check_Type(text_rbs, T_STRING);
45
+ std::string text = StringValueCStr(text_rbs);
46
+
47
+ Tagging *tagging;
48
+ Data_Get_Struct(self, Tagging, tagging);
49
+
50
+ std::vector<std::pair<std::string, std::string>> pairs;
51
+ tagging->p->Tag(text, pairs);
52
+
53
+ volatile VALUE arr = rb_ary_new();
54
+ for (std::vector<std::pair<std::string, std::string>>::const_iterator j = pairs.begin(); j != pairs.end(); j++)
55
+ {
56
+ VALUE pair = rb_hash_new();
57
+ rb_hash_aset(pair, rb_enc_str_new(std::get<0>(*j).c_str(), std::get<0>(*j).length(), u8_enc), rb_enc_str_new(std::get<1>(*j).c_str(), std::get<1>(*j).length(), u8_enc));
58
+ rb_ary_push(arr, pair);
59
+
60
+ }
61
+ return arr;
62
+ }
63
+
64
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
65
+
66
+ extern "C" {
67
+ void Init_tagging()
68
+ {
69
+ VALUE cTagging = rb_define_class_under(mJieba, "Tagging", rb_cObject);
70
+ u8_enc = rb_utf8_encoding();
71
+ rb_define_alloc_func(cTagging, alloc);
72
+ DEF(cTagging, "_init",init,3);
73
+ DEF(cTagging, "tag",tag,1);
74
+ }
75
+
76
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef RUBY_JIEBA_TAGGING
2
+ #define RUBY_JIEBA_TAGGING
3
+
4
+ #include <jieba.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void Init_tagging();
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif
15
+
16
+
17
+ #endif
@@ -0,0 +1,51 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'jieba_rb/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "jieba-rb"
8
+ spec.version = JiebaRb::VERSION
9
+ spec.authors = ["Chris Li"]
10
+ spec.email = ["liqi8822@gmail.com"]
11
+ spec.summary = "cppjieba binding for ruby"
12
+ spec.description = "cppjieba binding for ruby"
13
+ spec.homepage = "https://github.com/Xu-Zhiqing/jieba_rb"
14
+ spec.required_ruby_version = ">=1.9.2"
15
+ spec.license = "MIT"
16
+ spec.extensions = ["ext/jieba/extconf.rb"]
17
+
18
+ spec.files = `git ls-files -z`.split("\x0")
19
+ relative_path = File.expand_path("../", __FILE__) + '/'
20
+ `git submodule --quiet foreach pwd`.split($\).each do |submodule_path|
21
+ if (ENV['OS'] == 'Windows_NT') && submodule_path[0] == '/'
22
+ # Detect if cygwin path is being used by git
23
+ submodule_path = submodule_path[1..-1]
24
+ submodule_path.insert(1, ':')
25
+ end
26
+ # for each submodule, change working directory to that submodule
27
+ Dir.chdir(submodule_path) do
28
+ # Make the submodule path relative
29
+ submodule_path = submodule_path.gsub(/#{relative_path}/i, '')
30
+ # issue git ls-files in submodule's directory
31
+ submodule_files = `git ls-files`.split($\).reject { |i| i.start_with? 'test/' }
32
+
33
+ # prepend the submodule path to create relative file paths
34
+ submodule_files_paths = submodule_files.map do |filename|
35
+ File.join(submodule_path, filename)
36
+ end
37
+
38
+ # add relative paths to gem.files
39
+ spec.files += submodule_files_paths
40
+ end
41
+ end
42
+
43
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
44
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
45
+ spec.require_paths = ['lib']
46
+
47
+ spec.add_development_dependency "bundler"
48
+ spec.add_development_dependency "rake"
49
+ spec.add_development_dependency "rake-compiler"
50
+ spec.add_development_dependency "minitest"
51
+ end
@@ -0,0 +1,66 @@
1
+ require "jieba_rb/version"
2
+ require "jieba"
3
+ module JiebaRb
4
+ abs = File.expand_path File.dirname(__FILE__)
5
+ EXT_BASE = "#{abs}/../ext/cppjieba/"
6
+ DEFAULT_JIEBA_DICT = EXT_BASE + "dict/jieba.dict.utf8";
7
+ DEFAULT_HMM_DICT = EXT_BASE + "dict/hmm_model.utf8";
8
+ DEFAULT_USER_DICT = EXT_BASE + "dict/user.dict.utf8";
9
+
10
+ class Segment
11
+ private :_init
12
+ def initialize opts = {}
13
+ valid_modes = [:mix, :hmm, :mp]
14
+ if mode = opts[:mode]
15
+ raise "Mode must be one of :mix :hmm :mp" unless valid_modes.include? mode
16
+ else
17
+ mode = :mix #default
18
+ end
19
+
20
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
21
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
22
+ user_dict = opts[:user_dict] || ""
23
+ user_dict = DEFAULT_USER_DICT if user_dict == :default
24
+
25
+ _init mode, jieba_dict, hmm_dict, user_dict
26
+ end
27
+ end
28
+
29
+ class Keyword
30
+ DEFAULT_IDF = EXT_BASE + "dict/idf.utf8"
31
+ DEFAULT_STOP_WORDS = EXT_BASE + "dict/stop_words.utf8"
32
+
33
+ private :_init
34
+
35
+ def initialize opts = {}
36
+ valid_modes = [:tf_idf]
37
+ if mode = opts[:mode]
38
+ raise "Mode must be one of :tf_idf" unless valid_modes.include? mode
39
+ else
40
+ mode = :tf_idf #default
41
+ end
42
+
43
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
44
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
45
+ idf_path = opts[:idf] || DEFAULT_IDF
46
+ stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
47
+
48
+ user_dict = opts[:user_dict] || ""
49
+ user_dict = DEFAULT_USER_DICT if user_dict == :default
50
+
51
+ _init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
52
+ end
53
+ end
54
+
55
+ class Tagging
56
+ private :_init
57
+ def initialize opts = {}
58
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
59
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
60
+ user_dict = opts[:user_dict] || ""
61
+ user_dict = DEFAULT_USER_DICT if user_dict == :default
62
+
63
+ _init jieba_dict, hmm_dict, user_dict
64
+ end
65
+ end
66
+ end