jieba_rb 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (145) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +18 -0
  3. data/.gitmodules +3 -0
  4. data/.travis.yml +6 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +51 -0
  8. data/Rakefile +11 -0
  9. data/ext/cppjieba/.gitignore +17 -0
  10. data/ext/cppjieba/.travis.yml +22 -0
  11. data/ext/cppjieba/CMakeLists.txt +27 -0
  12. data/ext/cppjieba/ChangeLog.md +81 -0
  13. data/ext/cppjieba/Dockerfile +11 -0
  14. data/ext/cppjieba/LICENSE +20 -0
  15. data/ext/cppjieba/README.md +359 -0
  16. data/ext/cppjieba/conf/CMakeLists.txt +1 -0
  17. data/ext/cppjieba/conf/server.conf +16 -0
  18. data/ext/cppjieba/dict/CMakeLists.txt +1 -0
  19. data/ext/cppjieba/dict/README.md +31 -0
  20. data/ext/cppjieba/dict/extra_dict/jieba.dict.small.utf8 +109750 -0
  21. data/ext/cppjieba/dict/gbk_dict/hmm_model.gbk +34 -0
  22. data/ext/cppjieba/dict/gbk_dict/jieba.dict.gbk +348982 -0
  23. data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
  24. data/ext/cppjieba/dict/idf.utf8 +258826 -0
  25. data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
  26. data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
  27. data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
  28. data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
  29. data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
  30. data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
  31. data/ext/cppjieba/dict/user.dict.utf8 +3 -0
  32. data/ext/cppjieba/script/CMakeLists.txt +1 -0
  33. data/ext/cppjieba/script/cjserver.start +12 -0
  34. data/ext/cppjieba/script/cjserver.stop +13 -0
  35. data/ext/cppjieba/server/CMakeLists.txt +9 -0
  36. data/ext/cppjieba/server/Husky/HttpReqInfo.hpp +294 -0
  37. data/ext/cppjieba/server/Husky/IRequestHandler.hpp +18 -0
  38. data/ext/cppjieba/server/Husky/ThreadPoolServer.hpp +108 -0
  39. data/ext/cppjieba/server/Husky/WorkerThread.hpp +133 -0
  40. data/ext/cppjieba/server/server.cpp +91 -0
  41. data/ext/cppjieba/src/DictTrie.hpp +211 -0
  42. data/ext/cppjieba/src/FullSegment.hpp +153 -0
  43. data/ext/cppjieba/src/HMMSegment.hpp +394 -0
  44. data/ext/cppjieba/src/ISegment.hpp +17 -0
  45. data/ext/cppjieba/src/KeywordExtractor.hpp +173 -0
  46. data/ext/cppjieba/src/Limonp/ArgvContext.hpp +84 -0
  47. data/ext/cppjieba/src/Limonp/BlockingQueue.hpp +128 -0
  48. data/ext/cppjieba/src/Limonp/BoundedQueue.hpp +73 -0
  49. data/ext/cppjieba/src/Limonp/CastFloat.hpp +90 -0
  50. data/ext/cppjieba/src/Limonp/Condition.hpp +48 -0
  51. data/ext/cppjieba/src/Limonp/Config.hpp +118 -0
  52. data/ext/cppjieba/src/Limonp/HandyMacro.hpp +31 -0
  53. data/ext/cppjieba/src/Limonp/InitOnOff.hpp +21 -0
  54. data/ext/cppjieba/src/Limonp/LocalVector.hpp +171 -0
  55. data/ext/cppjieba/src/Limonp/Logger.hpp +74 -0
  56. data/ext/cppjieba/src/Limonp/Md5.hpp +432 -0
  57. data/ext/cppjieba/src/Limonp/MutexLock.hpp +57 -0
  58. data/ext/cppjieba/src/Limonp/MysqlClient.hpp +125 -0
  59. data/ext/cppjieba/src/Limonp/NonCopyable.hpp +22 -0
  60. data/ext/cppjieba/src/Limonp/StdExtension.hpp +139 -0
  61. data/ext/cppjieba/src/Limonp/StringUtil.hpp +349 -0
  62. data/ext/cppjieba/src/Limonp/Thread.hpp +50 -0
  63. data/ext/cppjieba/src/Limonp/ThreadPool.hpp +105 -0
  64. data/ext/cppjieba/src/MPSegment.hpp +148 -0
  65. data/ext/cppjieba/src/MixSegment.hpp +121 -0
  66. data/ext/cppjieba/src/PosTagger.hpp +109 -0
  67. data/ext/cppjieba/src/QuerySegment.hpp +123 -0
  68. data/ext/cppjieba/src/SegmentBase.hpp +78 -0
  69. data/ext/cppjieba/src/TransCode.hpp +63 -0
  70. data/ext/cppjieba/src/Trie.hpp +298 -0
  71. data/ext/cppjieba/test/CMakeLists.txt +7 -0
  72. data/ext/cppjieba/test/keyword_demo.cpp +16 -0
  73. data/ext/cppjieba/test/load_test.cpp +56 -0
  74. data/ext/cppjieba/test/segment_demo.cpp +59 -0
  75. data/ext/cppjieba/test/servertest/go_load_test.sh +2 -0
  76. data/ext/cppjieba/test/servertest/load_test.py +91 -0
  77. data/ext/cppjieba/test/servertest/run_curl.sh +11 -0
  78. data/ext/cppjieba/test/tagging_demo.cpp +12 -0
  79. data/ext/cppjieba/test/testdata/curl.res +1 -0
  80. data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
  81. data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
  82. data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
  83. data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
  84. data/ext/cppjieba/test/testdata/load_test.urls +2 -0
  85. data/ext/cppjieba/test/testdata/review.100 +100 -0
  86. data/ext/cppjieba/test/testdata/review.100.res +200 -0
  87. data/ext/cppjieba/test/testdata/server.conf +13 -0
  88. data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
  89. data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
  90. data/ext/cppjieba/test/testdata/userdict.utf8 +6 -0
  91. data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
  92. data/ext/cppjieba/test/unittest/CMakeLists.txt +28 -0
  93. data/ext/cppjieba/test/unittest/TKeywordExtractor.cpp +18 -0
  94. data/ext/cppjieba/test/unittest/TPosTagger.cpp +43 -0
  95. data/ext/cppjieba/test/unittest/TSegments.cpp +187 -0
  96. data/ext/cppjieba/test/unittest/TTrie.cpp +80 -0
  97. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-death-test.h +283 -0
  98. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-message.h +230 -0
  99. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h +1421 -0
  100. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h.pump +487 -0
  101. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-printers.h +796 -0
  102. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-spi.h +232 -0
  103. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-test-part.h +176 -0
  104. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-typed-test.h +259 -0
  105. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest.h +2155 -0
  106. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_pred_impl.h +358 -0
  107. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_prod.h +58 -0
  108. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-death-test-internal.h +308 -0
  109. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-filepath.h +210 -0
  110. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-internal.h +1226 -0
  111. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-linked_ptr.h +233 -0
  112. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h +4822 -0
  113. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
  114. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util.h +619 -0
  115. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-port.h +1788 -0
  116. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-string.h +350 -0
  117. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h +968 -0
  118. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h.pump +336 -0
  119. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h +3330 -0
  120. data/ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h.pump +296 -0
  121. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/.dirstamp +0 -0
  122. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest-all.Plo +681 -0
  123. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest_main.Plo +509 -0
  124. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/.dirstamp +0 -0
  125. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-all.cc +48 -0
  126. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-death-test.cc +1234 -0
  127. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-filepath.cc +380 -0
  128. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-internal-inl.h +1038 -0
  129. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-port.cc +746 -0
  130. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-printers.cc +356 -0
  131. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-test-part.cc +110 -0
  132. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-typed-test.cc +110 -0
  133. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc +4898 -0
  134. data/ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc +39 -0
  135. data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
  136. data/ext/jieba/extconf.rb +26 -0
  137. data/ext/jieba/jieba.c +9 -0
  138. data/ext/jieba/jieba.h +9 -0
  139. data/ext/jieba/segment.cc +88 -0
  140. data/ext/jieba/segment.h +17 -0
  141. data/jieba_rb.gemspec +51 -0
  142. data/lib/jieba_rb/version.rb +3 -0
  143. data/lib/jieba_rb.rb +28 -0
  144. data/test/test_segment.rb +32 -0
  145. metadata +246 -0
@@ -0,0 +1,39 @@
1
+ // Copyright 2006, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #include <iostream>
31
+
32
+ #include "gtest/gtest.h"
33
+
34
+ GTEST_API_ int main(int argc, char **argv) {
35
+ std::cout << "Running main() from gtest_main.cc\n";
36
+
37
+ testing::InitGoogleTest(&argc, argv);
38
+ return RUN_ALL_TESTS();
39
+ }
@@ -0,0 +1,39 @@
1
+ // Copyright 2006, Google Inc.
2
+ // All rights reserved.
3
+ //
4
+ // Redistribution and use in source and binary forms, with or without
5
+ // modification, are permitted provided that the following conditions are
6
+ // met:
7
+ //
8
+ // * Redistributions of source code must retain the above copyright
9
+ // notice, this list of conditions and the following disclaimer.
10
+ // * Redistributions in binary form must reproduce the above
11
+ // copyright notice, this list of conditions and the following disclaimer
12
+ // in the documentation and/or other materials provided with the
13
+ // distribution.
14
+ // * Neither the name of Google Inc. nor the names of its
15
+ // contributors may be used to endorse or promote products derived from
16
+ // this software without specific prior written permission.
17
+ //
18
+ // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
+ // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
+ // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
+ // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
+ // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
+ // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
+ // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
+ // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
+ // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
+ // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ #include <iostream>
31
+
32
+ #include "gtest/gtest.h"
33
+
34
+ GTEST_API_ int main(int argc, char **argv) {
35
+ std::cout << "Running main() from gtest_main.cc\n";
36
+
37
+ testing::InitGoogleTest(&argc, argv);
38
+ return RUN_ALL_TESTS();
39
+ }
@@ -0,0 +1,26 @@
1
+ require "mkmf"
2
+ abs = File.expand_path File.dirname(__FILE__)
3
+
4
+ LIBDIR = RbConfig::CONFIG['libdir']
5
+ INCLUDEDIR = RbConfig::CONFIG['includedir']
6
+
7
+ HEADER_DIRS = [
8
+ INCLUDEDIR,
9
+ "#{abs}/../cppjieba/src"
10
+ ]
11
+
12
+ LIB_DIRS = [
13
+
14
+ LIBDIR
15
+ ]
16
+
17
+ dir_config('cppjieba_src', HEADER_DIRS, LIB_DIRS)
18
+
19
+ CONFIG["CXXFLAGS"] += " -std=c++0x -O3"
20
+ $CXXFLAGS = "#{$CXXFLAGS} -std=c++0x -O3"
21
+ create_makefile 'jieba'
22
+ # respect header changes
23
+ headers = Dir.glob('*.{hpp,h}').join ' '
24
+ File.open 'Makefile', 'a' do |f|
25
+ f.puts "\n$(OBJS): #{headers}"
26
+ end
data/ext/jieba/jieba.c ADDED
@@ -0,0 +1,9 @@
1
+ #include <jieba.h>
2
+ VALUE mJieba;
3
+
4
+ void Init_jieba()
5
+ {
6
+ mJieba = rb_define_module("JiebaRb");
7
+
8
+ Init_segment();
9
+ }
data/ext/jieba/jieba.h ADDED
@@ -0,0 +1,9 @@
1
+ #ifndef RUBY_JIEBA
2
+ #define RUBY_JIEBA
3
+
4
+ #include <ruby.h>
5
+ #include <segment.h>
6
+
7
+ extern VALUE mJieba;
8
+
9
+ #endif
@@ -0,0 +1,88 @@
1
+ #include "segment.h"
2
+ #include <ruby/encoding.h>
3
+ #include <MPSegment.hpp>
4
+ #include <HMMSegment.hpp>
5
+ #include <MixSegment.hpp>
6
+
7
+ static rb_encoding* u8_enc;
8
+
9
+ struct SegWrapper{
10
+ CppJieba::ISegment * segp;
11
+ };
12
+ static void seg_free(void *p){
13
+ delete ((SegWrapper*) p) -> segp;
14
+ delete (SegWrapper*)p;
15
+ }
16
+
17
+ static VALUE allocate(VALUE klass)
18
+ {
19
+ SegWrapper* seg_wrapper = new SegWrapper();
20
+ return Data_Wrap_Struct(klass, NULL, seg_free, seg_wrapper);
21
+ }
22
+
23
+ static void seg_init(VALUE self,
24
+ VALUE type_rb_sym,
25
+ VALUE jieba_dict_rbs,
26
+ VALUE hmm_dict_rbs,
27
+ VALUE user_dict_rbs)
28
+ {
29
+ SegWrapper* seg_wrapper;
30
+ Data_Get_Struct(self, SegWrapper, seg_wrapper);
31
+
32
+ Check_Type(jieba_dict_rbs, T_STRING);
33
+ Check_Type(hmm_dict_rbs, T_STRING);
34
+ Check_Type(user_dict_rbs, T_STRING);
35
+
36
+ std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
37
+ std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
38
+ std::string user_dict = StringValueCStr(user_dict_rbs);
39
+
40
+ ID type = SYM2ID(type_rb_sym);
41
+ if ( type == rb_intern("mix") )
42
+ {
43
+ seg_wrapper->segp = new CppJieba::MixSegment(jieba_dict, hmm_dict, user_dict);
44
+ }
45
+ else if ( type == rb_intern("hmm") )
46
+ {
47
+ seg_wrapper->segp = new CppJieba::HMMSegment(hmm_dict);
48
+ }
49
+ else if ( type == rb_intern("mp"))
50
+ {
51
+ seg_wrapper->segp = new CppJieba::MPSegment(jieba_dict);
52
+ }
53
+ }
54
+
55
+ static VALUE seg_cut(VALUE self, VALUE text_rbs)
56
+ {
57
+ Check_Type(text_rbs, T_STRING);
58
+ std::string text = StringValueCStr(text_rbs);
59
+
60
+ SegWrapper* seg_wrapper;
61
+ Data_Get_Struct(self, SegWrapper, seg_wrapper);
62
+
63
+ std::vector<std::string> words;
64
+ seg_wrapper->segp->cut(text, words);
65
+
66
+ volatile VALUE arr = rb_ary_new();
67
+ for (std::vector<std::string>::const_iterator j = words.begin(); j != words.end(); j++)
68
+ {
69
+
70
+ rb_ary_push(arr, rb_enc_str_new((*j).c_str(), (*j).length(), u8_enc));
71
+
72
+ }
73
+ return arr;
74
+ }
75
+
76
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
77
+
78
+ extern "C" {
79
+ void Init_segment()
80
+ {
81
+ VALUE cSegment = rb_define_class_under(mJieba, "Segment", rb_cObject);
82
+ u8_enc = rb_utf8_encoding();
83
+ rb_define_alloc_func(cSegment, allocate);
84
+ DEF(cSegment, "_init",seg_init,4);
85
+ DEF(cSegment, "cut",seg_cut,1);
86
+ }
87
+
88
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef RUBY_JIEBA_SEGMENT
2
+ #define RUBY_JIEBA_SEGMENT
3
+
4
+ #include <jieba.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void Init_segment();
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif
15
+
16
+
17
+ #endif
data/jieba_rb.gemspec ADDED
@@ -0,0 +1,51 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'jieba_rb/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "jieba_rb"
8
+ spec.version = JiebaRb::VERSION
9
+ spec.authors = ["Chris Li"]
10
+ spec.email = ["liqi8822@gmail.com"]
11
+ spec.summary = "cppjieba binding for ruby"
12
+ spec.description = "cppjieba binding for ruby"
13
+ spec.homepage = "https://github.com/altkatz/jieba_rb"
14
+ spec.required_ruby_version = ">=1.9.2"
15
+ spec.license = "MIT"
16
+ spec.extensions = ["ext/jieba/extconf.rb"]
17
+
18
+ spec.files = `git ls-files -z`.split("\x0")
19
+ relative_path = File.expand_path("../", __FILE__) + '/'
20
+ `git submodule --quiet foreach pwd`.split($\).each do |submodule_path|
21
+ if (ENV['OS'] == 'Windows_NT') && submodule_path[0] == '/'
22
+ # Detect if cygwin path is being used by git
23
+ submodule_path = submodule_path[1..-1]
24
+ submodule_path.insert(1, ':')
25
+ end
26
+ # for each submodule, change working directory to that submodule
27
+ Dir.chdir(submodule_path) do
28
+ # Make the submodule path relative
29
+ submodule_path = submodule_path.gsub(/#{relative_path}/i, '')
30
+ # issue git ls-files in submodule's directory
31
+ submodule_files = `git ls-files`.split($\)
32
+
33
+ # prepend the submodule path to create relative file paths
34
+ submodule_files_paths = submodule_files.map do |filename|
35
+ File.join(submodule_path, filename)
36
+ end
37
+
38
+ # add relative paths to gem.files
39
+ spec.files += submodule_files_paths
40
+ end
41
+ end
42
+
43
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
44
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
45
+ spec.require_paths = ["lib"]
46
+
47
+ spec.add_development_dependency "bundler", "~> 1.5"
48
+ spec.add_development_dependency "rake"
49
+ spec.add_development_dependency "rake-compiler"
50
+ spec.add_development_dependency "minitest"
51
+ end
@@ -0,0 +1,3 @@
1
+ module JiebaRb
2
+ VERSION = "0.0.1"
3
+ end
data/lib/jieba_rb.rb ADDED
@@ -0,0 +1,28 @@
1
+ require "jieba_rb/version"
2
+ require "jieba"
3
+ module JiebaRb
4
+ class Segment
5
+ abs = File.expand_path File.dirname(__FILE__)
6
+ EXT_BASE = "#{abs}/../ext/cppjieba/"
7
+ JIEBA_DICT_FILE = EXT_BASE + "dict/jieba.dict.utf8";
8
+ HMM_DICT_FILE = EXT_BASE + "dict/hmm_model.utf8";
9
+ USER_DICT_FILE = EXT_BASE + "dict/user.dict.utf8";
10
+
11
+ private :_init
12
+ def initialize opts = {}
13
+ valid_seg_types = [:mix, :hmm, :mp]
14
+ if type = opts[:type]
15
+ raise "Type must be one of :mix :hmm :mp" unless valid_seg_types.include? type
16
+ else
17
+ type = :mix #default
18
+ end
19
+
20
+ jieba_dict = opts[:jieba_dict] || JIEBA_DICT_FILE
21
+ hmm_dict = opts[:hmm_dict] || HMM_DICT_FILE
22
+ user_dict = opts[:user_dict] || ""
23
+ user_dict = USER_DICT_FILE if user_dict == :default
24
+
25
+ _init type, jieba_dict, hmm_dict, user_dict
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,32 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'jieba_rb'
4
+ class JiebaTest < Minitest::Test
5
+ def test_mix_segment
6
+ seg = JiebaRb::Segment.new
7
+ words = seg.cut "我来到南京市长江大桥"
8
+ assert_equal %w(我 来到 南京市 长江大桥), words
9
+
10
+ words = seg.cut "令狐冲是云计算行业的专家"
11
+ assert_equal %w(令狐冲 是 云 计算 行业 的 专家), words
12
+ end
13
+
14
+ def test_mix_segment_with_user_dict
15
+ seg = JiebaRb::Segment.new user_dict: :default
16
+ words = seg.cut "令狐冲是云计算行业的专家"
17
+ assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
18
+ end
19
+
20
+ def test_hmm_segment
21
+ seg = JiebaRb::Segment.new type: :hmm
22
+ words = seg.cut "令狐冲是云计算行业的专家"
23
+ assert_equal %w(令狐冲 是 云计算 行业 的 专家), words
24
+ end
25
+
26
+ def test_max_prob_segment
27
+ seg = JiebaRb::Segment.new type: :mp
28
+ words = seg.cut "令狐冲是云计算行业的专家"
29
+ assert_equal %w(令狐冲 是 云 计算 行业 的 专家), words
30
+ end
31
+
32
+ end
metadata ADDED
@@ -0,0 +1,246 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: jieba_rb
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Chris Li
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-12-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake-compiler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description: cppjieba binding for ruby
70
+ email:
71
+ - liqi8822@gmail.com
72
+ executables: []
73
+ extensions:
74
+ - ext/jieba/extconf.rb
75
+ extra_rdoc_files: []
76
+ files:
77
+ - .gitignore
78
+ - .gitmodules
79
+ - .travis.yml
80
+ - Gemfile
81
+ - LICENSE.txt
82
+ - README.md
83
+ - Rakefile
84
+ - ext/jieba/extconf.rb
85
+ - ext/jieba/jieba.c
86
+ - ext/jieba/jieba.h
87
+ - ext/jieba/segment.cc
88
+ - ext/jieba/segment.h
89
+ - jieba_rb.gemspec
90
+ - lib/jieba_rb.rb
91
+ - lib/jieba_rb/version.rb
92
+ - test/test_segment.rb
93
+ - ext/cppjieba/.gitignore
94
+ - ext/cppjieba/.travis.yml
95
+ - ext/cppjieba/CMakeLists.txt
96
+ - ext/cppjieba/ChangeLog.md
97
+ - ext/cppjieba/Dockerfile
98
+ - ext/cppjieba/LICENSE
99
+ - ext/cppjieba/README.md
100
+ - ext/cppjieba/conf/CMakeLists.txt
101
+ - ext/cppjieba/conf/server.conf
102
+ - ext/cppjieba/dict/CMakeLists.txt
103
+ - ext/cppjieba/dict/README.md
104
+ - ext/cppjieba/dict/extra_dict/jieba.dict.small.utf8
105
+ - ext/cppjieba/dict/gbk_dict/hmm_model.gbk
106
+ - ext/cppjieba/dict/gbk_dict/jieba.dict.gbk
107
+ - ext/cppjieba/dict/hmm_model.utf8
108
+ - ext/cppjieba/dict/idf.utf8
109
+ - ext/cppjieba/dict/jieba.dict.utf8
110
+ - ext/cppjieba/dict/pos_dict/char_state_tab.utf8
111
+ - ext/cppjieba/dict/pos_dict/prob_emit.utf8
112
+ - ext/cppjieba/dict/pos_dict/prob_start.utf8
113
+ - ext/cppjieba/dict/pos_dict/prob_trans.utf8
114
+ - ext/cppjieba/dict/stop_words.utf8
115
+ - ext/cppjieba/dict/user.dict.utf8
116
+ - ext/cppjieba/script/CMakeLists.txt
117
+ - ext/cppjieba/script/cjserver.start
118
+ - ext/cppjieba/script/cjserver.stop
119
+ - ext/cppjieba/server/CMakeLists.txt
120
+ - ext/cppjieba/server/Husky/HttpReqInfo.hpp
121
+ - ext/cppjieba/server/Husky/IRequestHandler.hpp
122
+ - ext/cppjieba/server/Husky/ThreadPoolServer.hpp
123
+ - ext/cppjieba/server/Husky/WorkerThread.hpp
124
+ - ext/cppjieba/server/server.cpp
125
+ - ext/cppjieba/src/DictTrie.hpp
126
+ - ext/cppjieba/src/FullSegment.hpp
127
+ - ext/cppjieba/src/HMMSegment.hpp
128
+ - ext/cppjieba/src/ISegment.hpp
129
+ - ext/cppjieba/src/KeywordExtractor.hpp
130
+ - ext/cppjieba/src/Limonp/ArgvContext.hpp
131
+ - ext/cppjieba/src/Limonp/BlockingQueue.hpp
132
+ - ext/cppjieba/src/Limonp/BoundedQueue.hpp
133
+ - ext/cppjieba/src/Limonp/CastFloat.hpp
134
+ - ext/cppjieba/src/Limonp/Condition.hpp
135
+ - ext/cppjieba/src/Limonp/Config.hpp
136
+ - ext/cppjieba/src/Limonp/HandyMacro.hpp
137
+ - ext/cppjieba/src/Limonp/InitOnOff.hpp
138
+ - ext/cppjieba/src/Limonp/LocalVector.hpp
139
+ - ext/cppjieba/src/Limonp/Logger.hpp
140
+ - ext/cppjieba/src/Limonp/Md5.hpp
141
+ - ext/cppjieba/src/Limonp/MutexLock.hpp
142
+ - ext/cppjieba/src/Limonp/MysqlClient.hpp
143
+ - ext/cppjieba/src/Limonp/NonCopyable.hpp
144
+ - ext/cppjieba/src/Limonp/StdExtension.hpp
145
+ - ext/cppjieba/src/Limonp/StringUtil.hpp
146
+ - ext/cppjieba/src/Limonp/Thread.hpp
147
+ - ext/cppjieba/src/Limonp/ThreadPool.hpp
148
+ - ext/cppjieba/src/MPSegment.hpp
149
+ - ext/cppjieba/src/MixSegment.hpp
150
+ - ext/cppjieba/src/PosTagger.hpp
151
+ - ext/cppjieba/src/QuerySegment.hpp
152
+ - ext/cppjieba/src/SegmentBase.hpp
153
+ - ext/cppjieba/src/TransCode.hpp
154
+ - ext/cppjieba/src/Trie.hpp
155
+ - ext/cppjieba/test/CMakeLists.txt
156
+ - ext/cppjieba/test/keyword_demo.cpp
157
+ - ext/cppjieba/test/load_test.cpp
158
+ - ext/cppjieba/test/segment_demo.cpp
159
+ - ext/cppjieba/test/servertest/go_load_test.sh
160
+ - ext/cppjieba/test/servertest/load_test.py
161
+ - ext/cppjieba/test/servertest/run_curl.sh
162
+ - ext/cppjieba/test/tagging_demo.cpp
163
+ - ext/cppjieba/test/testdata/curl.res
164
+ - ext/cppjieba/test/testdata/jieba.dict.0.1.utf8
165
+ - ext/cppjieba/test/testdata/jieba.dict.0.utf8
166
+ - ext/cppjieba/test/testdata/jieba.dict.1.utf8
167
+ - ext/cppjieba/test/testdata/jieba.dict.2.utf8
168
+ - ext/cppjieba/test/testdata/load_test.urls
169
+ - ext/cppjieba/test/testdata/review.100
170
+ - ext/cppjieba/test/testdata/review.100.res
171
+ - ext/cppjieba/test/testdata/server.conf
172
+ - ext/cppjieba/test/testdata/testlines.gbk
173
+ - ext/cppjieba/test/testdata/testlines.utf8
174
+ - ext/cppjieba/test/testdata/userdict.utf8
175
+ - ext/cppjieba/test/testdata/weicheng.utf8
176
+ - ext/cppjieba/test/unittest/CMakeLists.txt
177
+ - ext/cppjieba/test/unittest/TKeywordExtractor.cpp
178
+ - ext/cppjieba/test/unittest/TPosTagger.cpp
179
+ - ext/cppjieba/test/unittest/TSegments.cpp
180
+ - ext/cppjieba/test/unittest/TTrie.cpp
181
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-death-test.h
182
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-message.h
183
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h
184
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-param-test.h.pump
185
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-printers.h
186
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-spi.h
187
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-test-part.h
188
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest-typed-test.h
189
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest.h
190
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_pred_impl.h
191
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/gtest_prod.h
192
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-death-test-internal.h
193
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-filepath.h
194
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-internal.h
195
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-linked_ptr.h
196
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h
197
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util-generated.h.pump
198
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-param-util.h
199
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-port.h
200
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-string.h
201
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h
202
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-tuple.h.pump
203
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h
204
+ - ext/cppjieba/test/unittest/gtest-1.6.0/include/gtest/internal/gtest-type-util.h.pump
205
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/.dirstamp
206
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest-all.Plo
207
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/.deps/gtest_main.Plo
208
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/.dirstamp
209
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-all.cc
210
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-death-test.cc
211
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-filepath.cc
212
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-internal-inl.h
213
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-port.cc
214
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-printers.cc
215
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-test-part.cc
216
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest-typed-test.cc
217
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest.cc
218
+ - ext/cppjieba/test/unittest/gtest-1.6.0/src/gtest_main.cc
219
+ - ext/cppjieba/test/unittest/gtest_main.cpp
220
+ homepage: https://github.com/altkatz/jieba_rb
221
+ licenses:
222
+ - MIT
223
+ metadata: {}
224
+ post_install_message:
225
+ rdoc_options: []
226
+ require_paths:
227
+ - lib
228
+ required_ruby_version: !ruby/object:Gem::Requirement
229
+ requirements:
230
+ - - '>='
231
+ - !ruby/object:Gem::Version
232
+ version: 1.9.2
233
+ required_rubygems_version: !ruby/object:Gem::Requirement
234
+ requirements:
235
+ - - '>='
236
+ - !ruby/object:Gem::Version
237
+ version: '0'
238
+ requirements: []
239
+ rubyforge_project:
240
+ rubygems_version: 2.1.11
241
+ signing_key:
242
+ specification_version: 4
243
+ summary: cppjieba binding for ruby
244
+ test_files:
245
+ - test/test_segment.rb
246
+ has_rdoc: