cppjieba_rb 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.gitmodules +3 -0
- data/.travis.yml +26 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +81 -0
- data/Rakefile +20 -0
- data/cppjieba_rb.gemspec +50 -0
- data/ext/cppjieba/.gitignore +17 -0
- data/ext/cppjieba/.travis.yml +22 -0
- data/ext/cppjieba/CMakeLists.txt +28 -0
- data/ext/cppjieba/ChangeLog.md +236 -0
- data/ext/cppjieba/README.md +285 -0
- data/ext/cppjieba/README_EN.md +111 -0
- data/ext/cppjieba/appveyor.yml +32 -0
- data/ext/cppjieba/deps/CMakeLists.txt +1 -0
- data/ext/cppjieba/deps/gtest/CMakeLists.txt +5 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-death-test.h +283 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-message.h +230 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h +1421 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-param-test.h.pump +487 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-printers.h +796 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-spi.h +232 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-test-part.h +176 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest-typed-test.h +259 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest.h +2155 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_pred_impl.h +358 -0
- data/ext/cppjieba/deps/gtest/include/gtest/gtest_prod.h +58 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-death-test-internal.h +308 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-filepath.h +210 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-internal.h +1226 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-linked_ptr.h +233 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h +4822 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util-generated.h.pump +301 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-param-util.h +619 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-port.h +1788 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-string.h +350 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h +968 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-tuple.h.pump +336 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h +3330 -0
- data/ext/cppjieba/deps/gtest/include/gtest/internal/gtest-type-util.h.pump +296 -0
- data/ext/cppjieba/deps/gtest/src/.deps/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest-all.Plo +681 -0
- data/ext/cppjieba/deps/gtest/src/.deps/gtest_main.Plo +509 -0
- data/ext/cppjieba/deps/gtest/src/.dirstamp +0 -0
- data/ext/cppjieba/deps/gtest/src/gtest-all.cc +48 -0
- data/ext/cppjieba/deps/gtest/src/gtest-death-test.cc +1234 -0
- data/ext/cppjieba/deps/gtest/src/gtest-filepath.cc +380 -0
- data/ext/cppjieba/deps/gtest/src/gtest-internal-inl.h +1038 -0
- data/ext/cppjieba/deps/gtest/src/gtest-port.cc +746 -0
- data/ext/cppjieba/deps/gtest/src/gtest-printers.cc +356 -0
- data/ext/cppjieba/deps/gtest/src/gtest-test-part.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest-typed-test.cc +110 -0
- data/ext/cppjieba/deps/gtest/src/gtest.cc +4898 -0
- data/ext/cppjieba/deps/gtest/src/gtest_main.cc +39 -0
- data/ext/cppjieba/deps/limonp/ArgvContext.hpp +70 -0
- data/ext/cppjieba/deps/limonp/BlockingQueue.hpp +49 -0
- data/ext/cppjieba/deps/limonp/BoundedBlockingQueue.hpp +67 -0
- data/ext/cppjieba/deps/limonp/BoundedQueue.hpp +65 -0
- data/ext/cppjieba/deps/limonp/Closure.hpp +206 -0
- data/ext/cppjieba/deps/limonp/Colors.hpp +31 -0
- data/ext/cppjieba/deps/limonp/Condition.hpp +38 -0
- data/ext/cppjieba/deps/limonp/Config.hpp +103 -0
- data/ext/cppjieba/deps/limonp/FileLock.hpp +74 -0
- data/ext/cppjieba/deps/limonp/ForcePublic.hpp +7 -0
- data/ext/cppjieba/deps/limonp/LocalVector.hpp +139 -0
- data/ext/cppjieba/deps/limonp/Logging.hpp +76 -0
- data/ext/cppjieba/deps/limonp/Md5.hpp +411 -0
- data/ext/cppjieba/deps/limonp/MutexLock.hpp +51 -0
- data/ext/cppjieba/deps/limonp/NonCopyable.hpp +21 -0
- data/ext/cppjieba/deps/limonp/StdExtension.hpp +159 -0
- data/ext/cppjieba/deps/limonp/StringUtil.hpp +365 -0
- data/ext/cppjieba/deps/limonp/Thread.hpp +44 -0
- data/ext/cppjieba/deps/limonp/ThreadPool.hpp +86 -0
- data/ext/cppjieba/dict/README.md +31 -0
- data/ext/cppjieba/dict/hmm_model.utf8 +34 -0
- data/ext/cppjieba/dict/idf.utf8 +258826 -0
- data/ext/cppjieba/dict/jieba.dict.utf8 +348982 -0
- data/ext/cppjieba/dict/pos_dict/char_state_tab.utf8 +6653 -0
- data/ext/cppjieba/dict/pos_dict/prob_emit.utf8 +166 -0
- data/ext/cppjieba/dict/pos_dict/prob_start.utf8 +259 -0
- data/ext/cppjieba/dict/pos_dict/prob_trans.utf8 +5222 -0
- data/ext/cppjieba/dict/stop_words.utf8 +1534 -0
- data/ext/cppjieba/dict/user.dict.utf8 +4 -0
- data/ext/cppjieba/include/cppjieba/DictTrie.hpp +227 -0
- data/ext/cppjieba/include/cppjieba/FullSegment.hpp +93 -0
- data/ext/cppjieba/include/cppjieba/HMMModel.hpp +129 -0
- data/ext/cppjieba/include/cppjieba/HMMSegment.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Jieba.hpp +108 -0
- data/ext/cppjieba/include/cppjieba/KeywordExtractor.hpp +153 -0
- data/ext/cppjieba/include/cppjieba/MPSegment.hpp +137 -0
- data/ext/cppjieba/include/cppjieba/MixSegment.hpp +109 -0
- data/ext/cppjieba/include/cppjieba/PosTagger.hpp +77 -0
- data/ext/cppjieba/include/cppjieba/PreFilter.hpp +54 -0
- data/ext/cppjieba/include/cppjieba/QuerySegment.hpp +90 -0
- data/ext/cppjieba/include/cppjieba/SegmentBase.hpp +46 -0
- data/ext/cppjieba/include/cppjieba/SegmentTagged.hpp +23 -0
- data/ext/cppjieba/include/cppjieba/TextRankExtractor.hpp +190 -0
- data/ext/cppjieba/include/cppjieba/Trie.hpp +174 -0
- data/ext/cppjieba/include/cppjieba/Unicode.hpp +215 -0
- data/ext/cppjieba/test/CMakeLists.txt +5 -0
- data/ext/cppjieba/test/demo.cpp +80 -0
- data/ext/cppjieba/test/load_test.cpp +54 -0
- data/ext/cppjieba/test/testdata/curl.res +1 -0
- data/ext/cppjieba/test/testdata/extra_dict/jieba.dict.small.utf8 +109750 -0
- data/ext/cppjieba/test/testdata/gbk_dict/hmm_model.gbk +34 -0
- data/ext/cppjieba/test/testdata/gbk_dict/jieba.dict.gbk +348982 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.1.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.0.utf8 +93 -0
- data/ext/cppjieba/test/testdata/jieba.dict.1.utf8 +67 -0
- data/ext/cppjieba/test/testdata/jieba.dict.2.utf8 +64 -0
- data/ext/cppjieba/test/testdata/load_test.urls +2 -0
- data/ext/cppjieba/test/testdata/review.100 +100 -0
- data/ext/cppjieba/test/testdata/review.100.res +200 -0
- data/ext/cppjieba/test/testdata/server.conf +19 -0
- data/ext/cppjieba/test/testdata/testlines.gbk +9 -0
- data/ext/cppjieba/test/testdata/testlines.utf8 +8 -0
- data/ext/cppjieba/test/testdata/userdict.2.utf8 +1 -0
- data/ext/cppjieba/test/testdata/userdict.english +2 -0
- data/ext/cppjieba/test/testdata/userdict.utf8 +8 -0
- data/ext/cppjieba/test/testdata/weicheng.utf8 +247 -0
- data/ext/cppjieba/test/unittest/CMakeLists.txt +24 -0
- data/ext/cppjieba/test/unittest/gtest_main.cpp +39 -0
- data/ext/cppjieba/test/unittest/jieba_test.cpp +133 -0
- data/ext/cppjieba/test/unittest/keyword_extractor_test.cpp +79 -0
- data/ext/cppjieba/test/unittest/pos_tagger_test.cpp +41 -0
- data/ext/cppjieba/test/unittest/pre_filter_test.cpp +43 -0
- data/ext/cppjieba/test/unittest/segments_test.cpp +256 -0
- data/ext/cppjieba/test/unittest/textrank_test.cpp +86 -0
- data/ext/cppjieba/test/unittest/trie_test.cpp +177 -0
- data/ext/cppjieba/test/unittest/unicode_test.cpp +43 -0
- data/ext/cppjieba_rb/cppjieba_rb.c +10 -0
- data/ext/cppjieba_rb/extconf.rb +26 -0
- data/ext/cppjieba_rb/internal.cc +148 -0
- data/lib/cppjieba_rb/segment.rb +20 -0
- data/lib/cppjieba_rb/version.rb +3 -0
- data/lib/cppjieba_rb.rb +34 -0
- data/test/test_keyword.rb +17 -0
- data/test/test_segment.rb +24 -0
- data/test/test_tagging.rb +19 -0
- metadata +244 -0
@@ -0,0 +1,86 @@
|
|
1
|
+
#ifndef LIMONP_THREAD_POOL_HPP
|
2
|
+
#define LIMONP_THREAD_POOL_HPP
|
3
|
+
|
4
|
+
#include "Thread.hpp"
|
5
|
+
#include "BlockingQueue.hpp"
|
6
|
+
#include "BoundedBlockingQueue.hpp"
|
7
|
+
#include "Closure.hpp"
|
8
|
+
|
9
|
+
namespace limonp {
|
10
|
+
|
11
|
+
using namespace std;
|
12
|
+
|
13
|
+
//class ThreadPool;
|
14
|
+
class ThreadPool: NonCopyable {
|
15
|
+
public:
|
16
|
+
class Worker: public IThread {
|
17
|
+
public:
|
18
|
+
Worker(ThreadPool* pool): ptThreadPool_(pool) {
|
19
|
+
assert(ptThreadPool_);
|
20
|
+
}
|
21
|
+
virtual ~Worker() {
|
22
|
+
}
|
23
|
+
|
24
|
+
virtual void Run() {
|
25
|
+
while (true) {
|
26
|
+
ClosureInterface* closure = ptThreadPool_->queue_.Pop();
|
27
|
+
if (closure == NULL) {
|
28
|
+
break;
|
29
|
+
}
|
30
|
+
try {
|
31
|
+
closure->Run();
|
32
|
+
} catch(std::exception& e) {
|
33
|
+
XLOG(ERROR) << e.what();
|
34
|
+
} catch(...) {
|
35
|
+
XLOG(ERROR) << " unknown exception.";
|
36
|
+
}
|
37
|
+
delete closure;
|
38
|
+
}
|
39
|
+
}
|
40
|
+
private:
|
41
|
+
ThreadPool * ptThreadPool_;
|
42
|
+
}; // class Worker
|
43
|
+
|
44
|
+
ThreadPool(size_t thread_num)
|
45
|
+
: threads_(thread_num),
|
46
|
+
queue_(thread_num) {
|
47
|
+
assert(thread_num);
|
48
|
+
for(size_t i = 0; i < threads_.size(); i ++) {
|
49
|
+
threads_[i] = new Worker(this);
|
50
|
+
}
|
51
|
+
}
|
52
|
+
~ThreadPool() {
|
53
|
+
Stop();
|
54
|
+
}
|
55
|
+
|
56
|
+
void Start() {
|
57
|
+
for(size_t i = 0; i < threads_.size(); i++) {
|
58
|
+
threads_[i]->Start();
|
59
|
+
}
|
60
|
+
}
|
61
|
+
void Stop() {
|
62
|
+
for(size_t i = 0; i < threads_.size(); i ++) {
|
63
|
+
queue_.Push(NULL);
|
64
|
+
}
|
65
|
+
for(size_t i = 0; i < threads_.size(); i ++) {
|
66
|
+
threads_[i]->Join();
|
67
|
+
delete threads_[i];
|
68
|
+
}
|
69
|
+
threads_.clear();
|
70
|
+
}
|
71
|
+
|
72
|
+
void Add(ClosureInterface* task) {
|
73
|
+
assert(task);
|
74
|
+
queue_.Push(task);
|
75
|
+
}
|
76
|
+
|
77
|
+
private:
|
78
|
+
friend class Worker;
|
79
|
+
|
80
|
+
vector<IThread*> threads_;
|
81
|
+
BoundedBlockingQueue<ClosureInterface*> queue_;
|
82
|
+
}; // class ThreadPool
|
83
|
+
|
84
|
+
} // namespace limonp
|
85
|
+
|
86
|
+
#endif // LIMONP_THREAD_POOL_HPP
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# CppJieba字典
|
2
|
+
|
3
|
+
文件后缀名代表的是词典的编码方式。
|
4
|
+
比如filename.utf8 是 utf8编码,filename.gbk 是 gbk编码方式。
|
5
|
+
|
6
|
+
|
7
|
+
## 分词
|
8
|
+
|
9
|
+
### jieba.dict.utf8/gbk
|
10
|
+
|
11
|
+
作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
|
12
|
+
|
13
|
+
### hmm_model.utf8/gbk
|
14
|
+
|
15
|
+
作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
|
16
|
+
|
17
|
+
__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
|
18
|
+
|
19
|
+
|
20
|
+
## 关键词抽取
|
21
|
+
|
22
|
+
### idf.utf8
|
23
|
+
|
24
|
+
IDF(Inverse Document Frequency)
|
25
|
+
在KeywordExtractor中,使用的是经典的TF-IDF算法,所以需要这么一个词典提供IDF信息。
|
26
|
+
|
27
|
+
### stop_words.utf8
|
28
|
+
|
29
|
+
停用词词典
|
30
|
+
|
31
|
+
|