jieba_rb 0.0.2 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +6 -1
- data/README.md +16 -0
- data/ext/jieba/jieba.c +1 -0
- data/ext/jieba/jieba.h +1 -0
- data/ext/jieba/tagging.cc +75 -0
- data/ext/jieba/tagging.h +17 -0
- data/lib/jieba_rb.rb +12 -1
- data/lib/jieba_rb/version.rb +1 -1
- data/test/test_tagging.rb +17 -0
- metadata +20 -17
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2aab0ba015d462b0ab6fc1ee344961660e20adeb
|
|
4
|
+
data.tar.gz: d15db1389073584381a83aad79967152949cda7c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b63793a07432aaf9811cb0dc16c4a3e61b3c5930019cd417a0f8e86b310909a48974199bc8c731c0656c75c744bb37ebe734a1515967f6fb29d299a9fa977bdd
|
|
7
|
+
data.tar.gz: 1ec9fde21352676ba4be0e6a2cf99310dfec07c3933e23554e7b473d527dcea19ccc8d435d22328bcf1ced19860b1d9d141671221f1bbb9391961065f74d5761
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
|
@@ -41,6 +41,22 @@ HMM or Max probability (mp) Segment mode:
|
|
|
41
41
|
seg = JiebaRb::Segment.new mode: :hmm # or mode: :mp
|
|
42
42
|
words = seg.cut "令狐冲是云计算行业的专家"
|
|
43
43
|
|
|
44
|
+
## Word tagging Usage
|
|
45
|
+
|
|
46
|
+
Default tagging:
|
|
47
|
+
|
|
48
|
+
require 'jieba_rb'
|
|
49
|
+
tagging = JiebaRb::Tagging.new
|
|
50
|
+
pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
|
|
51
|
+
# [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"x"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}]
|
|
52
|
+
|
|
53
|
+
Tagging with user-defined dictionary:
|
|
54
|
+
|
|
55
|
+
require 'jieba_rb'
|
|
56
|
+
tagging = JiebaRb::Tagging.new user_dict: :default
|
|
57
|
+
pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
|
|
58
|
+
# [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"nz"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}]
|
|
59
|
+
|
|
44
60
|
## Keyword Extractor Usage
|
|
45
61
|
|
|
46
62
|
* only support TF-IDF currently
|
data/ext/jieba/jieba.c
CHANGED
data/ext/jieba/jieba.h
CHANGED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
#include "tagging.h"
|
|
2
|
+
#include <ruby/encoding.h>
|
|
3
|
+
#include <PosTagger.hpp>
|
|
4
|
+
|
|
5
|
+
static rb_encoding* u8_enc;
|
|
6
|
+
|
|
7
|
+
struct Tagging{
|
|
8
|
+
CppJieba::PosTagger * p;
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
static void tagger_free(void *p){
|
|
12
|
+
delete ((Tagging*) p) -> p;
|
|
13
|
+
delete (Tagging*)p;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
static VALUE alloc(VALUE klass)
|
|
17
|
+
{
|
|
18
|
+
Tagging * tagging = new Tagging();
|
|
19
|
+
return Data_Wrap_Struct(klass, NULL, tagger_free, tagging);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
static void init(VALUE self,
|
|
23
|
+
VALUE jieba_dict_rbs,
|
|
24
|
+
VALUE hmm_dict_rbs,
|
|
25
|
+
VALUE user_dict_rbs)
|
|
26
|
+
{
|
|
27
|
+
Tagging *tagging;
|
|
28
|
+
Data_Get_Struct(self, Tagging, tagging);
|
|
29
|
+
|
|
30
|
+
Check_Type(jieba_dict_rbs, T_STRING);
|
|
31
|
+
Check_Type(hmm_dict_rbs, T_STRING);
|
|
32
|
+
Check_Type(user_dict_rbs, T_STRING);
|
|
33
|
+
|
|
34
|
+
std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
|
|
35
|
+
std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
|
|
36
|
+
std::string user_dict = StringValueCStr(user_dict_rbs);
|
|
37
|
+
|
|
38
|
+
tagging->p = new CppJieba::PosTagger(jieba_dict, hmm_dict, user_dict);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
static VALUE tag(VALUE self, VALUE text_rbs)
|
|
42
|
+
{
|
|
43
|
+
Check_Type(text_rbs, T_STRING);
|
|
44
|
+
std::string text = StringValueCStr(text_rbs);
|
|
45
|
+
|
|
46
|
+
Tagging *tagging;
|
|
47
|
+
Data_Get_Struct(self, Tagging, tagging);
|
|
48
|
+
|
|
49
|
+
std::vector<std::pair<std::string, std::string>> pairs;
|
|
50
|
+
tagging->p->tag(text, pairs);
|
|
51
|
+
|
|
52
|
+
volatile VALUE arr = rb_ary_new();
|
|
53
|
+
for (std::vector<std::pair<std::string, std::string>>::const_iterator j = pairs.begin(); j != pairs.end(); j++)
|
|
54
|
+
{
|
|
55
|
+
VALUE pair = rb_hash_new();
|
|
56
|
+
rb_hash_aset(pair, rb_enc_str_new(std::get<0>(*j).c_str(), std::get<0>(*j).length(), u8_enc), rb_enc_str_new(std::get<1>(*j).c_str(), std::get<1>(*j).length(), u8_enc));
|
|
57
|
+
rb_ary_push(arr, pair);
|
|
58
|
+
|
|
59
|
+
}
|
|
60
|
+
return arr;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
|
64
|
+
|
|
65
|
+
extern "C" {
|
|
66
|
+
void Init_tagging()
|
|
67
|
+
{
|
|
68
|
+
VALUE cTagging = rb_define_class_under(mJieba, "Tagging", rb_cObject);
|
|
69
|
+
u8_enc = rb_utf8_encoding();
|
|
70
|
+
rb_define_alloc_func(cTagging, alloc);
|
|
71
|
+
DEF(cTagging, "_init",init,3);
|
|
72
|
+
DEF(cTagging, "tag",tag,1);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
}
|
data/ext/jieba/tagging.h
ADDED
data/lib/jieba_rb.rb
CHANGED
|
@@ -46,10 +46,21 @@ module JiebaRb
|
|
|
46
46
|
stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
|
|
47
47
|
|
|
48
48
|
user_dict = opts[:user_dict] || ""
|
|
49
|
-
user_dict =
|
|
49
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
|
50
50
|
|
|
51
51
|
_init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
|
|
52
52
|
end
|
|
53
53
|
end
|
|
54
54
|
|
|
55
|
+
class Tagging
|
|
56
|
+
private :_init
|
|
57
|
+
def initialize opts = {}
|
|
58
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
|
59
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
|
60
|
+
user_dict = opts[:user_dict] || ""
|
|
61
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
|
62
|
+
|
|
63
|
+
_init jieba_dict, hmm_dict, user_dict
|
|
64
|
+
end
|
|
65
|
+
end
|
|
55
66
|
end
|
data/lib/jieba_rb/version.rb
CHANGED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
require 'minitest/autorun'
|
|
3
|
+
require 'jieba_rb'
|
|
4
|
+
class JiebaTest < Minitest::Test
|
|
5
|
+
def test_tagging
|
|
6
|
+
tagging = JiebaRb::Tagging.new
|
|
7
|
+
pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
|
|
8
|
+
assert_equal [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"x"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}], pairs
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def test_tagging_with_user_dict
|
|
12
|
+
tagging = JiebaRb::Tagging.new user_dict: :default
|
|
13
|
+
pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
|
|
14
|
+
assert_equal [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"nz"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}], pairs
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
end
|
metadata
CHANGED
|
@@ -1,69 +1,69 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: jieba_rb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Chris Li
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2016-12-23 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- - ~>
|
|
17
|
+
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
19
|
version: '1.5'
|
|
20
20
|
type: :development
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- - ~>
|
|
24
|
+
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '1.5'
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
28
|
name: rake
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
|
-
- -
|
|
31
|
+
- - ">="
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
33
|
version: '0'
|
|
34
34
|
type: :development
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
|
-
- -
|
|
38
|
+
- - ">="
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '0'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
42
|
name: rake-compiler
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
44
44
|
requirements:
|
|
45
|
-
- -
|
|
45
|
+
- - ">="
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
47
|
version: '0'
|
|
48
48
|
type: :development
|
|
49
49
|
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
|
-
- -
|
|
52
|
+
- - ">="
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
54
|
version: '0'
|
|
55
55
|
- !ruby/object:Gem::Dependency
|
|
56
56
|
name: minitest
|
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
|
58
58
|
requirements:
|
|
59
|
-
- -
|
|
59
|
+
- - ">="
|
|
60
60
|
- !ruby/object:Gem::Version
|
|
61
61
|
version: '0'
|
|
62
62
|
type: :development
|
|
63
63
|
prerelease: false
|
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
65
|
requirements:
|
|
66
|
-
- -
|
|
66
|
+
- - ">="
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
68
|
version: '0'
|
|
69
69
|
description: cppjieba binding for ruby
|
|
@@ -74,9 +74,9 @@ extensions:
|
|
|
74
74
|
- ext/jieba/extconf.rb
|
|
75
75
|
extra_rdoc_files: []
|
|
76
76
|
files:
|
|
77
|
-
- .gitignore
|
|
78
|
-
- .gitmodules
|
|
79
|
-
- .travis.yml
|
|
77
|
+
- ".gitignore"
|
|
78
|
+
- ".gitmodules"
|
|
79
|
+
- ".travis.yml"
|
|
80
80
|
- Gemfile
|
|
81
81
|
- LICENSE.txt
|
|
82
82
|
- README.md
|
|
@@ -215,11 +215,14 @@ files:
|
|
|
215
215
|
- ext/jieba/keyword.h
|
|
216
216
|
- ext/jieba/segment.cc
|
|
217
217
|
- ext/jieba/segment.h
|
|
218
|
+
- ext/jieba/tagging.cc
|
|
219
|
+
- ext/jieba/tagging.h
|
|
218
220
|
- jieba_rb.gemspec
|
|
219
221
|
- lib/jieba_rb.rb
|
|
220
222
|
- lib/jieba_rb/version.rb
|
|
221
223
|
- test/test_keyword.rb
|
|
222
224
|
- test/test_segment.rb
|
|
225
|
+
- test/test_tagging.rb
|
|
223
226
|
homepage: https://github.com/altkatz/jieba_rb
|
|
224
227
|
licenses:
|
|
225
228
|
- MIT
|
|
@@ -230,21 +233,21 @@ require_paths:
|
|
|
230
233
|
- lib
|
|
231
234
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
232
235
|
requirements:
|
|
233
|
-
- -
|
|
236
|
+
- - ">="
|
|
234
237
|
- !ruby/object:Gem::Version
|
|
235
238
|
version: 1.9.2
|
|
236
239
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
237
240
|
requirements:
|
|
238
|
-
- -
|
|
241
|
+
- - ">="
|
|
239
242
|
- !ruby/object:Gem::Version
|
|
240
243
|
version: '0'
|
|
241
244
|
requirements: []
|
|
242
245
|
rubyforge_project:
|
|
243
|
-
rubygems_version: 2.
|
|
246
|
+
rubygems_version: 2.5.1
|
|
244
247
|
signing_key:
|
|
245
248
|
specification_version: 4
|
|
246
249
|
summary: cppjieba binding for ruby
|
|
247
250
|
test_files:
|
|
248
251
|
- test/test_keyword.rb
|
|
249
252
|
- test/test_segment.rb
|
|
250
|
-
|
|
253
|
+
- test/test_tagging.rb
|