jieba_rb 0.0.2 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +6 -1
- data/README.md +16 -0
- data/ext/jieba/jieba.c +1 -0
- data/ext/jieba/jieba.h +1 -0
- data/ext/jieba/tagging.cc +75 -0
- data/ext/jieba/tagging.h +17 -0
- data/lib/jieba_rb.rb +12 -1
- data/lib/jieba_rb/version.rb +1 -1
- data/test/test_tagging.rb +17 -0
- metadata +20 -17
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2aab0ba015d462b0ab6fc1ee344961660e20adeb
|
4
|
+
data.tar.gz: d15db1389073584381a83aad79967152949cda7c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b63793a07432aaf9811cb0dc16c4a3e61b3c5930019cd417a0f8e86b310909a48974199bc8c731c0656c75c744bb37ebe734a1515967f6fb29d299a9fa977bdd
|
7
|
+
data.tar.gz: 1ec9fde21352676ba4be0e6a2cf99310dfec07c3933e23554e7b473d527dcea19ccc8d435d22328bcf1ced19860b1d9d141671221f1bbb9391961065f74d5761
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -41,6 +41,22 @@ HMM or Max probability (mp) Segment mode:
|
|
41
41
|
seg = JiebaRb::Segment.new mode: :hmm # or mode: :mp
|
42
42
|
words = seg.cut "令狐冲是云计算行业的专家"
|
43
43
|
|
44
|
+
## Word tagging Usage
|
45
|
+
|
46
|
+
Default tagging:
|
47
|
+
|
48
|
+
require 'jieba_rb'
|
49
|
+
tagging = JiebaRb::Tagging.new
|
50
|
+
pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
|
51
|
+
# [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"x"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}]
|
52
|
+
|
53
|
+
Tagging with user-defined dictionary:
|
54
|
+
|
55
|
+
require 'jieba_rb'
|
56
|
+
tagging = JiebaRb::Tagging.new user_dict: :default
|
57
|
+
pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
|
58
|
+
# [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"nz"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}]
|
59
|
+
|
44
60
|
## Keyword Extractor Usage
|
45
61
|
|
46
62
|
* only support TF-IDF currently
|
data/ext/jieba/jieba.c
CHANGED
data/ext/jieba/jieba.h
CHANGED
@@ -0,0 +1,75 @@
|
|
1
|
+
#include "tagging.h"
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
#include <PosTagger.hpp>
|
4
|
+
|
5
|
+
static rb_encoding* u8_enc;
|
6
|
+
|
7
|
+
struct Tagging{
|
8
|
+
CppJieba::PosTagger * p;
|
9
|
+
};
|
10
|
+
|
11
|
+
static void tagger_free(void *p){
|
12
|
+
delete ((Tagging*) p) -> p;
|
13
|
+
delete (Tagging*)p;
|
14
|
+
}
|
15
|
+
|
16
|
+
static VALUE alloc(VALUE klass)
|
17
|
+
{
|
18
|
+
Tagging * tagging = new Tagging();
|
19
|
+
return Data_Wrap_Struct(klass, NULL, tagger_free, tagging);
|
20
|
+
}
|
21
|
+
|
22
|
+
static void init(VALUE self,
|
23
|
+
VALUE jieba_dict_rbs,
|
24
|
+
VALUE hmm_dict_rbs,
|
25
|
+
VALUE user_dict_rbs)
|
26
|
+
{
|
27
|
+
Tagging *tagging;
|
28
|
+
Data_Get_Struct(self, Tagging, tagging);
|
29
|
+
|
30
|
+
Check_Type(jieba_dict_rbs, T_STRING);
|
31
|
+
Check_Type(hmm_dict_rbs, T_STRING);
|
32
|
+
Check_Type(user_dict_rbs, T_STRING);
|
33
|
+
|
34
|
+
std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
|
35
|
+
std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
|
36
|
+
std::string user_dict = StringValueCStr(user_dict_rbs);
|
37
|
+
|
38
|
+
tagging->p = new CppJieba::PosTagger(jieba_dict, hmm_dict, user_dict);
|
39
|
+
}
|
40
|
+
|
41
|
+
static VALUE tag(VALUE self, VALUE text_rbs)
|
42
|
+
{
|
43
|
+
Check_Type(text_rbs, T_STRING);
|
44
|
+
std::string text = StringValueCStr(text_rbs);
|
45
|
+
|
46
|
+
Tagging *tagging;
|
47
|
+
Data_Get_Struct(self, Tagging, tagging);
|
48
|
+
|
49
|
+
std::vector<std::pair<std::string, std::string>> pairs;
|
50
|
+
tagging->p->tag(text, pairs);
|
51
|
+
|
52
|
+
volatile VALUE arr = rb_ary_new();
|
53
|
+
for (std::vector<std::pair<std::string, std::string>>::const_iterator j = pairs.begin(); j != pairs.end(); j++)
|
54
|
+
{
|
55
|
+
VALUE pair = rb_hash_new();
|
56
|
+
rb_hash_aset(pair, rb_enc_str_new(std::get<0>(*j).c_str(), std::get<0>(*j).length(), u8_enc), rb_enc_str_new(std::get<1>(*j).c_str(), std::get<1>(*j).length(), u8_enc));
|
57
|
+
rb_ary_push(arr, pair);
|
58
|
+
|
59
|
+
}
|
60
|
+
return arr;
|
61
|
+
}
|
62
|
+
|
63
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
64
|
+
|
65
|
+
extern "C" {
|
66
|
+
void Init_tagging()
|
67
|
+
{
|
68
|
+
VALUE cTagging = rb_define_class_under(mJieba, "Tagging", rb_cObject);
|
69
|
+
u8_enc = rb_utf8_encoding();
|
70
|
+
rb_define_alloc_func(cTagging, alloc);
|
71
|
+
DEF(cTagging, "_init",init,3);
|
72
|
+
DEF(cTagging, "tag",tag,1);
|
73
|
+
}
|
74
|
+
|
75
|
+
}
|
data/ext/jieba/tagging.h
ADDED
data/lib/jieba_rb.rb
CHANGED
@@ -46,10 +46,21 @@ module JiebaRb
|
|
46
46
|
stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
|
47
47
|
|
48
48
|
user_dict = opts[:user_dict] || ""
|
49
|
-
user_dict =
|
49
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
50
50
|
|
51
51
|
_init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
|
52
52
|
end
|
53
53
|
end
|
54
54
|
|
55
|
+
class Tagging
|
56
|
+
private :_init
|
57
|
+
def initialize opts = {}
|
58
|
+
jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
|
59
|
+
hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
|
60
|
+
user_dict = opts[:user_dict] || ""
|
61
|
+
user_dict = DEFAULT_USER_DICT if user_dict == :default
|
62
|
+
|
63
|
+
_init jieba_dict, hmm_dict, user_dict
|
64
|
+
end
|
65
|
+
end
|
55
66
|
end
|
data/lib/jieba_rb/version.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'minitest/autorun'
|
3
|
+
require 'jieba_rb'
|
4
|
+
class JiebaTest < Minitest::Test
|
5
|
+
def test_tagging
|
6
|
+
tagging = JiebaRb::Tagging.new
|
7
|
+
pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
|
8
|
+
assert_equal [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"x"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}], pairs
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_tagging_with_user_dict
|
12
|
+
tagging = JiebaRb::Tagging.new user_dict: :default
|
13
|
+
pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
|
14
|
+
assert_equal [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"nz"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}], pairs
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
metadata
CHANGED
@@ -1,69 +1,69 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: jieba_rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Li
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-12-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.5'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '1.5'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ">="
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rake-compiler
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - ">="
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: minitest
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - ">="
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
description: cppjieba binding for ruby
|
@@ -74,9 +74,9 @@ extensions:
|
|
74
74
|
- ext/jieba/extconf.rb
|
75
75
|
extra_rdoc_files: []
|
76
76
|
files:
|
77
|
-
- .gitignore
|
78
|
-
- .gitmodules
|
79
|
-
- .travis.yml
|
77
|
+
- ".gitignore"
|
78
|
+
- ".gitmodules"
|
79
|
+
- ".travis.yml"
|
80
80
|
- Gemfile
|
81
81
|
- LICENSE.txt
|
82
82
|
- README.md
|
@@ -215,11 +215,14 @@ files:
|
|
215
215
|
- ext/jieba/keyword.h
|
216
216
|
- ext/jieba/segment.cc
|
217
217
|
- ext/jieba/segment.h
|
218
|
+
- ext/jieba/tagging.cc
|
219
|
+
- ext/jieba/tagging.h
|
218
220
|
- jieba_rb.gemspec
|
219
221
|
- lib/jieba_rb.rb
|
220
222
|
- lib/jieba_rb/version.rb
|
221
223
|
- test/test_keyword.rb
|
222
224
|
- test/test_segment.rb
|
225
|
+
- test/test_tagging.rb
|
223
226
|
homepage: https://github.com/altkatz/jieba_rb
|
224
227
|
licenses:
|
225
228
|
- MIT
|
@@ -230,21 +233,21 @@ require_paths:
|
|
230
233
|
- lib
|
231
234
|
required_ruby_version: !ruby/object:Gem::Requirement
|
232
235
|
requirements:
|
233
|
-
- -
|
236
|
+
- - ">="
|
234
237
|
- !ruby/object:Gem::Version
|
235
238
|
version: 1.9.2
|
236
239
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
237
240
|
requirements:
|
238
|
-
- -
|
241
|
+
- - ">="
|
239
242
|
- !ruby/object:Gem::Version
|
240
243
|
version: '0'
|
241
244
|
requirements: []
|
242
245
|
rubyforge_project:
|
243
|
-
rubygems_version: 2.
|
246
|
+
rubygems_version: 2.5.1
|
244
247
|
signing_key:
|
245
248
|
specification_version: 4
|
246
249
|
summary: cppjieba binding for ruby
|
247
250
|
test_files:
|
248
251
|
- test/test_keyword.rb
|
249
252
|
- test/test_segment.rb
|
250
|
-
|
253
|
+
- test/test_tagging.rb
|