jieba_rb 0.0.2 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cf9ad1c59145d326f4edd1c639066e107b52c13a
4
- data.tar.gz: 4732514648268711284a8198a4a324889636112d
3
+ metadata.gz: 2aab0ba015d462b0ab6fc1ee344961660e20adeb
4
+ data.tar.gz: d15db1389073584381a83aad79967152949cda7c
5
5
  SHA512:
6
- metadata.gz: 8fdc938f12a9506eb46baef952fd6bd5a261803934673273ba33f8e57cccd614ea31725dc58262b1e6c24828db9f8787027f8e95f4e614504eb943cf75dadf04
7
- data.tar.gz: cd5d7df0f5f46cf4a7f8553d9f09cf05d0aeceb2f7b211a9238b9596aa43c3bd395425d2d278fc971bc3f224ac764b55cdfe88506c1b77a0ab8bcb927b206b48
6
+ metadata.gz: b63793a07432aaf9811cb0dc16c4a3e61b3c5930019cd417a0f8e86b310909a48974199bc8c731c0656c75c744bb37ebe734a1515967f6fb29d299a9fa977bdd
7
+ data.tar.gz: 1ec9fde21352676ba4be0e6a2cf99310dfec07c3933e23554e7b473d527dcea19ccc8d435d22328bcf1ced19860b1d9d141671221f1bbb9391961065f74d5761
@@ -2,10 +2,15 @@ language: ruby
2
2
  rvm:
3
3
  - 1.9.3
4
4
  - 2.0.0
5
- - 2.1.5
5
+ - 2.1.10
6
+ - 2.2.5
7
+ - 2.3.1
6
8
  - ruby-head
7
9
  - rbx-2
8
10
 
11
+ before_install:
12
+ - gem install bundler
13
+
9
14
  before_script: rake compile
10
15
 
11
16
  matrix:
data/README.md CHANGED
@@ -41,6 +41,22 @@ HMM or Max probability (mp) Segment mode:
41
41
  seg = JiebaRb::Segment.new mode: :hmm # or mode: :mp
42
42
  words = seg.cut "令狐冲是云计算行业的专家"
43
43
 
44
+ ## Word tagging Usage
45
+
46
+ Default tagging:
47
+
48
+ require 'jieba_rb'
49
+ tagging = JiebaRb::Tagging.new
50
+ pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
51
+ # [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"x"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}]
52
+
53
+ Tagging with user-defined dictionary:
54
+
55
+ require 'jieba_rb'
56
+ tagging = JiebaRb::Tagging.new user_dict: :default
57
+ pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
58
+ # [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"nz"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}]
59
+
44
60
  ## Keyword Extractor Usage
45
61
 
46
62
  * only support TF-IDF currently
@@ -7,4 +7,5 @@ void Init_jieba()
7
7
 
8
8
  Init_segment();
9
9
  Init_keyword();
10
+ Init_tagging();
10
11
  }
@@ -4,6 +4,7 @@
4
4
  #include <ruby.h>
5
5
  #include <segment.h>
6
6
  #include <keyword.h>
7
+ #include <tagging.h>
7
8
 
8
9
  extern VALUE mJieba;
9
10
 
@@ -0,0 +1,75 @@
1
+ #include "tagging.h"
2
+ #include <ruby/encoding.h>
3
+ #include <PosTagger.hpp>
4
+
5
+ static rb_encoding* u8_enc;
6
+
7
+ struct Tagging{
8
+ CppJieba::PosTagger * p;
9
+ };
10
+
11
+ static void tagger_free(void *p){
12
+ delete ((Tagging*) p) -> p;
13
+ delete (Tagging*)p;
14
+ }
15
+
16
+ static VALUE alloc(VALUE klass)
17
+ {
18
+ Tagging * tagging = new Tagging();
19
+ return Data_Wrap_Struct(klass, NULL, tagger_free, tagging);
20
+ }
21
+
22
+ static void init(VALUE self,
23
+ VALUE jieba_dict_rbs,
24
+ VALUE hmm_dict_rbs,
25
+ VALUE user_dict_rbs)
26
+ {
27
+ Tagging *tagging;
28
+ Data_Get_Struct(self, Tagging, tagging);
29
+
30
+ Check_Type(jieba_dict_rbs, T_STRING);
31
+ Check_Type(hmm_dict_rbs, T_STRING);
32
+ Check_Type(user_dict_rbs, T_STRING);
33
+
34
+ std::string jieba_dict = StringValueCStr(jieba_dict_rbs);
35
+ std::string hmm_dict = StringValueCStr(hmm_dict_rbs);
36
+ std::string user_dict = StringValueCStr(user_dict_rbs);
37
+
38
+ tagging->p = new CppJieba::PosTagger(jieba_dict, hmm_dict, user_dict);
39
+ }
40
+
41
+ static VALUE tag(VALUE self, VALUE text_rbs)
42
+ {
43
+ Check_Type(text_rbs, T_STRING);
44
+ std::string text = StringValueCStr(text_rbs);
45
+
46
+ Tagging *tagging;
47
+ Data_Get_Struct(self, Tagging, tagging);
48
+
49
+ std::vector<std::pair<std::string, std::string>> pairs;
50
+ tagging->p->tag(text, pairs);
51
+
52
+ volatile VALUE arr = rb_ary_new();
53
+ for (std::vector<std::pair<std::string, std::string>>::const_iterator j = pairs.begin(); j != pairs.end(); j++)
54
+ {
55
+ VALUE pair = rb_hash_new();
56
+ rb_hash_aset(pair, rb_enc_str_new(std::get<0>(*j).c_str(), std::get<0>(*j).length(), u8_enc), rb_enc_str_new(std::get<1>(*j).c_str(), std::get<1>(*j).length(), u8_enc));
57
+ rb_ary_push(arr, pair);
58
+
59
+ }
60
+ return arr;
61
+ }
62
+
63
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
64
+
65
+ extern "C" {
66
+ void Init_tagging()
67
+ {
68
+ VALUE cTagging = rb_define_class_under(mJieba, "Tagging", rb_cObject);
69
+ u8_enc = rb_utf8_encoding();
70
+ rb_define_alloc_func(cTagging, alloc);
71
+ DEF(cTagging, "_init",init,3);
72
+ DEF(cTagging, "tag",tag,1);
73
+ }
74
+
75
+ }
@@ -0,0 +1,17 @@
1
+ #ifndef RUBY_JIEBA_TAGGING
2
+ #define RUBY_JIEBA_TAGGING
3
+
4
+ #include <jieba.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ void Init_tagging();
11
+
12
+ #ifdef __cplusplus
13
+ }
14
+ #endif
15
+
16
+
17
+ #endif
@@ -46,10 +46,21 @@ module JiebaRb
46
46
  stop_words_path = opts[:stop_words] || DEFAULT_STOP_WORDS
47
47
 
48
48
  user_dict = opts[:user_dict] || ""
49
- user_dict = USER_DICT_FILE if user_dict == :default
49
+ user_dict = DEFAULT_USER_DICT if user_dict == :default
50
50
 
51
51
  _init mode, jieba_dict, hmm_dict, idf_path, stop_words_path, user_dict
52
52
  end
53
53
  end
54
54
 
55
+ class Tagging
56
+ private :_init
57
+ def initialize opts = {}
58
+ jieba_dict = opts[:jieba_dict] || DEFAULT_JIEBA_DICT
59
+ hmm_dict = opts[:hmm_dict] || DEFAULT_HMM_DICT
60
+ user_dict = opts[:user_dict] || ""
61
+ user_dict = DEFAULT_USER_DICT if user_dict == :default
62
+
63
+ _init jieba_dict, hmm_dict, user_dict
64
+ end
65
+ end
55
66
  end
@@ -1,3 +1,3 @@
1
1
  module JiebaRb
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.5"
3
3
  end
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+ require 'minitest/autorun'
3
+ require 'jieba_rb'
4
+ class JiebaTest < Minitest::Test
5
+ def test_tagging
6
+ tagging = JiebaRb::Tagging.new
7
+ pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
8
+ assert_equal [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"x"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}], pairs
9
+ end
10
+
11
+ def test_tagging_with_user_dict
12
+ tagging = JiebaRb::Tagging.new user_dict: :default
13
+ pairs = tagging.tag "我是蓝翔技工拖拉机学院手扶拖拉机专业的。"
14
+ assert_equal [{"我"=>"r"}, {"是"=>"v"}, {"蓝翔"=>"nz"}, {"技工"=>"n"}, {"拖拉机"=>"n"}, {"学院"=>"n"}, {"手扶拖拉机"=>"n"}, {"专业"=>"n"}, {"的"=>"uj"}, {"。"=>"x"}], pairs
15
+ end
16
+
17
+ end
metadata CHANGED
@@ -1,69 +1,69 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jieba_rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Li
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-12-24 00:00:00.000000000 Z
11
+ date: 2016-12-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.5'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '1.5'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rake-compiler
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - '>='
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: minitest
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - ">="
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - '>='
66
+ - - ">="
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  description: cppjieba binding for ruby
@@ -74,9 +74,9 @@ extensions:
74
74
  - ext/jieba/extconf.rb
75
75
  extra_rdoc_files: []
76
76
  files:
77
- - .gitignore
78
- - .gitmodules
79
- - .travis.yml
77
+ - ".gitignore"
78
+ - ".gitmodules"
79
+ - ".travis.yml"
80
80
  - Gemfile
81
81
  - LICENSE.txt
82
82
  - README.md
@@ -215,11 +215,14 @@ files:
215
215
  - ext/jieba/keyword.h
216
216
  - ext/jieba/segment.cc
217
217
  - ext/jieba/segment.h
218
+ - ext/jieba/tagging.cc
219
+ - ext/jieba/tagging.h
218
220
  - jieba_rb.gemspec
219
221
  - lib/jieba_rb.rb
220
222
  - lib/jieba_rb/version.rb
221
223
  - test/test_keyword.rb
222
224
  - test/test_segment.rb
225
+ - test/test_tagging.rb
223
226
  homepage: https://github.com/altkatz/jieba_rb
224
227
  licenses:
225
228
  - MIT
@@ -230,21 +233,21 @@ require_paths:
230
233
  - lib
231
234
  required_ruby_version: !ruby/object:Gem::Requirement
232
235
  requirements:
233
- - - '>='
236
+ - - ">="
234
237
  - !ruby/object:Gem::Version
235
238
  version: 1.9.2
236
239
  required_rubygems_version: !ruby/object:Gem::Requirement
237
240
  requirements:
238
- - - '>='
241
+ - - ">="
239
242
  - !ruby/object:Gem::Version
240
243
  version: '0'
241
244
  requirements: []
242
245
  rubyforge_project:
243
- rubygems_version: 2.4.5
246
+ rubygems_version: 2.5.1
244
247
  signing_key:
245
248
  specification_version: 4
246
249
  summary: cppjieba binding for ruby
247
250
  test_files:
248
251
  - test/test_keyword.rb
249
252
  - test/test_segment.rb
250
- has_rdoc:
253
+ - test/test_tagging.rb