pluskid-rmmseg-cpp 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ #include <ruby.h>
2
+ #include <cstdio> // for debug
3
+
4
+ #include "token.h"
5
+ #include "dict.h"
6
+ #include "algor.h"
7
+
8
+ using namespace std;
9
+
10
+ extern "C" {
11
+
12
+ /*****************************************
13
+ *
14
+ * Normal interface
15
+ *
16
+ *****************************************/
17
+
18
+ /*********************
19
+ * RMMSeg module
20
+ *********************/
21
+ static VALUE mRMMSeg;
22
+
23
+
24
+ /*********************
25
+ * Dictionary module
26
+ *********************/
27
+ static VALUE mDictionary;
28
+
29
+ static VALUE dic_load_chars(VALUE mod, VALUE path)
30
+ {
31
+ if (rmmseg::dict::load_chars(RSTRING(path)->ptr))
32
+ return Qtrue;
33
+ return Qfalse;
34
+ }
35
+ static VALUE dic_load_words(VALUE mod, VALUE path)
36
+ {
37
+ if (rmmseg::dict::load_words(RSTRING(path)->ptr))
38
+ return Qtrue;
39
+ return Qfalse;
40
+ }
41
+ static VALUE dic_add(VALUE mod, VALUE word, VALUE len, VALUE freq)
42
+ {
43
+ const char *str = RSTRING(word)->ptr;
44
+ int nbytes = RSTRING(word)->len;
45
+ rmmseg::Word *w = rmmseg::make_word(str, FIX2INT(len), FIX2INT(freq), nbytes);
46
+ rmmseg::dict::add(w);
47
+ return Qnil;
48
+ }
49
+ static VALUE dic_has_word(VALUE mod, VALUE word)
50
+ {
51
+ const char *str = RSTRING(word)->ptr;
52
+ int nbytes = RSTRING(word)->len;
53
+ if (rmmseg::dict::get(str, nbytes) != NULL)
54
+ return Qtrue;
55
+ return Qfalse;
56
+ }
57
+
58
+
59
+ /**********************
60
+ * Token Class
61
+ **********************/
62
+ struct Token
63
+ {
64
+ VALUE text;
65
+ VALUE start;
66
+ VALUE end;
67
+ };
68
+
69
+ static void tk_mark(Token *t)
70
+ {
71
+ // start and end are Fixnums, no need to mark them
72
+ rb_gc_mark(t->text);
73
+ }
74
+ static void tk_free(Token *t)
75
+ {
76
+ free(t);
77
+ }
78
+
79
+ static VALUE tk_text(VALUE self)
80
+ {
81
+ Token *tk = (Token *)DATA_PTR(self);
82
+ return tk->text;
83
+ }
84
+ static VALUE tk_start(VALUE self)
85
+ {
86
+ Token *tk = (Token *)DATA_PTR(self);
87
+ return tk->start;
88
+ }
89
+ static VALUE tk_end(VALUE self)
90
+ {
91
+ Token *tk = (Token *)DATA_PTR(self);
92
+ return tk->end;
93
+ }
94
+
95
+ static VALUE cToken;
96
+ static VALUE tk_create(const char* base, const rmmseg::Token &t)
97
+ {
98
+ Token *tk = (Token *)malloc(sizeof(Token));
99
+ int start = t.text-base;
100
+ tk->text = rb_str_new(t.text, t.length);
101
+ tk->start = INT2FIX(start);
102
+ tk->end = INT2FIX(start + t.length);
103
+ return Data_Wrap_Struct(cToken,
104
+ (RUBY_DATA_FUNC)tk_mark,
105
+ (RUBY_DATA_FUNC)tk_free,
106
+ tk);
107
+ }
108
+
109
+ /*********************
110
+ * Algorithm Class
111
+ *********************/
112
+ struct Algorithm
113
+ {
114
+ VALUE text; // hold to avoid being garbage collected
115
+ rmmseg::Algorithm *algor;
116
+ };
117
+
118
+ static void algor_mark(Algorithm *a)
119
+ {
120
+ rb_gc_mark(a->text);
121
+ }
122
+ static void algor_free(Algorithm *a)
123
+ {
124
+ free(a->algor);
125
+ }
126
+
127
+ static VALUE cAlgorithm;
128
+ static VALUE algor_create(VALUE klass, VALUE text)
129
+ {
130
+ Algorithm *algor = (Algorithm *)malloc(sizeof(Algorithm));
131
+ void *mem;
132
+ algor->text = text;
133
+ mem = malloc(sizeof(rmmseg::Algorithm));
134
+ algor->algor = new(mem) rmmseg::Algorithm(RSTRING(text)->ptr,
135
+ RSTRING(text)->len);
136
+
137
+ return Data_Wrap_Struct(klass,
138
+ (RUBY_DATA_FUNC)algor_mark,
139
+ (RUBY_DATA_FUNC)algor_free,
140
+ algor);
141
+ }
142
+ static VALUE algor_next_token(VALUE self)
143
+ {
144
+ Algorithm *algor = (Algorithm *)DATA_PTR(self);
145
+ rmmseg::Token tk = algor->algor->next_token();
146
+
147
+ if (tk.length == 0)
148
+ return Qnil;
149
+ return tk_create(RSTRING(algor->text)->ptr, tk);
150
+ }
151
+
152
+
153
+ void Init_rmmseg()
154
+ {
155
+ typedef VALUE (*RUBY_METHOD) (...);
156
+ mRMMSeg = rb_define_module("RMMSeg");
157
+
158
+ mDictionary = rb_define_module_under(mRMMSeg, "Dictionary");
159
+ rb_define_singleton_method(mDictionary, "load_chars", (RUBY_METHOD)dic_load_chars, 1);
160
+ rb_define_singleton_method(mDictionary, "load_words", (RUBY_METHOD)dic_load_words, 1);
161
+ rb_define_singleton_method(mDictionary, "load_add", (RUBY_METHOD)dic_add, 3);
162
+ rb_define_singleton_method(mDictionary, "has_word?", (RUBY_METHOD)dic_has_word, 1);
163
+
164
+ cToken = rb_define_class_under(mRMMSeg, "Token", rb_cObject);
165
+ rb_undef_method(rb_singleton_class(cToken), "new");
166
+ rb_define_method(cToken, "text", (RUBY_METHOD)tk_text, 0);
167
+ rb_define_method(cToken, "start", (RUBY_METHOD)tk_start, 0);
168
+ rb_define_method(cToken, "end", (RUBY_METHOD)tk_end, 0);
169
+
170
+ cAlgorithm = rb_define_class_under(mRMMSeg, "Algorithm", rb_cObject);
171
+ rb_define_singleton_method(cAlgorithm, "new", (RUBY_METHOD)algor_create, 1);
172
+ rb_define_method(cAlgorithm, "next_token", (RUBY_METHOD)algor_next_token, 0);
173
+ }
174
+ }
@@ -0,0 +1,87 @@
1
+ #ifndef _RULES_H_
2
+ #define _RULES_H_
3
+
4
+ #include <vector>
5
+ #include <algorithm>
6
+
7
+ #include "chunk.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ template <typename Cmp>
12
+ void take_highest(std::vector<Chunk> &chunks, Cmp &cmp)
13
+ {
14
+ int i = 1, j;
15
+ Chunk& max = chunks[0];
16
+
17
+ for (j = 1; j < chunks.size(); ++j)
18
+ {
19
+ int rlt = cmp(chunks[j], max);
20
+ if (rlt > 0)
21
+ i = 0;
22
+ if (rlt >= 0)
23
+ std::swap(chunks[i++], chunks[j]);
24
+ }
25
+ chunks.erase(chunks.begin()+i, chunks.end());
26
+ }
27
+
28
+ struct MMCmp_t
29
+ {
30
+ int operator()(Chunk &a, Chunk &b)
31
+ {
32
+ return a.total_length() - b.total_length();
33
+ }
34
+ } MMCmp;
35
+ void mm_filter(std::vector<Chunk> &chunks)
36
+ {
37
+ take_highest(chunks, MMCmp);
38
+ }
39
+
40
+ struct LAWLCmp_t
41
+ {
42
+ int operator()(Chunk &a, Chunk &b)
43
+ {
44
+ double rlt = a.average_length() - b.average_length();
45
+ if (rlt == 0)
46
+ return 0;
47
+ if (rlt > 0)
48
+ return 1;
49
+ return -1;
50
+ }
51
+ } LAWLCmp;
52
+ void lawl_filter(std::vector<Chunk> &chunks)
53
+ {
54
+ take_highest(chunks, LAWLCmp);
55
+ }
56
+
57
+ struct SVWLCmp_t
58
+ {
59
+ int operator()(Chunk &a, Chunk& b)
60
+ {
61
+ double rlt = a.variance() - b.variance();
62
+ if (rlt == 0)
63
+ return 0;
64
+ if (rlt < 0)
65
+ return 1;
66
+ return -1;
67
+ }
68
+ } SVWLCmp;
69
+ void svwl_filter(std::vector<Chunk> &chunks)
70
+ {
71
+ take_highest(chunks, SVWLCmp);
72
+ }
73
+
74
+ struct LSDMFOCWCmp_t
75
+ {
76
+ int operator()(Chunk &a, Chunk& b)
77
+ {
78
+ return a.degree_of_morphemic_freedom() - b.degree_of_morphemic_freedom();
79
+ }
80
+ } LSDMFOCWCmp;
81
+ void lsdmfocw_filter(std::vector<Chunk> &chunks)
82
+ {
83
+ take_highest(chunks, LSDMFOCWCmp);
84
+ }
85
+ }
86
+
87
+ #endif /* _RULES_H_ */
@@ -0,0 +1,19 @@
1
+ #ifndef _TOKEN_H_
2
+ #define _TOKEN_H_
3
+
4
+ namespace rmmseg
5
+ {
6
+ struct Token
7
+ {
8
+ Token(const char *txt, int len)
9
+ :text(txt), length(len) { }
10
+ // `text' may or may not be nul-terminated, its length
11
+ // should be stored in the `length' field.
12
+ //
13
+ // if length is 0, this is an empty token
14
+ const char *text;
15
+ int length;
16
+ };
17
+ }
18
+
19
+ #endif /* _TOKEN_H_ */
data/ext/rmmseg/word.h ADDED
@@ -0,0 +1,44 @@
1
+ #ifndef _WORD_H_
2
+ #define _WORD_H_
3
+
4
+ #include <climits>
5
+ #include <cstring>
6
+
7
+ #include "memory.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
12
+ struct Word
13
+ {
14
+ unsigned char nbytes; /* number of bytes */
15
+ char length; /* number of characters */
16
+ unsigned short freq;
17
+ char text[word_embed_len];
18
+ };
19
+
20
+ /**
21
+ * text: the text of the word.
22
+ * length: number of characters (not bytes).
23
+ * freq: the frequency of the word.
24
+ */
25
+ inline Word *make_word(const char *text, int length=1,
26
+ int freq=0, int nbytes=-1)
27
+ {
28
+ if (freq > USHRT_MAX)
29
+ freq = USHRT_MAX; /* avoid overflow */
30
+ if (nbytes == -1)
31
+ nbytes = strlen(text);
32
+ Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
33
+ + nbytes+1
34
+ - word_embed_len));
35
+ w->nbytes = std::strlen(text);
36
+ w->length = length;
37
+ w->freq = freq;
38
+ std::strncpy(w->text, text, nbytes);
39
+ w->text[nbytes] = '\0';
40
+ return w;
41
+ }
42
+ }
43
+
44
+ #endif /* _WORD_H_ */
@@ -0,0 +1,54 @@
1
+ module RMMSeg
2
+ module Dictionary
3
+ @dictionaries = [
4
+ [:chars, File.join(File.dirname(__FILE__),
5
+ "..", "..", "data", "chars.dic")],
6
+ [:words, File.join(File.dirname(__FILE__),
7
+ "..", "..", "data", "words.dic")]
8
+ ]
9
+
10
+ class << self
11
+ #
12
+ # An array of dictionaries used by RMMSeg. Each entry is of the
13
+ # following form:
14
+ #
15
+ # [type, path]
16
+ #
17
+ # where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
18
+ # to the dictionary file.
19
+ #
20
+ # The format of <tt>:chars</tt> dictionary is a collection of lines of the
21
+ # following form:
22
+ #
23
+ # freq char
24
+ #
25
+ # Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
26
+ # character. They are spearated by <b>exactly one space</b>.
27
+ #
28
+ # The format of <tt>:words</tt> dictionary is similar:
29
+ #
30
+ # length word
31
+ #
32
+ # except the first number is not the frequency, but the number of
33
+ # characters (not number of bytes) in the word.
34
+ #
35
+ attr_accessor :dictionaries
36
+
37
+ # Add a user defined dictionary, +type+ can be
38
+ # +:chars+ or <tt>:words</tt>. See doc of dictionaries.
39
+ def add_dictionary(path, type)
40
+ @dictionaries << [type, path]
41
+ end
42
+
43
+ def load_dictionaries
44
+ @dictionaries.each do |type, path|
45
+ if type == :chars
46
+ load_chars(path)
47
+ elsif type == :words
48
+ load_words(path)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,112 @@
1
+ require 'singleton'
2
+ require 'rubygems'
3
+ require 'rmmseg'
4
+ require 'ferret'
5
+
6
+ module RMMSeg
7
+ module Ferret
8
+ # The Analyzer class can be used with Ferret .
9
+ class Analyzer < ::Ferret::Analysis::Analyzer
10
+
11
+ # Construct an Analyzer. Optional block can be used to
12
+ # add more +TokenFilter+s. e.g.
13
+ #
14
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
15
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
16
+ # }
17
+ #
18
+ def initialize(&brk)
19
+ @brk = brk
20
+ end
21
+
22
+ def token_stream(field, text)
23
+ t = PunctuationFilter.new(Tokenizer.new(text))
24
+ if @brk
25
+ @brk.call(t)
26
+ else
27
+ t
28
+ end
29
+ end
30
+ end
31
+
32
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
33
+ class Tokenizer < ::Ferret::Analysis::TokenStream
34
+ # Create a new Tokenizer to tokenize +text+
35
+ def initialize(str)
36
+ self.text = str
37
+ end
38
+
39
+ # Get next token
40
+ def next
41
+ tok = @algor.next_token
42
+ if tok.nil?
43
+ return nil
44
+ else
45
+ return ::Ferret::Analysis::Token.new(tok.text, tok.start, tok.end)
46
+ end
47
+ end
48
+
49
+ # Get the text being tokenized
50
+ def text
51
+ @text
52
+ end
53
+
54
+ # Set the text to be tokenized
55
+ def text=(str)
56
+ @text = str
57
+ @algor = Algorithm.new(@text)
58
+ end
59
+ end
60
+
61
+ # PunctuationFilter filter out the stand alone Chinese
62
+ # punctuation tokens.
63
+ class PunctuationFilter < ::Ferret::Analysis::TokenStream
64
+ # The punctuation dictionary.
65
+ class Dictionary
66
+ include Singleton
67
+
68
+ DIC_FILE = File.join(File.dirname(__FILE__),
69
+ "..",
70
+ "..",
71
+ "data",
72
+ "punctuation.dic")
73
+ def initialize
74
+ @dic = Hash.new
75
+ File.open(DIC_FILE, "r") do |f|
76
+ f.each_line { |line|
77
+ @dic[line.chomp.freeze] = nil
78
+ }
79
+ end
80
+ end
81
+
82
+ def include?(str)
83
+ @dic.has_key?(str)
84
+ end
85
+ end
86
+
87
+ def initialize(stream)
88
+ @stream = stream
89
+ end
90
+
91
+ # Get next token, skip stand alone Chinese punctuations.
92
+ def next
93
+ token = @stream.next
94
+ dic = Dictionary.instance
95
+
96
+ until token.nil? || !(dic.include? token.text)
97
+ token = @stream.next
98
+ end
99
+
100
+ token
101
+ end
102
+
103
+ def text
104
+ @stream.text
105
+ end
106
+
107
+ def text=(str)
108
+ @stream.text = str
109
+ end
110
+ end
111
+ end
112
+ end
data/lib/rmmseg.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'rmmseg/dictionary'
2
+ require File.join(File.dirname(__FILE__), '..',
3
+ 'ext', 'rmmseg', 'rmmseg')
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pluskid-rmmseg-cpp
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - pluskid
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-05-14 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: pluskid@gmail.com
18
+ executables: []
19
+
20
+ extensions:
21
+ - ext/rmmseg/extconf.rb
22
+ extra_rdoc_files:
23
+ - README
24
+ files:
25
+ - lib/rmmseg/dictionary.rb
26
+ - lib/rmmseg/ferret.rb
27
+ - lib/rmmseg.rb
28
+ - ext/rmmseg/dict.cpp
29
+ - ext/rmmseg/memory.cpp
30
+ - ext/rmmseg/rmmseg.cpp
31
+ - ext/rmmseg/algor.cpp
32
+ - ext/rmmseg/rules.h
33
+ - ext/rmmseg/dict.h
34
+ - ext/rmmseg/token.h
35
+ - ext/rmmseg/algor.h
36
+ - ext/rmmseg/word.h
37
+ - ext/rmmseg/chunk.h
38
+ - ext/rmmseg/memory.h
39
+ - data/punctuation.dic
40
+ - data/words.dic
41
+ - data/chars.dic
42
+ - README
43
+ - ext/rmmseg/extconf.rb
44
+ has_rdoc: true
45
+ homepage:
46
+ post_install_message:
47
+ rdoc_options:
48
+ - --inline-source
49
+ - --main
50
+ - README
51
+ - --line-numbers
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "0"
59
+ version:
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: "0"
65
+ version:
66
+ requirements: []
67
+
68
+ rubyforge_project:
69
+ rubygems_version: 1.0.1
70
+ signing_key:
71
+ specification_version: 2
72
+ summary: A high performance package for doing Chinese word segmentation.
73
+ test_files: []
74
+