rmmseg-cpp-new 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ #ifndef _TOKEN_H_
2
+ #define _TOKEN_H_
3
+
4
+ namespace rmmseg
5
+ {
6
+ struct Token
7
+ {
8
+ Token(const char *txt, int len)
9
+ :text(txt), length(len) { }
10
+ // `text' may or may not be nul-terminated, its length
11
+ // should be stored in the `length' field.
12
+ //
13
+ // if length is 0, this is an empty token
14
+ const char *text;
15
+ int length;
16
+ };
17
+ }
18
+
19
+ #endif /* _TOKEN_H_ */
data/ext/rmmseg/word.h ADDED
@@ -0,0 +1,44 @@
1
+ #ifndef _WORD_H_
2
+ #define _WORD_H_
3
+
4
+ #include <climits>
5
+ #include <cstring>
6
+
7
+ #include "memory.h"
8
+
9
+ namespace rmmseg
10
+ {
11
+ const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
12
+ struct Word
13
+ {
14
+ unsigned char nbytes; /* number of bytes */
15
+ char length; /* number of characters */
16
+ unsigned short freq;
17
+ char text[word_embed_len];
18
+ };
19
+
20
+ /**
21
+ * text: the text of the word.
22
+ * length: number of characters (not bytes).
23
+ * freq: the frequency of the word.
24
+ */
25
+ inline Word *make_word(const char *text, int length=1,
26
+ int freq=0, int nbytes=-1)
27
+ {
28
+ if (freq > USHRT_MAX)
29
+ freq = USHRT_MAX; /* avoid overflow */
30
+ if (nbytes == -1)
31
+ nbytes = std::strlen(text);
32
+ Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
33
+ + nbytes+1
34
+ - word_embed_len));
35
+ w->nbytes = nbytes;
36
+ w->length = length;
37
+ w->freq = freq;
38
+ std::strncpy(w->text, text, nbytes);
39
+ w->text[nbytes] = '\0';
40
+ return w;
41
+ }
42
+ }
43
+
44
+ #endif /* _WORD_H_ */
@@ -0,0 +1,2 @@
1
+ require_relative 'rmmseg/dictionary'
2
+ require_relative 'rmmseg'
@@ -0,0 +1,59 @@
1
+ module RMMSeg
2
+ module Dictionary
3
+ @dictionaries = [
4
+ [:chars, File.join(File.dirname(__FILE__),
5
+ "..", "..", "data", "chars.dic")],
6
+ [:words, File.join(File.dirname(__FILE__),
7
+ "..", "..", "data", "words.dic")]
8
+ ]
9
+
10
+ class << self
11
+ #
12
+ # An array of dictionaries used by RMMSeg. Each entry is of the
13
+ # following form:
14
+ #
15
+ # [type, path]
16
+ #
17
+ # where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
18
+ # to the dictionary file.
19
+ #
20
+ # The format of <tt>:chars</tt> dictionary is a collection of lines of the
21
+ # following form:
22
+ #
23
+ # freq char
24
+ #
25
+ # Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
26
+ # character. They are spearated by <b>exactly one space</b>.
27
+ #
28
+ # The format of <tt>:words</tt> dictionary is similar:
29
+ #
30
+ # length word
31
+ #
32
+ # except the first number is not the frequency, but the number of
33
+ # characters (not number of bytes) in the word.
34
+ #
35
+ # There's a script (convert.rb) in the tools directory that can be used
36
+ # to convert and normalize dictionaries.
37
+ attr_accessor :dictionaries
38
+
39
+ # Add a user defined dictionary, +type+ can be
40
+ # +:chars+ or <tt>:words</tt>. See doc of dictionaries.
41
+ def add_dictionary(path, type)
42
+ @dictionaries << [type, path]
43
+ end
44
+
45
+ # Load dictionaries. Call this method after set up the path of the
46
+ # dictionaries needed to load and before any Algorithm object is
47
+ # created.
48
+ def load_dictionaries()
49
+ @dictionaries.each do |type, path|
50
+ if type == :chars
51
+ load_chars(path)
52
+ elsif type == :words
53
+ load_words(path)
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,64 @@
1
+ require 'rubygems'
2
+ require 'rmmseg'
3
+ require 'ferret'
4
+
5
+ module RMMSeg
6
+ module Ferret
7
+ # The Analyzer class can be used with Ferret .
8
+ class Analyzer < ::Ferret::Analysis::Analyzer
9
+
10
+ # Construct an Analyzer. Optional block can be used to
11
+ # add more +TokenFilter+s. e.g.
12
+ #
13
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
14
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
15
+ # }
16
+ #
17
+ def initialize(&brk)
18
+ @brk = brk
19
+ end
20
+
21
+ def token_stream(field, text)
22
+ t = Tokenizer.new(text)
23
+ if @brk
24
+ @brk.call(t)
25
+ else
26
+ t
27
+ end
28
+ end
29
+ end
30
+
31
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
32
+ class Tokenizer < ::Ferret::Analysis::TokenStream
33
+ # Create a new Tokenizer to tokenize +text+
34
+ def initialize(str)
35
+ self.text = str
36
+ end
37
+
38
+ # Get next token
39
+ def next
40
+ tok = @algor.next_token
41
+ if tok.nil?
42
+ return nil
43
+ else
44
+ @token.text = tok.text
45
+ @token.start = tok.start
46
+ @token.end = tok.end
47
+ return @token
48
+ end
49
+ end
50
+
51
+ # Get the text being tokenized
52
+ def text
53
+ @text
54
+ end
55
+
56
+ # Set the text to be tokenized
57
+ def text=(str)
58
+ @token = ::Ferret::Analysis::Token.new("", 0, 0)
59
+ @text = str
60
+ @algor = Algorithm.new(@text)
61
+ end
62
+ end
63
+ end
64
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rmmseg-cpp-new
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - pluskid
8
+ - seoaqua
9
+ - ChienliMa
10
+ - Eric Guo
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2016-04-16 00:00:00.000000000 Z
15
+ dependencies: []
16
+ description:
17
+ email:
18
+ - pluskid@gmail.com;seoaqua@qq.com;maqianlie@gmail.com;eric.guocz@gmail.com
19
+ executables: []
20
+ extensions:
21
+ - ext/rmmseg/extconf.rb
22
+ extra_rdoc_files: []
23
+ files:
24
+ - LICENSE
25
+ - README.md
26
+ - bin/rmmseg
27
+ - data/chars.dic
28
+ - data/words.dic
29
+ - ext/rmmseg/algor.cpp
30
+ - ext/rmmseg/algor.h
31
+ - ext/rmmseg/chunk.h
32
+ - ext/rmmseg/dict.cpp
33
+ - ext/rmmseg/dict.h
34
+ - ext/rmmseg/extconf.rb
35
+ - ext/rmmseg/memory.cpp
36
+ - ext/rmmseg/memory.h
37
+ - ext/rmmseg/rmmseg.cpp
38
+ - ext/rmmseg/rules.h
39
+ - ext/rmmseg/token.h
40
+ - ext/rmmseg/word.h
41
+ - lib/rmmseg-cpp-new.rb
42
+ - lib/rmmseg/dictionary.rb
43
+ - lib/rmmseg/ferret.rb
44
+ homepage: https://github.com/Eric-Guo/rmmseg-cpp-new
45
+ licenses:
46
+ - MIT
47
+ metadata: {}
48
+ post_install_message:
49
+ rdoc_options: []
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 2.4.8
65
+ signing_key:
66
+ specification_version: 4
67
+ summary: rmmseg-cpp new born to support windows as well
68
+ test_files: []