rmmseg-cpp-new 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +23 -0
- data/README.md +3 -0
- data/bin/rmmseg +63 -0
- data/data/chars.dic +12638 -0
- data/data/words.dic +120308 -0
- data/ext/rmmseg/algor.cpp +253 -0
- data/ext/rmmseg/algor.h +79 -0
- data/ext/rmmseg/chunk.h +59 -0
- data/ext/rmmseg/dict.cpp +230 -0
- data/ext/rmmseg/dict.h +34 -0
- data/ext/rmmseg/extconf.rb +13 -0
- data/ext/rmmseg/memory.cpp +9 -0
- data/ext/rmmseg/memory.h +43 -0
- data/ext/rmmseg/rmmseg.cpp +263 -0
- data/ext/rmmseg/rules.h +86 -0
- data/ext/rmmseg/token.h +19 -0
- data/ext/rmmseg/word.h +44 -0
- data/lib/rmmseg-cpp-new.rb +2 -0
- data/lib/rmmseg/dictionary.rb +59 -0
- data/lib/rmmseg/ferret.rb +64 -0
- metadata +68 -0
data/ext/rmmseg/token.h
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#ifndef _TOKEN_H_
|
2
|
+
#define _TOKEN_H_
|
3
|
+
|
4
|
+
namespace rmmseg
|
5
|
+
{
|
6
|
+
struct Token
|
7
|
+
{
|
8
|
+
Token(const char *txt, int len)
|
9
|
+
:text(txt), length(len) { }
|
10
|
+
// `text' may or may not be nul-terminated, its length
|
11
|
+
// should be stored in the `length' field.
|
12
|
+
//
|
13
|
+
// if length is 0, this is an empty token
|
14
|
+
const char *text;
|
15
|
+
int length;
|
16
|
+
};
|
17
|
+
}
|
18
|
+
|
19
|
+
#endif /* _TOKEN_H_ */
|
data/ext/rmmseg/word.h
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#ifndef _WORD_H_
|
2
|
+
#define _WORD_H_
|
3
|
+
|
4
|
+
#include <climits>
|
5
|
+
#include <cstring>
|
6
|
+
|
7
|
+
#include "memory.h"
|
8
|
+
|
9
|
+
namespace rmmseg
|
10
|
+
{
|
11
|
+
const int word_embed_len = 4; /* at least 1 char (3 bytes+'\0') */
|
12
|
+
struct Word
|
13
|
+
{
|
14
|
+
unsigned char nbytes; /* number of bytes */
|
15
|
+
char length; /* number of characters */
|
16
|
+
unsigned short freq;
|
17
|
+
char text[word_embed_len];
|
18
|
+
};
|
19
|
+
|
20
|
+
/**
|
21
|
+
* text: the text of the word.
|
22
|
+
* length: number of characters (not bytes).
|
23
|
+
* freq: the frequency of the word.
|
24
|
+
*/
|
25
|
+
inline Word *make_word(const char *text, int length=1,
|
26
|
+
int freq=0, int nbytes=-1)
|
27
|
+
{
|
28
|
+
if (freq > USHRT_MAX)
|
29
|
+
freq = USHRT_MAX; /* avoid overflow */
|
30
|
+
if (nbytes == -1)
|
31
|
+
nbytes = std::strlen(text);
|
32
|
+
Word *w = static_cast<Word *>(pool_alloc(sizeof(Word)
|
33
|
+
+ nbytes+1
|
34
|
+
- word_embed_len));
|
35
|
+
w->nbytes = nbytes;
|
36
|
+
w->length = length;
|
37
|
+
w->freq = freq;
|
38
|
+
std::strncpy(w->text, text, nbytes);
|
39
|
+
w->text[nbytes] = '\0';
|
40
|
+
return w;
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
#endif /* _WORD_H_ */
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
module Dictionary
|
3
|
+
@dictionaries = [
|
4
|
+
[:chars, File.join(File.dirname(__FILE__),
|
5
|
+
"..", "..", "data", "chars.dic")],
|
6
|
+
[:words, File.join(File.dirname(__FILE__),
|
7
|
+
"..", "..", "data", "words.dic")]
|
8
|
+
]
|
9
|
+
|
10
|
+
class << self
|
11
|
+
#
|
12
|
+
# An array of dictionaries used by RMMSeg. Each entry is of the
|
13
|
+
# following form:
|
14
|
+
#
|
15
|
+
# [type, path]
|
16
|
+
#
|
17
|
+
# where +type+ can either <tt>:chars</tt> or <tt>:words</tt>. +path+ is the path
|
18
|
+
# to the dictionary file.
|
19
|
+
#
|
20
|
+
# The format of <tt>:chars</tt> dictionary is a collection of lines of the
|
21
|
+
# following form:
|
22
|
+
#
|
23
|
+
# freq char
|
24
|
+
#
|
25
|
+
# Where +frequency+ is a number <b>less than 65535</b>. +char+ is the
|
26
|
+
# character. They are spearated by <b>exactly one space</b>.
|
27
|
+
#
|
28
|
+
# The format of <tt>:words</tt> dictionary is similar:
|
29
|
+
#
|
30
|
+
# length word
|
31
|
+
#
|
32
|
+
# except the first number is not the frequency, but the number of
|
33
|
+
# characters (not number of bytes) in the word.
|
34
|
+
#
|
35
|
+
# There's a script (convert.rb) in the tools directory that can be used
|
36
|
+
# to convert and normalize dictionaries.
|
37
|
+
attr_accessor :dictionaries
|
38
|
+
|
39
|
+
# Add a user defined dictionary, +type+ can be
|
40
|
+
# +:chars+ or <tt>:words</tt>. See doc of dictionaries.
|
41
|
+
def add_dictionary(path, type)
|
42
|
+
@dictionaries << [type, path]
|
43
|
+
end
|
44
|
+
|
45
|
+
# Load dictionaries. Call this method after set up the path of the
|
46
|
+
# dictionaries needed to load and before any Algorithm object is
|
47
|
+
# created.
|
48
|
+
def load_dictionaries()
|
49
|
+
@dictionaries.each do |type, path|
|
50
|
+
if type == :chars
|
51
|
+
load_chars(path)
|
52
|
+
elsif type == :words
|
53
|
+
load_words(path)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rmmseg'
|
3
|
+
require 'ferret'
|
4
|
+
|
5
|
+
module RMMSeg
|
6
|
+
module Ferret
|
7
|
+
# The Analyzer class can be used with Ferret .
|
8
|
+
class Analyzer < ::Ferret::Analysis::Analyzer
|
9
|
+
|
10
|
+
# Construct an Analyzer. Optional block can be used to
|
11
|
+
# add more +TokenFilter+s. e.g.
|
12
|
+
#
|
13
|
+
# analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
|
14
|
+
# Ferret::Analysis::LowerCaseFilter.new(tokenizer)
|
15
|
+
# }
|
16
|
+
#
|
17
|
+
def initialize(&brk)
|
18
|
+
@brk = brk
|
19
|
+
end
|
20
|
+
|
21
|
+
def token_stream(field, text)
|
22
|
+
t = Tokenizer.new(text)
|
23
|
+
if @brk
|
24
|
+
@brk.call(t)
|
25
|
+
else
|
26
|
+
t
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# The Tokenizer tokenize text with RMMSeg::Algorithm.
|
32
|
+
class Tokenizer < ::Ferret::Analysis::TokenStream
|
33
|
+
# Create a new Tokenizer to tokenize +text+
|
34
|
+
def initialize(str)
|
35
|
+
self.text = str
|
36
|
+
end
|
37
|
+
|
38
|
+
# Get next token
|
39
|
+
def next
|
40
|
+
tok = @algor.next_token
|
41
|
+
if tok.nil?
|
42
|
+
return nil
|
43
|
+
else
|
44
|
+
@token.text = tok.text
|
45
|
+
@token.start = tok.start
|
46
|
+
@token.end = tok.end
|
47
|
+
return @token
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Get the text being tokenized
|
52
|
+
def text
|
53
|
+
@text
|
54
|
+
end
|
55
|
+
|
56
|
+
# Set the text to be tokenized
|
57
|
+
def text=(str)
|
58
|
+
@token = ::Ferret::Analysis::Token.new("", 0, 0)
|
59
|
+
@text = str
|
60
|
+
@algor = Algorithm.new(@text)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rmmseg-cpp-new
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- pluskid
|
8
|
+
- seoaqua
|
9
|
+
- ChienliMa
|
10
|
+
- Eric Guo
|
11
|
+
autorequire:
|
12
|
+
bindir: bin
|
13
|
+
cert_chain: []
|
14
|
+
date: 2016-04-16 00:00:00.000000000 Z
|
15
|
+
dependencies: []
|
16
|
+
description:
|
17
|
+
email:
|
18
|
+
- pluskid@gmail.com;seoaqua@qq.com;maqianlie@gmail.com;eric.guocz@gmail.com
|
19
|
+
executables: []
|
20
|
+
extensions:
|
21
|
+
- ext/rmmseg/extconf.rb
|
22
|
+
extra_rdoc_files: []
|
23
|
+
files:
|
24
|
+
- LICENSE
|
25
|
+
- README.md
|
26
|
+
- bin/rmmseg
|
27
|
+
- data/chars.dic
|
28
|
+
- data/words.dic
|
29
|
+
- ext/rmmseg/algor.cpp
|
30
|
+
- ext/rmmseg/algor.h
|
31
|
+
- ext/rmmseg/chunk.h
|
32
|
+
- ext/rmmseg/dict.cpp
|
33
|
+
- ext/rmmseg/dict.h
|
34
|
+
- ext/rmmseg/extconf.rb
|
35
|
+
- ext/rmmseg/memory.cpp
|
36
|
+
- ext/rmmseg/memory.h
|
37
|
+
- ext/rmmseg/rmmseg.cpp
|
38
|
+
- ext/rmmseg/rules.h
|
39
|
+
- ext/rmmseg/token.h
|
40
|
+
- ext/rmmseg/word.h
|
41
|
+
- lib/rmmseg-cpp-new.rb
|
42
|
+
- lib/rmmseg/dictionary.rb
|
43
|
+
- lib/rmmseg/ferret.rb
|
44
|
+
homepage: https://github.com/Eric-Guo/rmmseg-cpp-new
|
45
|
+
licenses:
|
46
|
+
- MIT
|
47
|
+
metadata: {}
|
48
|
+
post_install_message:
|
49
|
+
rdoc_options: []
|
50
|
+
require_paths:
|
51
|
+
- lib
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0'
|
57
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
requirements: []
|
63
|
+
rubyforge_project:
|
64
|
+
rubygems_version: 2.4.8
|
65
|
+
signing_key:
|
66
|
+
specification_version: 4
|
67
|
+
summary: rmmseg-cpp new born to support windows as well
|
68
|
+
test_files: []
|