mittens 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
@@ -0,0 +1,204 @@
|
|
1
|
+
/* This is a simple program which uses libstemmer to provide a command
|
2
|
+
* line interface for stemming using any of the algorithms provided.
|
3
|
+
*/
|
4
|
+
|
5
|
+
#include <stdio.h>
|
6
|
+
#include <stdlib.h> /* for malloc, free */
|
7
|
+
#include <string.h> /* for memmove */
|
8
|
+
#include <ctype.h> /* for isupper, tolower */
|
9
|
+
|
10
|
+
#include "libstemmer.h"
|
11
|
+
|
12
|
+
const char * progname;
|
13
|
+
static int pretty = 1;
|
14
|
+
|
15
|
+
static void
|
16
|
+
stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
|
17
|
+
{
|
18
|
+
#define INC 10
|
19
|
+
int lim = INC;
|
20
|
+
sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
|
21
|
+
|
22
|
+
while (1) {
|
23
|
+
int ch = getc(f_in);
|
24
|
+
if (ch == EOF) {
|
25
|
+
free(b); return;
|
26
|
+
}
|
27
|
+
{
|
28
|
+
int i = 0;
|
29
|
+
int inlen = 0;
|
30
|
+
while (ch != '\n' && ch != EOF) {
|
31
|
+
if (i == lim) {
|
32
|
+
sb_symbol * newb;
|
33
|
+
newb = (sb_symbol *)
|
34
|
+
realloc(b, (lim + INC) * sizeof(sb_symbol));
|
35
|
+
if (newb == 0) goto error;
|
36
|
+
b = newb;
|
37
|
+
lim = lim + INC;
|
38
|
+
}
|
39
|
+
/* Update count of utf-8 characters. */
|
40
|
+
if (ch < 0x80 || ch > 0xBF) inlen += 1;
|
41
|
+
/* force lower case: */
|
42
|
+
ch = tolower(ch);
|
43
|
+
|
44
|
+
b[i] = ch;
|
45
|
+
i++;
|
46
|
+
ch = getc(f_in);
|
47
|
+
}
|
48
|
+
|
49
|
+
{
|
50
|
+
const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
|
51
|
+
if (stemmed == NULL)
|
52
|
+
{
|
53
|
+
fprintf(stderr, "Out of memory");
|
54
|
+
exit(1);
|
55
|
+
}
|
56
|
+
|
57
|
+
if (pretty == 1) {
|
58
|
+
fwrite(b, i, 1, f_out);
|
59
|
+
fputs(" -> ", f_out);
|
60
|
+
} else if (pretty == 2) {
|
61
|
+
fwrite(b, i, 1, f_out);
|
62
|
+
if (sb_stemmer_length(stemmer) > 0) {
|
63
|
+
int j;
|
64
|
+
if (inlen < 30) {
|
65
|
+
for (j = 30 - inlen; j > 0; j--)
|
66
|
+
fputs(" ", f_out);
|
67
|
+
} else {
|
68
|
+
fputs("\n", f_out);
|
69
|
+
for (j = 30; j > 0; j--)
|
70
|
+
fputs(" ", f_out);
|
71
|
+
}
|
72
|
+
}
|
73
|
+
}
|
74
|
+
|
75
|
+
fputs((const char *)stemmed, f_out);
|
76
|
+
putc('\n', f_out);
|
77
|
+
}
|
78
|
+
}
|
79
|
+
}
|
80
|
+
error:
|
81
|
+
if (b != 0) free(b);
|
82
|
+
return;
|
83
|
+
}
|
84
|
+
|
85
|
+
/** Display the command line syntax, and then exit.
|
86
|
+
* @param n The value to exit with.
|
87
|
+
*/
|
88
|
+
static void
|
89
|
+
usage(int n)
|
90
|
+
{
|
91
|
+
printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
|
92
|
+
"\n"
|
93
|
+
"The input file consists of a list of words to be stemmed, one per\n"
|
94
|
+
"line. Words should be in lower case, but (for English) A-Z letters\n"
|
95
|
+
"are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
|
96
|
+
"used.\n"
|
97
|
+
"\n"
|
98
|
+
"If -c is given, the argument is the character encoding of the input\n"
|
99
|
+
"and output files. If it is omitted, the UTF-8 encoding is used.\n"
|
100
|
+
"\n"
|
101
|
+
"If -p is given the output file consists of each word of the input\n"
|
102
|
+
"file followed by \"->\" followed by its stemmed equivalent.\n"
|
103
|
+
"If -p2 is given the output file is a two column layout containing\n"
|
104
|
+
"the input words in the first column and the stemmed equivalents in\n"
|
105
|
+
"the second column.\n"
|
106
|
+
"Otherwise, the output file consists of the stemmed words, one per\n"
|
107
|
+
"line.\n"
|
108
|
+
"\n"
|
109
|
+
"-h displays this help\n",
|
110
|
+
progname);
|
111
|
+
exit(n);
|
112
|
+
}
|
113
|
+
|
114
|
+
int
|
115
|
+
main(int argc, char * argv[])
|
116
|
+
{
|
117
|
+
const char * in = 0;
|
118
|
+
const char * out = 0;
|
119
|
+
FILE * f_in;
|
120
|
+
FILE * f_out;
|
121
|
+
struct sb_stemmer * stemmer;
|
122
|
+
|
123
|
+
const char * language = "english";
|
124
|
+
const char * charenc = NULL;
|
125
|
+
|
126
|
+
int i = 1;
|
127
|
+
pretty = 0;
|
128
|
+
|
129
|
+
progname = argv[0];
|
130
|
+
|
131
|
+
while (i < argc) {
|
132
|
+
const char * s = argv[i++];
|
133
|
+
if (s[0] == '-') {
|
134
|
+
if (strcmp(s, "-o") == 0) {
|
135
|
+
if (i >= argc) {
|
136
|
+
fprintf(stderr, "%s requires an argument\n", s);
|
137
|
+
exit(1);
|
138
|
+
}
|
139
|
+
out = argv[i++];
|
140
|
+
} else if (strcmp(s, "-i") == 0) {
|
141
|
+
if (i >= argc) {
|
142
|
+
fprintf(stderr, "%s requires an argument\n", s);
|
143
|
+
exit(1);
|
144
|
+
}
|
145
|
+
in = argv[i++];
|
146
|
+
} else if (strcmp(s, "-l") == 0) {
|
147
|
+
if (i >= argc) {
|
148
|
+
fprintf(stderr, "%s requires an argument\n", s);
|
149
|
+
exit(1);
|
150
|
+
}
|
151
|
+
language = argv[i++];
|
152
|
+
} else if (strcmp(s, "-c") == 0) {
|
153
|
+
if (i >= argc) {
|
154
|
+
fprintf(stderr, "%s requires an argument\n", s);
|
155
|
+
exit(1);
|
156
|
+
}
|
157
|
+
charenc = argv[i++];
|
158
|
+
} else if (strcmp(s, "-p2") == 0) {
|
159
|
+
pretty = 2;
|
160
|
+
} else if (strcmp(s, "-p") == 0) {
|
161
|
+
pretty = 1;
|
162
|
+
} else if (strcmp(s, "-h") == 0) {
|
163
|
+
usage(0);
|
164
|
+
} else {
|
165
|
+
fprintf(stderr, "option %s unknown\n", s);
|
166
|
+
usage(1);
|
167
|
+
}
|
168
|
+
} else {
|
169
|
+
fprintf(stderr, "unexpected parameter %s\n", s);
|
170
|
+
usage(1);
|
171
|
+
}
|
172
|
+
}
|
173
|
+
|
174
|
+
/* prepare the files */
|
175
|
+
f_in = (in == 0) ? stdin : fopen(in, "r");
|
176
|
+
if (f_in == 0) {
|
177
|
+
fprintf(stderr, "file %s not found\n", in);
|
178
|
+
exit(1);
|
179
|
+
}
|
180
|
+
f_out = (out == 0) ? stdout : fopen(out, "w");
|
181
|
+
if (f_out == 0) {
|
182
|
+
fprintf(stderr, "file %s cannot be opened\n", out);
|
183
|
+
exit(1);
|
184
|
+
}
|
185
|
+
|
186
|
+
/* do the stemming process: */
|
187
|
+
stemmer = sb_stemmer_new(language, charenc);
|
188
|
+
if (stemmer == 0) {
|
189
|
+
if (charenc == NULL) {
|
190
|
+
fprintf(stderr, "language `%s' not available for stemming\n", language);
|
191
|
+
exit(1);
|
192
|
+
} else {
|
193
|
+
fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
|
194
|
+
exit(1);
|
195
|
+
}
|
196
|
+
}
|
197
|
+
stem_file(stemmer, f_in, f_out);
|
198
|
+
sb_stemmer_delete(stemmer);
|
199
|
+
|
200
|
+
if (in != 0) (void) fclose(f_in);
|
201
|
+
if (out != 0) (void) fclose(f_out);
|
202
|
+
|
203
|
+
return 0;
|
204
|
+
}
|
@@ -0,0 +1,55 @@
|
|
1
|
+
# Go Target for Snowball
|
2
|
+
|
3
|
+
The initial implementation was built as a port of the Rust target. The initial focus has been on getting it to function, and making it work correctly. No attempt has been made to beautify the implementation, generated code, or address performance issues.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
To generate Go source for a Snowball algorithm:
|
8
|
+
```
|
9
|
+
$ snowball path/to/algorithm.sbl -go -o algorithm
|
10
|
+
```
|
11
|
+
|
12
|
+
### Go specific options
|
13
|
+
|
14
|
+
`-gop[ackage]` the package name used in the generated go file (defaults to `snowball`)
|
15
|
+
|
16
|
+
`-gor[untime]` the import path used for the Go Snowball runtime (defaults to `github.com/snowballstem/snowball/go`)
|
17
|
+
|
18
|
+
## Code Organization
|
19
|
+
|
20
|
+
`compiler/generator_go.c` has the Go code generation logic
|
21
|
+
|
22
|
+
`go/` contains the default Go Snowball runtime support
|
23
|
+
|
24
|
+
`go/stemwords` contains the source for a Go version of the stemwords utility
|
25
|
+
|
26
|
+
`go/algorithms` location where the makefile generated code will end up
|
27
|
+
|
28
|
+
## Using the Generated Stemmers
|
29
|
+
|
30
|
+
Assuming you generated a stemmer, put that code in a package which is imported by this code as `english`.
|
31
|
+
|
32
|
+
```
|
33
|
+
env := snowball.NewEnv("beautiful")
|
34
|
+
english.Stem(env)
|
35
|
+
fmt.Printf("stemmed word is: %s", env.Current())
|
36
|
+
```
|
37
|
+
|
38
|
+
NOTE: you can use the env.SetCurrent("new_word") to reuse the env on subsequent calls to the stemmer.
|
39
|
+
|
40
|
+
## Testing
|
41
|
+
|
42
|
+
Only the existing Snowball algorithms have been used for testing. This does not exercise all features of the language.
|
43
|
+
|
44
|
+
Run:
|
45
|
+
|
46
|
+
```
|
47
|
+
$ make check_go
|
48
|
+
```
|
49
|
+
|
50
|
+
An initial pass of fuzz-testing has been performed on the generated stemmers for the algorithms in this repo. Each ran for 5 minutes and used an initial corpus seeded with 10k words from the algorithm's snowballstem-data voc.txt file.
|
51
|
+
|
52
|
+
## Known Limitations
|
53
|
+
|
54
|
+
- Code going through generate_dollar production has not been tested
|
55
|
+
- Code going through generate_debug production has not been tested
|
@@ -0,0 +1,16 @@
|
|
1
|
+
package snowball
|
2
|
+
|
3
|
+
import "fmt"
|
4
|
+
|
5
|
+
type AmongF func(env *Env, ctx interface{}) bool
|
6
|
+
|
7
|
+
type Among struct {
|
8
|
+
Str string
|
9
|
+
A int32
|
10
|
+
B int32
|
11
|
+
F AmongF
|
12
|
+
}
|
13
|
+
|
14
|
+
func (a *Among) String() string {
|
15
|
+
return fmt.Sprintf("str: `%s`, a: %d, b: %d, f: %p", a.Str, a.A, a.B, a.F)
|
16
|
+
}
|
@@ -0,0 +1,403 @@
|
|
1
|
+
package snowball
|
2
|
+
|
3
|
+
import (
|
4
|
+
"log"
|
5
|
+
"strings"
|
6
|
+
"unicode/utf8"
|
7
|
+
)
|
8
|
+
|
9
|
+
// Env represents the Snowball execution environment
|
10
|
+
type Env struct {
|
11
|
+
current string
|
12
|
+
Cursor int
|
13
|
+
Limit int
|
14
|
+
LimitBackward int
|
15
|
+
Bra int
|
16
|
+
Ket int
|
17
|
+
}
|
18
|
+
|
19
|
+
// NewEnv creates a new Snowball execution environment on the provided string
|
20
|
+
func NewEnv(val string) *Env {
|
21
|
+
return &Env{
|
22
|
+
current: val,
|
23
|
+
Cursor: 0,
|
24
|
+
Limit: len(val),
|
25
|
+
LimitBackward: 0,
|
26
|
+
Bra: 0,
|
27
|
+
Ket: len(val),
|
28
|
+
}
|
29
|
+
}
|
30
|
+
|
31
|
+
func (env *Env) Current() string {
|
32
|
+
return env.current
|
33
|
+
}
|
34
|
+
|
35
|
+
func (env *Env) SetCurrent(s string) {
|
36
|
+
env.current = s
|
37
|
+
env.Cursor = 0
|
38
|
+
env.Limit = len(s)
|
39
|
+
env.LimitBackward = 0
|
40
|
+
env.Bra = 0
|
41
|
+
env.Ket = len(s)
|
42
|
+
}
|
43
|
+
|
44
|
+
func (env *Env) ReplaceS(bra, ket int, s string) int32 {
|
45
|
+
adjustment := int32(len(s)) - (int32(ket) - int32(bra))
|
46
|
+
result, _ := splitAt(env.current, bra)
|
47
|
+
rsplit := ket
|
48
|
+
if ket < bra {
|
49
|
+
rsplit = bra
|
50
|
+
}
|
51
|
+
_, rhs := splitAt(env.current, rsplit)
|
52
|
+
result += s
|
53
|
+
result += rhs
|
54
|
+
|
55
|
+
newLim := int32(env.Limit) + adjustment
|
56
|
+
env.Limit = int(newLim)
|
57
|
+
|
58
|
+
if env.Cursor >= ket {
|
59
|
+
newCur := int32(env.Cursor) + adjustment
|
60
|
+
env.Cursor = int(newCur)
|
61
|
+
} else if env.Cursor > bra {
|
62
|
+
env.Cursor = bra
|
63
|
+
}
|
64
|
+
|
65
|
+
env.current = result
|
66
|
+
return adjustment
|
67
|
+
}
|
68
|
+
|
69
|
+
func (env *Env) EqS(s string) bool {
|
70
|
+
if env.Cursor >= env.Limit {
|
71
|
+
return false
|
72
|
+
}
|
73
|
+
|
74
|
+
if strings.HasPrefix(env.current[env.Cursor:], s) {
|
75
|
+
env.Cursor += len(s)
|
76
|
+
for !onCharBoundary(env.current, env.Cursor) {
|
77
|
+
env.Cursor++
|
78
|
+
}
|
79
|
+
return true
|
80
|
+
}
|
81
|
+
return false
|
82
|
+
}
|
83
|
+
|
84
|
+
func (env *Env) EqSB(s string) bool {
|
85
|
+
if int32(env.Cursor)-int32(env.LimitBackward) < int32(len(s)) {
|
86
|
+
return false
|
87
|
+
} else if !onCharBoundary(env.current, env.Cursor-len(s)) ||
|
88
|
+
!strings.HasPrefix(env.current[env.Cursor-len(s):], s) {
|
89
|
+
return false
|
90
|
+
} else {
|
91
|
+
env.Cursor -= len(s)
|
92
|
+
return true
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
func (env *Env) SliceFrom(s string) bool {
|
97
|
+
bra, ket := env.Bra, env.Ket
|
98
|
+
env.ReplaceS(bra, ket, s)
|
99
|
+
return true
|
100
|
+
}
|
101
|
+
|
102
|
+
func (env *Env) NextChar() {
|
103
|
+
env.Cursor++
|
104
|
+
for !onCharBoundary(env.current, env.Cursor) {
|
105
|
+
env.Cursor++
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
func (env *Env) PrevChar() {
|
110
|
+
env.Cursor--
|
111
|
+
for !onCharBoundary(env.current, env.Cursor) {
|
112
|
+
env.Cursor--
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
func (env *Env) Hop(delta int32) bool {
|
117
|
+
res := env.Cursor
|
118
|
+
for delta > 0 {
|
119
|
+
delta--
|
120
|
+
if res >= env.Limit {
|
121
|
+
return false
|
122
|
+
}
|
123
|
+
res++
|
124
|
+
for res < env.Limit && !onCharBoundary(env.current, res) {
|
125
|
+
res++
|
126
|
+
}
|
127
|
+
}
|
128
|
+
env.Cursor = res
|
129
|
+
return true
|
130
|
+
}
|
131
|
+
|
132
|
+
func (env *Env) HopChecked(delta int32) bool {
|
133
|
+
return delta >= 0 && env.Hop(delta)
|
134
|
+
}
|
135
|
+
|
136
|
+
func (env *Env) HopBack(delta int32) bool {
|
137
|
+
res := env.Cursor
|
138
|
+
for delta > 0 {
|
139
|
+
delta--
|
140
|
+
if res <= env.LimitBackward {
|
141
|
+
return false
|
142
|
+
}
|
143
|
+
res--
|
144
|
+
for res > env.LimitBackward && !onCharBoundary(env.current, res) {
|
145
|
+
res--
|
146
|
+
}
|
147
|
+
}
|
148
|
+
env.Cursor = res
|
149
|
+
return true
|
150
|
+
}
|
151
|
+
|
152
|
+
func (env *Env) HopBackChecked(delta int32) bool {
|
153
|
+
return delta >= 0 && env.HopBack(delta)
|
154
|
+
}
|
155
|
+
|
156
|
+
func (env *Env) InGrouping(chars []byte, min, max int32) bool {
|
157
|
+
if env.Cursor >= env.Limit {
|
158
|
+
return false
|
159
|
+
}
|
160
|
+
|
161
|
+
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
162
|
+
if r != utf8.RuneError {
|
163
|
+
if r > max || r < min {
|
164
|
+
return false
|
165
|
+
}
|
166
|
+
r -= min
|
167
|
+
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
168
|
+
return false
|
169
|
+
}
|
170
|
+
env.NextChar()
|
171
|
+
return true
|
172
|
+
}
|
173
|
+
return false
|
174
|
+
}
|
175
|
+
|
176
|
+
func (env *Env) InGroupingB(chars []byte, min, max int32) bool {
|
177
|
+
if env.Cursor <= env.LimitBackward {
|
178
|
+
return false
|
179
|
+
}
|
180
|
+
env.PrevChar()
|
181
|
+
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
182
|
+
if r != utf8.RuneError {
|
183
|
+
env.NextChar()
|
184
|
+
if r > max || r < min {
|
185
|
+
return false
|
186
|
+
}
|
187
|
+
r -= min
|
188
|
+
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
189
|
+
return false
|
190
|
+
}
|
191
|
+
env.PrevChar()
|
192
|
+
return true
|
193
|
+
}
|
194
|
+
return false
|
195
|
+
}
|
196
|
+
|
197
|
+
func (env *Env) OutGrouping(chars []byte, min, max int32) bool {
|
198
|
+
if env.Cursor >= env.Limit {
|
199
|
+
return false
|
200
|
+
}
|
201
|
+
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
202
|
+
if r != utf8.RuneError {
|
203
|
+
if r > max || r < min {
|
204
|
+
env.NextChar()
|
205
|
+
return true
|
206
|
+
}
|
207
|
+
r -= min
|
208
|
+
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
209
|
+
env.NextChar()
|
210
|
+
return true
|
211
|
+
}
|
212
|
+
}
|
213
|
+
return false
|
214
|
+
}
|
215
|
+
|
216
|
+
func (env *Env) OutGroupingB(chars []byte, min, max int32) bool {
|
217
|
+
if env.Cursor <= env.LimitBackward {
|
218
|
+
return false
|
219
|
+
}
|
220
|
+
env.PrevChar()
|
221
|
+
r, _ := utf8.DecodeRuneInString(env.current[env.Cursor:])
|
222
|
+
if r != utf8.RuneError {
|
223
|
+
env.NextChar()
|
224
|
+
if r > max || r < min {
|
225
|
+
env.PrevChar()
|
226
|
+
return true
|
227
|
+
}
|
228
|
+
r -= min
|
229
|
+
if (chars[uint(r>>3)] & (0x1 << uint(r&0x7))) == 0 {
|
230
|
+
env.PrevChar()
|
231
|
+
return true
|
232
|
+
}
|
233
|
+
}
|
234
|
+
return false
|
235
|
+
}
|
236
|
+
|
237
|
+
func (env *Env) SliceDel() bool {
|
238
|
+
return env.SliceFrom("")
|
239
|
+
}
|
240
|
+
|
241
|
+
func (env *Env) Insert(bra, ket int, s string) {
|
242
|
+
adjustment := env.ReplaceS(bra, ket, s)
|
243
|
+
if bra <= env.Bra {
|
244
|
+
env.Bra = int(int32(env.Bra) + adjustment)
|
245
|
+
}
|
246
|
+
if bra <= env.Ket {
|
247
|
+
env.Ket = int(int32(env.Ket) + adjustment)
|
248
|
+
}
|
249
|
+
}
|
250
|
+
|
251
|
+
func (env *Env) SliceTo() string {
|
252
|
+
return env.current[env.Bra:env.Ket]
|
253
|
+
}
|
254
|
+
|
255
|
+
func (env *Env) FindAmong(amongs []*Among, ctx interface{}) int32 {
|
256
|
+
var i int32
|
257
|
+
j := int32(len(amongs))
|
258
|
+
|
259
|
+
c := env.Cursor
|
260
|
+
l := env.Limit
|
261
|
+
|
262
|
+
var commonI, commonJ int
|
263
|
+
|
264
|
+
firstKeyInspected := false
|
265
|
+
for {
|
266
|
+
k := i + ((j - i) >> 1)
|
267
|
+
var diff int32
|
268
|
+
common := min(commonI, commonJ)
|
269
|
+
w := amongs[k]
|
270
|
+
for lvar := common; lvar < len(w.Str); lvar++ {
|
271
|
+
if c+common == l {
|
272
|
+
diff--
|
273
|
+
break
|
274
|
+
}
|
275
|
+
diff = int32(env.current[c+common]) - int32(w.Str[lvar])
|
276
|
+
if diff != 0 {
|
277
|
+
break
|
278
|
+
}
|
279
|
+
common++
|
280
|
+
}
|
281
|
+
if diff < 0 {
|
282
|
+
j = k
|
283
|
+
commonJ = common
|
284
|
+
} else {
|
285
|
+
i = k
|
286
|
+
commonI = common
|
287
|
+
}
|
288
|
+
if j-i <= 1 {
|
289
|
+
if i > 0 {
|
290
|
+
break
|
291
|
+
}
|
292
|
+
if j == i {
|
293
|
+
break
|
294
|
+
}
|
295
|
+
if firstKeyInspected {
|
296
|
+
break
|
297
|
+
}
|
298
|
+
firstKeyInspected = true
|
299
|
+
}
|
300
|
+
}
|
301
|
+
|
302
|
+
for {
|
303
|
+
w := amongs[i]
|
304
|
+
if commonI >= len(w.Str) {
|
305
|
+
env.Cursor = c + len(w.Str)
|
306
|
+
if w.F != nil {
|
307
|
+
res := w.F(env, ctx)
|
308
|
+
env.Cursor = c + len(w.Str)
|
309
|
+
if res {
|
310
|
+
return w.B
|
311
|
+
}
|
312
|
+
} else {
|
313
|
+
return w.B
|
314
|
+
}
|
315
|
+
}
|
316
|
+
i = w.A
|
317
|
+
if i < 0 {
|
318
|
+
return 0
|
319
|
+
}
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
323
|
+
func (env *Env) FindAmongB(amongs []*Among, ctx interface{}) int32 {
|
324
|
+
var i int32
|
325
|
+
j := int32(len(amongs))
|
326
|
+
|
327
|
+
c := env.Cursor
|
328
|
+
lb := env.LimitBackward
|
329
|
+
|
330
|
+
var commonI, commonJ int
|
331
|
+
|
332
|
+
firstKeyInspected := false
|
333
|
+
|
334
|
+
for {
|
335
|
+
k := i + ((j - i) >> 1)
|
336
|
+
diff := int32(0)
|
337
|
+
common := min(commonI, commonJ)
|
338
|
+
w := amongs[k]
|
339
|
+
for lvar := len(w.Str) - int(common) - 1; lvar >= 0; lvar-- {
|
340
|
+
if c-common == lb {
|
341
|
+
diff--
|
342
|
+
break
|
343
|
+
}
|
344
|
+
diff = int32(env.current[c-common-1]) - int32(w.Str[lvar])
|
345
|
+
if diff != 0 {
|
346
|
+
break
|
347
|
+
}
|
348
|
+
// Count up commons. But not one character but the byte width of that char
|
349
|
+
common++
|
350
|
+
}
|
351
|
+
if diff < 0 {
|
352
|
+
j = k
|
353
|
+
commonJ = common
|
354
|
+
} else {
|
355
|
+
i = k
|
356
|
+
commonI = common
|
357
|
+
}
|
358
|
+
if j-i <= 1 {
|
359
|
+
if i > 0 {
|
360
|
+
break
|
361
|
+
}
|
362
|
+
if j == i {
|
363
|
+
break
|
364
|
+
}
|
365
|
+
if firstKeyInspected {
|
366
|
+
break
|
367
|
+
}
|
368
|
+
firstKeyInspected = true
|
369
|
+
}
|
370
|
+
}
|
371
|
+
for {
|
372
|
+
w := amongs[i]
|
373
|
+
if commonI >= len(w.Str) {
|
374
|
+
env.Cursor = c - len(w.Str)
|
375
|
+
if w.F != nil {
|
376
|
+
res := w.F(env, ctx)
|
377
|
+
env.Cursor = c - len(w.Str)
|
378
|
+
if res {
|
379
|
+
return w.B
|
380
|
+
}
|
381
|
+
} else {
|
382
|
+
return w.B
|
383
|
+
}
|
384
|
+
}
|
385
|
+
i = w.A
|
386
|
+
if i < 0 {
|
387
|
+
return 0
|
388
|
+
}
|
389
|
+
}
|
390
|
+
}
|
391
|
+
|
392
|
+
func (env *Env) Debug(count, lineNumber int) {
|
393
|
+
log.Printf("snowball debug, count: %d, line: %d", count, lineNumber)
|
394
|
+
}
|
395
|
+
|
396
|
+
func (env *Env) Clone() *Env {
|
397
|
+
clone := *env
|
398
|
+
return &clone
|
399
|
+
}
|
400
|
+
|
401
|
+
func (env *Env) AssignTo() string {
|
402
|
+
return env.Current()
|
403
|
+
}
|