mittens 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
// +build ignore
|
2
|
+
|
3
|
+
package main
|
4
|
+
|
5
|
+
import (
|
6
|
+
"flag"
|
7
|
+
"fmt"
|
8
|
+
"io"
|
9
|
+
"io/ioutil"
|
10
|
+
"log"
|
11
|
+
"os"
|
12
|
+
)
|
13
|
+
|
14
|
+
// tool to register all algorithms built with the stemwords tool
|
15
|
+
|
16
|
+
func main() {
|
17
|
+
flag.Parse()
|
18
|
+
|
19
|
+
if flag.NArg() < 1 {
|
20
|
+
log.Fatal("must specify algorithms directory")
|
21
|
+
}
|
22
|
+
|
23
|
+
var w io.Writer
|
24
|
+
if flag.NArg() > 1 {
|
25
|
+
var err error
|
26
|
+
w, err = os.Create(flag.Arg(1))
|
27
|
+
if err != nil {
|
28
|
+
log.Fatalf("error creating output file %v", err)
|
29
|
+
}
|
30
|
+
} else {
|
31
|
+
w = os.Stdout
|
32
|
+
}
|
33
|
+
|
34
|
+
fmt.Fprintf(w, "%s", header)
|
35
|
+
|
36
|
+
files, err := ioutil.ReadDir(flag.Arg(0))
|
37
|
+
if err != nil {
|
38
|
+
log.Fatal(err)
|
39
|
+
}
|
40
|
+
|
41
|
+
for _, file := range files {
|
42
|
+
fmt.Fprintf(w, " %s \"github.com/snowballstem/snowball/go/algorithms/%s\"\n",
|
43
|
+
file.Name(), file.Name())
|
44
|
+
}
|
45
|
+
|
46
|
+
fmt.Fprintf(w, closeImportStartInit)
|
47
|
+
|
48
|
+
for _, file := range files {
|
49
|
+
fmt.Fprintf(w, " languages[\"%s\"] = %s.Stem\n",
|
50
|
+
file.Name(), file.Name())
|
51
|
+
}
|
52
|
+
|
53
|
+
fmt.Fprintf(w, "%s", footer)
|
54
|
+
}
|
55
|
+
|
56
|
+
var header = `// generated list of supported algorithms, DO NOT EDIT
|
57
|
+
|
58
|
+
package main
|
59
|
+
|
60
|
+
import (
|
61
|
+
`
|
62
|
+
|
63
|
+
var closeImportStartInit = `)
|
64
|
+
|
65
|
+
func init() {`
|
66
|
+
|
67
|
+
var footer = `}
|
68
|
+
`
|
@@ -0,0 +1,68 @@
|
|
1
|
+
//go:generate go run generate.go ../algorithms algorithms.go
|
2
|
+
//go:generate gofmt -s -w algorithms.go
|
3
|
+
|
4
|
+
package main
|
5
|
+
|
6
|
+
import (
|
7
|
+
"bufio"
|
8
|
+
"flag"
|
9
|
+
"fmt"
|
10
|
+
"log"
|
11
|
+
"os"
|
12
|
+
|
13
|
+
snowballRuntime "github.com/snowballstem/snowball/go"
|
14
|
+
)
|
15
|
+
|
16
|
+
var language = flag.String("l", "", "language")
|
17
|
+
var input = flag.String("i", "", "input file")
|
18
|
+
var output = flag.String("o", "", "output file")
|
19
|
+
|
20
|
+
func main() {
|
21
|
+
flag.Parse()
|
22
|
+
|
23
|
+
if *language == "" {
|
24
|
+
log.Fatal("must specify language")
|
25
|
+
}
|
26
|
+
|
27
|
+
stemmer, ok := languages[*language]
|
28
|
+
if !ok {
|
29
|
+
log.Fatalf("no language support for %s", *language)
|
30
|
+
}
|
31
|
+
|
32
|
+
var reader = os.Stdin
|
33
|
+
if *input != "" {
|
34
|
+
var err error
|
35
|
+
reader, err = os.Open(*input)
|
36
|
+
if err != nil {
|
37
|
+
log.Fatal(err)
|
38
|
+
}
|
39
|
+
defer reader.Close()
|
40
|
+
}
|
41
|
+
|
42
|
+
var writer = os.Stdout
|
43
|
+
if *output != "" {
|
44
|
+
var err error
|
45
|
+
writer, err = os.Create(*output)
|
46
|
+
if err != nil {
|
47
|
+
log.Fatal(err)
|
48
|
+
}
|
49
|
+
defer writer.Close()
|
50
|
+
}
|
51
|
+
|
52
|
+
var err error
|
53
|
+
scanner := bufio.NewScanner(reader)
|
54
|
+
for scanner.Scan() {
|
55
|
+
word := scanner.Text()
|
56
|
+
env := snowballRuntime.NewEnv(word)
|
57
|
+
stemmer(env)
|
58
|
+
fmt.Fprintf(writer, "%s\n", env.Current())
|
59
|
+
}
|
60
|
+
|
61
|
+
if err = scanner.Err(); err != nil {
|
62
|
+
log.Fatal(err)
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
type StemFunc func(env *snowballRuntime.Env) bool
|
67
|
+
|
68
|
+
var languages = make(map[string]StemFunc)
|
@@ -0,0 +1,34 @@
|
|
1
|
+
package snowball
|
2
|
+
|
3
|
+
import (
|
4
|
+
"math"
|
5
|
+
"unicode/utf8"
|
6
|
+
)
|
7
|
+
|
8
|
+
const MaxInt = math.MaxInt32
|
9
|
+
const MinInt = math.MinInt32
|
10
|
+
|
11
|
+
func splitAt(str string, mid int) (string, string) {
|
12
|
+
return str[:mid], str[mid:]
|
13
|
+
}
|
14
|
+
|
15
|
+
func min(a, b int) int {
|
16
|
+
if a < b {
|
17
|
+
return a
|
18
|
+
}
|
19
|
+
return b
|
20
|
+
}
|
21
|
+
|
22
|
+
func onCharBoundary(s string, pos int) bool {
|
23
|
+
if pos <= 0 || pos >= len(s) {
|
24
|
+
return true
|
25
|
+
}
|
26
|
+
return utf8.RuneStart(s[pos])
|
27
|
+
}
|
28
|
+
|
29
|
+
// RuneCountInString is a wrapper around utf8.RuneCountInString
|
30
|
+
// this allows us to not have to conditionally include
|
31
|
+
// the utf8 package into some stemmers and not others
|
32
|
+
func RuneCountInString(str string) int {
|
33
|
+
return utf8.RuneCountInString(str)
|
34
|
+
}
|
@@ -0,0 +1,50 @@
|
|
1
|
+
#!env python
|
2
|
+
# Simple (but slow) iconv replacement in Python.
|
3
|
+
import sys
|
4
|
+
|
5
|
+
in_cs = out_cs = in_file = out_file = pending = None
|
6
|
+
for arg in sys.argv[1:]:
|
7
|
+
if pending != None:
|
8
|
+
arg = pending + arg
|
9
|
+
pending = None
|
10
|
+
if arg.startswith('-'):
|
11
|
+
if arg[1] in ('f', 't', 'o'):
|
12
|
+
if len(arg) == 2:
|
13
|
+
pending = arg
|
14
|
+
continue
|
15
|
+
if arg[1] == 'f':
|
16
|
+
in_cs = arg[2:]
|
17
|
+
continue
|
18
|
+
if arg[1] == 't':
|
19
|
+
out_cs = arg[2:]
|
20
|
+
continue
|
21
|
+
if arg[1] == 'o':
|
22
|
+
out_file = open(arg[2:], 'wb')
|
23
|
+
continue
|
24
|
+
print("Unknown option: '%s'" % arg)
|
25
|
+
sys.exit(1)
|
26
|
+
if in_file == None:
|
27
|
+
in_file = open(arg, 'rb')
|
28
|
+
continue
|
29
|
+
print("Too many arguments")
|
30
|
+
sys.exit(1)
|
31
|
+
|
32
|
+
if in_cs == None:
|
33
|
+
print("Need to specify input cs with -f")
|
34
|
+
sys.exit(1)
|
35
|
+
if out_cs == None:
|
36
|
+
print("Need to specify output cs with -t")
|
37
|
+
sys.exit(1)
|
38
|
+
|
39
|
+
if in_file == None:
|
40
|
+
if hasattr(sys.stdin, 'buffer'):
|
41
|
+
in_file = sys.stdin.buffer
|
42
|
+
else:
|
43
|
+
in_file = sys.stdin
|
44
|
+
if out_file == None:
|
45
|
+
if hasattr(sys.stdout, 'buffer'):
|
46
|
+
out_file = sys.stdout.buffer
|
47
|
+
else:
|
48
|
+
out_file = sys.stdout
|
49
|
+
|
50
|
+
out_file.write(in_file.read().decode(in_cs).encode(out_cs))
|
@@ -0,0 +1,78 @@
|
|
1
|
+
|
2
|
+
/* Make header file work when included from C++ */
|
3
|
+
#ifdef __cplusplus
|
4
|
+
extern "C" {
|
5
|
+
#endif
|
6
|
+
|
7
|
+
struct sb_stemmer;
|
8
|
+
typedef unsigned char sb_symbol;
|
9
|
+
|
10
|
+
/* FIXME - should be able to get a version number for each stemming
|
11
|
+
* algorithm (which will be incremented each time the output changes). */
|
12
|
+
|
13
|
+
/** Returns an array of the names of the available stemming algorithms.
|
14
|
+
* Note that these are the canonical names - aliases (ie, other names for
|
15
|
+
* the same algorithm) will not be included in the list.
|
16
|
+
* The list is terminated with a null pointer.
|
17
|
+
*
|
18
|
+
* The list must not be modified in any way.
|
19
|
+
*/
|
20
|
+
const char ** sb_stemmer_list(void);
|
21
|
+
|
22
|
+
/** Create a new stemmer object, using the specified algorithm, for the
|
23
|
+
* specified character encoding.
|
24
|
+
*
|
25
|
+
* All algorithms will usually be available in UTF-8, but may also be
|
26
|
+
* available in other character encodings.
|
27
|
+
*
|
28
|
+
* @param algorithm The algorithm name. This is either the english
|
29
|
+
* name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
|
30
|
+
* language. Note that case is significant in this parameter - the
|
31
|
+
* value should be supplied in lower case.
|
32
|
+
*
|
33
|
+
* @param charenc The character encoding. NULL may be passed as
|
34
|
+
* this value, in which case UTF-8 encoding will be assumed. Otherwise,
|
35
|
+
* the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1),
|
36
|
+
* "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is
|
37
|
+
* significant in this parameter.
|
38
|
+
*
|
39
|
+
* @return NULL if the specified algorithm is not recognised, or the
|
40
|
+
* algorithm is not available for the requested encoding. Otherwise,
|
41
|
+
* returns a pointer to a newly created stemmer for the requested algorithm.
|
42
|
+
* The returned pointer must be deleted by calling sb_stemmer_delete().
|
43
|
+
*
|
44
|
+
* @note NULL will also be returned if an out of memory error occurs.
|
45
|
+
*/
|
46
|
+
struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
|
47
|
+
|
48
|
+
/** Delete a stemmer object.
|
49
|
+
*
|
50
|
+
* This frees all resources allocated for the stemmer. After calling
|
51
|
+
* this function, the supplied stemmer may no longer be used in any way.
|
52
|
+
*
|
53
|
+
* It is safe to pass a null pointer to this function - this will have
|
54
|
+
* no effect.
|
55
|
+
*/
|
56
|
+
void sb_stemmer_delete(struct sb_stemmer * stemmer);
|
57
|
+
|
58
|
+
/** Stem a word.
|
59
|
+
*
|
60
|
+
* The return value is owned by the stemmer - it must not be freed or
|
61
|
+
* modified, and it will become invalid when the stemmer is called again,
|
62
|
+
* or if the stemmer is freed.
|
63
|
+
*
|
64
|
+
* The length of the return value can be obtained using sb_stemmer_length().
|
65
|
+
*
|
66
|
+
* If an out-of-memory error occurs, this will return NULL.
|
67
|
+
*/
|
68
|
+
const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
|
69
|
+
const sb_symbol * word, int size);
|
70
|
+
|
71
|
+
/** Get the length of the result of the last stemmed word.
|
72
|
+
* This should not be called before sb_stemmer_stem() has been called.
|
73
|
+
*/
|
74
|
+
int sb_stemmer_length(struct sb_stemmer * stemmer);
|
75
|
+
|
76
|
+
#ifdef __cplusplus
|
77
|
+
}
|
78
|
+
#endif
|
@@ -0,0 +1,29 @@
|
|
1
|
+
package org.tartarus.snowball;
|
2
|
+
|
3
|
+
import java.lang.reflect.Method;
|
4
|
+
|
5
|
+
public class Among {
|
6
|
+
public Among (String s, int substring_i, int result) {
|
7
|
+
this.s = s.toCharArray();
|
8
|
+
this.substring_i = substring_i;
|
9
|
+
this.result = result;
|
10
|
+
this.method = null;
|
11
|
+
}
|
12
|
+
|
13
|
+
public Among (String s, int substring_i, int result, String methodname,
|
14
|
+
Class<? extends SnowballProgram> programclass) {
|
15
|
+
this.s = s.toCharArray();
|
16
|
+
this.substring_i = substring_i;
|
17
|
+
this.result = result;
|
18
|
+
try {
|
19
|
+
this.method = programclass.getDeclaredMethod(methodname);
|
20
|
+
} catch (NoSuchMethodException e) {
|
21
|
+
throw new RuntimeException(e);
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
public final char[] s; /* search string */
|
26
|
+
public final int substring_i; /* index to longest matching substring */
|
27
|
+
public final int result; /* result of the lookup */
|
28
|
+
public final Method method; /* method to use if substring matches */
|
29
|
+
};
|
@@ -0,0 +1,381 @@
|
|
1
|
+
|
2
|
+
package org.tartarus.snowball;
|
3
|
+
import java.lang.reflect.InvocationTargetException;
|
4
|
+
import java.io.Serializable;
|
5
|
+
|
6
|
+
public class SnowballProgram implements Serializable {
|
7
|
+
protected SnowballProgram()
|
8
|
+
{
|
9
|
+
current = new StringBuilder();
|
10
|
+
init();
|
11
|
+
}
|
12
|
+
|
13
|
+
static final long serialVersionUID = 2016072500L;
|
14
|
+
|
15
|
+
private void init() {
|
16
|
+
cursor = 0;
|
17
|
+
limit = current.length();
|
18
|
+
limit_backward = 0;
|
19
|
+
bra = cursor;
|
20
|
+
ket = limit;
|
21
|
+
}
|
22
|
+
|
23
|
+
/**
|
24
|
+
* Set the current string.
|
25
|
+
*/
|
26
|
+
public void setCurrent(String value)
|
27
|
+
{
|
28
|
+
// Make a new StringBuilder. If we reuse the old one, and a user of
|
29
|
+
// the library keeps a reference to the buffer returned (for example,
|
30
|
+
// by converting it to a String in a way which doesn't force a copy),
|
31
|
+
// the buffer size will not decrease, and we will risk wasting a large
|
32
|
+
// amount of memory.
|
33
|
+
// Thanks to Wolfram Esser for spotting this problem.
|
34
|
+
current = new StringBuilder(value);
|
35
|
+
init();
|
36
|
+
}
|
37
|
+
|
38
|
+
/**
|
39
|
+
* Get the current string.
|
40
|
+
*/
|
41
|
+
public String getCurrent()
|
42
|
+
{
|
43
|
+
return current.toString();
|
44
|
+
}
|
45
|
+
|
46
|
+
// current string
|
47
|
+
protected StringBuilder current;
|
48
|
+
|
49
|
+
protected int cursor;
|
50
|
+
protected int limit;
|
51
|
+
protected int limit_backward;
|
52
|
+
protected int bra;
|
53
|
+
protected int ket;
|
54
|
+
|
55
|
+
public SnowballProgram(SnowballProgram other) {
|
56
|
+
current = other.current;
|
57
|
+
cursor = other.cursor;
|
58
|
+
limit = other.limit;
|
59
|
+
limit_backward = other.limit_backward;
|
60
|
+
bra = other.bra;
|
61
|
+
ket = other.ket;
|
62
|
+
}
|
63
|
+
|
64
|
+
protected void copy_from(SnowballProgram other)
|
65
|
+
{
|
66
|
+
current = other.current;
|
67
|
+
cursor = other.cursor;
|
68
|
+
limit = other.limit;
|
69
|
+
limit_backward = other.limit_backward;
|
70
|
+
bra = other.bra;
|
71
|
+
ket = other.ket;
|
72
|
+
}
|
73
|
+
|
74
|
+
protected boolean in_grouping(char [] s, int min, int max)
|
75
|
+
{
|
76
|
+
if (cursor >= limit) return false;
|
77
|
+
char ch = current.charAt(cursor);
|
78
|
+
if (ch > max || ch < min) return false;
|
79
|
+
ch -= min;
|
80
|
+
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
|
81
|
+
cursor++;
|
82
|
+
return true;
|
83
|
+
}
|
84
|
+
|
85
|
+
protected boolean in_grouping_b(char [] s, int min, int max)
|
86
|
+
{
|
87
|
+
if (cursor <= limit_backward) return false;
|
88
|
+
char ch = current.charAt(cursor - 1);
|
89
|
+
if (ch > max || ch < min) return false;
|
90
|
+
ch -= min;
|
91
|
+
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
|
92
|
+
cursor--;
|
93
|
+
return true;
|
94
|
+
}
|
95
|
+
|
96
|
+
protected boolean out_grouping(char [] s, int min, int max)
|
97
|
+
{
|
98
|
+
if (cursor >= limit) return false;
|
99
|
+
char ch = current.charAt(cursor);
|
100
|
+
if (ch > max || ch < min) {
|
101
|
+
cursor++;
|
102
|
+
return true;
|
103
|
+
}
|
104
|
+
ch -= min;
|
105
|
+
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
|
106
|
+
cursor++;
|
107
|
+
return true;
|
108
|
+
}
|
109
|
+
return false;
|
110
|
+
}
|
111
|
+
|
112
|
+
protected boolean out_grouping_b(char [] s, int min, int max)
|
113
|
+
{
|
114
|
+
if (cursor <= limit_backward) return false;
|
115
|
+
char ch = current.charAt(cursor - 1);
|
116
|
+
if (ch > max || ch < min) {
|
117
|
+
cursor--;
|
118
|
+
return true;
|
119
|
+
}
|
120
|
+
ch -= min;
|
121
|
+
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
|
122
|
+
cursor--;
|
123
|
+
return true;
|
124
|
+
}
|
125
|
+
return false;
|
126
|
+
}
|
127
|
+
|
128
|
+
protected boolean eq_s(CharSequence s)
|
129
|
+
{
|
130
|
+
if (limit - cursor < s.length()) return false;
|
131
|
+
int i;
|
132
|
+
for (i = 0; i != s.length(); i++) {
|
133
|
+
if (current.charAt(cursor + i) != s.charAt(i)) return false;
|
134
|
+
}
|
135
|
+
cursor += s.length();
|
136
|
+
return true;
|
137
|
+
}
|
138
|
+
|
139
|
+
protected boolean eq_s_b(CharSequence s)
|
140
|
+
{
|
141
|
+
if (cursor - limit_backward < s.length()) return false;
|
142
|
+
int i;
|
143
|
+
for (i = 0; i != s.length(); i++) {
|
144
|
+
if (current.charAt(cursor - s.length() + i) != s.charAt(i)) return false;
|
145
|
+
}
|
146
|
+
cursor -= s.length();
|
147
|
+
return true;
|
148
|
+
}
|
149
|
+
|
150
|
+
protected int find_among(Among v[])
|
151
|
+
{
|
152
|
+
int i = 0;
|
153
|
+
int j = v.length;
|
154
|
+
|
155
|
+
int c = cursor;
|
156
|
+
int l = limit;
|
157
|
+
|
158
|
+
int common_i = 0;
|
159
|
+
int common_j = 0;
|
160
|
+
|
161
|
+
boolean first_key_inspected = false;
|
162
|
+
|
163
|
+
while (true) {
|
164
|
+
int k = i + ((j - i) >> 1);
|
165
|
+
int diff = 0;
|
166
|
+
int common = common_i < common_j ? common_i : common_j; // smaller
|
167
|
+
Among w = v[k];
|
168
|
+
int i2;
|
169
|
+
for (i2 = common; i2 < w.s.length; i2++) {
|
170
|
+
if (c + common == l) {
|
171
|
+
diff = -1;
|
172
|
+
break;
|
173
|
+
}
|
174
|
+
diff = current.charAt(c + common) - w.s[i2];
|
175
|
+
if (diff != 0) break;
|
176
|
+
common++;
|
177
|
+
}
|
178
|
+
if (diff < 0) {
|
179
|
+
j = k;
|
180
|
+
common_j = common;
|
181
|
+
} else {
|
182
|
+
i = k;
|
183
|
+
common_i = common;
|
184
|
+
}
|
185
|
+
if (j - i <= 1) {
|
186
|
+
if (i > 0) break; // v->s has been inspected
|
187
|
+
if (j == i) break; // only one item in v
|
188
|
+
|
189
|
+
// - but now we need to go round once more to get
|
190
|
+
// v->s inspected. This looks messy, but is actually
|
191
|
+
// the optimal approach.
|
192
|
+
|
193
|
+
if (first_key_inspected) break;
|
194
|
+
first_key_inspected = true;
|
195
|
+
}
|
196
|
+
}
|
197
|
+
while (true) {
|
198
|
+
Among w = v[i];
|
199
|
+
if (common_i >= w.s.length) {
|
200
|
+
cursor = c + w.s.length;
|
201
|
+
if (w.method == null) return w.result;
|
202
|
+
boolean res;
|
203
|
+
try {
|
204
|
+
Object resobj = w.method.invoke(this);
|
205
|
+
res = resobj.toString().equals("true");
|
206
|
+
} catch (InvocationTargetException e) {
|
207
|
+
res = false;
|
208
|
+
// FIXME - debug message
|
209
|
+
} catch (IllegalAccessException e) {
|
210
|
+
res = false;
|
211
|
+
// FIXME - debug message
|
212
|
+
}
|
213
|
+
cursor = c + w.s.length;
|
214
|
+
if (res) return w.result;
|
215
|
+
}
|
216
|
+
i = w.substring_i;
|
217
|
+
if (i < 0) return 0;
|
218
|
+
}
|
219
|
+
}
|
220
|
+
|
221
|
+
// find_among_b is for backwards processing. Same comments apply
|
222
|
+
protected int find_among_b(Among v[])
|
223
|
+
{
|
224
|
+
int i = 0;
|
225
|
+
int j = v.length;
|
226
|
+
|
227
|
+
int c = cursor;
|
228
|
+
int lb = limit_backward;
|
229
|
+
|
230
|
+
int common_i = 0;
|
231
|
+
int common_j = 0;
|
232
|
+
|
233
|
+
boolean first_key_inspected = false;
|
234
|
+
|
235
|
+
while (true) {
|
236
|
+
int k = i + ((j - i) >> 1);
|
237
|
+
int diff = 0;
|
238
|
+
int common = common_i < common_j ? common_i : common_j;
|
239
|
+
Among w = v[k];
|
240
|
+
int i2;
|
241
|
+
for (i2 = w.s.length - 1 - common; i2 >= 0; i2--) {
|
242
|
+
if (c - common == lb) {
|
243
|
+
diff = -1;
|
244
|
+
break;
|
245
|
+
}
|
246
|
+
diff = current.charAt(c - 1 - common) - w.s[i2];
|
247
|
+
if (diff != 0) break;
|
248
|
+
common++;
|
249
|
+
}
|
250
|
+
if (diff < 0) {
|
251
|
+
j = k;
|
252
|
+
common_j = common;
|
253
|
+
} else {
|
254
|
+
i = k;
|
255
|
+
common_i = common;
|
256
|
+
}
|
257
|
+
if (j - i <= 1) {
|
258
|
+
if (i > 0) break;
|
259
|
+
if (j == i) break;
|
260
|
+
if (first_key_inspected) break;
|
261
|
+
first_key_inspected = true;
|
262
|
+
}
|
263
|
+
}
|
264
|
+
while (true) {
|
265
|
+
Among w = v[i];
|
266
|
+
if (common_i >= w.s.length) {
|
267
|
+
cursor = c - w.s.length;
|
268
|
+
if (w.method == null) return w.result;
|
269
|
+
|
270
|
+
boolean res;
|
271
|
+
try {
|
272
|
+
Object resobj = w.method.invoke(this);
|
273
|
+
res = resobj.toString().equals("true");
|
274
|
+
} catch (InvocationTargetException e) {
|
275
|
+
res = false;
|
276
|
+
// FIXME - debug message
|
277
|
+
} catch (IllegalAccessException e) {
|
278
|
+
res = false;
|
279
|
+
// FIXME - debug message
|
280
|
+
}
|
281
|
+
cursor = c - w.s.length;
|
282
|
+
if (res) return w.result;
|
283
|
+
}
|
284
|
+
i = w.substring_i;
|
285
|
+
if (i < 0) return 0;
|
286
|
+
}
|
287
|
+
}
|
288
|
+
|
289
|
+
/* to replace chars between c_bra and c_ket in current by the
|
290
|
+
* chars in s.
|
291
|
+
*/
|
292
|
+
protected int replace_s(int c_bra, int c_ket, String s)
|
293
|
+
{
|
294
|
+
int adjustment = s.length() - (c_ket - c_bra);
|
295
|
+
current.replace(c_bra, c_ket, s);
|
296
|
+
limit += adjustment;
|
297
|
+
if (cursor >= c_ket) cursor += adjustment;
|
298
|
+
else if (cursor > c_bra) cursor = c_bra;
|
299
|
+
return adjustment;
|
300
|
+
}
|
301
|
+
|
302
|
+
protected void slice_check()
|
303
|
+
{
|
304
|
+
if (bra < 0 ||
|
305
|
+
bra > ket ||
|
306
|
+
ket > limit ||
|
307
|
+
limit > current.length()) // this line could be removed
|
308
|
+
{
|
309
|
+
System.err.println("faulty slice operation");
|
310
|
+
// FIXME: report error somehow.
|
311
|
+
/*
|
312
|
+
fprintf(stderr, "faulty slice operation:\n");
|
313
|
+
debug(z, -1, 0);
|
314
|
+
exit(1);
|
315
|
+
*/
|
316
|
+
}
|
317
|
+
}
|
318
|
+
|
319
|
+
protected void slice_from(String s)
|
320
|
+
{
|
321
|
+
slice_check();
|
322
|
+
replace_s(bra, ket, s);
|
323
|
+
}
|
324
|
+
|
325
|
+
protected void slice_from(CharSequence s)
|
326
|
+
{
|
327
|
+
slice_from(s.toString());
|
328
|
+
}
|
329
|
+
|
330
|
+
protected void slice_del()
|
331
|
+
{
|
332
|
+
slice_from("");
|
333
|
+
}
|
334
|
+
|
335
|
+
protected void insert(int c_bra, int c_ket, String s)
|
336
|
+
{
|
337
|
+
int adjustment = replace_s(c_bra, c_ket, s);
|
338
|
+
if (c_bra <= bra) bra += adjustment;
|
339
|
+
if (c_bra <= ket) ket += adjustment;
|
340
|
+
}
|
341
|
+
|
342
|
+
protected void insert(int c_bra, int c_ket, CharSequence s)
|
343
|
+
{
|
344
|
+
insert(c_bra, c_ket, s.toString());
|
345
|
+
}
|
346
|
+
|
347
|
+
/* Copy the slice into the supplied StringBuilder */
|
348
|
+
protected void slice_to(StringBuilder s)
|
349
|
+
{
|
350
|
+
slice_check();
|
351
|
+
s.replace(0, s.length(), current.substring(bra, ket));
|
352
|
+
}
|
353
|
+
|
354
|
+
protected void assign_to(StringBuilder s)
|
355
|
+
{
|
356
|
+
s.replace(0, s.length(), current.substring(0, limit));
|
357
|
+
}
|
358
|
+
|
359
|
+
/*
|
360
|
+
extern void debug(struct SN_env * z, int number, int line_count)
|
361
|
+
{ int i;
|
362
|
+
int limit = SIZE(z->p);
|
363
|
+
//if (number >= 0) printf("%3d (line %4d): '", number, line_count);
|
364
|
+
if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
|
365
|
+
for (i = 0; i <= limit; i++)
|
366
|
+
{ if (z->lb == i) printf("{");
|
367
|
+
if (z->bra == i) printf("[");
|
368
|
+
if (z->c == i) printf("|");
|
369
|
+
if (z->ket == i) printf("]");
|
370
|
+
if (z->l == i) printf("}");
|
371
|
+
if (i < limit)
|
372
|
+
{ int ch = z->p[i];
|
373
|
+
if (ch == 0) ch = '#';
|
374
|
+
printf("%c", ch);
|
375
|
+
}
|
376
|
+
}
|
377
|
+
printf("'\n");
|
378
|
+
}
|
379
|
+
*/
|
380
|
+
|
381
|
+
};
|