mittens 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +30 -0
- data/README.md +62 -0
- data/Rakefile +21 -0
- data/ext/mittens/ext.c +96 -0
- data/ext/mittens/extconf.rb +12 -0
- data/lib/mittens/version.rb +3 -0
- data/lib/mittens.rb +7 -0
- data/mittens.gemspec +22 -0
- data/vendor/snowball/.gitignore +26 -0
- data/vendor/snowball/.travis.yml +112 -0
- data/vendor/snowball/AUTHORS +27 -0
- data/vendor/snowball/CONTRIBUTING.rst +216 -0
- data/vendor/snowball/COPYING +29 -0
- data/vendor/snowball/GNUmakefile +742 -0
- data/vendor/snowball/NEWS +754 -0
- data/vendor/snowball/README.rst +37 -0
- data/vendor/snowball/ada/README.md +74 -0
- data/vendor/snowball/ada/generate/generate.adb +83 -0
- data/vendor/snowball/ada/generate.gpr +21 -0
- data/vendor/snowball/ada/src/stemmer.adb +620 -0
- data/vendor/snowball/ada/src/stemmer.ads +219 -0
- data/vendor/snowball/ada/src/stemwords.adb +70 -0
- data/vendor/snowball/ada/stemmer_config.gpr +83 -0
- data/vendor/snowball/ada/stemwords.gpr +21 -0
- data/vendor/snowball/algorithms/arabic.sbl +558 -0
- data/vendor/snowball/algorithms/armenian.sbl +301 -0
- data/vendor/snowball/algorithms/basque.sbl +149 -0
- data/vendor/snowball/algorithms/catalan.sbl +202 -0
- data/vendor/snowball/algorithms/danish.sbl +93 -0
- data/vendor/snowball/algorithms/dutch.sbl +164 -0
- data/vendor/snowball/algorithms/english.sbl +229 -0
- data/vendor/snowball/algorithms/finnish.sbl +197 -0
- data/vendor/snowball/algorithms/french.sbl +254 -0
- data/vendor/snowball/algorithms/german.sbl +139 -0
- data/vendor/snowball/algorithms/german2.sbl +145 -0
- data/vendor/snowball/algorithms/greek.sbl +701 -0
- data/vendor/snowball/algorithms/hindi.sbl +323 -0
- data/vendor/snowball/algorithms/hungarian.sbl +241 -0
- data/vendor/snowball/algorithms/indonesian.sbl +192 -0
- data/vendor/snowball/algorithms/irish.sbl +149 -0
- data/vendor/snowball/algorithms/italian.sbl +202 -0
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
- data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
- data/vendor/snowball/algorithms/lovins.sbl +208 -0
- data/vendor/snowball/algorithms/nepali.sbl +92 -0
- data/vendor/snowball/algorithms/norwegian.sbl +80 -0
- data/vendor/snowball/algorithms/porter.sbl +139 -0
- data/vendor/snowball/algorithms/portuguese.sbl +218 -0
- data/vendor/snowball/algorithms/romanian.sbl +236 -0
- data/vendor/snowball/algorithms/russian.sbl +221 -0
- data/vendor/snowball/algorithms/serbian.sbl +2379 -0
- data/vendor/snowball/algorithms/spanish.sbl +230 -0
- data/vendor/snowball/algorithms/swedish.sbl +72 -0
- data/vendor/snowball/algorithms/tamil.sbl +405 -0
- data/vendor/snowball/algorithms/turkish.sbl +470 -0
- data/vendor/snowball/algorithms/yiddish.sbl +460 -0
- data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
- data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
- data/vendor/snowball/charsets/cp850.sbl +130 -0
- data/vendor/snowball/compiler/analyser.c +1547 -0
- data/vendor/snowball/compiler/driver.c +615 -0
- data/vendor/snowball/compiler/generator.c +1748 -0
- data/vendor/snowball/compiler/generator_ada.c +1702 -0
- data/vendor/snowball/compiler/generator_csharp.c +1322 -0
- data/vendor/snowball/compiler/generator_go.c +1278 -0
- data/vendor/snowball/compiler/generator_java.c +1313 -0
- data/vendor/snowball/compiler/generator_js.c +1316 -0
- data/vendor/snowball/compiler/generator_pascal.c +1387 -0
- data/vendor/snowball/compiler/generator_python.c +1337 -0
- data/vendor/snowball/compiler/generator_rust.c +1295 -0
- data/vendor/snowball/compiler/header.h +418 -0
- data/vendor/snowball/compiler/space.c +286 -0
- data/vendor/snowball/compiler/syswords.h +86 -0
- data/vendor/snowball/compiler/syswords2.h +13 -0
- data/vendor/snowball/compiler/tokeniser.c +567 -0
- data/vendor/snowball/csharp/.gitignore +8 -0
- data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
- data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
- data/vendor/snowball/csharp/Stemwords/App.config +6 -0
- data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
- data/vendor/snowball/doc/TODO +12 -0
- data/vendor/snowball/doc/libstemmer_c_README +148 -0
- data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
- data/vendor/snowball/doc/libstemmer_java_README +67 -0
- data/vendor/snowball/doc/libstemmer_js_README +48 -0
- data/vendor/snowball/doc/libstemmer_python_README +113 -0
- data/vendor/snowball/examples/stemwords.c +204 -0
- data/vendor/snowball/go/README.md +55 -0
- data/vendor/snowball/go/among.go +16 -0
- data/vendor/snowball/go/env.go +403 -0
- data/vendor/snowball/go/stemwords/generate.go +68 -0
- data/vendor/snowball/go/stemwords/main.go +68 -0
- data/vendor/snowball/go/util.go +34 -0
- data/vendor/snowball/iconv.py +50 -0
- data/vendor/snowball/include/libstemmer.h +78 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
- data/vendor/snowball/javascript/base-stemmer.js +294 -0
- data/vendor/snowball/javascript/stemwords.js +106 -0
- data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
- data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
- data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
- data/vendor/snowball/libstemmer/modules.txt +63 -0
- data/vendor/snowball/libstemmer/test.c +34 -0
- data/vendor/snowball/pascal/.gitignore +4 -0
- data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
- data/vendor/snowball/pascal/generate.pl +23 -0
- data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
- data/vendor/snowball/python/MANIFEST.in +7 -0
- data/vendor/snowball/python/create_init.py +54 -0
- data/vendor/snowball/python/setup.cfg +6 -0
- data/vendor/snowball/python/setup.py +81 -0
- data/vendor/snowball/python/snowballstemmer/among.py +13 -0
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
- data/vendor/snowball/python/stemwords.py +101 -0
- data/vendor/snowball/python/testapp.py +28 -0
- data/vendor/snowball/runtime/api.c +58 -0
- data/vendor/snowball/runtime/api.h +32 -0
- data/vendor/snowball/runtime/header.h +61 -0
- data/vendor/snowball/runtime/utilities.c +513 -0
- data/vendor/snowball/rust/Cargo.toml +7 -0
- data/vendor/snowball/rust/build.rs +55 -0
- data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
- data/vendor/snowball/rust/src/main.rs +102 -0
- data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
- data/vendor/snowball/rust/src/snowball/among.rs +6 -0
- data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
- data/vendor/snowball/tests/stemtest.c +95 -0
- metadata +178 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
// +build ignore
|
|
2
|
+
|
|
3
|
+
package main
|
|
4
|
+
|
|
5
|
+
import (
|
|
6
|
+
"flag"
|
|
7
|
+
"fmt"
|
|
8
|
+
"io"
|
|
9
|
+
"io/ioutil"
|
|
10
|
+
"log"
|
|
11
|
+
"os"
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
// tool to register all algorithms built with the stemwords tool
|
|
15
|
+
|
|
16
|
+
func main() {
|
|
17
|
+
flag.Parse()
|
|
18
|
+
|
|
19
|
+
if flag.NArg() < 1 {
|
|
20
|
+
log.Fatal("must specify algorithms directory")
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
var w io.Writer
|
|
24
|
+
if flag.NArg() > 1 {
|
|
25
|
+
var err error
|
|
26
|
+
w, err = os.Create(flag.Arg(1))
|
|
27
|
+
if err != nil {
|
|
28
|
+
log.Fatalf("error creating output file %v", err)
|
|
29
|
+
}
|
|
30
|
+
} else {
|
|
31
|
+
w = os.Stdout
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
fmt.Fprintf(w, "%s", header)
|
|
35
|
+
|
|
36
|
+
files, err := ioutil.ReadDir(flag.Arg(0))
|
|
37
|
+
if err != nil {
|
|
38
|
+
log.Fatal(err)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
for _, file := range files {
|
|
42
|
+
fmt.Fprintf(w, " %s \"github.com/snowballstem/snowball/go/algorithms/%s\"\n",
|
|
43
|
+
file.Name(), file.Name())
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
fmt.Fprintf(w, closeImportStartInit)
|
|
47
|
+
|
|
48
|
+
for _, file := range files {
|
|
49
|
+
fmt.Fprintf(w, " languages[\"%s\"] = %s.Stem\n",
|
|
50
|
+
file.Name(), file.Name())
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
fmt.Fprintf(w, "%s", footer)
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
var header = `// generated list of supported algorithms, DO NOT EDIT
|
|
57
|
+
|
|
58
|
+
package main
|
|
59
|
+
|
|
60
|
+
import (
|
|
61
|
+
`
|
|
62
|
+
|
|
63
|
+
var closeImportStartInit = `)
|
|
64
|
+
|
|
65
|
+
func init() {`
|
|
66
|
+
|
|
67
|
+
var footer = `}
|
|
68
|
+
`
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
//go:generate go run generate.go ../algorithms algorithms.go
|
|
2
|
+
//go:generate gofmt -s -w algorithms.go
|
|
3
|
+
|
|
4
|
+
package main
|
|
5
|
+
|
|
6
|
+
import (
|
|
7
|
+
"bufio"
|
|
8
|
+
"flag"
|
|
9
|
+
"fmt"
|
|
10
|
+
"log"
|
|
11
|
+
"os"
|
|
12
|
+
|
|
13
|
+
snowballRuntime "github.com/snowballstem/snowball/go"
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
var language = flag.String("l", "", "language")
|
|
17
|
+
var input = flag.String("i", "", "input file")
|
|
18
|
+
var output = flag.String("o", "", "output file")
|
|
19
|
+
|
|
20
|
+
func main() {
|
|
21
|
+
flag.Parse()
|
|
22
|
+
|
|
23
|
+
if *language == "" {
|
|
24
|
+
log.Fatal("must specify language")
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
stemmer, ok := languages[*language]
|
|
28
|
+
if !ok {
|
|
29
|
+
log.Fatalf("no language support for %s", *language)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
var reader = os.Stdin
|
|
33
|
+
if *input != "" {
|
|
34
|
+
var err error
|
|
35
|
+
reader, err = os.Open(*input)
|
|
36
|
+
if err != nil {
|
|
37
|
+
log.Fatal(err)
|
|
38
|
+
}
|
|
39
|
+
defer reader.Close()
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
var writer = os.Stdout
|
|
43
|
+
if *output != "" {
|
|
44
|
+
var err error
|
|
45
|
+
writer, err = os.Create(*output)
|
|
46
|
+
if err != nil {
|
|
47
|
+
log.Fatal(err)
|
|
48
|
+
}
|
|
49
|
+
defer writer.Close()
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
var err error
|
|
53
|
+
scanner := bufio.NewScanner(reader)
|
|
54
|
+
for scanner.Scan() {
|
|
55
|
+
word := scanner.Text()
|
|
56
|
+
env := snowballRuntime.NewEnv(word)
|
|
57
|
+
stemmer(env)
|
|
58
|
+
fmt.Fprintf(writer, "%s\n", env.Current())
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if err = scanner.Err(); err != nil {
|
|
62
|
+
log.Fatal(err)
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
type StemFunc func(env *snowballRuntime.Env) bool
|
|
67
|
+
|
|
68
|
+
var languages = make(map[string]StemFunc)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
package snowball
|
|
2
|
+
|
|
3
|
+
import (
|
|
4
|
+
"math"
|
|
5
|
+
"unicode/utf8"
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
const MaxInt = math.MaxInt32
|
|
9
|
+
const MinInt = math.MinInt32
|
|
10
|
+
|
|
11
|
+
func splitAt(str string, mid int) (string, string) {
|
|
12
|
+
return str[:mid], str[mid:]
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
func min(a, b int) int {
|
|
16
|
+
if a < b {
|
|
17
|
+
return a
|
|
18
|
+
}
|
|
19
|
+
return b
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
func onCharBoundary(s string, pos int) bool {
|
|
23
|
+
if pos <= 0 || pos >= len(s) {
|
|
24
|
+
return true
|
|
25
|
+
}
|
|
26
|
+
return utf8.RuneStart(s[pos])
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// RuneCountInString is a wrapper around utf8.RuneCountInString
|
|
30
|
+
// this allows us to not have to conditionally include
|
|
31
|
+
// the utf8 package into some stemmers and not others
|
|
32
|
+
func RuneCountInString(str string) int {
|
|
33
|
+
return utf8.RuneCountInString(str)
|
|
34
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
#!env python
|
|
2
|
+
# Simple (but slow) iconv replacement in Python.
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
in_cs = out_cs = in_file = out_file = pending = None
|
|
6
|
+
for arg in sys.argv[1:]:
|
|
7
|
+
if pending != None:
|
|
8
|
+
arg = pending + arg
|
|
9
|
+
pending = None
|
|
10
|
+
if arg.startswith('-'):
|
|
11
|
+
if arg[1] in ('f', 't', 'o'):
|
|
12
|
+
if len(arg) == 2:
|
|
13
|
+
pending = arg
|
|
14
|
+
continue
|
|
15
|
+
if arg[1] == 'f':
|
|
16
|
+
in_cs = arg[2:]
|
|
17
|
+
continue
|
|
18
|
+
if arg[1] == 't':
|
|
19
|
+
out_cs = arg[2:]
|
|
20
|
+
continue
|
|
21
|
+
if arg[1] == 'o':
|
|
22
|
+
out_file = open(arg[2:], 'wb')
|
|
23
|
+
continue
|
|
24
|
+
print("Unknown option: '%s'" % arg)
|
|
25
|
+
sys.exit(1)
|
|
26
|
+
if in_file == None:
|
|
27
|
+
in_file = open(arg, 'rb')
|
|
28
|
+
continue
|
|
29
|
+
print("Too many arguments")
|
|
30
|
+
sys.exit(1)
|
|
31
|
+
|
|
32
|
+
if in_cs == None:
|
|
33
|
+
print("Need to specify input cs with -f")
|
|
34
|
+
sys.exit(1)
|
|
35
|
+
if out_cs == None:
|
|
36
|
+
print("Need to specify output cs with -t")
|
|
37
|
+
sys.exit(1)
|
|
38
|
+
|
|
39
|
+
if in_file == None:
|
|
40
|
+
if hasattr(sys.stdin, 'buffer'):
|
|
41
|
+
in_file = sys.stdin.buffer
|
|
42
|
+
else:
|
|
43
|
+
in_file = sys.stdin
|
|
44
|
+
if out_file == None:
|
|
45
|
+
if hasattr(sys.stdout, 'buffer'):
|
|
46
|
+
out_file = sys.stdout.buffer
|
|
47
|
+
else:
|
|
48
|
+
out_file = sys.stdout
|
|
49
|
+
|
|
50
|
+
out_file.write(in_file.read().decode(in_cs).encode(out_cs))
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
|
|
2
|
+
/* Make header file work when included from C++ */
|
|
3
|
+
#ifdef __cplusplus
|
|
4
|
+
extern "C" {
|
|
5
|
+
#endif
|
|
6
|
+
|
|
7
|
+
struct sb_stemmer;
|
|
8
|
+
typedef unsigned char sb_symbol;
|
|
9
|
+
|
|
10
|
+
/* FIXME - should be able to get a version number for each stemming
|
|
11
|
+
* algorithm (which will be incremented each time the output changes). */
|
|
12
|
+
|
|
13
|
+
/** Returns an array of the names of the available stemming algorithms.
|
|
14
|
+
* Note that these are the canonical names - aliases (ie, other names for
|
|
15
|
+
* the same algorithm) will not be included in the list.
|
|
16
|
+
* The list is terminated with a null pointer.
|
|
17
|
+
*
|
|
18
|
+
* The list must not be modified in any way.
|
|
19
|
+
*/
|
|
20
|
+
const char ** sb_stemmer_list(void);
|
|
21
|
+
|
|
22
|
+
/** Create a new stemmer object, using the specified algorithm, for the
|
|
23
|
+
* specified character encoding.
|
|
24
|
+
*
|
|
25
|
+
* All algorithms will usually be available in UTF-8, but may also be
|
|
26
|
+
* available in other character encodings.
|
|
27
|
+
*
|
|
28
|
+
* @param algorithm The algorithm name. This is either the english
|
|
29
|
+
* name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
|
|
30
|
+
* language. Note that case is significant in this parameter - the
|
|
31
|
+
* value should be supplied in lower case.
|
|
32
|
+
*
|
|
33
|
+
* @param charenc The character encoding. NULL may be passed as
|
|
34
|
+
* this value, in which case UTF-8 encoding will be assumed. Otherwise,
|
|
35
|
+
* the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1),
|
|
36
|
+
* "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is
|
|
37
|
+
* significant in this parameter.
|
|
38
|
+
*
|
|
39
|
+
* @return NULL if the specified algorithm is not recognised, or the
|
|
40
|
+
* algorithm is not available for the requested encoding. Otherwise,
|
|
41
|
+
* returns a pointer to a newly created stemmer for the requested algorithm.
|
|
42
|
+
* The returned pointer must be deleted by calling sb_stemmer_delete().
|
|
43
|
+
*
|
|
44
|
+
* @note NULL will also be returned if an out of memory error occurs.
|
|
45
|
+
*/
|
|
46
|
+
struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
|
|
47
|
+
|
|
48
|
+
/** Delete a stemmer object.
|
|
49
|
+
*
|
|
50
|
+
* This frees all resources allocated for the stemmer. After calling
|
|
51
|
+
* this function, the supplied stemmer may no longer be used in any way.
|
|
52
|
+
*
|
|
53
|
+
* It is safe to pass a null pointer to this function - this will have
|
|
54
|
+
* no effect.
|
|
55
|
+
*/
|
|
56
|
+
void sb_stemmer_delete(struct sb_stemmer * stemmer);
|
|
57
|
+
|
|
58
|
+
/** Stem a word.
|
|
59
|
+
*
|
|
60
|
+
* The return value is owned by the stemmer - it must not be freed or
|
|
61
|
+
* modified, and it will become invalid when the stemmer is called again,
|
|
62
|
+
* or if the stemmer is freed.
|
|
63
|
+
*
|
|
64
|
+
* The length of the return value can be obtained using sb_stemmer_length().
|
|
65
|
+
*
|
|
66
|
+
* If an out-of-memory error occurs, this will return NULL.
|
|
67
|
+
*/
|
|
68
|
+
const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
|
|
69
|
+
const sb_symbol * word, int size);
|
|
70
|
+
|
|
71
|
+
/** Get the length of the result of the last stemmed word.
|
|
72
|
+
* This should not be called before sb_stemmer_stem() has been called.
|
|
73
|
+
*/
|
|
74
|
+
int sb_stemmer_length(struct sb_stemmer * stemmer);
|
|
75
|
+
|
|
76
|
+
#ifdef __cplusplus
|
|
77
|
+
}
|
|
78
|
+
#endif
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
package org.tartarus.snowball;
|
|
2
|
+
|
|
3
|
+
import java.lang.reflect.Method;
|
|
4
|
+
|
|
5
|
+
public class Among {
|
|
6
|
+
public Among (String s, int substring_i, int result) {
|
|
7
|
+
this.s = s.toCharArray();
|
|
8
|
+
this.substring_i = substring_i;
|
|
9
|
+
this.result = result;
|
|
10
|
+
this.method = null;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
public Among (String s, int substring_i, int result, String methodname,
|
|
14
|
+
Class<? extends SnowballProgram> programclass) {
|
|
15
|
+
this.s = s.toCharArray();
|
|
16
|
+
this.substring_i = substring_i;
|
|
17
|
+
this.result = result;
|
|
18
|
+
try {
|
|
19
|
+
this.method = programclass.getDeclaredMethod(methodname);
|
|
20
|
+
} catch (NoSuchMethodException e) {
|
|
21
|
+
throw new RuntimeException(e);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
public final char[] s; /* search string */
|
|
26
|
+
public final int substring_i; /* index to longest matching substring */
|
|
27
|
+
public final int result; /* result of the lookup */
|
|
28
|
+
public final Method method; /* method to use if substring matches */
|
|
29
|
+
};
|
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
|
|
2
|
+
package org.tartarus.snowball;
|
|
3
|
+
import java.lang.reflect.InvocationTargetException;
|
|
4
|
+
import java.io.Serializable;
|
|
5
|
+
|
|
6
|
+
public class SnowballProgram implements Serializable {
|
|
7
|
+
protected SnowballProgram()
|
|
8
|
+
{
|
|
9
|
+
current = new StringBuilder();
|
|
10
|
+
init();
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
static final long serialVersionUID = 2016072500L;
|
|
14
|
+
|
|
15
|
+
private void init() {
|
|
16
|
+
cursor = 0;
|
|
17
|
+
limit = current.length();
|
|
18
|
+
limit_backward = 0;
|
|
19
|
+
bra = cursor;
|
|
20
|
+
ket = limit;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Set the current string.
|
|
25
|
+
*/
|
|
26
|
+
public void setCurrent(String value)
|
|
27
|
+
{
|
|
28
|
+
// Make a new StringBuilder. If we reuse the old one, and a user of
|
|
29
|
+
// the library keeps a reference to the buffer returned (for example,
|
|
30
|
+
// by converting it to a String in a way which doesn't force a copy),
|
|
31
|
+
// the buffer size will not decrease, and we will risk wasting a large
|
|
32
|
+
// amount of memory.
|
|
33
|
+
// Thanks to Wolfram Esser for spotting this problem.
|
|
34
|
+
current = new StringBuilder(value);
|
|
35
|
+
init();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Get the current string.
|
|
40
|
+
*/
|
|
41
|
+
public String getCurrent()
|
|
42
|
+
{
|
|
43
|
+
return current.toString();
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// current string
|
|
47
|
+
protected StringBuilder current;
|
|
48
|
+
|
|
49
|
+
protected int cursor;
|
|
50
|
+
protected int limit;
|
|
51
|
+
protected int limit_backward;
|
|
52
|
+
protected int bra;
|
|
53
|
+
protected int ket;
|
|
54
|
+
|
|
55
|
+
public SnowballProgram(SnowballProgram other) {
|
|
56
|
+
current = other.current;
|
|
57
|
+
cursor = other.cursor;
|
|
58
|
+
limit = other.limit;
|
|
59
|
+
limit_backward = other.limit_backward;
|
|
60
|
+
bra = other.bra;
|
|
61
|
+
ket = other.ket;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
protected void copy_from(SnowballProgram other)
|
|
65
|
+
{
|
|
66
|
+
current = other.current;
|
|
67
|
+
cursor = other.cursor;
|
|
68
|
+
limit = other.limit;
|
|
69
|
+
limit_backward = other.limit_backward;
|
|
70
|
+
bra = other.bra;
|
|
71
|
+
ket = other.ket;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
protected boolean in_grouping(char [] s, int min, int max)
|
|
75
|
+
{
|
|
76
|
+
if (cursor >= limit) return false;
|
|
77
|
+
char ch = current.charAt(cursor);
|
|
78
|
+
if (ch > max || ch < min) return false;
|
|
79
|
+
ch -= min;
|
|
80
|
+
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
|
|
81
|
+
cursor++;
|
|
82
|
+
return true;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
protected boolean in_grouping_b(char [] s, int min, int max)
|
|
86
|
+
{
|
|
87
|
+
if (cursor <= limit_backward) return false;
|
|
88
|
+
char ch = current.charAt(cursor - 1);
|
|
89
|
+
if (ch > max || ch < min) return false;
|
|
90
|
+
ch -= min;
|
|
91
|
+
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
|
|
92
|
+
cursor--;
|
|
93
|
+
return true;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
protected boolean out_grouping(char [] s, int min, int max)
|
|
97
|
+
{
|
|
98
|
+
if (cursor >= limit) return false;
|
|
99
|
+
char ch = current.charAt(cursor);
|
|
100
|
+
if (ch > max || ch < min) {
|
|
101
|
+
cursor++;
|
|
102
|
+
return true;
|
|
103
|
+
}
|
|
104
|
+
ch -= min;
|
|
105
|
+
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
|
|
106
|
+
cursor++;
|
|
107
|
+
return true;
|
|
108
|
+
}
|
|
109
|
+
return false;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
protected boolean out_grouping_b(char [] s, int min, int max)
|
|
113
|
+
{
|
|
114
|
+
if (cursor <= limit_backward) return false;
|
|
115
|
+
char ch = current.charAt(cursor - 1);
|
|
116
|
+
if (ch > max || ch < min) {
|
|
117
|
+
cursor--;
|
|
118
|
+
return true;
|
|
119
|
+
}
|
|
120
|
+
ch -= min;
|
|
121
|
+
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
|
|
122
|
+
cursor--;
|
|
123
|
+
return true;
|
|
124
|
+
}
|
|
125
|
+
return false;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
protected boolean eq_s(CharSequence s)
|
|
129
|
+
{
|
|
130
|
+
if (limit - cursor < s.length()) return false;
|
|
131
|
+
int i;
|
|
132
|
+
for (i = 0; i != s.length(); i++) {
|
|
133
|
+
if (current.charAt(cursor + i) != s.charAt(i)) return false;
|
|
134
|
+
}
|
|
135
|
+
cursor += s.length();
|
|
136
|
+
return true;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
protected boolean eq_s_b(CharSequence s)
|
|
140
|
+
{
|
|
141
|
+
if (cursor - limit_backward < s.length()) return false;
|
|
142
|
+
int i;
|
|
143
|
+
for (i = 0; i != s.length(); i++) {
|
|
144
|
+
if (current.charAt(cursor - s.length() + i) != s.charAt(i)) return false;
|
|
145
|
+
}
|
|
146
|
+
cursor -= s.length();
|
|
147
|
+
return true;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
protected int find_among(Among v[])
|
|
151
|
+
{
|
|
152
|
+
int i = 0;
|
|
153
|
+
int j = v.length;
|
|
154
|
+
|
|
155
|
+
int c = cursor;
|
|
156
|
+
int l = limit;
|
|
157
|
+
|
|
158
|
+
int common_i = 0;
|
|
159
|
+
int common_j = 0;
|
|
160
|
+
|
|
161
|
+
boolean first_key_inspected = false;
|
|
162
|
+
|
|
163
|
+
while (true) {
|
|
164
|
+
int k = i + ((j - i) >> 1);
|
|
165
|
+
int diff = 0;
|
|
166
|
+
int common = common_i < common_j ? common_i : common_j; // smaller
|
|
167
|
+
Among w = v[k];
|
|
168
|
+
int i2;
|
|
169
|
+
for (i2 = common; i2 < w.s.length; i2++) {
|
|
170
|
+
if (c + common == l) {
|
|
171
|
+
diff = -1;
|
|
172
|
+
break;
|
|
173
|
+
}
|
|
174
|
+
diff = current.charAt(c + common) - w.s[i2];
|
|
175
|
+
if (diff != 0) break;
|
|
176
|
+
common++;
|
|
177
|
+
}
|
|
178
|
+
if (diff < 0) {
|
|
179
|
+
j = k;
|
|
180
|
+
common_j = common;
|
|
181
|
+
} else {
|
|
182
|
+
i = k;
|
|
183
|
+
common_i = common;
|
|
184
|
+
}
|
|
185
|
+
if (j - i <= 1) {
|
|
186
|
+
if (i > 0) break; // v->s has been inspected
|
|
187
|
+
if (j == i) break; // only one item in v
|
|
188
|
+
|
|
189
|
+
// - but now we need to go round once more to get
|
|
190
|
+
// v->s inspected. This looks messy, but is actually
|
|
191
|
+
// the optimal approach.
|
|
192
|
+
|
|
193
|
+
if (first_key_inspected) break;
|
|
194
|
+
first_key_inspected = true;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
while (true) {
|
|
198
|
+
Among w = v[i];
|
|
199
|
+
if (common_i >= w.s.length) {
|
|
200
|
+
cursor = c + w.s.length;
|
|
201
|
+
if (w.method == null) return w.result;
|
|
202
|
+
boolean res;
|
|
203
|
+
try {
|
|
204
|
+
Object resobj = w.method.invoke(this);
|
|
205
|
+
res = resobj.toString().equals("true");
|
|
206
|
+
} catch (InvocationTargetException e) {
|
|
207
|
+
res = false;
|
|
208
|
+
// FIXME - debug message
|
|
209
|
+
} catch (IllegalAccessException e) {
|
|
210
|
+
res = false;
|
|
211
|
+
// FIXME - debug message
|
|
212
|
+
}
|
|
213
|
+
cursor = c + w.s.length;
|
|
214
|
+
if (res) return w.result;
|
|
215
|
+
}
|
|
216
|
+
i = w.substring_i;
|
|
217
|
+
if (i < 0) return 0;
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// find_among_b is for backwards processing. Same comments apply
|
|
222
|
+
protected int find_among_b(Among v[])
|
|
223
|
+
{
|
|
224
|
+
int i = 0;
|
|
225
|
+
int j = v.length;
|
|
226
|
+
|
|
227
|
+
int c = cursor;
|
|
228
|
+
int lb = limit_backward;
|
|
229
|
+
|
|
230
|
+
int common_i = 0;
|
|
231
|
+
int common_j = 0;
|
|
232
|
+
|
|
233
|
+
boolean first_key_inspected = false;
|
|
234
|
+
|
|
235
|
+
while (true) {
|
|
236
|
+
int k = i + ((j - i) >> 1);
|
|
237
|
+
int diff = 0;
|
|
238
|
+
int common = common_i < common_j ? common_i : common_j;
|
|
239
|
+
Among w = v[k];
|
|
240
|
+
int i2;
|
|
241
|
+
for (i2 = w.s.length - 1 - common; i2 >= 0; i2--) {
|
|
242
|
+
if (c - common == lb) {
|
|
243
|
+
diff = -1;
|
|
244
|
+
break;
|
|
245
|
+
}
|
|
246
|
+
diff = current.charAt(c - 1 - common) - w.s[i2];
|
|
247
|
+
if (diff != 0) break;
|
|
248
|
+
common++;
|
|
249
|
+
}
|
|
250
|
+
if (diff < 0) {
|
|
251
|
+
j = k;
|
|
252
|
+
common_j = common;
|
|
253
|
+
} else {
|
|
254
|
+
i = k;
|
|
255
|
+
common_i = common;
|
|
256
|
+
}
|
|
257
|
+
if (j - i <= 1) {
|
|
258
|
+
if (i > 0) break;
|
|
259
|
+
if (j == i) break;
|
|
260
|
+
if (first_key_inspected) break;
|
|
261
|
+
first_key_inspected = true;
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
while (true) {
|
|
265
|
+
Among w = v[i];
|
|
266
|
+
if (common_i >= w.s.length) {
|
|
267
|
+
cursor = c - w.s.length;
|
|
268
|
+
if (w.method == null) return w.result;
|
|
269
|
+
|
|
270
|
+
boolean res;
|
|
271
|
+
try {
|
|
272
|
+
Object resobj = w.method.invoke(this);
|
|
273
|
+
res = resobj.toString().equals("true");
|
|
274
|
+
} catch (InvocationTargetException e) {
|
|
275
|
+
res = false;
|
|
276
|
+
// FIXME - debug message
|
|
277
|
+
} catch (IllegalAccessException e) {
|
|
278
|
+
res = false;
|
|
279
|
+
// FIXME - debug message
|
|
280
|
+
}
|
|
281
|
+
cursor = c - w.s.length;
|
|
282
|
+
if (res) return w.result;
|
|
283
|
+
}
|
|
284
|
+
i = w.substring_i;
|
|
285
|
+
if (i < 0) return 0;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/* to replace chars between c_bra and c_ket in current by the
|
|
290
|
+
* chars in s.
|
|
291
|
+
*/
|
|
292
|
+
protected int replace_s(int c_bra, int c_ket, String s)
|
|
293
|
+
{
|
|
294
|
+
int adjustment = s.length() - (c_ket - c_bra);
|
|
295
|
+
current.replace(c_bra, c_ket, s);
|
|
296
|
+
limit += adjustment;
|
|
297
|
+
if (cursor >= c_ket) cursor += adjustment;
|
|
298
|
+
else if (cursor > c_bra) cursor = c_bra;
|
|
299
|
+
return adjustment;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
protected void slice_check()
|
|
303
|
+
{
|
|
304
|
+
if (bra < 0 ||
|
|
305
|
+
bra > ket ||
|
|
306
|
+
ket > limit ||
|
|
307
|
+
limit > current.length()) // this line could be removed
|
|
308
|
+
{
|
|
309
|
+
System.err.println("faulty slice operation");
|
|
310
|
+
// FIXME: report error somehow.
|
|
311
|
+
/*
|
|
312
|
+
fprintf(stderr, "faulty slice operation:\n");
|
|
313
|
+
debug(z, -1, 0);
|
|
314
|
+
exit(1);
|
|
315
|
+
*/
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
protected void slice_from(String s)
|
|
320
|
+
{
|
|
321
|
+
slice_check();
|
|
322
|
+
replace_s(bra, ket, s);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
protected void slice_from(CharSequence s)
|
|
326
|
+
{
|
|
327
|
+
slice_from(s.toString());
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
protected void slice_del()
|
|
331
|
+
{
|
|
332
|
+
slice_from("");
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
protected void insert(int c_bra, int c_ket, String s)
|
|
336
|
+
{
|
|
337
|
+
int adjustment = replace_s(c_bra, c_ket, s);
|
|
338
|
+
if (c_bra <= bra) bra += adjustment;
|
|
339
|
+
if (c_bra <= ket) ket += adjustment;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
protected void insert(int c_bra, int c_ket, CharSequence s)
|
|
343
|
+
{
|
|
344
|
+
insert(c_bra, c_ket, s.toString());
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/* Copy the slice into the supplied StringBuilder */
|
|
348
|
+
protected void slice_to(StringBuilder s)
|
|
349
|
+
{
|
|
350
|
+
slice_check();
|
|
351
|
+
s.replace(0, s.length(), current.substring(bra, ket));
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
protected void assign_to(StringBuilder s)
|
|
355
|
+
{
|
|
356
|
+
s.replace(0, s.length(), current.substring(0, limit));
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/*
|
|
360
|
+
extern void debug(struct SN_env * z, int number, int line_count)
|
|
361
|
+
{ int i;
|
|
362
|
+
int limit = SIZE(z->p);
|
|
363
|
+
//if (number >= 0) printf("%3d (line %4d): '", number, line_count);
|
|
364
|
+
if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
|
|
365
|
+
for (i = 0; i <= limit; i++)
|
|
366
|
+
{ if (z->lb == i) printf("{");
|
|
367
|
+
if (z->bra == i) printf("[");
|
|
368
|
+
if (z->c == i) printf("|");
|
|
369
|
+
if (z->ket == i) printf("]");
|
|
370
|
+
if (z->l == i) printf("}");
|
|
371
|
+
if (i < limit)
|
|
372
|
+
{ int ch = z->p[i];
|
|
373
|
+
if (ch == 0) ch = '#';
|
|
374
|
+
printf("%c", ch);
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
printf("'\n");
|
|
378
|
+
}
|
|
379
|
+
*/
|
|
380
|
+
|
|
381
|
+
};
|