mittens 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE.txt +30 -0
  5. data/README.md +62 -0
  6. data/Rakefile +21 -0
  7. data/ext/mittens/ext.c +96 -0
  8. data/ext/mittens/extconf.rb +12 -0
  9. data/lib/mittens/version.rb +3 -0
  10. data/lib/mittens.rb +7 -0
  11. data/mittens.gemspec +22 -0
  12. data/vendor/snowball/.gitignore +26 -0
  13. data/vendor/snowball/.travis.yml +112 -0
  14. data/vendor/snowball/AUTHORS +27 -0
  15. data/vendor/snowball/CONTRIBUTING.rst +216 -0
  16. data/vendor/snowball/COPYING +29 -0
  17. data/vendor/snowball/GNUmakefile +742 -0
  18. data/vendor/snowball/NEWS +754 -0
  19. data/vendor/snowball/README.rst +37 -0
  20. data/vendor/snowball/ada/README.md +74 -0
  21. data/vendor/snowball/ada/generate/generate.adb +83 -0
  22. data/vendor/snowball/ada/generate.gpr +21 -0
  23. data/vendor/snowball/ada/src/stemmer.adb +620 -0
  24. data/vendor/snowball/ada/src/stemmer.ads +219 -0
  25. data/vendor/snowball/ada/src/stemwords.adb +70 -0
  26. data/vendor/snowball/ada/stemmer_config.gpr +83 -0
  27. data/vendor/snowball/ada/stemwords.gpr +21 -0
  28. data/vendor/snowball/algorithms/arabic.sbl +558 -0
  29. data/vendor/snowball/algorithms/armenian.sbl +301 -0
  30. data/vendor/snowball/algorithms/basque.sbl +149 -0
  31. data/vendor/snowball/algorithms/catalan.sbl +202 -0
  32. data/vendor/snowball/algorithms/danish.sbl +93 -0
  33. data/vendor/snowball/algorithms/dutch.sbl +164 -0
  34. data/vendor/snowball/algorithms/english.sbl +229 -0
  35. data/vendor/snowball/algorithms/finnish.sbl +197 -0
  36. data/vendor/snowball/algorithms/french.sbl +254 -0
  37. data/vendor/snowball/algorithms/german.sbl +139 -0
  38. data/vendor/snowball/algorithms/german2.sbl +145 -0
  39. data/vendor/snowball/algorithms/greek.sbl +701 -0
  40. data/vendor/snowball/algorithms/hindi.sbl +323 -0
  41. data/vendor/snowball/algorithms/hungarian.sbl +241 -0
  42. data/vendor/snowball/algorithms/indonesian.sbl +192 -0
  43. data/vendor/snowball/algorithms/irish.sbl +149 -0
  44. data/vendor/snowball/algorithms/italian.sbl +202 -0
  45. data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +240 -0
  46. data/vendor/snowball/algorithms/lithuanian.sbl +373 -0
  47. data/vendor/snowball/algorithms/lovins.sbl +208 -0
  48. data/vendor/snowball/algorithms/nepali.sbl +92 -0
  49. data/vendor/snowball/algorithms/norwegian.sbl +80 -0
  50. data/vendor/snowball/algorithms/porter.sbl +139 -0
  51. data/vendor/snowball/algorithms/portuguese.sbl +218 -0
  52. data/vendor/snowball/algorithms/romanian.sbl +236 -0
  53. data/vendor/snowball/algorithms/russian.sbl +221 -0
  54. data/vendor/snowball/algorithms/serbian.sbl +2379 -0
  55. data/vendor/snowball/algorithms/spanish.sbl +230 -0
  56. data/vendor/snowball/algorithms/swedish.sbl +72 -0
  57. data/vendor/snowball/algorithms/tamil.sbl +405 -0
  58. data/vendor/snowball/algorithms/turkish.sbl +470 -0
  59. data/vendor/snowball/algorithms/yiddish.sbl +460 -0
  60. data/vendor/snowball/charsets/ISO-8859-2.sbl +98 -0
  61. data/vendor/snowball/charsets/KOI8-R.sbl +74 -0
  62. data/vendor/snowball/charsets/cp850.sbl +130 -0
  63. data/vendor/snowball/compiler/analyser.c +1547 -0
  64. data/vendor/snowball/compiler/driver.c +615 -0
  65. data/vendor/snowball/compiler/generator.c +1748 -0
  66. data/vendor/snowball/compiler/generator_ada.c +1702 -0
  67. data/vendor/snowball/compiler/generator_csharp.c +1322 -0
  68. data/vendor/snowball/compiler/generator_go.c +1278 -0
  69. data/vendor/snowball/compiler/generator_java.c +1313 -0
  70. data/vendor/snowball/compiler/generator_js.c +1316 -0
  71. data/vendor/snowball/compiler/generator_pascal.c +1387 -0
  72. data/vendor/snowball/compiler/generator_python.c +1337 -0
  73. data/vendor/snowball/compiler/generator_rust.c +1295 -0
  74. data/vendor/snowball/compiler/header.h +418 -0
  75. data/vendor/snowball/compiler/space.c +286 -0
  76. data/vendor/snowball/compiler/syswords.h +86 -0
  77. data/vendor/snowball/compiler/syswords2.h +13 -0
  78. data/vendor/snowball/compiler/tokeniser.c +567 -0
  79. data/vendor/snowball/csharp/.gitignore +8 -0
  80. data/vendor/snowball/csharp/Snowball/Algorithms/.gitignore +1 -0
  81. data/vendor/snowball/csharp/Snowball/Among.cs +108 -0
  82. data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +36 -0
  83. data/vendor/snowball/csharp/Snowball/Stemmer.cs +660 -0
  84. data/vendor/snowball/csharp/Stemwords/App.config +6 -0
  85. data/vendor/snowball/csharp/Stemwords/Program.cs +114 -0
  86. data/vendor/snowball/doc/TODO +12 -0
  87. data/vendor/snowball/doc/libstemmer_c_README +148 -0
  88. data/vendor/snowball/doc/libstemmer_csharp_README +53 -0
  89. data/vendor/snowball/doc/libstemmer_java_README +67 -0
  90. data/vendor/snowball/doc/libstemmer_js_README +48 -0
  91. data/vendor/snowball/doc/libstemmer_python_README +113 -0
  92. data/vendor/snowball/examples/stemwords.c +204 -0
  93. data/vendor/snowball/go/README.md +55 -0
  94. data/vendor/snowball/go/among.go +16 -0
  95. data/vendor/snowball/go/env.go +403 -0
  96. data/vendor/snowball/go/stemwords/generate.go +68 -0
  97. data/vendor/snowball/go/stemwords/main.go +68 -0
  98. data/vendor/snowball/go/util.go +34 -0
  99. data/vendor/snowball/iconv.py +50 -0
  100. data/vendor/snowball/include/libstemmer.h +78 -0
  101. data/vendor/snowball/java/org/tartarus/snowball/Among.java +29 -0
  102. data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +381 -0
  103. data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +8 -0
  104. data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +75 -0
  105. data/vendor/snowball/javascript/base-stemmer.js +294 -0
  106. data/vendor/snowball/javascript/stemwords.js +106 -0
  107. data/vendor/snowball/libstemmer/libstemmer_c.in +96 -0
  108. data/vendor/snowball/libstemmer/mkalgorithms.pl +90 -0
  109. data/vendor/snowball/libstemmer/mkmodules.pl +267 -0
  110. data/vendor/snowball/libstemmer/modules.txt +63 -0
  111. data/vendor/snowball/libstemmer/test.c +34 -0
  112. data/vendor/snowball/pascal/.gitignore +4 -0
  113. data/vendor/snowball/pascal/SnowballProgram.pas +430 -0
  114. data/vendor/snowball/pascal/generate.pl +23 -0
  115. data/vendor/snowball/pascal/stemwords-template.dpr +78 -0
  116. data/vendor/snowball/python/MANIFEST.in +7 -0
  117. data/vendor/snowball/python/create_init.py +54 -0
  118. data/vendor/snowball/python/setup.cfg +6 -0
  119. data/vendor/snowball/python/setup.py +81 -0
  120. data/vendor/snowball/python/snowballstemmer/among.py +13 -0
  121. data/vendor/snowball/python/snowballstemmer/basestemmer.py +323 -0
  122. data/vendor/snowball/python/stemwords.py +101 -0
  123. data/vendor/snowball/python/testapp.py +28 -0
  124. data/vendor/snowball/runtime/api.c +58 -0
  125. data/vendor/snowball/runtime/api.h +32 -0
  126. data/vendor/snowball/runtime/header.h +61 -0
  127. data/vendor/snowball/runtime/utilities.c +513 -0
  128. data/vendor/snowball/rust/Cargo.toml +7 -0
  129. data/vendor/snowball/rust/build.rs +55 -0
  130. data/vendor/snowball/rust/rust-pre-1.27-compat.patch +30 -0
  131. data/vendor/snowball/rust/src/main.rs +102 -0
  132. data/vendor/snowball/rust/src/snowball/algorithms/mod.rs +2 -0
  133. data/vendor/snowball/rust/src/snowball/among.rs +6 -0
  134. data/vendor/snowball/rust/src/snowball/mod.rs +6 -0
  135. data/vendor/snowball/rust/src/snowball/snowball_env.rs +421 -0
  136. data/vendor/snowball/tests/stemtest.c +95 -0
  137. metadata +178 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 5685ca729c53bcfeadf2241ffecd3eb536197dcf3863efd33e9b07aac114e4dd
4
+ data.tar.gz: a2a07b4b5d16a691695d1a420ef4acde948276077c9bce7cb98111b172bb71ab
5
+ SHA512:
6
+ metadata.gz: 2b4f3abde51e786665b83c3846cf5aa0409cae5cbf91dd8f715d3aa850a4c967c951dcd1475d9c6887aa991fc7a460be6f9205196ed7c8f4389c301de38b4931
7
+ data.tar.gz: b9ea741946ac5af5a27ae6685297a7416d1b1e64b0c780c8e0871332393990832a59821bed08ff21428a8c703dfd089ec89e0b376e34514f25f3f4c151620d00
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (2023-03-20)
2
+
3
+ - First release
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "https://rubygems.org"
2
+
3
+ gemspec
4
+
5
+ gem "rake"
6
+ gem "rake-compiler"
7
+ gem "minitest", ">= 5"
data/LICENSE.txt ADDED
@@ -0,0 +1,30 @@
1
+ Copyright (c) 2001, Dr Martin Porter
2
+ Copyright (c) 2004,2005, Richard Boulton
3
+ Copyright (c) 2013, Yoshiki Shibukawa
4
+ Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts
5
+ Copyright (c) 2022, Andrew Kane
6
+ All rights reserved.
7
+
8
+ Redistribution and use in source and binary forms, with or without
9
+ modification, are permitted provided that the following conditions
10
+ are met:
11
+
12
+ 1. Redistributions of source code must retain the above copyright notice,
13
+ this list of conditions and the following disclaimer.
14
+ 2. Redistributions in binary form must reproduce the above copyright notice,
15
+ this list of conditions and the following disclaimer in the documentation
16
+ and/or other materials provided with the distribution.
17
+ 3. Neither the name of the Mittens project nor the names of its contributors
18
+ may be used to endorse or promote products derived from this software
19
+ without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
25
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,62 @@
1
+ # Mittens
2
+
3
+ Stemming for Ruby, powered by [Snowball](https://github.com/snowballstem/snowball)
4
+
5
+ :snowflake: Supports 28 languages
6
+
7
+ [![Build Status](https://github.com/ankane/mittens/workflows/build/badge.svg?branch=master)](https://github.com/ankane/mittens/actions)
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application’s Gemfile:
12
+
13
+ ```ruby
14
+ gem "mittens"
15
+ ```
16
+
17
+ ## Getting Started
18
+
19
+ Create a stemmer
20
+
21
+ ```ruby
22
+ stemmer = Mittens::Stemmer.new
23
+ ```
24
+
25
+ Stem a word
26
+
27
+ ```ruby
28
+ stemmer.stem("tomatos")
29
+ ```
30
+
31
+ ## Languages
32
+
33
+ Specify the language
34
+
35
+ ```ruby
36
+ stemmer = Mittens::Stemmer.new(language: "french")
37
+ ```
38
+
39
+ Supports `arabic`, `armenian`, `basque`, `catalan`, `danish`, `dutch`, `english`, `finnish`, `french`, `german`, `greek`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `lithuanian`, `nepali`, `norwegian`, `porter`, `portuguese`, `romanian`, `russian`, `serbian`, `spanish`, `swedish`, `tamil`, `turkish`, and `yiddish`
40
+
41
+ ## History
42
+
43
+ View the [changelog](https://github.com/ankane/mittens/blob/master/CHANGELOG.md)
44
+
45
+ ## Contributing
46
+
47
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
48
+
49
+ - [Report bugs](https://github.com/ankane/mittens/issues)
50
+ - Fix bugs and [submit pull requests](https://github.com/ankane/mittens/pulls)
51
+ - Write, clarify, or fix documentation
52
+ - Suggest or add new features
53
+
54
+ To get started with development:
55
+
56
+ ```sh
57
+ git clone --recursive https://github.com/ankane/mittens.git
58
+ cd mittens
59
+ bundle install
60
+ bundle exec rake compile
61
+ bundle exec rake test
62
+ ```
data/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+ require "rake/extensiontask"
4
+
5
+ task default: :test
6
+ Rake::TestTask.new do |t|
7
+ t.libs << "test"
8
+ t.pattern = "test/**/*_test.rb"
9
+ end
10
+
11
+ Rake::ExtensionTask.new("mittens") do |ext|
12
+ ext.name = "ext"
13
+ ext.lib_dir = "lib/mittens"
14
+ end
15
+
16
+ task :remove_ext do
17
+ path = "lib/mittens/ext.bundle"
18
+ File.unlink(path) if File.exist?(path)
19
+ end
20
+
21
+ Rake::Task["build"].enhance [:remove_ext]
data/ext/mittens/ext.c ADDED
@@ -0,0 +1,96 @@
1
+ #include "libstemmer.h"
2
+ #include "ruby/ruby.h"
3
+
4
+ typedef struct stemmer {
5
+ struct sb_stemmer * stemmer;
6
+ } stemmer_t;
7
+
8
+ static void stemmer_free(void *ptr)
9
+ {
10
+ stemmer_t *stemmer = (stemmer_t *)ptr;
11
+ // safe to pass null pointer according to docs
12
+ sb_stemmer_delete(stemmer->stemmer);
13
+ xfree(ptr);
14
+ }
15
+
16
+ const rb_data_type_t stemmer_data_type = {
17
+ .wrap_struct_name = "stemmer",
18
+ .function = {
19
+ .dfree = stemmer_free,
20
+ },
21
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY
22
+ };
23
+
24
+ static VALUE stemmer_allocate(VALUE klass)
25
+ {
26
+ stemmer_t *stemmer;
27
+ VALUE obj = TypedData_Make_Struct(klass, stemmer_t, &stemmer_data_type, stemmer);
28
+ stemmer->stemmer = NULL;
29
+ return obj;
30
+ }
31
+
32
+ static VALUE stemmer_initialize(int argc, VALUE* argv, VALUE self)
33
+ {
34
+ VALUE opts;
35
+ rb_scan_args(argc, argv, ":", &opts);
36
+
37
+ const char * algorithm = "english";
38
+ if (!NIL_P(opts)) {
39
+ VALUE language = rb_hash_aref(opts, ID2SYM(rb_intern("language")));
40
+ if (!NIL_P(language)) {
41
+ Check_Type(language, T_STRING);
42
+ algorithm = RSTRING_PTR(language);
43
+ }
44
+ }
45
+
46
+ stemmer_t *stemmer;
47
+ TypedData_Get_Struct(self, stemmer_t, &stemmer_data_type, stemmer);
48
+
49
+ // in case called multiple times
50
+ sb_stemmer_delete(stemmer->stemmer);
51
+
52
+ // if adding support for encoding, may want to change encoding returned from stem
53
+ stemmer->stemmer = sb_stemmer_new(algorithm, NULL);
54
+ if (stemmer->stemmer == NULL) {
55
+ rb_raise(rb_eArgError, "unknown language: %s", algorithm);
56
+ }
57
+
58
+ return self;
59
+ }
60
+
61
+ static VALUE stemmer_stem(VALUE self, VALUE value)
62
+ {
63
+ stemmer_t *stemmer;
64
+ TypedData_Get_Struct(self, stemmer_t, &stemmer_data_type, stemmer);
65
+
66
+ Check_Type(value, T_STRING);
67
+
68
+ const sb_symbol * word = (const sb_symbol *) RSTRING_PTR(value);
69
+ int size = (int) RSTRING_LEN(value);
70
+ const sb_symbol * pointer_out = sb_stemmer_stem(stemmer->stemmer, word, size);
71
+
72
+ return rb_utf8_str_new_cstr((char *) pointer_out);
73
+ }
74
+
75
+ static VALUE stemmer_languages(VALUE klass)
76
+ {
77
+ VALUE out = rb_ary_new();
78
+
79
+ const char **language = sb_stemmer_list();
80
+ while (*language != NULL) {
81
+ rb_ary_push(out, rb_utf8_str_new_cstr(*language));
82
+ language++;
83
+ }
84
+
85
+ return out;
86
+ }
87
+
88
+ void Init_ext(void)
89
+ {
90
+ VALUE rb_mMittens = rb_define_module("Mittens");
91
+ VALUE rb_cStemmer = rb_define_class_under(rb_mMittens, "Stemmer", rb_cObject);
92
+ rb_define_alloc_func(rb_cStemmer, stemmer_allocate);
93
+ rb_define_method(rb_cStemmer, "initialize", stemmer_initialize, -1);
94
+ rb_define_method(rb_cStemmer, "stem", stemmer_stem, 1);
95
+ rb_define_singleton_method(rb_cStemmer, "languages", stemmer_languages, 0);
96
+ }
@@ -0,0 +1,12 @@
1
+ require "mkmf"
2
+ require "open3"
3
+
4
+ vendor = File.expand_path("../../vendor/snowball", __dir__)
5
+ output, status = Open3.capture2("make", chdir: vendor)
6
+ puts output
7
+ raise "Command failed" unless status.success?
8
+
9
+ $INCFLAGS += " -I$(srcdir)/../../vendor/snowball/include"
10
+ $LDFLAGS += " $(srcdir)/../../vendor/snowball/libstemmer.a"
11
+
12
+ create_makefile("mittens/ext")
@@ -0,0 +1,3 @@
1
+ module Mittens
2
+ VERSION = "0.1.0"
3
+ end
data/lib/mittens.rb ADDED
@@ -0,0 +1,7 @@
1
+ # modules
2
+ require_relative "mittens/ext"
3
+ require_relative "mittens/version"
4
+
5
+ module Mittens
6
+ class Error < StandardError; end
7
+ end
data/mittens.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ require_relative "lib/mittens/version"
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "mittens"
5
+ spec.version = Mittens::VERSION
6
+ spec.summary = "Stemming for Ruby, powered by Snowball"
7
+ spec.homepage = "https://github.com/ankane/mittens"
8
+ spec.license = "BSD-3-Clause"
9
+
10
+ spec.author = "Andrew Kane"
11
+ spec.email = "andrew@ankane.org"
12
+
13
+ spec.files = Dir.chdir(__dir__) do
14
+ `git ls-files -z --recurse-submodules`.split("\x0").reject do |f|
15
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
16
+ end
17
+ end
18
+ spec.require_path = "lib"
19
+ spec.extensions = ["ext/mittens/extconf.rb"]
20
+
21
+ spec.required_ruby_version = ">= 3"
22
+ end
@@ -0,0 +1,26 @@
1
+ *.o
2
+ /ada/bin/
3
+ /ada/obj/
4
+ /algorithms.mk
5
+ /libstemmer/libstemmer.c
6
+ /libstemmer/libstemmer_utf8.c
7
+ /libstemmer/mkinc.mak
8
+ /libstemmer/mkinc_utf8.mak
9
+ /libstemmer/modules.h
10
+ /libstemmer/modules_utf8.h
11
+ /libstemmer.a
12
+ /snowball
13
+ /src_c
14
+ /stemtest
15
+ /stemwords
16
+ /dist
17
+ /java/org/tartarus/snowball/ext/
18
+ /js_out
19
+ /python_check
20
+ /python_out
21
+ *.generated.cs
22
+ /rust/Cargo.lock
23
+ /rust/src/snowball/algorithms/*.rs
24
+ /rust/target/
25
+ /go/algorithms/
26
+ /go/stemwords/algorithms.go
@@ -0,0 +1,112 @@
1
+ language: minimal
2
+ arch: arm64
3
+ dist: xenial
4
+ env:
5
+ global: MAKE=make
6
+ matrix:
7
+ include:
8
+ - name: "C distribution build"
9
+ language: c
10
+ dist: focal
11
+ compiler: gcc
12
+ env: CFLAGS_DIST_BUILD='-O2 -Wall -W -std=c90 -Wdeclaration-after-statement -Werror'
13
+ - name: "C distribution build (clang)"
14
+ language: c
15
+ dist: focal
16
+ compiler: clang
17
+ env: CFLAGS_DIST_BUILD='-O2 -Wall -W -std=c90 -Wdeclaration-after-statement -Werror'
18
+ - language: c
19
+ compiler: gcc
20
+ env: c_tests=y CFLAGS='-O2 -Wall -W -std=c99 -Werror'
21
+ - language: c
22
+ compiler: clang
23
+ env: c_tests=y CFLAGS='-O2 -Wall -W -std=c99 -Werror'
24
+ - language: java
25
+ env: JAVA=java JAVAC=javac
26
+ - language: go
27
+ go: "1.8"
28
+ env: GO=go
29
+ - language: go
30
+ dist: bionic
31
+ go: "1.17"
32
+ env: GO=go
33
+ - language: node_js
34
+ node_js: "node"
35
+ env: NODE=node
36
+ dist: focal # seems to be needed for working node
37
+ - language: rust
38
+ rust:
39
+ - stable
40
+ - beta
41
+ dist: bionic
42
+ env: RUST=rust
43
+ - language: csharp
44
+ arch: amd64 # csharp doesn't seem to work on arm64
45
+ env: MCS=mcs
46
+ dist: bionic
47
+ - name: Pascal
48
+ env: FPC=fpc
49
+ dist: bionic
50
+ addons:
51
+ apt:
52
+ packages:
53
+ - fpc
54
+ # The pure Python versions run slowly so we need to thin the testdata
55
+ # for languages such as Arabic where there's a lot, or else the build
56
+ # hits the travis time limit. With pypy, it's enough faster than we
57
+ # can run the full tests.
58
+ - language: python
59
+ python: "3.9"
60
+ env: PYTHON=python THIN_FACTOR=10
61
+ - language: python
62
+ python: "3.7"
63
+ env: PYTHON=python THIN_FACTOR=10
64
+ - language: python
65
+ python: "3.6"
66
+ env: PYTHON=python THIN_FACTOR=10
67
+ - language: python
68
+ python: "pypy3.7-7.3.5"
69
+ env: PYTHON=python
70
+ dist: bionic
71
+ - name: "Ada"
72
+ env: gprbuild=gprbuild
73
+ dist: bionic
74
+ addons:
75
+ apt:
76
+ packages:
77
+ - gnat
78
+ - gprbuild
79
+ - os: windows
80
+ language: c
81
+ env: c_tests=y MAKE=mingw32-make
82
+ - os: windows
83
+ language: go
84
+ env: GO=go MAKE=mingw32-make
85
+
86
+ before_install:
87
+ # Try to check out a branch of the same name from the snowball-data repo
88
+ # sibling of this snowball repo, so that PRs requiring changes to both can be
89
+ # CI tested easily.
90
+ #
91
+ # If that fails, just use the standard snowball-data repo's default branch.
92
+ - GH_BRANCH=${TRAVIS_PULL_REQUEST_BRANCH:-$TRAVIS_BRANCH}
93
+ - GH_REPO_SLUG=${TRAVIS_PULL_REQUEST_SLUG:-$TRAVIS_REPO_SLUG}
94
+ - GH_REPO_URL=https://github.com/${GH_REPO_SLUG%%/*}/snowball-data.git
95
+ - echo "Trying branch $GH_BRANCH from $GH_REPO_URL"
96
+ - git clone --depth=1 -b "$GH_BRANCH" "$GH_REPO_URL" || git clone --depth=1 https://github.com/snowballstem/snowball-data.git
97
+
98
+ script:
99
+ # Ensure CC is set for building the compiler in non-C builds.
100
+ - test -n "$CC" || export CC=gcc
101
+ - $MAKE CC="$CC"
102
+ - test -z "$CFLAGS_DIST_BUILD" || { pip install setuptools && $MAKE dist && mkdir tmp && cd tmp && tar xf ../dist/libstemmer_c-*.tar.gz && cd libstemmer_c-* && $MAKE CFLAGS="$CFLAGS_DIST_BUILD" ; }
103
+ - test -z "$c_tests" || $MAKE check CC="$CC" STEMMING_DATA=snowball-data
104
+ - test -z "$PYTHON" || $MAKE check_python python="$PYTHON" STEMMING_DATA=snowball-data
105
+ - test -z "$JAVA" -o -z "$JAVAC" || $MAKE check_java STEMMING_DATA=snowball-data
106
+ - test -z "$MCS" || $MAKE check_csharp MCS="$MCS" STEMMING_DATA=snowball-data
107
+ - test -z "$NODE" || $MAKE check_js STEMMING_DATA=snowball-data
108
+ - test -z "$RUST" || $MAKE check_rust STEMMING_DATA=snowball-data
109
+ - test -z "$RUST" || $MAKE check_rust STEMMING_DATA=snowball-data
110
+ - test -z "$GO" || $MAKE check_go STEMMING_DATA=snowball-data
111
+ - test -z "$FPC" || $MAKE check_pascal STEMMING_DATA=snowball-data
112
+ - test -z "$gprbuild" || $MAKE check_ada STEMMING_DATA=snowball-data
@@ -0,0 +1,27 @@
1
+ Authors
2
+ =======
3
+
4
+ Martin Porter
5
+ -------------
6
+
7
+ - Designed the snowball language.
8
+ - Implemented the snowball to C compiler.
9
+ - Implemented the stemming algorithms in C.
10
+ - Wrote the documentation.
11
+
12
+ Richard Boulton
13
+ ---------------
14
+
15
+ - Implemented Java backend of the snowball compiler.
16
+ - Developed build system.
17
+ - Assisted with website maintenance.
18
+
19
+
20
+ Assistance from
21
+ ---------------
22
+
23
+ Olivier Bornet - fixes to java packaging and build system.
24
+ Andreas Jung - useful bug reports on the libstemmer library.
25
+ Olly Betts - several patches, bug reports, and performance improvements.
26
+ Sebastiano Vigna and Oerd Cukalla - patches for the Java stemming algorithms.
27
+ Ralf Junker - fix a potential memory leak in sb_stemmer_new().
@@ -0,0 +1,216 @@
1
+ Adding a new stemming algorithm
2
+ ===============================
3
+
4
+ This needs PRs against three repositories. Name the branch the same for
5
+ at least `snowball` and `snowball-data`, push to the latter repo first, and the
6
+ CI should use your new vocabulary list when running the testsuite.
7
+
8
+ Some points to note about algorithm implementations:
9
+
10
+ * Avoid literal non-ASCII characters in snowball string literals - they will
11
+ work OK for languages that use UTF-8, but not wide-character Unicode or other
12
+ encodings. Instead use ``stringdef`` like the existing stemmers do, and
13
+ please use the newer `U+` notation rather than the older ``hex`` or
14
+ ``decimal`` as this allows us to support different encodings without having
15
+ to modify the source files - for example::
16
+
17
+ stringdef o" {U+00F6}
18
+ define foo 'o{o"}'
19
+
20
+ not::
21
+
22
+ stringdef o" hex F6
23
+ define foo 'o{o"}'
24
+
25
+ and definitely not::
26
+
27
+ define foo 'oö'
28
+
29
+ It's helpful to consistently use the same ``stringdef`` codes across the
30
+ different stemmers - the website has `guidance on what to use
31
+ <https://snowballstem.org/codesets/guide.html>`_ and a `list of stringdef
32
+ lines for common characters to cut and paste from
33
+ <https://snowballstem.org/codesets/latin-stringdef-list.txt>`_.
34
+
35
+ snowball repo
36
+ -------------
37
+
38
+ Add `.sbl` source to algorithms subdirectory.
39
+
40
+ Add entry to `libstemmer/modules.txt`, maintaining the current sorted order by
41
+ the first column. The columns are:
42
+
43
+ * Algorithm name (needs to match the `.sbl` source without extension)
44
+ * Encodings to support. Wide-character Unicode is always supported
45
+ and doesn't need to be listed here. You should always include `UTF_8`, and
46
+ also any of `ISO_8859_1`, `ISO_8859_2` and `KOI8_R` which the language can
47
+ usefully be written using only characters from (in particular they need to
48
+ contain all the characters the stemmer explicitly uses). Support for other
49
+ single-byte character sets is easy to add if they're useful.
50
+ * Names and ISO-639 codes for the language. Wikipedia has a handy list of `all
51
+ the ISO-639 codes <https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes>`_ -
52
+ find the row for your new language and include the codes from the "639-1",
53
+ "639-2/T" and (if different) "639-2/B" columns. For example, for the `Afar`
54
+ language you'd put `afar,aa,aar` here.
55
+
56
+ snowball-data repo
57
+ ------------------
58
+
59
+ Add subdirectory named after new stemmer containing:
60
+
61
+ * voc.txt - word list
62
+ * output.txt - stemmed equivalents
63
+ * COPYING - licensing details (word lists need to be under an OSI-approved
64
+ licence)
65
+
66
+ If you don't have access to a suitably licensed word list of a suitable size,
67
+ you may be able to use the `wikipedia-most-common-words` script to generate
68
+ one by extracting the most frequent words from a Wikipedia dump in the
69
+ language the stemmer is for. You need to specify the Unicode "script" (that's
70
+ "script" in the sense of alphabet) to use - you can find the appropriate one
71
+ by looking in the Unicode `Scripts.txt`::
72
+
73
+ https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt
74
+
75
+ The script name is the second column, between `;` and `#`. The first entries
76
+ are all "Common" which isn't what you want - scroll down to get to the entries
77
+ that are useful here.
78
+
79
+ You also need to specify the minimum frequency to select. Picking this value
80
+ will probably need some experimentation as the appropriate threshold depends on
81
+ how much data there is in the wikipedia dump for a particular language, as well
82
+ as the size of the vocabulary for the language, and how inflected the language
83
+ is. Try counting the number of unique words extracted (`wc -l voc.txt` on
84
+ Unix) and also looking through the list - some proper nouns, words from other
85
+ languages, typos, etc are OK (since the stemmer will encounter all these in
86
+ practice too), but at some point "more" stops being "better".
87
+
88
+ snowball-website repo
89
+ ---------------------
90
+
91
+ Create subdirectory of `algorithms/` named after the language.
92
+
93
+ Create `stemmer.tt` which describes the stemming algorithm. This is a
94
+ "template toolkit" template which is essentially a mix of HTML and some
95
+ macros for adding the navigation, sample vocabulary, etc. See the
96
+ existing `stemmer.tt` files for other algorithms for inspiration.
97
+
98
+ If it is based on an academic paper, cite the paper and describe any difference
99
+ between your implementation and that described in the paper (for example,
100
+ sometimes papers have ambiguities that need resolving to re-implement the
101
+ algorithm described).
102
+
103
+ If you have a stopword list, add that as `stop.txt` in your new subdirectory.
104
+ The `generate` script checks if such a file exists and if it does a link to
105
+ it is automatically added.
106
+
107
+ Link to your new `stemmer.tt` from `algorithms/index.tt`.
108
+
109
+ Add a news entry to `index.tt`.
110
+
111
+ Add the new stemmer to the online demo. Assuming you have checkouts of the
112
+ `snowball`, `snowball-data` and `snowball-website` repos in sibling
113
+ directories:
114
+
115
+ * run `make check_js` in the `snowball` repo
116
+ * run `./update-js`
117
+ * add the new stemmer to git with: `git add js/*-stemmer.js`
118
+ * if the new language is written right-to-left (RTL) then add it to the check
119
+ in `demo.tt` (search for `rtl` to find the place to change.)
120
+ * `git commit`.
121
+
122
+ Adding a new programming language generator
123
+ ===========================================
124
+
125
+ This is a short guide to adding support for generating code for another
126
+ programming language.
127
+
128
+ Is a new generator the right solution?
129
+ --------------------------------------
130
+
131
+ Adding a new code generator is probably not your only option if you want
132
+ to use Snowball from another language - most languages have support for
133
+ writing bindings to a C library, so this is probably another option.
134
+
135
+ Generating code can have advantages. For example, it can be simpler to
136
+ deploy without C bindings which need to be built for a specific platform.
137
+
138
+ However, it's likely to be significantly more work to implement a new generator
139
+ than to write bindings to the generated C code, especially as the libstemmer
140
+ C API is a very small and simple one. Generated code can also be slower -
141
+ currently the Snowball compiler often generates code that assumes an optimising
142
+ compiler will clean up redundant constructs, which is not a problem for C, and
143
+ probably not for most compiled languages, but for a language like Python C
144
+ bindings are much faster than the generated Python code (using pypy helps a
145
+ lot, but is still slower). See doc/libstemmer_python_README for some timings.
146
+
147
+ That said, the unoptimised generated code has improved over time, and is likely
148
+ to improve further in the future.
149
+
150
+ Key problems to solve
151
+ ---------------------
152
+
153
+ * A key problem to solve is how to map the required flow of control in response
154
+ to Snowball signals.
155
+
156
+ In the generated C code this is mostly done using `goto`. If your language
157
+ doesn't provide an equivalent to `goto` then you'll need an alternative
158
+ solution.
159
+
160
+ In Java and JavaScript we use labelled `break` from blocks and loops
161
+ instead. If your language has an equivalent to this feature, that will
162
+ probably work.
163
+
164
+ For Python, we currently generate a `try:` ... `raise lab123` ...
165
+ `except lab123: pass` construct. This works, but doesn't seem ideal.
166
+
167
+ If one of the mechanisms above sounds suitable then take a look at the
168
+ generator for the respective generated output and generator code. If
169
+ not, come and talk to us on the snowball-discuss mailing list.
170
+
171
+ * Snowball's division is specified as integer division with semantics
172
+ matching C - i.e. the result should be truncated (rounded towards zero).
173
+ Some languages lack a built-in integer division operation, or have one
174
+ which instead implements rounding towards negative infinity. Existing
175
+ backends with special handling handling here which may be useful to look at
176
+ include Javascript, Pascal and Python.
177
+
178
+ Don't hardcode algorithm names
179
+ ------------------------------
180
+
181
+ We want to avoid hard-coded lists of algorithms in the language-specific code
182
+ that have to be manually updated each time a new algorithm is added, because
183
+ that adds some extra tedious work for adding a new algorithm, and mechanical
184
+ updates done by hand tend to miss places that need updating, or code gets
185
+ copied and pasted from an existing case but not fully updated.
186
+
187
+ All the existing language backends generate any such code at build time, and
188
+ adding a new algorithm just requires updating `libstemmer/modules.txt`.
189
+
190
+ You can probably copy the approach used for Pascal (script `pascal/generate.pl`
191
+ works from template `stemwords-template.dpr` which has marked blocks of code
192
+ that get expanded for each stemming algorithm with a placeholder replaced by
193
+ the algorithm name. For an alternative approach, see Rust where this is done
194
+ by `rust/build.rs`.
195
+
196
+ Mechanics of adding a new generator
197
+ -----------------------------------
198
+
199
+ Copy an existing `compiler/generator_*.c` for your new language and modify
200
+ away (`generator.c` has the generator for C, but also some common functions
201
+ so if you start from this one you'll need to remove those common functions).
202
+ Please resist reformatting existing C code - there's currently a lot of code
203
+ repeated in each generator which ought to be pulled out as common code, and
204
+ if you reformat that just makes that job harder.
205
+
206
+ Add your new source to `COMPILER_SOURCES` in `GNUmakefile`.
207
+
208
+ Add prototypes for the new functions to `compiler/header.h`.
209
+
210
+ Add support to `compiler/driver.c`.
211
+
212
+ Add targets to `GNUmakefile` to run tests for the new language.
213
+
214
+ Hook up automated testing via CI in `.travis.yml`.
215
+
216
+ Add to the list of languages in `README.rst`.