mittens 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +3 -3
- data/lib/mittens/version.rb +1 -1
- data/vendor/snowball/.github/workflows/ci.yml +216 -0
- data/vendor/snowball/CONTRIBUTING.rst +111 -62
- data/vendor/snowball/GNUmakefile +194 -136
- data/vendor/snowball/NEWS +798 -3
- data/vendor/snowball/README.rst +50 -1
- data/vendor/snowball/ada/src/stemmer.adb +25 -13
- data/vendor/snowball/ada/src/stemmer.ads +9 -9
- data/vendor/snowball/ada/stemmer_config.gpr +7 -7
- data/vendor/snowball/algorithms/basque.sbl +4 -19
- data/vendor/snowball/algorithms/catalan.sbl +2 -9
- data/vendor/snowball/algorithms/danish.sbl +1 -1
- data/vendor/snowball/algorithms/dutch.sbl +284 -122
- data/vendor/snowball/algorithms/dutch_porter.sbl +178 -0
- data/vendor/snowball/algorithms/english.sbl +52 -37
- data/vendor/snowball/algorithms/esperanto.sbl +157 -0
- data/vendor/snowball/algorithms/estonian.sbl +269 -0
- data/vendor/snowball/algorithms/finnish.sbl +2 -3
- data/vendor/snowball/algorithms/french.sbl +42 -16
- data/vendor/snowball/algorithms/german.sbl +35 -14
- data/vendor/snowball/algorithms/greek.sbl +76 -76
- data/vendor/snowball/algorithms/hungarian.sbl +8 -6
- data/vendor/snowball/algorithms/indonesian.sbl +14 -8
- data/vendor/snowball/algorithms/italian.sbl +11 -21
- data/vendor/snowball/algorithms/lithuanian.sbl +36 -37
- data/vendor/snowball/algorithms/lovins.sbl +0 -1
- data/vendor/snowball/algorithms/nepali.sbl +138 -37
- data/vendor/snowball/algorithms/norwegian.sbl +19 -5
- data/vendor/snowball/algorithms/porter.sbl +2 -2
- data/vendor/snowball/algorithms/portuguese.sbl +9 -13
- data/vendor/snowball/algorithms/romanian.sbl +17 -4
- data/vendor/snowball/algorithms/serbian.sbl +467 -468
- data/vendor/snowball/algorithms/spanish.sbl +5 -7
- data/vendor/snowball/algorithms/swedish.sbl +60 -6
- data/vendor/snowball/algorithms/tamil.sbl +207 -176
- data/vendor/snowball/algorithms/turkish.sbl +461 -445
- data/vendor/snowball/algorithms/yiddish.sbl +36 -38
- data/vendor/snowball/compiler/analyser.c +445 -192
- data/vendor/snowball/compiler/driver.c +109 -101
- data/vendor/snowball/compiler/generator.c +853 -464
- data/vendor/snowball/compiler/generator_ada.c +404 -366
- data/vendor/snowball/compiler/generator_csharp.c +297 -260
- data/vendor/snowball/compiler/generator_go.c +323 -254
- data/vendor/snowball/compiler/generator_java.c +326 -252
- data/vendor/snowball/compiler/generator_js.c +362 -252
- data/vendor/snowball/compiler/generator_pascal.c +349 -197
- data/vendor/snowball/compiler/generator_python.c +257 -240
- data/vendor/snowball/compiler/generator_rust.c +423 -251
- data/vendor/snowball/compiler/header.h +117 -71
- data/vendor/snowball/compiler/space.c +137 -68
- data/vendor/snowball/compiler/syswords.h +2 -2
- data/vendor/snowball/compiler/tokeniser.c +125 -107
- data/vendor/snowball/csharp/Snowball/Among.cs +14 -14
- data/vendor/snowball/csharp/Snowball/AssemblyInfo.cs +7 -7
- data/vendor/snowball/csharp/Snowball/Stemmer.cs +57 -37
- data/vendor/snowball/csharp/Stemwords/App.config +2 -2
- data/vendor/snowball/csharp/Stemwords/Program.cs +16 -12
- data/vendor/snowball/doc/libstemmer_c_README +7 -4
- data/vendor/snowball/doc/libstemmer_csharp_README +4 -1
- data/vendor/snowball/doc/libstemmer_java_README +12 -1
- data/vendor/snowball/doc/libstemmer_js_README +6 -4
- data/vendor/snowball/doc/libstemmer_python_README +9 -4
- data/vendor/snowball/examples/stemwords.c +12 -12
- data/vendor/snowball/go/env.go +107 -31
- data/vendor/snowball/go/util.go +0 -4
- data/vendor/snowball/include/libstemmer.h +4 -0
- data/vendor/snowball/java/org/tartarus/snowball/Among.java +32 -15
- data/vendor/snowball/java/org/tartarus/snowball/SnowballProgram.java +347 -261
- data/vendor/snowball/java/org/tartarus/snowball/SnowballStemmer.java +3 -0
- data/vendor/snowball/java/org/tartarus/snowball/TestApp.java +52 -37
- data/vendor/snowball/javascript/base-stemmer.js +186 -2
- data/vendor/snowball/javascript/stemwords.js +3 -6
- data/vendor/snowball/libstemmer/libstemmer_c.in +1 -1
- data/vendor/snowball/libstemmer/mkalgorithms.pl +6 -6
- data/vendor/snowball/libstemmer/mkmodules.pl +2 -2
- data/vendor/snowball/libstemmer/modules.txt +13 -10
- data/vendor/snowball/libstemmer/test.c +1 -1
- data/vendor/snowball/pascal/SnowballProgram.pas +84 -2
- data/vendor/snowball/pascal/generate.pl +13 -13
- data/vendor/snowball/python/create_init.py +4 -1
- data/vendor/snowball/python/setup.cfg +0 -3
- data/vendor/snowball/python/setup.py +8 -3
- data/vendor/snowball/python/snowballstemmer/basestemmer.py +20 -54
- data/vendor/snowball/python/stemwords.py +8 -12
- data/vendor/snowball/runtime/api.c +10 -5
- data/vendor/snowball/runtime/header.h +10 -9
- data/vendor/snowball/runtime/utilities.c +9 -9
- data/vendor/snowball/rust/build.rs +1 -1
- data/vendor/snowball/rust/src/snowball/snowball_env.rs +83 -5
- data/vendor/snowball/tests/stemtest.c +7 -4
- metadata +7 -7
- data/vendor/snowball/.travis.yml +0 -112
- data/vendor/snowball/algorithms/german2.sbl +0 -145
- data/vendor/snowball/algorithms/kraaij_pohlmann.sbl +0 -240
- data/vendor/snowball/compiler/syswords2.h +0 -13
data/vendor/snowball/GNUmakefile
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# After changing this, run `make update_version` to update various sources
|
4
4
|
# which hard-code it.
|
5
|
-
SNOWBALL_VERSION =
|
5
|
+
SNOWBALL_VERSION = 3.0.1
|
6
6
|
|
7
7
|
ifeq ($(OS),Windows_NT)
|
8
8
|
EXEEXT = .exe
|
@@ -10,8 +10,9 @@ endif
|
|
10
10
|
|
11
11
|
c_src_dir = src_c
|
12
12
|
|
13
|
+
JAVACFLAGS ?=
|
13
14
|
JAVAC ?= javac
|
14
|
-
JAVA ?= java
|
15
|
+
JAVA ?= java -ea
|
15
16
|
java_src_main_dir = java/org/tartarus/snowball
|
16
17
|
java_src_dir = $(java_src_main_dir)/ext
|
17
18
|
|
@@ -22,6 +23,8 @@ csharp_src_dir = $(csharp_src_main_dir)/Algorithms
|
|
22
23
|
csharp_sample_dir = csharp/Stemwords
|
23
24
|
|
24
25
|
FPC ?= fpc
|
26
|
+
# Enable warnings, info, notes; select "FILE:LINE:" diagnostic format.
|
27
|
+
FPC_FLAGS ?= -veiwnr
|
25
28
|
pascal_src_dir = pascal
|
26
29
|
|
27
30
|
python ?= python3
|
@@ -32,7 +35,8 @@ python_sample_dir = sample
|
|
32
35
|
js_output_dir = js_out
|
33
36
|
js_runtime_dir = javascript
|
34
37
|
js_sample_dir = sample
|
35
|
-
|
38
|
+
JSRUN ?= node
|
39
|
+
JSTYPE ?= global
|
36
40
|
|
37
41
|
cargo ?= cargo
|
38
42
|
cargoflags ?= --release
|
@@ -56,6 +60,22 @@ endif
|
|
56
60
|
ICONV = iconv
|
57
61
|
#ICONV = python ./iconv.py
|
58
62
|
|
63
|
+
# Where the data files are located - assumes their repo is checked out as
|
64
|
+
# a sibling to this one.
|
65
|
+
STEMMING_DATA ?= ../snowball-data
|
66
|
+
STEMMING_DATA_ABS := $(abspath $(STEMMING_DATA))
|
67
|
+
|
68
|
+
# Keep one in $(THIN_FACTOR) entries from gzipped vocabularies.
|
69
|
+
THIN_FACTOR ?= 3
|
70
|
+
|
71
|
+
ifneq (1,$(THIN_FACTOR))
|
72
|
+
ifneq (,$(THIN_FACTOR))
|
73
|
+
# Command to thin out the testdata. Used for Python tests, which otherwise
|
74
|
+
# take a long time (unless you use pypy).
|
75
|
+
THIN_TEST_DATA := |awk '(FNR % $(THIN_FACTOR) == 0){print}'
|
76
|
+
endif
|
77
|
+
endif
|
78
|
+
|
59
79
|
tarball_ext = .tar.gz
|
60
80
|
|
61
81
|
# algorithms.mk is generated from libstemmer/modules.txt and defines:
|
@@ -65,7 +85,7 @@ tarball_ext = .tar.gz
|
|
65
85
|
# * KOI8_R_algorithms
|
66
86
|
include algorithms.mk
|
67
87
|
|
68
|
-
other_algorithms =
|
88
|
+
other_algorithms = lovins
|
69
89
|
|
70
90
|
all_algorithms = $(libstemmer_algorithms) $(other_algorithms)
|
71
91
|
|
@@ -84,8 +104,7 @@ COMPILER_SOURCES = compiler/space.c \
|
|
84
104
|
compiler/generator_ada.c
|
85
105
|
|
86
106
|
COMPILER_HEADERS = compiler/header.h \
|
87
|
-
compiler/syswords.h
|
88
|
-
compiler/syswords2.h
|
107
|
+
compiler/syswords.h
|
89
108
|
|
90
109
|
RUNTIME_SOURCES = runtime/api.c \
|
91
110
|
runtime/utilities.c
|
@@ -151,7 +170,8 @@ CSHARP_SOURCES = $(libstemmer_algorithms:%=$(csharp_src_dir)/%Stemmer.generated.
|
|
151
170
|
PASCAL_SOURCES = $(ISO_8859_1_algorithms:%=$(pascal_src_dir)/%Stemmer.pas)
|
152
171
|
PYTHON_SOURCES = $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py) \
|
153
172
|
$(python_output_dir)/__init__.py
|
154
|
-
JS_SOURCES = $(libstemmer_algorithms:%=$(js_output_dir)/%-stemmer.js)
|
173
|
+
JS_SOURCES = $(libstemmer_algorithms:%=$(js_output_dir)/%-stemmer.js) \
|
174
|
+
$(js_output_dir)/base-stemmer.js
|
155
175
|
RUST_SOURCES = $(libstemmer_algorithms:%=$(rust_src_dir)/%_stemmer.rs)
|
156
176
|
GO_SOURCES = $(libstemmer_algorithms:%=$(go_src_dir)/%_stemmer.go) \
|
157
177
|
$(go_src_main_dir)/stemwords/algorithms.go
|
@@ -170,7 +190,7 @@ C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o)
|
|
170
190
|
JAVA_CLASSES = $(JAVA_SOURCES:.java=.class)
|
171
191
|
JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class)
|
172
192
|
|
173
|
-
CFLAGS=-O2 -W -Wall -Wmissing-prototypes -Wmissing-declarations
|
193
|
+
CFLAGS=-g -O2 -W -Wall -Wmissing-prototypes -Wmissing-declarations -Wshadow $(WERROR)
|
174
194
|
CPPFLAGS=
|
175
195
|
|
176
196
|
INCLUDES=-Iinclude
|
@@ -195,6 +215,7 @@ clean:
|
|
195
215
|
$(JS_SOURCES) \
|
196
216
|
$(RUST_SOURCES) \
|
197
217
|
$(ADA_SOURCES) ada/bin/generate ada/bin/stemwords \
|
218
|
+
stemtest$(EXEEXT) $(STEMTEST_OBJECTS) \
|
198
219
|
libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \
|
199
220
|
libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c \
|
200
221
|
algorithms.mk
|
@@ -203,6 +224,17 @@ clean:
|
|
203
224
|
-rmdir $(python_output_dir)
|
204
225
|
-rmdir $(js_output_dir)
|
205
226
|
|
227
|
+
update_version:
|
228
|
+
perl -pi -e 's/(SNOWBALL_VERSION.*?)\d+\.\d+\.\d+/$${1}$(SNOWBALL_VERSION)/' \
|
229
|
+
compiler/header.h \
|
230
|
+
csharp/Snowball/AssemblyInfo.cs \
|
231
|
+
python/setup.py
|
232
|
+
|
233
|
+
.PHONY: all clean update_version
|
234
|
+
|
235
|
+
$(STEMMING_DATA)/% $(STEMMING_DATA_ABS)/%:
|
236
|
+
@[ -f '$@' ] || { echo '$@: Test data not found'; echo 'Checkout the snowball-data repo as "$(STEMMING_DATA_ABS)"'; exit 1; }
|
237
|
+
|
206
238
|
snowball$(EXEEXT): $(COMPILER_OBJECTS)
|
207
239
|
$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^
|
208
240
|
|
@@ -244,110 +276,72 @@ pascal/stemwords.dpr: pascal/stemwords-template.dpr libstemmer/modules.txt
|
|
244
276
|
pascal/generate.pl $(ISO_8859_1_algorithms) < pascal/stemwords-template.dpr > $@
|
245
277
|
|
246
278
|
pascal/stemwords: $(PASCAL_STEMWORDS_SOURCES) $(PASCAL_RUNTIME_SOURCES) $(PASCAL_SOURCES)
|
247
|
-
$(FPC) -o$@ -Mdelphi $(PASCAL_STEMWORDS_SOURCES)
|
279
|
+
$(FPC) $(FPC_FLAGS) -o$@ -Mdelphi $(PASCAL_STEMWORDS_SOURCES)
|
248
280
|
|
249
281
|
$(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%.sbl snowball$(EXEEXT)
|
250
282
|
@mkdir -p $(c_src_dir)
|
251
|
-
|
252
|
-
o="$(c_src_dir)/stem_UTF_8_$${l}"; \
|
253
|
-
echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \
|
254
|
-
./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u
|
283
|
+
./snowball $< -o "$(c_src_dir)/stem_UTF_8_$*" -eprefix $*_UTF_8_ -r ../runtime -u
|
255
284
|
|
256
285
|
$(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%.sbl snowball$(EXEEXT)
|
257
286
|
@mkdir -p $(c_src_dir)
|
258
|
-
|
259
|
-
o="$(c_src_dir)/stem_KOI8_R_$${l}"; \
|
260
|
-
echo "./snowball charsets/KOI8-R.sbl $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \
|
261
|
-
./snowball charsets/KOI8-R.sbl $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime
|
287
|
+
./snowball charsets/KOI8-R.sbl $< -o "$(c_src_dir)/stem_KOI8_R_$*" -eprefix $*_KOI8_R_ -r ../runtime
|
262
288
|
|
263
289
|
$(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%.sbl snowball$(EXEEXT)
|
264
290
|
@mkdir -p $(c_src_dir)
|
265
|
-
|
266
|
-
o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \
|
267
|
-
echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \
|
268
|
-
./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime
|
291
|
+
./snowball $< -o "$(c_src_dir)/stem_ISO_8859_1_$*" -eprefix $*_ISO_8859_1_ -r ../runtime
|
269
292
|
|
270
293
|
$(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%.sbl snowball$(EXEEXT)
|
271
294
|
@mkdir -p $(c_src_dir)
|
272
|
-
|
273
|
-
o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \
|
274
|
-
echo "./snowball charsets/ISO-8859-2.sbl $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \
|
275
|
-
./snowball charsets/ISO-8859-2.sbl $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime
|
295
|
+
./snowball charsets/ISO-8859-2.sbl $< -o "$(c_src_dir)/stem_ISO_8859_2_$*" -eprefix $*_ISO_8859_2_ -r ../runtime
|
276
296
|
|
277
297
|
$(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h
|
278
298
|
$(CC) $(CFLAGS) $(INCLUDES) $(CPPFLAGS) -c -o $@ $<
|
279
299
|
|
280
300
|
$(java_src_dir)/%Stemmer.java: algorithms/%.sbl snowball$(EXEEXT)
|
281
301
|
@mkdir -p $(java_src_dir)
|
282
|
-
|
283
|
-
o="$(java_src_dir)/$${l}Stemmer"; \
|
284
|
-
echo "./snowball $< -j -o $${o} -p org.tartarus.snowball.SnowballStemmer"; \
|
285
|
-
./snowball $< -j -o $${o} -p org.tartarus.snowball.SnowballStemmer
|
302
|
+
./snowball $< -j -o "$(java_src_dir)/$*Stemmer" -p org.tartarus.snowball.SnowballStemmer
|
286
303
|
|
287
304
|
$(csharp_src_dir)/%Stemmer.generated.cs: algorithms/%.sbl snowball$(EXEEXT)
|
288
305
|
@mkdir -p $(csharp_src_dir)
|
289
|
-
|
290
|
-
t=`echo "$${l}" | sed 's/.*/\L&/; s/[a-z]*/\u&/g'`; \
|
291
|
-
o="$(csharp_src_dir)/$${l}Stemmer.generated"; \
|
292
|
-
echo "./snowball $< -cs -o $${o}"; \
|
293
|
-
./snowball $< -cs -o $${o}
|
306
|
+
./snowball $< -cs -o "$(csharp_src_dir)/$*Stemmer.generated"
|
294
307
|
|
295
308
|
$(pascal_src_dir)/%Stemmer.pas: algorithms/%.sbl snowball$(EXEEXT)
|
296
309
|
@mkdir -p $(pascal_src_dir)
|
297
|
-
|
298
|
-
t=`echo "$${l}" | sed 's/.*/\L&/; s/[a-z]*/\u&/g'`; \
|
299
|
-
o="$(pascal_src_dir)/$${l}Stemmer"; \
|
300
|
-
echo "./snowball $< -pascal -o $${o}"; \
|
301
|
-
./snowball $< -pascal -o $${o}
|
310
|
+
./snowball $< -pascal -o "$(pascal_src_dir)/$*Stemmer"
|
302
311
|
|
303
312
|
$(python_output_dir)/%_stemmer.py: algorithms/%.sbl snowball$(EXEEXT)
|
304
313
|
@mkdir -p $(python_output_dir)
|
305
|
-
|
306
|
-
o="$(python_output_dir)/$${l}_stemmer"; \
|
307
|
-
echo "./snowball $< -py -o $${o}"; \
|
308
|
-
./snowball $< -py -o $${o}
|
314
|
+
./snowball $< -py -o "$(python_output_dir)/$*_stemmer"
|
309
315
|
|
310
|
-
$(python_output_dir)/__init__.py:
|
311
|
-
@mkdir -p $(python_output_dir)
|
316
|
+
$(python_output_dir)/__init__.py: $(libstemmer_algorithms:%=$(python_output_dir)/%_stemmer.py)
|
312
317
|
$(python) python/create_init.py $(python_output_dir)
|
313
318
|
|
314
319
|
$(rust_src_dir)/%_stemmer.rs: algorithms/%.sbl snowball$(EXEEXT)
|
315
320
|
@mkdir -p $(rust_src_dir)
|
316
|
-
|
317
|
-
o="$(rust_src_dir)/$${l}_stemmer"; \
|
318
|
-
echo "./snowball $< -rust -o $${o}"; \
|
319
|
-
./snowball $< -rust -o $${o}
|
321
|
+
./snowball $< -rust -o "$(rust_src_dir)/$*_stemmer"
|
320
322
|
|
321
323
|
$(go_src_main_dir)/stemwords/algorithms.go: go/stemwords/generate.go libstemmer/modules.txt
|
322
324
|
@echo "Generating algorithms.go"
|
323
325
|
@cd go/stemwords && go generate
|
324
326
|
|
325
327
|
$(go_src_dir)/%_stemmer.go: algorithms/%.sbl snowball$(EXEEXT)
|
326
|
-
@
|
327
|
-
o
|
328
|
-
|
329
|
-
@l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \
|
330
|
-
o="$(go_src_dir)/$${l}/$${l}_stemmer"; \
|
331
|
-
echo "./snowball $< -go -o $${o} -gop $${l}"; \
|
332
|
-
./snowball $< -go -o $${o} -gop $${l}
|
333
|
-
@l=`echo "$<" | sed 's!\(.*\)\.sbl$$!\1!;s!^.*/!!'`; \
|
334
|
-
o="$(go_src_dir)/$${l}/$${l}_stemmer"; \
|
335
|
-
echo "$(gofmt) -s -w $(go_src_dir)/$${l}/$${l}_stemmer.go"; \
|
336
|
-
$(gofmt) -s -w $(go_src_dir)/$${l}/$${l}_stemmer.go
|
328
|
+
@mkdir -p $(go_src_dir)/$*
|
329
|
+
./snowball $< -go -o "$(go_src_dir)/$*/$*_stemmer" -gop $*
|
330
|
+
$(gofmt) -s -w $(go_src_dir)/$*/$*_stemmer.go
|
337
331
|
|
338
332
|
$(js_output_dir)/%-stemmer.js: algorithms/%.sbl snowball$(EXEEXT)
|
339
333
|
@mkdir -p $(js_output_dir)
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
334
|
+
./snowball $< -js -o "$(js_output_dir)/$*-stemmer"
|
335
|
+
|
336
|
+
$(js_output_dir)/base-stemmer.js: $(js_runtime_dir)/base-stemmer.js
|
337
|
+
@mkdir -p $(js_output_dir)
|
338
|
+
cp $< $@
|
344
339
|
|
345
340
|
$(ada_src_dir)/stemmer-%.ads: algorithms/%.sbl snowball
|
346
341
|
@mkdir -p $(ada_src_dir)
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
./snowball $< -ada -P $${l} -o $${o}
|
342
|
+
./snowball $< -ada -P $* -o "$(ada_src_dir)/stemmer-$*"
|
343
|
+
|
344
|
+
.PHONY: dist dist_snowball dist_libstemmer_c dist_libstemmer_csharp dist_libstemmer_java dist_libstemmer_js dist_libstemmer_python
|
351
345
|
|
352
346
|
# Make a full source distribution
|
353
347
|
dist: dist_snowball dist_libstemmer_c dist_libstemmer_csharp dist_libstemmer_java dist_libstemmer_js dist_libstemmer_python
|
@@ -384,6 +378,7 @@ dist_libstemmer_c: \
|
|
384
378
|
$(LIBSTEMMER_EXTRA) \
|
385
379
|
$(C_LIB_SOURCES) \
|
386
380
|
$(C_LIB_HEADERS) \
|
381
|
+
$(COMMON_FILES) \
|
387
382
|
libstemmer/mkinc.mak \
|
388
383
|
libstemmer/mkinc_utf8.mak
|
389
384
|
destname=libstemmer_c-$(SNOWBALL_VERSION); \
|
@@ -428,6 +423,7 @@ dist_libstemmer_c: \
|
|
428
423
|
|
429
424
|
# Make a distribution of all the sources required to compile the Java library.
|
430
425
|
dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
|
426
|
+
$(COMMON_FILES) \
|
431
427
|
$(LIBSTEMMER_EXTRA) \
|
432
428
|
$(JAVA_SOURCES)
|
433
429
|
destname=libstemmer_java-$(SNOWBALL_VERSION); \
|
@@ -450,6 +446,7 @@ dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
|
|
450
446
|
|
451
447
|
# Make a distribution of all the sources required to compile the C# library.
|
452
448
|
dist_libstemmer_csharp: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
|
449
|
+
$(COMMON_FILES) \
|
453
450
|
$(LIBSTEMMER_EXTRA) \
|
454
451
|
$(CSHARP_SOURCES)
|
455
452
|
destname=libstemmer_csharp-$(SNOWBALL_VERSION); \
|
@@ -468,7 +465,7 @@ dist_libstemmer_csharp: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
|
|
468
465
|
(cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \
|
469
466
|
rm -rf $${dest}
|
470
467
|
|
471
|
-
dist_libstemmer_python: $(PYTHON_SOURCES)
|
468
|
+
dist_libstemmer_python: $(PYTHON_SOURCES) $(COMMON_FILES)
|
472
469
|
destname=snowballstemmer-$(SNOWBALL_VERSION); \
|
473
470
|
dest=dist/$${destname}; \
|
474
471
|
rm -rf $${dest} && \
|
@@ -482,10 +479,10 @@ dist_libstemmer_python: $(PYTHON_SOURCES)
|
|
482
479
|
cp -a $(PYTHON_SAMPLE_SOURCES) $${dest}/src/$(python_sample_dir) && \
|
483
480
|
cp -a $(PYTHON_RUNTIME_SOURCES) $${dest}/src/$(python_runtime_dir) && \
|
484
481
|
cp -a $(COMMON_FILES) $(PYTHON_PACKAGE_FILES) $${dest} && \
|
485
|
-
(cd $${dest} && $(python)
|
482
|
+
(cd $${dest} && $(python) -m build && cp dist/*.tar.gz dist/*.whl ..) && \
|
486
483
|
rm -rf $${dest}
|
487
484
|
|
488
|
-
dist_libstemmer_js: $(JS_SOURCES)
|
485
|
+
dist_libstemmer_js: $(JS_SOURCES) $(COMMON_FILES)
|
489
486
|
destname=jsstemmer-$(SNOWBALL_VERSION); \
|
490
487
|
dest=dist/$${destname}; \
|
491
488
|
rm -rf $${dest} && \
|
@@ -503,6 +500,12 @@ dist_libstemmer_js: $(JS_SOURCES)
|
|
503
500
|
(cd dist && tar zcf $${destname}$(tarball_ext) $${destname}) && \
|
504
501
|
rm -rf $${dest}
|
505
502
|
|
503
|
+
###############################################################################
|
504
|
+
# C
|
505
|
+
###############################################################################
|
506
|
+
|
507
|
+
.PHONY: check check_stemtest check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r
|
508
|
+
|
506
509
|
check: check_stemtest check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r
|
507
510
|
|
508
511
|
check_stemtest: stemtest$(EXEEXT)
|
@@ -516,17 +519,12 @@ check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%)
|
|
516
519
|
|
517
520
|
check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%)
|
518
521
|
|
519
|
-
# Where the data files are located - assumes their repo is checked out as
|
520
|
-
# a sibling to this one.
|
521
|
-
STEMMING_DATA ?= ../snowball-data
|
522
|
-
STEMMING_DATA_ABS := $(abspath $(STEMMING_DATA))
|
523
|
-
|
524
522
|
check_utf8_%: $(STEMMING_DATA)/% stemwords$(EXEEXT)
|
525
|
-
@echo "Checking output of
|
523
|
+
@echo "Checking output of $* stemmer with UTF-8"
|
526
524
|
@if test -f '$</voc.txt.gz' ; then \
|
527
|
-
gzip -dc '$</voc.txt.gz'|./stemwords$(EXEEXT) -c UTF_8 -l
|
525
|
+
gzip -dc '$</voc.txt.gz'|./stemwords$(EXEEXT) -c UTF_8 -l $* -o tmp.txt; \
|
528
526
|
else \
|
529
|
-
./stemwords$(EXEEXT) -c UTF_8 -l
|
527
|
+
./stemwords$(EXEEXT) -c UTF_8 -l $* -i $</voc.txt -o tmp.txt; \
|
530
528
|
fi
|
531
529
|
@if test -f '$</output.txt.gz' ; then \
|
532
530
|
gzip -dc '$</output.txt.gz'|$(DIFF) -u - tmp.txt; \
|
@@ -536,44 +534,52 @@ check_utf8_%: $(STEMMING_DATA)/% stemwords$(EXEEXT)
|
|
536
534
|
@rm tmp.txt
|
537
535
|
|
538
536
|
check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords$(EXEEXT)
|
539
|
-
@echo "Checking output of
|
537
|
+
@echo "Checking output of $* stemmer with ISO_8859_1"
|
540
538
|
@$(ICONV) -f UTF-8 -t ISO-8859-1 '$</voc.txt' |\
|
541
|
-
./stemwords -c ISO_8859_1 -l
|
539
|
+
./stemwords -c ISO_8859_1 -l $* -o tmp.txt
|
542
540
|
@$(ICONV) -f UTF-8 -t ISO-8859-1 '$</output.txt' |\
|
543
541
|
$(DIFF) -u - tmp.txt
|
544
542
|
@rm tmp.txt
|
545
543
|
|
546
544
|
check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords$(EXEEXT)
|
547
|
-
@echo "Checking output of
|
545
|
+
@echo "Checking output of $* stemmer with ISO_8859_2"
|
548
546
|
@$(ICONV) -f UTF-8 -t ISO-8859-2 '$</voc.txt' |\
|
549
|
-
./stemwords -c ISO_8859_2 -l
|
547
|
+
./stemwords -c ISO_8859_2 -l $* -o tmp.txt
|
550
548
|
@$(ICONV) -f UTF-8 -t ISO-8859-2 '$</output.txt' |\
|
551
549
|
$(DIFF) -u - tmp.txt
|
552
550
|
@rm tmp.txt
|
553
551
|
|
554
552
|
check_koi8r_%: $(STEMMING_DATA)/% stemwords$(EXEEXT)
|
555
|
-
@echo "Checking output of
|
553
|
+
@echo "Checking output of $* stemmer with KOI8R"
|
556
554
|
@$(ICONV) -f UTF-8 -t KOI8-R '$</voc.txt' |\
|
557
|
-
./stemwords -c KOI8_R -l
|
555
|
+
./stemwords -c KOI8_R -l $* -o tmp.txt
|
558
556
|
@$(ICONV) -f UTF-8 -t KOI8-R '$</output.txt' |\
|
559
557
|
$(DIFF) -u - tmp.txt
|
560
558
|
@rm tmp.txt
|
561
559
|
|
560
|
+
###############################################################################
|
561
|
+
# Java
|
562
|
+
###############################################################################
|
563
|
+
|
564
|
+
.PHONY: java check_java do_check_java
|
565
|
+
|
566
|
+
java: $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES)
|
567
|
+
|
562
568
|
.java.class:
|
563
|
-
cd java && $(JAVAC)
|
569
|
+
cd java && $(JAVAC) $(JAVACFLAGS) $(patsubst java/%,%,$<)
|
564
570
|
|
565
|
-
check_java:
|
571
|
+
check_java: java
|
566
572
|
$(MAKE) do_check_java
|
567
573
|
|
568
574
|
do_check_java: $(libstemmer_algorithms:%=check_java_%)
|
569
575
|
|
570
576
|
check_java_%: $(STEMMING_DATA_ABS)/%
|
571
|
-
@echo "Checking output of
|
577
|
+
@echo "Checking output of $* stemmer for Java"
|
572
578
|
@cd java && if test -f '$</voc.txt.gz' ; then \
|
573
579
|
gzip -dc '$</voc.txt.gz' |\
|
574
|
-
$(JAVA) org/tartarus/snowball/TestApp
|
580
|
+
$(JAVA) org/tartarus/snowball/TestApp $* -o $(PWD)/tmp.txt; \
|
575
581
|
else \
|
576
|
-
$(JAVA) org/tartarus/snowball/TestApp
|
582
|
+
$(JAVA) org/tartarus/snowball/TestApp $* $</voc.txt -o $(PWD)/tmp.txt; \
|
577
583
|
fi
|
578
584
|
@if test -f '$</output.txt.gz' ; then \
|
579
585
|
gzip -dc '$</output.txt.gz'|$(DIFF) -u - tmp.txt; \
|
@@ -582,18 +588,26 @@ check_java_%: $(STEMMING_DATA_ABS)/%
|
|
582
588
|
fi
|
583
589
|
@rm tmp.txt
|
584
590
|
|
585
|
-
|
591
|
+
###############################################################################
|
592
|
+
# C#
|
593
|
+
###############################################################################
|
594
|
+
|
595
|
+
.PHONY: csharp check_csharp do_check_csharp
|
596
|
+
|
597
|
+
csharp: csharp_stemwords$(EXEEXT)
|
598
|
+
|
599
|
+
check_csharp: csharp
|
586
600
|
$(MAKE) do_check_csharp
|
587
601
|
|
588
602
|
do_check_csharp: $(libstemmer_algorithms:%=check_csharp_%)
|
589
603
|
|
590
604
|
check_csharp_%: $(STEMMING_DATA_ABS)/%
|
591
|
-
@echo "Checking output of
|
605
|
+
@echo "Checking output of $* stemmer for C#"
|
592
606
|
@if test -f '$</voc.txt.gz' ; then \
|
593
607
|
gzip -dc '$</voc.txt.gz' |\
|
594
|
-
$(MONO) csharp_stemwords$(EXEEXT) -l
|
608
|
+
$(MONO) csharp_stemwords$(EXEEXT) -l $* -i /dev/stdin -o tmp.txt; \
|
595
609
|
else \
|
596
|
-
$(MONO) csharp_stemwords$(EXEEXT) -l
|
610
|
+
$(MONO) csharp_stemwords$(EXEEXT) -l $* -i $</voc.txt -o tmp.txt; \
|
597
611
|
fi
|
598
612
|
@if test -f '$</output.txt.gz' ; then \
|
599
613
|
gzip -dc '$</output.txt.gz'|$(DIFF) -u - tmp.txt; \
|
@@ -602,38 +616,49 @@ check_csharp_%: $(STEMMING_DATA_ABS)/%
|
|
602
616
|
fi
|
603
617
|
@rm tmp.txt
|
604
618
|
|
605
|
-
|
619
|
+
###############################################################################
|
620
|
+
# Pascal
|
621
|
+
###############################################################################
|
622
|
+
|
623
|
+
.PHONY: pascal check_pascal do_check_pascal
|
624
|
+
|
625
|
+
pascal: pascal/stemwords
|
626
|
+
|
627
|
+
check_pascal: pascal
|
606
628
|
$(MAKE) do_check_pascal
|
607
629
|
|
608
630
|
do_check_pascal: $(ISO_8859_1_algorithms:%=check_pascal_%)
|
609
631
|
|
610
632
|
check_pascal_%: $(STEMMING_DATA_ABS)/%
|
611
|
-
@echo "Checking output of
|
633
|
+
@echo "Checking output of $* stemmer with ISO_8859_1 for Pascal"
|
612
634
|
@$(ICONV) -f UTF-8 -t ISO-8859-1 '$</voc.txt' |\
|
613
|
-
./pascal/stemwords -l
|
635
|
+
./pascal/stemwords -l $* > tmp.txt
|
614
636
|
@$(ICONV) -f UTF-8 -t ISO-8859-1 '$</output.txt' |\
|
615
637
|
$(DIFF) -u - tmp.txt
|
616
638
|
@rm tmp.txt
|
617
639
|
|
618
|
-
|
640
|
+
###############################################################################
|
641
|
+
# Javascript
|
642
|
+
###############################################################################
|
619
643
|
|
620
|
-
|
621
|
-
THIN_FACTOR ?= 3
|
644
|
+
.PHONY: js check_js do_check_js
|
622
645
|
|
623
|
-
|
624
|
-
# take a long time (unless you use pypy).
|
625
|
-
THIN_TEST_DATA := awk '(FNR % $(THIN_FACTOR) == 0){print}'
|
646
|
+
js: $(JS_SOURCES)
|
626
647
|
|
627
|
-
|
648
|
+
check_js: js
|
649
|
+
$(MAKE) do_check_js
|
628
650
|
|
629
|
-
|
630
|
-
|
631
|
-
|
651
|
+
do_check_js: $(libstemmer_algorithms:%=check_js_%)
|
652
|
+
|
653
|
+
check_js_%: export NODE_PATH=$(js_output_dir)
|
654
|
+
check_js_%: $(STEMMING_DATA)/%
|
655
|
+
@echo "Checking output of $* stemmer for JS"
|
656
|
+
@if test -f '$</voc.txt.gz' ; then \
|
632
657
|
gzip -dc '$</voc.txt.gz' > tmp.in; \
|
633
|
-
$(
|
658
|
+
$(JSRUN) javascript/stemwords.js -l $* -i tmp.in -o tmp.txt; \
|
634
659
|
rm tmp.in; \
|
635
660
|
else \
|
636
|
-
$(
|
661
|
+
$(JSRUN) javascript/stemwords.js -l $* -i $</voc.txt -o tmp.txt; \
|
637
662
|
fi
|
638
663
|
@if test -f '$</output.txt.gz' ; then \
|
639
664
|
gzip -dc '$</output.txt.gz'|$(DIFF) -u - tmp.txt; \
|
@@ -642,16 +667,27 @@ check_rust_%: $(STEMMING_DATA_ABS)/%
|
|
642
667
|
fi
|
643
668
|
@rm tmp.txt
|
644
669
|
|
645
|
-
|
670
|
+
###############################################################################
|
671
|
+
# Rust
|
672
|
+
###############################################################################
|
646
673
|
|
647
|
-
|
648
|
-
|
649
|
-
|
674
|
+
.PHONY: rust check_rust do_check_rust
|
675
|
+
|
676
|
+
rust: $(RUST_SOURCES)
|
677
|
+
|
678
|
+
check_rust: rust
|
679
|
+
$(MAKE) do_check_rust
|
680
|
+
|
681
|
+
do_check_rust: $(libstemmer_algorithms:%=check_rust_%)
|
682
|
+
|
683
|
+
check_rust_%: $(STEMMING_DATA_ABS)/%
|
684
|
+
@echo "Checking output of $* stemmer for Rust"
|
685
|
+
@cd rust && if test -f '$</voc.txt.gz' ; then \
|
650
686
|
gzip -dc '$</voc.txt.gz' > tmp.in; \
|
651
|
-
$(
|
687
|
+
$(cargo) run $(cargoflags) -- -l $* -i tmp.in -o $(PWD)/tmp.txt; \
|
652
688
|
rm tmp.in; \
|
653
689
|
else \
|
654
|
-
$(
|
690
|
+
$(cargo) run $(cargoflags) -- -l $* -i $</voc.txt -o $(PWD)/tmp.txt; \
|
655
691
|
fi
|
656
692
|
@if test -f '$</output.txt.gz' ; then \
|
657
693
|
gzip -dc '$</output.txt.gz'|$(DIFF) -u - tmp.txt; \
|
@@ -660,16 +696,27 @@ check_go_%: $(STEMMING_DATA_ABS)/%
|
|
660
696
|
fi
|
661
697
|
@rm tmp.txt
|
662
698
|
|
663
|
-
|
699
|
+
###############################################################################
|
700
|
+
# Go
|
701
|
+
###############################################################################
|
664
702
|
|
665
|
-
|
666
|
-
|
667
|
-
|
703
|
+
.PHONY: go check_go do_check_go
|
704
|
+
|
705
|
+
go: $(GO_SOURCES)
|
706
|
+
|
707
|
+
check_go: go
|
708
|
+
$(MAKE) do_check_go
|
709
|
+
|
710
|
+
do_check_go: $(libstemmer_algorithms:%=check_go_%)
|
711
|
+
|
712
|
+
check_go_%: $(STEMMING_DATA_ABS)/%
|
713
|
+
@echo "Checking output of $* stemmer for Go"
|
714
|
+
@cd go && if test -f '$</voc.txt.gz' ; then \
|
668
715
|
gzip -dc '$</voc.txt.gz' > tmp.in; \
|
669
|
-
$(
|
716
|
+
$(go) run $(goflags) -l $* -i tmp.in -o $(PWD)/tmp.txt; \
|
670
717
|
rm tmp.in; \
|
671
718
|
else \
|
672
|
-
$(
|
719
|
+
$(go) run $(goflags) -l $* -i $</voc.txt -o $(PWD)/tmp.txt; \
|
673
720
|
fi
|
674
721
|
@if test -f '$</output.txt.gz' ; then \
|
675
722
|
gzip -dc '$</output.txt.gz'|$(DIFF) -u - tmp.txt; \
|
@@ -678,19 +725,28 @@ check_js_%: $(STEMMING_DATA)/%
|
|
678
725
|
fi
|
679
726
|
@rm tmp.txt
|
680
727
|
|
681
|
-
|
728
|
+
###############################################################################
|
729
|
+
# Python
|
730
|
+
###############################################################################
|
731
|
+
|
732
|
+
.PHONY: python check_python do_check_python
|
733
|
+
|
734
|
+
python: check_python_stemwords
|
735
|
+
|
736
|
+
check_python: python
|
737
|
+
$(MAKE) $(libstemmer_algorithms:%=check_python_%)
|
682
738
|
|
683
739
|
check_python_%: $(STEMMING_DATA_ABS)/%
|
684
|
-
@echo "Checking output of
|
740
|
+
@echo "Checking output of $* stemmer for Python (THIN_FACTOR=$(THIN_FACTOR))"
|
685
741
|
@cd python_check && if test -f '$</voc.txt.gz' ; then \
|
686
|
-
gzip -dc '$</voc.txt.gz'
|
687
|
-
$(python) stemwords.py -c utf8 -l
|
742
|
+
gzip -dc '$</voc.txt.gz' $(THIN_TEST_DATA) > tmp.in; \
|
743
|
+
$(python) stemwords.py -c utf8 -l $* -i tmp.in -o $(PWD)/tmp.txt; \
|
688
744
|
rm tmp.in; \
|
689
745
|
else \
|
690
|
-
$(python) stemwords.py -c utf8 -l
|
746
|
+
$(python) stemwords.py -c utf8 -l $* -i $</voc.txt -o $(PWD)/tmp.txt; \
|
691
747
|
fi
|
692
748
|
@if test -f '$</output.txt.gz' ; then \
|
693
|
-
gzip -dc '$</output.txt.gz'
|
749
|
+
gzip -dc '$</output.txt.gz' $(THIN_TEST_DATA)|$(DIFF) -u - tmp.txt; \
|
694
750
|
else \
|
695
751
|
$(DIFF) -u $</output.txt tmp.txt; \
|
696
752
|
fi
|
@@ -703,25 +759,27 @@ check_python_stemwords: $(PYTHON_STEMWORDS_SOURCE) $(PYTHON_SOURCES)
|
|
703
759
|
cp -a $(PYTHON_SOURCES) python_check/snowballstemmer
|
704
760
|
cp -a $(PYTHON_STEMWORDS_SOURCE) python_check/
|
705
761
|
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
762
|
+
###############################################################################
|
763
|
+
# Ada
|
764
|
+
###############################################################################
|
765
|
+
|
766
|
+
.PHONY: ada check_ada do_check_ada
|
767
|
+
|
768
|
+
ada: ada/bin/stemwords
|
711
769
|
|
712
|
-
check_ada: ada
|
770
|
+
check_ada: ada
|
713
771
|
$(MAKE) do_check_ada
|
714
772
|
|
715
773
|
do_check_ada: $(libstemmer_algorithms:%=check_ada_%)
|
716
774
|
|
717
775
|
check_ada_%: $(STEMMING_DATA_ABS)/%
|
718
|
-
@echo "Checking output of
|
776
|
+
@echo "Checking output of $* stemmer for Ada"
|
719
777
|
@cd ada && if test -f '$</voc.txt.gz' ; then \
|
720
778
|
gzip -dc '$</voc.txt.gz' > tmp.in; \
|
721
|
-
./bin/stemwords
|
779
|
+
./bin/stemwords $* tmp.in $(PWD)/tmp.txt; \
|
722
780
|
rm tmp.in; \
|
723
781
|
else \
|
724
|
-
./bin/stemwords
|
782
|
+
./bin/stemwords $* $</voc.txt $(PWD)/tmp.txt; \
|
725
783
|
fi
|
726
784
|
@if test -f '$</output.txt.gz' ; then \
|
727
785
|
gzip -dc '$</output.txt.gz'|$(DIFF) -u - tmp.txt; \
|