ruby-stemmer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. data/MIT-LICENSE +21 -0
  2. data/README +79 -0
  3. data/Rakefile +52 -0
  4. data/extconf.rb +14 -0
  5. data/libstemmer_c/MANIFEST +72 -0
  6. data/libstemmer_c/Makefile +9 -0
  7. data/libstemmer_c/README +125 -0
  8. data/libstemmer_c/include/libstemmer.h +79 -0
  9. data/libstemmer_c/libstemmer/libstemmer.c +93 -0
  10. data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
  11. data/libstemmer_c/libstemmer/modules.h +190 -0
  12. data/libstemmer_c/libstemmer/modules.txt +50 -0
  13. data/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  14. data/libstemmer_c/libstemmer/modules_utf8.txt +49 -0
  15. data/libstemmer_c/mkinc.mak +82 -0
  16. data/libstemmer_c/mkinc_utf8.mak +52 -0
  17. data/libstemmer_c/runtime/api.c +66 -0
  18. data/libstemmer_c/runtime/api.h +26 -0
  19. data/libstemmer_c/runtime/header.h +58 -0
  20. data/libstemmer_c/runtime/utilities.c +478 -0
  21. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  22. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  23. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  24. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  25. data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  26. data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  27. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  28. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  29. data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  30. data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  31. data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
  32. data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  33. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  34. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  35. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  36. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  37. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  38. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  39. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  40. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  41. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  42. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  43. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  44. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  45. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  46. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  47. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  48. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  49. data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  50. data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  51. data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  52. data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  53. data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  54. data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  55. data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  56. data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  57. data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  58. data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  59. data/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  60. data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  61. data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
  62. data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  63. data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  64. data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  65. data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  66. data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  67. data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  68. data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  69. data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  70. data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  71. data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  72. data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  73. data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  74. data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  75. data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  76. data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  77. data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  78. data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  79. data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  80. data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  81. data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  82. data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  83. data/ruby-stemmer.c +108 -0
  84. data/test.rb +31 -0
  85. metadata +141 -0
@@ -0,0 +1,190 @@
1
+ /* libstemmer/modules.h: List of stemming modules.
2
+ *
3
+ * This file is generated by mkmodules.pl from a list of module names.
4
+ * Do not edit manually.
5
+ *
6
+ * Modules included by this file are: danish, dutch, english, finnish, french,
7
+ * german, hungarian, italian, norwegian, porter, portuguese, romanian,
8
+ * russian, spanish, swedish, turkish
9
+ */
10
+
11
+ #include "../src_c/stem_ISO_8859_1_danish.h"
12
+ #include "../src_c/stem_UTF_8_danish.h"
13
+ #include "../src_c/stem_ISO_8859_1_dutch.h"
14
+ #include "../src_c/stem_UTF_8_dutch.h"
15
+ #include "../src_c/stem_ISO_8859_1_english.h"
16
+ #include "../src_c/stem_UTF_8_english.h"
17
+ #include "../src_c/stem_ISO_8859_1_finnish.h"
18
+ #include "../src_c/stem_UTF_8_finnish.h"
19
+ #include "../src_c/stem_ISO_8859_1_french.h"
20
+ #include "../src_c/stem_UTF_8_french.h"
21
+ #include "../src_c/stem_ISO_8859_1_german.h"
22
+ #include "../src_c/stem_UTF_8_german.h"
23
+ #include "../src_c/stem_ISO_8859_1_hungarian.h"
24
+ #include "../src_c/stem_UTF_8_hungarian.h"
25
+ #include "../src_c/stem_ISO_8859_1_italian.h"
26
+ #include "../src_c/stem_UTF_8_italian.h"
27
+ #include "../src_c/stem_ISO_8859_1_norwegian.h"
28
+ #include "../src_c/stem_UTF_8_norwegian.h"
29
+ #include "../src_c/stem_ISO_8859_1_porter.h"
30
+ #include "../src_c/stem_UTF_8_porter.h"
31
+ #include "../src_c/stem_ISO_8859_1_portuguese.h"
32
+ #include "../src_c/stem_UTF_8_portuguese.h"
33
+ #include "../src_c/stem_ISO_8859_2_romanian.h"
34
+ #include "../src_c/stem_UTF_8_romanian.h"
35
+ #include "../src_c/stem_KOI8_R_russian.h"
36
+ #include "../src_c/stem_UTF_8_russian.h"
37
+ #include "../src_c/stem_ISO_8859_1_spanish.h"
38
+ #include "../src_c/stem_UTF_8_spanish.h"
39
+ #include "../src_c/stem_ISO_8859_1_swedish.h"
40
+ #include "../src_c/stem_UTF_8_swedish.h"
41
+ #include "../src_c/stem_UTF_8_turkish.h"
42
+
43
+ typedef enum {
44
+ ENC_UNKNOWN=0,
45
+ ENC_ISO_8859_1,
46
+ ENC_ISO_8859_2,
47
+ ENC_KOI8_R,
48
+ ENC_UTF_8
49
+ } stemmer_encoding_t;
50
+
51
+ struct stemmer_encoding {
52
+ const char * name;
53
+ stemmer_encoding_t enc;
54
+ };
55
+ static struct stemmer_encoding encodings[] = {
56
+ {"ISO_8859_1", ENC_ISO_8859_1},
57
+ {"ISO_8859_2", ENC_ISO_8859_2},
58
+ {"KOI8_R", ENC_KOI8_R},
59
+ {"UTF_8", ENC_UTF_8},
60
+ {0,ENC_UNKNOWN}
61
+ };
62
+
63
+ struct stemmer_modules {
64
+ const char * name;
65
+ stemmer_encoding_t enc;
66
+ struct SN_env * (*create)(void);
67
+ void (*close)(struct SN_env *);
68
+ int (*stem)(struct SN_env *);
69
+ };
70
+ static struct stemmer_modules modules[] = {
71
+ {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
72
+ {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
73
+ {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
74
+ {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
75
+ {"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
76
+ {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
77
+ {"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
78
+ {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
79
+ {"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
80
+ {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
81
+ {"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
82
+ {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
83
+ {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
84
+ {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
85
+ {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
86
+ {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
87
+ {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
88
+ {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
89
+ {"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
90
+ {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
91
+ {"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
92
+ {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
93
+ {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
94
+ {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
95
+ {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
96
+ {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
97
+ {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
98
+ {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
99
+ {"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
100
+ {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
101
+ {"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
102
+ {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
103
+ {"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
104
+ {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
105
+ {"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
106
+ {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
107
+ {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
108
+ {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
109
+ {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
110
+ {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
111
+ {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
112
+ {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
113
+ {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
114
+ {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
115
+ {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
116
+ {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
117
+ {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
118
+ {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
119
+ {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
120
+ {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
121
+ {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
122
+ {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
123
+ {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
124
+ {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
125
+ {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
126
+ {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
127
+ {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
128
+ {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
129
+ {"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
130
+ {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
131
+ {"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
132
+ {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
133
+ {"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem},
134
+ {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
135
+ {"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
136
+ {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
137
+ {"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem},
138
+ {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
139
+ {"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
140
+ {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
141
+ {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem},
142
+ {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
143
+ {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
144
+ {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
145
+ {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
146
+ {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
147
+ {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
148
+ {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
149
+ {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
150
+ {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
151
+ {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem},
152
+ {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
153
+ {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
154
+ {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
155
+ {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
156
+ {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
157
+ {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
158
+ {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
159
+ {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
160
+ {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
161
+ {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
162
+ {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
163
+ {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
164
+ {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
165
+ {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
166
+ {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
167
+ {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
168
+ {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
169
+ {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
170
+ {0,ENC_UNKNOWN,0,0,0}
171
+ };
172
+ static const char * algorithm_names[] = {
173
+ "danish",
174
+ "dutch",
175
+ "english",
176
+ "finnish",
177
+ "french",
178
+ "german",
179
+ "hungarian",
180
+ "italian",
181
+ "norwegian",
182
+ "porter",
183
+ "portuguese",
184
+ "romanian",
185
+ "russian",
186
+ "spanish",
187
+ "swedish",
188
+ "turkish",
189
+ 0
190
+ };
@@ -0,0 +1,50 @@
1
+ # This file contains a list of stemmers to include in the distribution.
2
+ # The format is a set of space separated lines - on each line:
3
+ # First item is name of stemmer.
4
+ # Second item is comma separated list of character sets.
5
+ # Third item is comma separated list of names to refer to the stemmer by.
6
+ #
7
+ # Lines starting with a #, or blank lines, are ignored.
8
+
9
+ # List all the main algorithms for each language, in UTF-8, and also with
10
+ # the most commonly used encoding.
11
+
12
+ danish UTF_8,ISO_8859_1 danish,da,dan
13
+ dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
14
+ english UTF_8,ISO_8859_1 english,en,eng
15
+ finnish UTF_8,ISO_8859_1 finnish,fi,fin
16
+ french UTF_8,ISO_8859_1 french,fr,fre,fra
17
+ german UTF_8,ISO_8859_1 german,de,ger,deu
18
+ hungarian UTF_8,ISO_8859_1 hungarian,hu,hun
19
+ italian UTF_8,ISO_8859_1 italian,it,ita
20
+ norwegian UTF_8,ISO_8859_1 norwegian,no,nor
21
+ portuguese UTF_8,ISO_8859_1 portuguese,pt,por
22
+ romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
23
+ russian UTF_8,KOI8_R russian,ru,rus
24
+ spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
25
+ swedish UTF_8,ISO_8859_1 swedish,sv,swe
26
+ turkish UTF_8 turkish,tr,tur
27
+
28
+ # Also include the traditional porter algorithm for english.
29
+ # The porter algorithm is included in the libstemmer distribution to assist
30
+ # with backwards compatibility, but for new systems the english algorithm
31
+ # should be used in preference.
32
+ porter UTF_8,ISO_8859_1 porter
33
+
34
+ # Some other stemmers in the snowball project are not included in the standard
35
+ # distribution. To compile a libstemmer with them in, add them to this list,
36
+ # and regenerate the distribution. (You will need a full source checkout for
37
+ # this.) They are included in the snowball website as curiosities, but are not
38
+ # intended for general use, and use of them is is not fully supported. These
39
+ # algorithms are:
40
+ #
41
+ # german2 - This is a slight modification of the german stemmer.
42
+ #german2 UTF_8,ISO_8859_1 german2
43
+ #
44
+ # kraaij_pohlmann - This is a different dutch stemmer.
45
+ #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann
46
+ #
47
+ # lovins - This is an english stemmer, but fairly outdated, and
48
+ # only really applicable to a restricted type of input text
49
+ # (keywords in academic publications).
50
+ #lovins UTF_8,ISO_8859_1 lovins
@@ -0,0 +1,121 @@
1
+ /* libstemmer/modules_utf8.h: List of stemming modules.
2
+ *
3
+ * This file is generated by mkmodules.pl from a list of module names.
4
+ * Do not edit manually.
5
+ *
6
+ * Modules included by this file are: danish, dutch, english, finnish, french,
7
+ * german, hungarian, italian, norwegian, porter, portuguese, romanian,
8
+ * russian, spanish, swedish, turkish
9
+ */
10
+
11
+ #include "../src_c/stem_UTF_8_danish.h"
12
+ #include "../src_c/stem_UTF_8_dutch.h"
13
+ #include "../src_c/stem_UTF_8_english.h"
14
+ #include "../src_c/stem_UTF_8_finnish.h"
15
+ #include "../src_c/stem_UTF_8_french.h"
16
+ #include "../src_c/stem_UTF_8_german.h"
17
+ #include "../src_c/stem_UTF_8_hungarian.h"
18
+ #include "../src_c/stem_UTF_8_italian.h"
19
+ #include "../src_c/stem_UTF_8_norwegian.h"
20
+ #include "../src_c/stem_UTF_8_porter.h"
21
+ #include "../src_c/stem_UTF_8_portuguese.h"
22
+ #include "../src_c/stem_UTF_8_romanian.h"
23
+ #include "../src_c/stem_UTF_8_russian.h"
24
+ #include "../src_c/stem_UTF_8_spanish.h"
25
+ #include "../src_c/stem_UTF_8_swedish.h"
26
+ #include "../src_c/stem_UTF_8_turkish.h"
27
+
28
+ typedef enum {
29
+ ENC_UNKNOWN=0,
30
+ ENC_UTF_8
31
+ } stemmer_encoding_t;
32
+
33
+ struct stemmer_encoding {
34
+ const char * name;
35
+ stemmer_encoding_t enc;
36
+ };
37
+ static struct stemmer_encoding encodings[] = {
38
+ {"UTF_8", ENC_UTF_8},
39
+ {0,ENC_UNKNOWN}
40
+ };
41
+
42
+ struct stemmer_modules {
43
+ const char * name;
44
+ stemmer_encoding_t enc;
45
+ struct SN_env * (*create)(void);
46
+ void (*close)(struct SN_env *);
47
+ int (*stem)(struct SN_env *);
48
+ };
49
+ static struct stemmer_modules modules[] = {
50
+ {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
51
+ {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
52
+ {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
53
+ {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
54
+ {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
55
+ {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
56
+ {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
57
+ {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
58
+ {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
59
+ {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
60
+ {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
61
+ {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
62
+ {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
63
+ {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
64
+ {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
65
+ {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
66
+ {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
67
+ {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
68
+ {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
69
+ {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
70
+ {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
71
+ {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
72
+ {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
73
+ {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
74
+ {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
75
+ {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
76
+ {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
77
+ {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
78
+ {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
79
+ {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
80
+ {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
81
+ {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
82
+ {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
83
+ {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem},
84
+ {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
85
+ {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
86
+ {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
87
+ {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
88
+ {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
89
+ {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
90
+ {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem},
91
+ {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
92
+ {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
93
+ {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
94
+ {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
95
+ {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
96
+ {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
97
+ {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
98
+ {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
99
+ {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
100
+ {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
101
+ {0,ENC_UNKNOWN,0,0,0}
102
+ };
103
+ static const char * algorithm_names[] = {
104
+ "danish",
105
+ "dutch",
106
+ "english",
107
+ "finnish",
108
+ "french",
109
+ "german",
110
+ "hungarian",
111
+ "italian",
112
+ "norwegian",
113
+ "porter",
114
+ "portuguese",
115
+ "romanian",
116
+ "russian",
117
+ "spanish",
118
+ "swedish",
119
+ "turkish",
120
+ 0
121
+ };
@@ -0,0 +1,49 @@
1
+ # This file contains a list of stemmers to include in the distribution.
2
+ # The format is a set of space separated lines - on each line:
3
+ # First item is name of stemmer.
4
+ # Second item is comma separated list of character sets.
5
+ # Third item is comma separated list of names to refer to the stemmer by.
6
+ #
7
+ # Lines starting with a #, or blank lines, are ignored.
8
+
9
+ # List all the main algorithms for each language, in UTF-8.
10
+
11
+ danish UTF_8 danish,da,dan
12
+ dutch UTF_8 dutch,nl,dut,nld
13
+ english UTF_8 english,en,eng
14
+ finnish UTF_8 finnish,fi,fin
15
+ french UTF_8 french,fr,fre,fra
16
+ german UTF_8 german,de,ger,deu
17
+ hungarian UTF_8 hungarian,hu,hun
18
+ italian UTF_8 italian,it,ita
19
+ norwegian UTF_8 norwegian,no,nor
20
+ portuguese UTF_8 portuguese,pt,por
21
+ romanian UTF_8 romanian,ro,rum,ron
22
+ russian UTF_8 russian,ru,rus
23
+ spanish UTF_8 spanish,es,esl,spa
24
+ swedish UTF_8 swedish,sv,swe
25
+ turkish UTF_8 turkish,tr,tur
26
+
27
+ # Also include the traditional porter algorithm for english.
28
+ # The porter algorithm is included in the libstemmer distribution to assist
29
+ # with backwards compatibility, but for new systems the english algorithm
30
+ # should be used in preference.
31
+ porter UTF_8 porter
32
+
33
+ # Some other stemmers in the snowball project are not included in the standard
34
+ # distribution. To compile a libstemmer with them in, add them to this list,
35
+ # and regenerate the distribution. (You will need a full source checkout for
36
+ # this.) They are included in the snowball website as curiosities, but are not
37
+ # intended for general use, and use of them is is not fully supported. These
38
+ # algorithms are:
39
+ #
40
+ # german2 - This is a slight modification of the german stemmer.
41
+ #german2 UTF_8 german2
42
+ #
43
+ # kraaij_pohlmann - This is a different dutch stemmer.
44
+ #kraaij_pohlmann UTF_8 kraaij_pohlmann
45
+ #
46
+ # lovins - This is an english stemmer, but fairly outdated, and
47
+ # only really applicable to a restricted type of input text
48
+ # (keywords in academic publications).
49
+ #lovins UTF_8 lovins
@@ -0,0 +1,82 @@
1
+ # libstemmer/mkinc.mak: List of stemming module source files
2
+ #
3
+ # This file is generated by mkmodules.pl from a list of module names.
4
+ # Do not edit manually.
5
+ #
6
+ # Modules included by this file are: danish, dutch, english, finnish, french,
7
+ # german, hungarian, italian, norwegian, porter, portuguese, romanian,
8
+ # russian, spanish, swedish, turkish
9
+
10
+ snowball_sources= \
11
+ src_c/stem_ISO_8859_1_danish.c \
12
+ src_c/stem_UTF_8_danish.c \
13
+ src_c/stem_ISO_8859_1_dutch.c \
14
+ src_c/stem_UTF_8_dutch.c \
15
+ src_c/stem_ISO_8859_1_english.c \
16
+ src_c/stem_UTF_8_english.c \
17
+ src_c/stem_ISO_8859_1_finnish.c \
18
+ src_c/stem_UTF_8_finnish.c \
19
+ src_c/stem_ISO_8859_1_french.c \
20
+ src_c/stem_UTF_8_french.c \
21
+ src_c/stem_ISO_8859_1_german.c \
22
+ src_c/stem_UTF_8_german.c \
23
+ src_c/stem_ISO_8859_1_hungarian.c \
24
+ src_c/stem_UTF_8_hungarian.c \
25
+ src_c/stem_ISO_8859_1_italian.c \
26
+ src_c/stem_UTF_8_italian.c \
27
+ src_c/stem_ISO_8859_1_norwegian.c \
28
+ src_c/stem_UTF_8_norwegian.c \
29
+ src_c/stem_ISO_8859_1_porter.c \
30
+ src_c/stem_UTF_8_porter.c \
31
+ src_c/stem_ISO_8859_1_portuguese.c \
32
+ src_c/stem_UTF_8_portuguese.c \
33
+ src_c/stem_ISO_8859_2_romanian.c \
34
+ src_c/stem_UTF_8_romanian.c \
35
+ src_c/stem_KOI8_R_russian.c \
36
+ src_c/stem_UTF_8_russian.c \
37
+ src_c/stem_ISO_8859_1_spanish.c \
38
+ src_c/stem_UTF_8_spanish.c \
39
+ src_c/stem_ISO_8859_1_swedish.c \
40
+ src_c/stem_UTF_8_swedish.c \
41
+ src_c/stem_UTF_8_turkish.c \
42
+ runtime/api.c \
43
+ runtime/utilities.c \
44
+ libstemmer/libstemmer.c
45
+
46
+ snowball_headers= \
47
+ src_c/stem_ISO_8859_1_danish.h \
48
+ src_c/stem_UTF_8_danish.h \
49
+ src_c/stem_ISO_8859_1_dutch.h \
50
+ src_c/stem_UTF_8_dutch.h \
51
+ src_c/stem_ISO_8859_1_english.h \
52
+ src_c/stem_UTF_8_english.h \
53
+ src_c/stem_ISO_8859_1_finnish.h \
54
+ src_c/stem_UTF_8_finnish.h \
55
+ src_c/stem_ISO_8859_1_french.h \
56
+ src_c/stem_UTF_8_french.h \
57
+ src_c/stem_ISO_8859_1_german.h \
58
+ src_c/stem_UTF_8_german.h \
59
+ src_c/stem_ISO_8859_1_hungarian.h \
60
+ src_c/stem_UTF_8_hungarian.h \
61
+ src_c/stem_ISO_8859_1_italian.h \
62
+ src_c/stem_UTF_8_italian.h \
63
+ src_c/stem_ISO_8859_1_norwegian.h \
64
+ src_c/stem_UTF_8_norwegian.h \
65
+ src_c/stem_ISO_8859_1_porter.h \
66
+ src_c/stem_UTF_8_porter.h \
67
+ src_c/stem_ISO_8859_1_portuguese.h \
68
+ src_c/stem_UTF_8_portuguese.h \
69
+ src_c/stem_ISO_8859_2_romanian.h \
70
+ src_c/stem_UTF_8_romanian.h \
71
+ src_c/stem_KOI8_R_russian.h \
72
+ src_c/stem_UTF_8_russian.h \
73
+ src_c/stem_ISO_8859_1_spanish.h \
74
+ src_c/stem_UTF_8_spanish.h \
75
+ src_c/stem_ISO_8859_1_swedish.h \
76
+ src_c/stem_UTF_8_swedish.h \
77
+ src_c/stem_UTF_8_turkish.h \
78
+ include/libstemmer.h \
79
+ libstemmer/modules.h \
80
+ runtime/api.h \
81
+ runtime/header.h
82
+
@@ -0,0 +1,52 @@
1
+ # libstemmer/mkinc_utf8.mak: List of stemming module source files
2
+ #
3
+ # This file is generated by mkmodules.pl from a list of module names.
4
+ # Do not edit manually.
5
+ #
6
+ # Modules included by this file are: danish, dutch, english, finnish, french,
7
+ # german, hungarian, italian, norwegian, porter, portuguese, romanian,
8
+ # russian, spanish, swedish, turkish
9
+
10
+ snowball_sources= \
11
+ src_c/stem_UTF_8_danish.c \
12
+ src_c/stem_UTF_8_dutch.c \
13
+ src_c/stem_UTF_8_english.c \
14
+ src_c/stem_UTF_8_finnish.c \
15
+ src_c/stem_UTF_8_french.c \
16
+ src_c/stem_UTF_8_german.c \
17
+ src_c/stem_UTF_8_hungarian.c \
18
+ src_c/stem_UTF_8_italian.c \
19
+ src_c/stem_UTF_8_norwegian.c \
20
+ src_c/stem_UTF_8_porter.c \
21
+ src_c/stem_UTF_8_portuguese.c \
22
+ src_c/stem_UTF_8_romanian.c \
23
+ src_c/stem_UTF_8_russian.c \
24
+ src_c/stem_UTF_8_spanish.c \
25
+ src_c/stem_UTF_8_swedish.c \
26
+ src_c/stem_UTF_8_turkish.c \
27
+ runtime/api.c \
28
+ runtime/utilities.c \
29
+ libstemmer/libstemmer_utf8.c
30
+
31
+ snowball_headers= \
32
+ src_c/stem_UTF_8_danish.h \
33
+ src_c/stem_UTF_8_dutch.h \
34
+ src_c/stem_UTF_8_english.h \
35
+ src_c/stem_UTF_8_finnish.h \
36
+ src_c/stem_UTF_8_french.h \
37
+ src_c/stem_UTF_8_german.h \
38
+ src_c/stem_UTF_8_hungarian.h \
39
+ src_c/stem_UTF_8_italian.h \
40
+ src_c/stem_UTF_8_norwegian.h \
41
+ src_c/stem_UTF_8_porter.h \
42
+ src_c/stem_UTF_8_portuguese.h \
43
+ src_c/stem_UTF_8_romanian.h \
44
+ src_c/stem_UTF_8_russian.h \
45
+ src_c/stem_UTF_8_spanish.h \
46
+ src_c/stem_UTF_8_swedish.h \
47
+ src_c/stem_UTF_8_turkish.h \
48
+ include/libstemmer.h \
49
+ libstemmer/modules_utf8.h \
50
+ runtime/api.h \
51
+ runtime/header.h
52
+
@@ -0,0 +1,66 @@
1
+
2
+ #include <stdlib.h> /* for calloc, free */
3
+ #include "header.h"
4
+
5
+ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
6
+ {
7
+ struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
8
+ if (z == NULL) return NULL;
9
+ z->p = create_s();
10
+ if (z->p == NULL) goto error;
11
+ if (S_size)
12
+ {
13
+ int i;
14
+ z->S = (symbol * *) calloc(S_size, sizeof(symbol *));
15
+ if (z->S == NULL) goto error;
16
+
17
+ for (i = 0; i < S_size; i++)
18
+ {
19
+ z->S[i] = create_s();
20
+ if (z->S[i] == NULL) goto error;
21
+ }
22
+ }
23
+
24
+ if (I_size)
25
+ {
26
+ z->I = (int *) calloc(I_size, sizeof(int));
27
+ if (z->I == NULL) goto error;
28
+ }
29
+
30
+ if (B_size)
31
+ {
32
+ z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
33
+ if (z->B == NULL) goto error;
34
+ }
35
+
36
+ return z;
37
+ error:
38
+ SN_close_env(z, S_size);
39
+ return NULL;
40
+ }
41
+
42
+ extern void SN_close_env(struct SN_env * z, int S_size)
43
+ {
44
+ if (z == NULL) return;
45
+ if (S_size)
46
+ {
47
+ int i;
48
+ for (i = 0; i < S_size; i++)
49
+ {
50
+ lose_s(z->S[i]);
51
+ }
52
+ free(z->S);
53
+ }
54
+ free(z->I);
55
+ free(z->B);
56
+ if (z->p) lose_s(z->p);
57
+ free(z);
58
+ }
59
+
60
+ extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
61
+ {
62
+ int err = replace_s(z, 0, z->l, size, s, NULL);
63
+ z->c = 0;
64
+ return err;
65
+ }
66
+
@@ -0,0 +1,26 @@
1
+
2
+ typedef unsigned char symbol;
3
+
4
+ /* Or replace 'char' above with 'short' for 16 bit characters.
5
+
6
+ More precisely, replace 'char' with whatever type guarantees the
7
+ character width you need. Note however that sizeof(symbol) should divide
8
+ HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise
9
+ there is an alignment problem. In the unlikely event of a problem here,
10
+ consult Martin Porter.
11
+
12
+ */
13
+
14
+ struct SN_env {
15
+ symbol * p;
16
+ int c; int l; int lb; int bra; int ket;
17
+ symbol * * S;
18
+ int * I;
19
+ unsigned char * B;
20
+ };
21
+
22
+ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
23
+ extern void SN_close_env(struct SN_env * z, int S_size);
24
+
25
+ extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
26
+