isomorfeus-ferret 0.12.2 → 0.12.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +612 -612
  3. data/README.md +80 -44
  4. data/ext/isomorfeus_ferret_ext/bm_hash.c +9 -6
  5. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +4 -2
  6. data/ext/isomorfeus_ferret_ext/frb_search.c +14 -2
  7. data/ext/isomorfeus_ferret_ext/frb_store.c +34 -5
  8. data/ext/isomorfeus_ferret_ext/frt_posh.h +11 -19
  9. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +1844 -1911
  10. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +12 -15
  11. data/ext/isomorfeus_ferret_ext/frt_scanner.c +1 -0
  12. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +1 -0
  13. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +1 -0
  14. data/ext/isomorfeus_ferret_ext/frt_search.h +1 -1
  15. data/ext/isomorfeus_ferret_ext/libstemmer.c +14 -11
  16. data/ext/isomorfeus_ferret_ext/libstemmer.h +4 -9
  17. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +1167 -0
  18. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +6 -0
  19. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +1433 -0
  20. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +6 -0
  21. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +120 -143
  22. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +1 -2
  23. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +217 -237
  24. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +1 -1
  25. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +377 -432
  26. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +1 -1
  27. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +298 -342
  28. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +1 -2
  29. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +530 -524
  30. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +1 -1
  31. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +201 -214
  32. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +1 -1
  33. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1 -1
  34. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +394 -0
  35. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +6 -0
  36. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +457 -0
  37. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +6 -0
  38. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +396 -439
  39. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +1 -1
  40. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +104 -128
  41. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +1 -1
  42. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +242 -273
  43. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +1 -1
  44. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +406 -461
  45. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +1 -2
  46. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +405 -456
  47. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +1 -1
  48. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +108 -126
  49. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +1 -1
  50. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +849 -0
  51. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +6 -0
  52. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +373 -405
  53. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +1 -1
  54. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +288 -305
  55. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +1 -1
  56. data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.c +1651 -0
  57. data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.h +6 -0
  58. data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.c +546 -0
  59. data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.h +6 -0
  60. data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.c +1171 -0
  61. data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.h +6 -0
  62. data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.c +1436 -0
  63. data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.h +6 -0
  64. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +121 -141
  65. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +1 -1
  66. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +221 -241
  67. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +1 -1
  68. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +381 -431
  69. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +1 -1
  70. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +300 -345
  71. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +1 -1
  72. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +518 -511
  73. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +1 -1
  74. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +201 -209
  75. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +1 -1
  76. data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.c +3660 -0
  77. data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.h +6 -0
  78. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.c +309 -0
  79. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.h +6 -0
  80. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +306 -671
  81. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +1 -1
  82. data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.c +394 -0
  83. data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.h +6 -0
  84. data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.c +457 -0
  85. data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.h +6 -0
  86. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +400 -442
  87. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +1 -1
  88. data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.c +824 -0
  89. data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.h +6 -0
  90. data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.c +408 -0
  91. data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.h +6 -0
  92. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +105 -127
  93. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +1 -1
  94. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +245 -276
  95. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +1 -1
  96. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +409 -464
  97. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +1 -1
  98. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +376 -408
  99. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +1 -1
  100. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +272 -287
  101. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +1 -1
  102. data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.c +6530 -0
  103. data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.h +6 -0
  104. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +407 -458
  105. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +1 -1
  106. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +110 -125
  107. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +1 -1
  108. data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.c +1865 -0
  109. data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.h +6 -0
  110. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +698 -806
  111. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +1 -1
  112. data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.c +1220 -0
  113. data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.h +6 -0
  114. data/ext/isomorfeus_ferret_ext/stem_api.c +1 -9
  115. data/ext/isomorfeus_ferret_ext/stem_api.h +1 -3
  116. data/ext/isomorfeus_ferret_ext/stem_header.h +30 -26
  117. data/ext/isomorfeus_ferret_ext/stem_modules.h +113 -26
  118. data/ext/isomorfeus_ferret_ext/stem_modules.txt +18 -5
  119. data/ext/isomorfeus_ferret_ext/stem_utilities.c +167 -132
  120. data/ext/isomorfeus_ferret_ext/test.c +7 -1
  121. data/ext/isomorfeus_ferret_ext/test_search.c +0 -1
  122. data/lib/isomorfeus/ferret/index/index.rb +1 -1
  123. data/lib/isomorfeus/ferret/version.rb +1 -1
  124. metadata +43 -7
  125. data/ext/isomorfeus_ferret_ext/q_parser.y +0 -1366
@@ -0,0 +1,6 @@
1
+ /* Generated by Snowball 2.2.0 - https://snowballstem.org/ */
2
+
3
+ extern struct SN_env * yiddish_UTF_8_create_env(void);
4
+ extern void yiddish_UTF_8_close_env(struct SN_env * z);
5
+
6
+ extern int yiddish_UTF_8_stem(struct SN_env * z);
@@ -2,7 +2,7 @@
2
2
  #include <stdlib.h> /* for calloc, free */
3
3
  #include "stem_header.h"
4
4
 
5
- extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
5
+ extern struct SN_env * SN_create_env(int S_size, int I_size)
6
6
  {
7
7
  struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
8
8
  if (z == NULL) return NULL;
@@ -27,12 +27,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
27
27
  if (z->I == NULL) goto error;
28
28
  }
29
29
 
30
- if (B_size)
31
- {
32
- z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
33
- if (z->B == NULL) goto error;
34
- }
35
-
36
30
  return z;
37
31
  error:
38
32
  SN_close_env(z, S_size);
@@ -52,7 +46,6 @@ extern void SN_close_env(struct SN_env * z, int S_size)
52
46
  free(z->S);
53
47
  }
54
48
  free(z->I);
55
- free(z->B);
56
49
  if (z->p) lose_s(z->p);
57
50
  free(z);
58
51
  }
@@ -63,4 +56,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
63
56
  z->c = 0;
64
57
  return err;
65
58
  }
66
-
@@ -16,11 +16,9 @@ struct SN_env {
16
16
  int c; int l; int lb; int bra; int ket;
17
17
  symbol * * S;
18
18
  int * I;
19
- unsigned char * B;
20
19
  };
21
20
 
22
- extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
21
+ extern struct SN_env * SN_create_env(int S_size, int I_size);
23
22
  extern void SN_close_env(struct SN_env * z, int S_size);
24
23
 
25
24
  extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
26
-
@@ -20,38 +20,42 @@ struct among
20
20
  int (* function)(struct SN_env *);
21
21
  };
22
22
 
23
- symbol * create_s(void);
24
- void lose_s(symbol * p);
23
+ extern symbol * create_s(void);
24
+ extern void lose_s(symbol * p);
25
25
 
26
- int skip_utf8(const symbol * p, int c, int lb, int l, int n);
26
+ extern int skip_utf8(const symbol * p, int c, int limit, int n);
27
27
 
28
- int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
29
- int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
30
- int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
31
- int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
28
+ extern int skip_b_utf8(const symbol * p, int c, int limit, int n);
32
29
 
33
- int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
34
- int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
35
- int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
36
- int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
30
+ extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
31
+ extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
32
+ extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
33
+ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
37
34
 
38
- int eq_s(struct SN_env * z, int s_size, const symbol * s);
39
- int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
40
- int eq_v(struct SN_env * z, const symbol * p);
41
- int eq_v_b(struct SN_env * z, const symbol * p);
35
+ extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
36
+ extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
37
+ extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
38
+ extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
42
39
 
43
- int find_among(struct SN_env * z, const struct among * v, int v_size);
44
- int find_among_b(struct SN_env * z, const struct among * v, int v_size);
40
+ extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
41
+ extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
42
+ extern int eq_v(struct SN_env * z, const symbol * p);
43
+ extern int eq_v_b(struct SN_env * z, const symbol * p);
45
44
 
46
- int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
47
- int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
48
- int slice_from_v(struct SN_env * z, const symbol * p);
49
- int slice_del(struct SN_env * z);
45
+ extern int find_among(struct SN_env * z, const struct among * v, int v_size);
46
+ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
50
47
 
51
- int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
52
- int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
48
+ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
49
+ extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
50
+ extern int slice_from_v(struct SN_env * z, const symbol * p);
51
+ extern int slice_del(struct SN_env * z);
53
52
 
54
- symbol * slice_to(struct SN_env * z, symbol * p);
55
- symbol * assign_to(struct SN_env * z, symbol * p);
53
+ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
54
+ extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
56
55
 
57
- void debug(struct SN_env * z, int number, int line_count);
56
+ extern symbol * slice_to(struct SN_env * z, symbol * p);
57
+ extern symbol * assign_to(struct SN_env * z, symbol * p);
58
+
59
+ extern int len_utf8(const symbol * p);
60
+
61
+ extern void debug(struct SN_env * z, int number, int line_count);
@@ -3,11 +3,19 @@
3
3
  * This file is generated by mkmodules.pl from a list of module names.
4
4
  * Do not edit manually.
5
5
  *
6
- * Modules included by this file are: danish, dutch, english, finnish, french,
7
- * german, hungarian, italian, norwegian, porter, portuguese, romanian,
8
- * russian, spanish, swedish, turkish
6
+ * Modules included by this file are: arabic, armenian, basque, catalan,
7
+ * danish, dutch, english, finnish, french, german, greek, hindi, hungarian,
8
+ * indonesian, irish, italian, lithuanian, nepali, norwegian, porter,
9
+ * portuguese, romanian, russian, serbian, spanish, swedish, tamil, turkish,
10
+ * yiddish
9
11
  */
10
12
 
13
+ #include "stem_UTF_8_arabic.h"
14
+ #include "stem_UTF_8_armenian.h"
15
+ #include "stem_ISO_8859_1_basque.h"
16
+ #include "stem_UTF_8_basque.h"
17
+ #include "stem_ISO_8859_1_catalan.h"
18
+ #include "stem_UTF_8_catalan.h"
11
19
  #include "stem_ISO_8859_1_danish.h"
12
20
  #include "stem_UTF_8_danish.h"
13
21
  #include "stem_ISO_8859_1_dutch.h"
@@ -20,10 +28,18 @@
20
28
  #include "stem_UTF_8_french.h"
21
29
  #include "stem_ISO_8859_1_german.h"
22
30
  #include "stem_UTF_8_german.h"
23
- #include "stem_ISO_8859_1_hungarian.h"
31
+ #include "stem_UTF_8_greek.h"
32
+ #include "stem_UTF_8_hindi.h"
33
+ #include "stem_ISO_8859_2_hungarian.h"
24
34
  #include "stem_UTF_8_hungarian.h"
35
+ #include "stem_ISO_8859_1_indonesian.h"
36
+ #include "stem_UTF_8_indonesian.h"
37
+ #include "stem_ISO_8859_1_irish.h"
38
+ #include "stem_UTF_8_irish.h"
25
39
  #include "stem_ISO_8859_1_italian.h"
26
40
  #include "stem_UTF_8_italian.h"
41
+ #include "stem_UTF_8_lithuanian.h"
42
+ #include "stem_UTF_8_nepali.h"
27
43
  #include "stem_ISO_8859_1_norwegian.h"
28
44
  #include "stem_UTF_8_norwegian.h"
29
45
  #include "stem_ISO_8859_1_porter.h"
@@ -34,11 +50,14 @@
34
50
  #include "stem_UTF_8_romanian.h"
35
51
  #include "stem_KOI8_R_russian.h"
36
52
  #include "stem_UTF_8_russian.h"
53
+ #include "stem_UTF_8_serbian.h"
37
54
  #include "stem_ISO_8859_1_spanish.h"
38
55
  #include "stem_UTF_8_spanish.h"
39
56
  #include "stem_ISO_8859_1_swedish.h"
40
57
  #include "stem_UTF_8_swedish.h"
58
+ #include "stem_UTF_8_tamil.h"
41
59
  #include "stem_UTF_8_turkish.h"
60
+ #include "stem_UTF_8_yiddish.h"
42
61
 
43
62
  typedef enum {
44
63
  ENC_UNKNOWN=0,
@@ -52,7 +71,7 @@ struct stemmer_encoding {
52
71
  const char * name;
53
72
  stemmer_encoding_t enc;
54
73
  };
55
- static struct stemmer_encoding encodings[] = {
74
+ static const struct stemmer_encoding encodings[] = {
56
75
  {"ISO_8859_1", ENC_ISO_8859_1},
57
76
  {"ISO_8859_2", ENC_ISO_8859_2},
58
77
  {"KOI8_R", ENC_KOI8_R},
@@ -62,12 +81,27 @@ static struct stemmer_encoding encodings[] = {
62
81
 
63
82
  struct stemmer_modules {
64
83
  const char * name;
65
- stemmer_encoding_t enc;
84
+ stemmer_encoding_t enc;
66
85
  struct SN_env * (*create)(void);
67
86
  void (*close)(struct SN_env *);
68
87
  int (*stem)(struct SN_env *);
69
88
  };
70
- static struct stemmer_modules modules[] = {
89
+ static const struct stemmer_modules modules[] = {
90
+ {"ar", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
91
+ {"ara", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
92
+ {"arabic", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
93
+ {"arm", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
94
+ {"armenian", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
95
+ {"baq", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
96
+ {"baq", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
97
+ {"basque", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
98
+ {"basque", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
99
+ {"ca", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
100
+ {"ca", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
101
+ {"cat", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
102
+ {"cat", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
103
+ {"catalan", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
104
+ {"catalan", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
71
105
  {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
72
106
  {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
73
107
  {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
@@ -82,6 +116,8 @@ static struct stemmer_modules modules[] = {
82
116
  {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
83
117
  {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
84
118
  {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
119
+ {"el", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
120
+ {"ell", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
85
121
  {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
86
122
  {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
87
123
  {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
@@ -92,6 +128,10 @@ static struct stemmer_modules modules[] = {
92
128
  {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
93
129
  {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
94
130
  {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
131
+ {"eu", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
132
+ {"eu", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
133
+ {"eus", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
134
+ {"eus", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
95
135
  {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
96
136
  {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
97
137
  {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
@@ -106,22 +146,47 @@ static struct stemmer_modules modules[] = {
106
146
  {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
107
147
  {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
108
148
  {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
149
+ {"ga", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
150
+ {"ga", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
109
151
  {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
110
152
  {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
111
153
  {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
112
154
  {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
113
- {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
155
+ {"gle", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
156
+ {"gle", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
157
+ {"gre", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
158
+ {"greek", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
159
+ {"hi", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
160
+ {"hin", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
161
+ {"hindi", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
162
+ {"hu", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
114
163
  {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
115
- {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
164
+ {"hun", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
116
165
  {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
117
- {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
166
+ {"hungarian", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
118
167
  {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
168
+ {"hy", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
169
+ {"hye", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
170
+ {"id", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
171
+ {"id", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
172
+ {"ind", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
173
+ {"ind", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
174
+ {"indonesian", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
175
+ {"indonesian", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
176
+ {"irish", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
177
+ {"irish", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
119
178
  {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
120
179
  {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
121
180
  {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
122
181
  {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
123
182
  {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
124
183
  {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
184
+ {"lit", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
185
+ {"lithuanian", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
186
+ {"lt", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
187
+ {"ne", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
188
+ {"nep", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
189
+ {"nepali", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
125
190
  {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
126
191
  {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
127
192
  {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
@@ -154,37 +219,59 @@ static struct stemmer_modules modules[] = {
154
219
  {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
155
220
  {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
156
221
  {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
222
+ {"serbian", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
157
223
  {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
158
224
  {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
159
225
  {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
160
226
  {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
227
+ {"sr", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
228
+ {"srp", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
161
229
  {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
162
230
  {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
163
231
  {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
164
232
  {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
165
233
  {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
166
234
  {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
235
+ {"ta", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
236
+ {"tam", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
237
+ {"tamil", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
167
238
  {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
168
239
  {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
169
240
  {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
241
+ {"yi", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
242
+ {"yid", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
243
+ {"yiddish", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
170
244
  {0,ENC_UNKNOWN,0,0,0}
171
245
  };
172
246
  static const char * algorithm_names[] = {
173
- "danish",
174
- "dutch",
175
- "english",
176
- "finnish",
177
- "french",
178
- "german",
179
- "hungarian",
180
- "italian",
181
- "norwegian",
182
- "porter",
183
- "portuguese",
184
- "romanian",
185
- "russian",
186
- "spanish",
187
- "swedish",
188
- "turkish",
247
+ "arabic",
248
+ "armenian",
249
+ "basque",
250
+ "catalan",
251
+ "danish",
252
+ "dutch",
253
+ "english",
254
+ "finnish",
255
+ "french",
256
+ "german",
257
+ "greek",
258
+ "hindi",
259
+ "hungarian",
260
+ "indonesian",
261
+ "irish",
262
+ "italian",
263
+ "lithuanian",
264
+ "nepali",
265
+ "norwegian",
266
+ "porter",
267
+ "portuguese",
268
+ "romanian",
269
+ "russian",
270
+ "serbian",
271
+ "spanish",
272
+ "swedish",
273
+ "tamil",
274
+ "turkish",
275
+ "yiddish",
189
276
  0
190
277
  };
@@ -9,27 +9,40 @@
9
9
  # List all the main algorithms for each language, in UTF-8, and also with
10
10
  # the most commonly used encoding.
11
11
 
12
+ arabic UTF_8 arabic,ar,ara
13
+ armenian UTF_8 armenian,hy,hye,arm
14
+ basque UTF_8,ISO_8859_1 basque,eu,eus,baq
15
+ catalan UTF_8,ISO_8859_1 catalan,ca,cat
12
16
  danish UTF_8,ISO_8859_1 danish,da,dan
13
17
  dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
14
18
  english UTF_8,ISO_8859_1 english,en,eng
15
19
  finnish UTF_8,ISO_8859_1 finnish,fi,fin
16
20
  french UTF_8,ISO_8859_1 french,fr,fre,fra
17
21
  german UTF_8,ISO_8859_1 german,de,ger,deu
18
- hungarian UTF_8,ISO_8859_1 hungarian,hu,hun
22
+ greek UTF_8 greek,el,gre,ell
23
+ hindi UTF_8 hindi,hi,hin
24
+ hungarian UTF_8,ISO_8859_2 hungarian,hu,hun
25
+ indonesian UTF_8,ISO_8859_1 indonesian,id,ind
26
+ irish UTF_8,ISO_8859_1 irish,ga,gle
19
27
  italian UTF_8,ISO_8859_1 italian,it,ita
28
+ lithuanian UTF_8 lithuanian,lt,lit
29
+ nepali UTF_8 nepali,ne,nep
20
30
  norwegian UTF_8,ISO_8859_1 norwegian,no,nor
21
31
  portuguese UTF_8,ISO_8859_1 portuguese,pt,por
22
32
  romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
23
33
  russian UTF_8,KOI8_R russian,ru,rus
34
+ serbian UTF_8 serbian,sr,srp
24
35
  spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
25
36
  swedish UTF_8,ISO_8859_1 swedish,sv,swe
37
+ tamil UTF_8 tamil,ta,tam
26
38
  turkish UTF_8 turkish,tr,tur
39
+ yiddish UTF_8 yiddish,yi,yid
27
40
 
28
41
  # Also include the traditional porter algorithm for english.
29
42
  # The porter algorithm is included in the libstemmer distribution to assist
30
43
  # with backwards compatibility, but for new systems the english algorithm
31
44
  # should be used in preference.
32
- porter UTF_8,ISO_8859_1 porter
45
+ porter UTF_8,ISO_8859_1 porter english
33
46
 
34
47
  # Some other stemmers in the snowball project are not included in the standard
35
48
  # distribution. To compile a libstemmer with them in, add them to this list,
@@ -39,12 +52,12 @@ porter UTF_8,ISO_8859_1 porter
39
52
  # algorithms are:
40
53
  #
41
54
  # german2 - This is a slight modification of the german stemmer.
42
- #german2 UTF_8,ISO_8859_1 german2
55
+ #german2 UTF_8,ISO_8859_1 german2 german
43
56
  #
44
57
  # kraaij_pohlmann - This is a different dutch stemmer.
45
- #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann
58
+ #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
46
59
  #
47
60
  # lovins - This is an english stemmer, but fairly outdated, and
48
61
  # only really applicable to a restricted type of input text
49
62
  # (keywords in academic publications).
50
- #lovins UTF_8,ISO_8859_1 lovins
63
+ #lovins UTF_8,ISO_8859_1 lovins english