isomorfeus-ferret 0.12.4 → 0.12.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (123) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +612 -612
  3. data/README.md +80 -48
  4. data/ext/isomorfeus_ferret_ext/bm_hash.c +9 -6
  5. data/ext/isomorfeus_ferret_ext/bm_micro_string.c +4 -2
  6. data/ext/isomorfeus_ferret_ext/frb_store.c +34 -5
  7. data/ext/isomorfeus_ferret_ext/frt_posh.h +11 -19
  8. data/ext/isomorfeus_ferret_ext/frt_q_parser.c +1844 -1911
  9. data/ext/isomorfeus_ferret_ext/frt_q_phrase.c +7 -7
  10. data/ext/isomorfeus_ferret_ext/frt_scanner.c +1 -0
  11. data/ext/isomorfeus_ferret_ext/frt_scanner_mb.c +1 -0
  12. data/ext/isomorfeus_ferret_ext/frt_scanner_utf8.c +1 -0
  13. data/ext/isomorfeus_ferret_ext/frt_search.h +1 -1
  14. data/ext/isomorfeus_ferret_ext/libstemmer.c +14 -11
  15. data/ext/isomorfeus_ferret_ext/libstemmer.h +4 -9
  16. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.c +1167 -0
  17. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_basque.h +6 -0
  18. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.c +1433 -0
  19. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_catalan.h +6 -0
  20. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.c +120 -143
  21. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_danish.h +1 -2
  22. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.c +217 -237
  23. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_dutch.h +1 -1
  24. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.c +377 -432
  25. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_english.h +1 -1
  26. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.c +298 -342
  27. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_finnish.h +1 -2
  28. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.c +530 -524
  29. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_french.h +1 -1
  30. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.c +201 -214
  31. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_german.h +1 -1
  32. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_hungarian.c +1 -1
  33. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.c +394 -0
  34. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_indonesian.h +6 -0
  35. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.c +457 -0
  36. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_irish.h +6 -0
  37. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.c +396 -439
  38. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_italian.h +1 -1
  39. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.c +104 -128
  40. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_norwegian.h +1 -1
  41. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.c +242 -273
  42. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_porter.h +1 -1
  43. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.c +406 -461
  44. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_portuguese.h +1 -2
  45. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.c +405 -456
  46. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_spanish.h +1 -1
  47. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.c +108 -126
  48. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_1_swedish.h +1 -1
  49. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.c +849 -0
  50. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_hungarian.h +6 -0
  51. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.c +373 -405
  52. data/ext/isomorfeus_ferret_ext/stem_ISO_8859_2_romanian.h +1 -1
  53. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.c +288 -305
  54. data/ext/isomorfeus_ferret_ext/stem_KOI8_R_russian.h +1 -1
  55. data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.c +1651 -0
  56. data/ext/isomorfeus_ferret_ext/stem_UTF_8_arabic.h +6 -0
  57. data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.c +546 -0
  58. data/ext/isomorfeus_ferret_ext/stem_UTF_8_armenian.h +6 -0
  59. data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.c +1171 -0
  60. data/ext/isomorfeus_ferret_ext/stem_UTF_8_basque.h +6 -0
  61. data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.c +1436 -0
  62. data/ext/isomorfeus_ferret_ext/stem_UTF_8_catalan.h +6 -0
  63. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.c +121 -141
  64. data/ext/isomorfeus_ferret_ext/stem_UTF_8_danish.h +1 -1
  65. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.c +221 -241
  66. data/ext/isomorfeus_ferret_ext/stem_UTF_8_dutch.h +1 -1
  67. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.c +381 -431
  68. data/ext/isomorfeus_ferret_ext/stem_UTF_8_english.h +1 -1
  69. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.c +300 -345
  70. data/ext/isomorfeus_ferret_ext/stem_UTF_8_finnish.h +1 -1
  71. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.c +518 -511
  72. data/ext/isomorfeus_ferret_ext/stem_UTF_8_french.h +1 -1
  73. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.c +201 -209
  74. data/ext/isomorfeus_ferret_ext/stem_UTF_8_german.h +1 -1
  75. data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.c +3660 -0
  76. data/ext/isomorfeus_ferret_ext/stem_UTF_8_greek.h +6 -0
  77. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.c +309 -0
  78. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hindi.h +6 -0
  79. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.c +306 -671
  80. data/ext/isomorfeus_ferret_ext/stem_UTF_8_hungarian.h +1 -1
  81. data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.c +394 -0
  82. data/ext/isomorfeus_ferret_ext/stem_UTF_8_indonesian.h +6 -0
  83. data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.c +457 -0
  84. data/ext/isomorfeus_ferret_ext/stem_UTF_8_irish.h +6 -0
  85. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.c +400 -442
  86. data/ext/isomorfeus_ferret_ext/stem_UTF_8_italian.h +1 -1
  87. data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.c +824 -0
  88. data/ext/isomorfeus_ferret_ext/stem_UTF_8_lithuanian.h +6 -0
  89. data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.c +408 -0
  90. data/ext/isomorfeus_ferret_ext/stem_UTF_8_nepali.h +6 -0
  91. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.c +105 -127
  92. data/ext/isomorfeus_ferret_ext/stem_UTF_8_norwegian.h +1 -1
  93. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.c +245 -276
  94. data/ext/isomorfeus_ferret_ext/stem_UTF_8_porter.h +1 -1
  95. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.c +409 -464
  96. data/ext/isomorfeus_ferret_ext/stem_UTF_8_portuguese.h +1 -1
  97. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.c +376 -408
  98. data/ext/isomorfeus_ferret_ext/stem_UTF_8_romanian.h +1 -1
  99. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.c +272 -287
  100. data/ext/isomorfeus_ferret_ext/stem_UTF_8_russian.h +1 -1
  101. data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.c +6530 -0
  102. data/ext/isomorfeus_ferret_ext/stem_UTF_8_serbian.h +6 -0
  103. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.c +407 -458
  104. data/ext/isomorfeus_ferret_ext/stem_UTF_8_spanish.h +1 -1
  105. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.c +110 -125
  106. data/ext/isomorfeus_ferret_ext/stem_UTF_8_swedish.h +1 -1
  107. data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.c +1865 -0
  108. data/ext/isomorfeus_ferret_ext/stem_UTF_8_tamil.h +6 -0
  109. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.c +698 -806
  110. data/ext/isomorfeus_ferret_ext/stem_UTF_8_turkish.h +1 -1
  111. data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.c +1220 -0
  112. data/ext/isomorfeus_ferret_ext/stem_UTF_8_yiddish.h +6 -0
  113. data/ext/isomorfeus_ferret_ext/stem_api.c +1 -9
  114. data/ext/isomorfeus_ferret_ext/stem_api.h +1 -3
  115. data/ext/isomorfeus_ferret_ext/stem_header.h +30 -26
  116. data/ext/isomorfeus_ferret_ext/stem_modules.h +113 -26
  117. data/ext/isomorfeus_ferret_ext/stem_modules.txt +18 -5
  118. data/ext/isomorfeus_ferret_ext/stem_utilities.c +167 -132
  119. data/ext/isomorfeus_ferret_ext/test.c +7 -1
  120. data/ext/isomorfeus_ferret_ext/test_search.c +0 -1
  121. data/lib/isomorfeus/ferret/version.rb +1 -1
  122. metadata +39 -4
  123. data/ext/isomorfeus_ferret_ext/q_parser.y +0 -1366
@@ -0,0 +1,6 @@
1
+ /* Generated by Snowball 2.2.0 - https://snowballstem.org/ */
2
+
3
+ extern struct SN_env * yiddish_UTF_8_create_env(void);
4
+ extern void yiddish_UTF_8_close_env(struct SN_env * z);
5
+
6
+ extern int yiddish_UTF_8_stem(struct SN_env * z);
@@ -2,7 +2,7 @@
2
2
  #include <stdlib.h> /* for calloc, free */
3
3
  #include "stem_header.h"
4
4
 
5
- extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
5
+ extern struct SN_env * SN_create_env(int S_size, int I_size)
6
6
  {
7
7
  struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
8
8
  if (z == NULL) return NULL;
@@ -27,12 +27,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
27
27
  if (z->I == NULL) goto error;
28
28
  }
29
29
 
30
- if (B_size)
31
- {
32
- z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
33
- if (z->B == NULL) goto error;
34
- }
35
-
36
30
  return z;
37
31
  error:
38
32
  SN_close_env(z, S_size);
@@ -52,7 +46,6 @@ extern void SN_close_env(struct SN_env * z, int S_size)
52
46
  free(z->S);
53
47
  }
54
48
  free(z->I);
55
- free(z->B);
56
49
  if (z->p) lose_s(z->p);
57
50
  free(z);
58
51
  }
@@ -63,4 +56,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
63
56
  z->c = 0;
64
57
  return err;
65
58
  }
66
-
@@ -16,11 +16,9 @@ struct SN_env {
16
16
  int c; int l; int lb; int bra; int ket;
17
17
  symbol * * S;
18
18
  int * I;
19
- unsigned char * B;
20
19
  };
21
20
 
22
- extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
21
+ extern struct SN_env * SN_create_env(int S_size, int I_size);
23
22
  extern void SN_close_env(struct SN_env * z, int S_size);
24
23
 
25
24
  extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
26
-
@@ -20,38 +20,42 @@ struct among
20
20
  int (* function)(struct SN_env *);
21
21
  };
22
22
 
23
- symbol * create_s(void);
24
- void lose_s(symbol * p);
23
+ extern symbol * create_s(void);
24
+ extern void lose_s(symbol * p);
25
25
 
26
- int skip_utf8(const symbol * p, int c, int lb, int l, int n);
26
+ extern int skip_utf8(const symbol * p, int c, int limit, int n);
27
27
 
28
- int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
29
- int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
30
- int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
31
- int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
28
+ extern int skip_b_utf8(const symbol * p, int c, int limit, int n);
32
29
 
33
- int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
34
- int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
35
- int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
36
- int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
30
+ extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
31
+ extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
32
+ extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
33
+ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
37
34
 
38
- int eq_s(struct SN_env * z, int s_size, const symbol * s);
39
- int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
40
- int eq_v(struct SN_env * z, const symbol * p);
41
- int eq_v_b(struct SN_env * z, const symbol * p);
35
+ extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
36
+ extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
37
+ extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
38
+ extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
42
39
 
43
- int find_among(struct SN_env * z, const struct among * v, int v_size);
44
- int find_among_b(struct SN_env * z, const struct among * v, int v_size);
40
+ extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
41
+ extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
42
+ extern int eq_v(struct SN_env * z, const symbol * p);
43
+ extern int eq_v_b(struct SN_env * z, const symbol * p);
45
44
 
46
- int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
47
- int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
48
- int slice_from_v(struct SN_env * z, const symbol * p);
49
- int slice_del(struct SN_env * z);
45
+ extern int find_among(struct SN_env * z, const struct among * v, int v_size);
46
+ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
50
47
 
51
- int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
52
- int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
48
+ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
49
+ extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
50
+ extern int slice_from_v(struct SN_env * z, const symbol * p);
51
+ extern int slice_del(struct SN_env * z);
53
52
 
54
- symbol * slice_to(struct SN_env * z, symbol * p);
55
- symbol * assign_to(struct SN_env * z, symbol * p);
53
+ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
54
+ extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
56
55
 
57
- void debug(struct SN_env * z, int number, int line_count);
56
+ extern symbol * slice_to(struct SN_env * z, symbol * p);
57
+ extern symbol * assign_to(struct SN_env * z, symbol * p);
58
+
59
+ extern int len_utf8(const symbol * p);
60
+
61
+ extern void debug(struct SN_env * z, int number, int line_count);
@@ -3,11 +3,19 @@
3
3
  * This file is generated by mkmodules.pl from a list of module names.
4
4
  * Do not edit manually.
5
5
  *
6
- * Modules included by this file are: danish, dutch, english, finnish, french,
7
- * german, hungarian, italian, norwegian, porter, portuguese, romanian,
8
- * russian, spanish, swedish, turkish
6
+ * Modules included by this file are: arabic, armenian, basque, catalan,
7
+ * danish, dutch, english, finnish, french, german, greek, hindi, hungarian,
8
+ * indonesian, irish, italian, lithuanian, nepali, norwegian, porter,
9
+ * portuguese, romanian, russian, serbian, spanish, swedish, tamil, turkish,
10
+ * yiddish
9
11
  */
10
12
 
13
+ #include "stem_UTF_8_arabic.h"
14
+ #include "stem_UTF_8_armenian.h"
15
+ #include "stem_ISO_8859_1_basque.h"
16
+ #include "stem_UTF_8_basque.h"
17
+ #include "stem_ISO_8859_1_catalan.h"
18
+ #include "stem_UTF_8_catalan.h"
11
19
  #include "stem_ISO_8859_1_danish.h"
12
20
  #include "stem_UTF_8_danish.h"
13
21
  #include "stem_ISO_8859_1_dutch.h"
@@ -20,10 +28,18 @@
20
28
  #include "stem_UTF_8_french.h"
21
29
  #include "stem_ISO_8859_1_german.h"
22
30
  #include "stem_UTF_8_german.h"
23
- #include "stem_ISO_8859_1_hungarian.h"
31
+ #include "stem_UTF_8_greek.h"
32
+ #include "stem_UTF_8_hindi.h"
33
+ #include "stem_ISO_8859_2_hungarian.h"
24
34
  #include "stem_UTF_8_hungarian.h"
35
+ #include "stem_ISO_8859_1_indonesian.h"
36
+ #include "stem_UTF_8_indonesian.h"
37
+ #include "stem_ISO_8859_1_irish.h"
38
+ #include "stem_UTF_8_irish.h"
25
39
  #include "stem_ISO_8859_1_italian.h"
26
40
  #include "stem_UTF_8_italian.h"
41
+ #include "stem_UTF_8_lithuanian.h"
42
+ #include "stem_UTF_8_nepali.h"
27
43
  #include "stem_ISO_8859_1_norwegian.h"
28
44
  #include "stem_UTF_8_norwegian.h"
29
45
  #include "stem_ISO_8859_1_porter.h"
@@ -34,11 +50,14 @@
34
50
  #include "stem_UTF_8_romanian.h"
35
51
  #include "stem_KOI8_R_russian.h"
36
52
  #include "stem_UTF_8_russian.h"
53
+ #include "stem_UTF_8_serbian.h"
37
54
  #include "stem_ISO_8859_1_spanish.h"
38
55
  #include "stem_UTF_8_spanish.h"
39
56
  #include "stem_ISO_8859_1_swedish.h"
40
57
  #include "stem_UTF_8_swedish.h"
58
+ #include "stem_UTF_8_tamil.h"
41
59
  #include "stem_UTF_8_turkish.h"
60
+ #include "stem_UTF_8_yiddish.h"
42
61
 
43
62
  typedef enum {
44
63
  ENC_UNKNOWN=0,
@@ -52,7 +71,7 @@ struct stemmer_encoding {
52
71
  const char * name;
53
72
  stemmer_encoding_t enc;
54
73
  };
55
- static struct stemmer_encoding encodings[] = {
74
+ static const struct stemmer_encoding encodings[] = {
56
75
  {"ISO_8859_1", ENC_ISO_8859_1},
57
76
  {"ISO_8859_2", ENC_ISO_8859_2},
58
77
  {"KOI8_R", ENC_KOI8_R},
@@ -62,12 +81,27 @@ static struct stemmer_encoding encodings[] = {
62
81
 
63
82
  struct stemmer_modules {
64
83
  const char * name;
65
- stemmer_encoding_t enc;
84
+ stemmer_encoding_t enc;
66
85
  struct SN_env * (*create)(void);
67
86
  void (*close)(struct SN_env *);
68
87
  int (*stem)(struct SN_env *);
69
88
  };
70
- static struct stemmer_modules modules[] = {
89
+ static const struct stemmer_modules modules[] = {
90
+ {"ar", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
91
+ {"ara", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
92
+ {"arabic", ENC_UTF_8, arabic_UTF_8_create_env, arabic_UTF_8_close_env, arabic_UTF_8_stem},
93
+ {"arm", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
94
+ {"armenian", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
95
+ {"baq", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
96
+ {"baq", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
97
+ {"basque", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
98
+ {"basque", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
99
+ {"ca", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
100
+ {"ca", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
101
+ {"cat", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
102
+ {"cat", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
103
+ {"catalan", ENC_ISO_8859_1, catalan_ISO_8859_1_create_env, catalan_ISO_8859_1_close_env, catalan_ISO_8859_1_stem},
104
+ {"catalan", ENC_UTF_8, catalan_UTF_8_create_env, catalan_UTF_8_close_env, catalan_UTF_8_stem},
71
105
  {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
72
106
  {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem},
73
107
  {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem},
@@ -82,6 +116,8 @@ static struct stemmer_modules modules[] = {
82
116
  {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
83
117
  {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
84
118
  {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
119
+ {"el", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
120
+ {"ell", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
85
121
  {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
86
122
  {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem},
87
123
  {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem},
@@ -92,6 +128,10 @@ static struct stemmer_modules modules[] = {
92
128
  {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
93
129
  {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
94
130
  {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
131
+ {"eu", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
132
+ {"eu", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
133
+ {"eus", ENC_ISO_8859_1, basque_ISO_8859_1_create_env, basque_ISO_8859_1_close_env, basque_ISO_8859_1_stem},
134
+ {"eus", ENC_UTF_8, basque_UTF_8_create_env, basque_UTF_8_close_env, basque_UTF_8_stem},
95
135
  {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
96
136
  {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem},
97
137
  {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem},
@@ -106,22 +146,47 @@ static struct stemmer_modules modules[] = {
106
146
  {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
107
147
  {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem},
108
148
  {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem},
149
+ {"ga", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
150
+ {"ga", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
109
151
  {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
110
152
  {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
111
153
  {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem},
112
154
  {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem},
113
- {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
155
+ {"gle", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
156
+ {"gle", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
157
+ {"gre", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
158
+ {"greek", ENC_UTF_8, greek_UTF_8_create_env, greek_UTF_8_close_env, greek_UTF_8_stem},
159
+ {"hi", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
160
+ {"hin", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
161
+ {"hindi", ENC_UTF_8, hindi_UTF_8_create_env, hindi_UTF_8_close_env, hindi_UTF_8_stem},
162
+ {"hu", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
114
163
  {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
115
- {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
164
+ {"hun", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
116
165
  {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
117
- {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem},
166
+ {"hungarian", ENC_ISO_8859_2, hungarian_ISO_8859_2_create_env, hungarian_ISO_8859_2_close_env, hungarian_ISO_8859_2_stem},
118
167
  {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
168
+ {"hy", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
169
+ {"hye", ENC_UTF_8, armenian_UTF_8_create_env, armenian_UTF_8_close_env, armenian_UTF_8_stem},
170
+ {"id", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
171
+ {"id", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
172
+ {"ind", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
173
+ {"ind", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
174
+ {"indonesian", ENC_ISO_8859_1, indonesian_ISO_8859_1_create_env, indonesian_ISO_8859_1_close_env, indonesian_ISO_8859_1_stem},
175
+ {"indonesian", ENC_UTF_8, indonesian_UTF_8_create_env, indonesian_UTF_8_close_env, indonesian_UTF_8_stem},
176
+ {"irish", ENC_ISO_8859_1, irish_ISO_8859_1_create_env, irish_ISO_8859_1_close_env, irish_ISO_8859_1_stem},
177
+ {"irish", ENC_UTF_8, irish_UTF_8_create_env, irish_UTF_8_close_env, irish_UTF_8_stem},
119
178
  {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
120
179
  {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
121
180
  {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
122
181
  {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
123
182
  {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem},
124
183
  {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem},
184
+ {"lit", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
185
+ {"lithuanian", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
186
+ {"lt", ENC_UTF_8, lithuanian_UTF_8_create_env, lithuanian_UTF_8_close_env, lithuanian_UTF_8_stem},
187
+ {"ne", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
188
+ {"nep", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
189
+ {"nepali", ENC_UTF_8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem},
125
190
  {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
126
191
  {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem},
127
192
  {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem},
@@ -154,37 +219,59 @@ static struct stemmer_modules modules[] = {
154
219
  {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
155
220
  {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem},
156
221
  {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem},
222
+ {"serbian", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
157
223
  {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
158
224
  {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
159
225
  {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem},
160
226
  {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem},
227
+ {"sr", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
228
+ {"srp", ENC_UTF_8, serbian_UTF_8_create_env, serbian_UTF_8_close_env, serbian_UTF_8_stem},
161
229
  {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
162
230
  {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
163
231
  {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
164
232
  {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
165
233
  {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem},
166
234
  {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem},
235
+ {"ta", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
236
+ {"tam", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
237
+ {"tamil", ENC_UTF_8, tamil_UTF_8_create_env, tamil_UTF_8_close_env, tamil_UTF_8_stem},
167
238
  {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
168
239
  {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
169
240
  {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem},
241
+ {"yi", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
242
+ {"yid", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
243
+ {"yiddish", ENC_UTF_8, yiddish_UTF_8_create_env, yiddish_UTF_8_close_env, yiddish_UTF_8_stem},
170
244
  {0,ENC_UNKNOWN,0,0,0}
171
245
  };
172
246
  static const char * algorithm_names[] = {
173
- "danish",
174
- "dutch",
175
- "english",
176
- "finnish",
177
- "french",
178
- "german",
179
- "hungarian",
180
- "italian",
181
- "norwegian",
182
- "porter",
183
- "portuguese",
184
- "romanian",
185
- "russian",
186
- "spanish",
187
- "swedish",
188
- "turkish",
247
+ "arabic",
248
+ "armenian",
249
+ "basque",
250
+ "catalan",
251
+ "danish",
252
+ "dutch",
253
+ "english",
254
+ "finnish",
255
+ "french",
256
+ "german",
257
+ "greek",
258
+ "hindi",
259
+ "hungarian",
260
+ "indonesian",
261
+ "irish",
262
+ "italian",
263
+ "lithuanian",
264
+ "nepali",
265
+ "norwegian",
266
+ "porter",
267
+ "portuguese",
268
+ "romanian",
269
+ "russian",
270
+ "serbian",
271
+ "spanish",
272
+ "swedish",
273
+ "tamil",
274
+ "turkish",
275
+ "yiddish",
189
276
  0
190
277
  };
@@ -9,27 +9,40 @@
9
9
  # List all the main algorithms for each language, in UTF-8, and also with
10
10
  # the most commonly used encoding.
11
11
 
12
+ arabic UTF_8 arabic,ar,ara
13
+ armenian UTF_8 armenian,hy,hye,arm
14
+ basque UTF_8,ISO_8859_1 basque,eu,eus,baq
15
+ catalan UTF_8,ISO_8859_1 catalan,ca,cat
12
16
  danish UTF_8,ISO_8859_1 danish,da,dan
13
17
  dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
14
18
  english UTF_8,ISO_8859_1 english,en,eng
15
19
  finnish UTF_8,ISO_8859_1 finnish,fi,fin
16
20
  french UTF_8,ISO_8859_1 french,fr,fre,fra
17
21
  german UTF_8,ISO_8859_1 german,de,ger,deu
18
- hungarian UTF_8,ISO_8859_1 hungarian,hu,hun
22
+ greek UTF_8 greek,el,gre,ell
23
+ hindi UTF_8 hindi,hi,hin
24
+ hungarian UTF_8,ISO_8859_2 hungarian,hu,hun
25
+ indonesian UTF_8,ISO_8859_1 indonesian,id,ind
26
+ irish UTF_8,ISO_8859_1 irish,ga,gle
19
27
  italian UTF_8,ISO_8859_1 italian,it,ita
28
+ lithuanian UTF_8 lithuanian,lt,lit
29
+ nepali UTF_8 nepali,ne,nep
20
30
  norwegian UTF_8,ISO_8859_1 norwegian,no,nor
21
31
  portuguese UTF_8,ISO_8859_1 portuguese,pt,por
22
32
  romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
23
33
  russian UTF_8,KOI8_R russian,ru,rus
34
+ serbian UTF_8 serbian,sr,srp
24
35
  spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
25
36
  swedish UTF_8,ISO_8859_1 swedish,sv,swe
37
+ tamil UTF_8 tamil,ta,tam
26
38
  turkish UTF_8 turkish,tr,tur
39
+ yiddish UTF_8 yiddish,yi,yid
27
40
 
28
41
  # Also include the traditional porter algorithm for english.
29
42
  # The porter algorithm is included in the libstemmer distribution to assist
30
43
  # with backwards compatibility, but for new systems the english algorithm
31
44
  # should be used in preference.
32
- porter UTF_8,ISO_8859_1 porter
45
+ porter UTF_8,ISO_8859_1 porter english
33
46
 
34
47
  # Some other stemmers in the snowball project are not included in the standard
35
48
  # distribution. To compile a libstemmer with them in, add them to this list,
@@ -39,12 +52,12 @@ porter UTF_8,ISO_8859_1 porter
39
52
  # algorithms are:
40
53
  #
41
54
  # german2 - This is a slight modification of the german stemmer.
42
- #german2 UTF_8,ISO_8859_1 german2
55
+ #german2 UTF_8,ISO_8859_1 german2 german
43
56
  #
44
57
  # kraaij_pohlmann - This is a different dutch stemmer.
45
- #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann
58
+ #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
46
59
  #
47
60
  # lovins - This is an english stemmer, but fairly outdated, and
48
61
  # only really applicable to a restricted type of input text
49
62
  # (keywords in academic publications).
50
- #lovins UTF_8,ISO_8859_1 lovins
63
+ #lovins UTF_8,ISO_8859_1 lovins english