ruby-stemmer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. data/MIT-LICENSE +21 -0
  2. data/README +79 -0
  3. data/Rakefile +52 -0
  4. data/extconf.rb +14 -0
  5. data/libstemmer_c/MANIFEST +72 -0
  6. data/libstemmer_c/Makefile +9 -0
  7. data/libstemmer_c/README +125 -0
  8. data/libstemmer_c/include/libstemmer.h +79 -0
  9. data/libstemmer_c/libstemmer/libstemmer.c +93 -0
  10. data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
  11. data/libstemmer_c/libstemmer/modules.h +190 -0
  12. data/libstemmer_c/libstemmer/modules.txt +50 -0
  13. data/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  14. data/libstemmer_c/libstemmer/modules_utf8.txt +49 -0
  15. data/libstemmer_c/mkinc.mak +82 -0
  16. data/libstemmer_c/mkinc_utf8.mak +52 -0
  17. data/libstemmer_c/runtime/api.c +66 -0
  18. data/libstemmer_c/runtime/api.h +26 -0
  19. data/libstemmer_c/runtime/header.h +58 -0
  20. data/libstemmer_c/runtime/utilities.c +478 -0
  21. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  22. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  23. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  24. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  25. data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  26. data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  27. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  28. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  29. data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  30. data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  31. data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
  32. data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  33. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  34. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  35. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  36. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  37. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  38. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  39. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  40. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  41. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  42. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  43. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  44. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  45. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  46. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  47. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  48. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  49. data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  50. data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  51. data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  52. data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  53. data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  54. data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  55. data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  56. data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  57. data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  58. data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  59. data/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  60. data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  61. data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
  62. data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  63. data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  64. data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  65. data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  66. data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  67. data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  68. data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  69. data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  70. data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  71. data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  72. data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  73. data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  74. data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  75. data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  76. data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  77. data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  78. data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  79. data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  80. data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  81. data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  82. data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  83. data/ruby-stemmer.c +108 -0
  84. data/test.rb +31 -0
  85. metadata +141 -0
@@ -0,0 +1,16 @@
1
+
2
+ /* This file was generated automatically by the Snowball to ANSI C compiler */
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ extern struct SN_env * turkish_UTF_8_create_env(void);
9
+ extern void turkish_UTF_8_close_env(struct SN_env * z);
10
+
11
+ extern int turkish_UTF_8_stem(struct SN_env * z);
12
+
13
+ #ifdef __cplusplus
14
+ }
15
+ #endif
16
+
data/ruby-stemmer.c ADDED
@@ -0,0 +1,108 @@
1
+ //
2
+ // $Id: ruby-stemmer.c 17 2008-01-07 16:59:10Z aurelian $
3
+ //
4
+
5
+ #include "ruby.h"
6
+ #include <libstemmer.h>
7
+
8
+ #define GetStemmer(obj, sb_data) {\
9
+ Data_Get_Struct(obj, struct sb_stemmer_data, sb_data);\
10
+ }
11
+
12
+ VALUE rb_mLingua;
13
+ VALUE rb_cStemmer;
14
+
15
+ struct sb_stemmer_data {
16
+ struct sb_stemmer * stemmer;
17
+ const char * lang;
18
+ const char * enc;
19
+ };
20
+
21
+ static VALUE
22
+ rb_stemmer_init(int argc, VALUE *argv, VALUE self) {
23
+ VALUE roptions, rlang, renc;
24
+
25
+ struct sb_stemmer * stemmer;
26
+ struct sb_stemmer_data *sb_data;
27
+
28
+ rb_scan_args(argc, argv, "01", &roptions);
29
+
30
+ if(argc > 0) {
31
+ Check_Type(roptions, T_HASH);
32
+ if((rlang = rb_hash_aref(roptions, ID2SYM(rb_intern("language")))) != Qnil) {
33
+ Check_Type(rlang, T_STRING);
34
+ } else {
35
+ rlang = rb_str_new2("en");
36
+ }
37
+ if((renc = rb_hash_aref(roptions, ID2SYM(rb_intern("encoding")))) != Qnil) {
38
+ Check_Type(renc, T_STRING);
39
+ } else {
40
+ renc = rb_str_new2("UTF_8");
41
+ }
42
+ } else {
43
+ rlang = rb_str_new2("en");
44
+ renc = rb_str_new2("UTF_8");
45
+ }
46
+
47
+ stemmer = sb_stemmer_new( RSTRING(rlang)->ptr, RSTRING(renc)->ptr );
48
+ if (stemmer == 0) {
49
+ printf(">>[libstemmer]: got a null stemmer!\n");
50
+ if (renc == 0 ) {
51
+ rb_raise(rb_eRuntimeError, "Language %s not available for stemming", RSTRING(rlang)->ptr);
52
+ exit(1);
53
+ } else {
54
+ rb_raise(rb_eRuntimeError, "Language %s not available for stemming in encoding",
55
+ RSTRING(rlang)->ptr, RSTRING(renc)->ptr);
56
+ exit(1);
57
+ }
58
+ }
59
+
60
+ sb_data = ALLOC(struct sb_stemmer_data);
61
+ DATA_PTR(self) = sb_data;
62
+ sb_data->stemmer= stemmer;
63
+ sb_data->lang = RSTRING(rlang)->ptr;
64
+ sb_data->enc = RSTRING(renc)->ptr;
65
+
66
+ return self;
67
+ }
68
+
69
+ static VALUE
70
+ rb_stemmer_stem(VALUE self, VALUE word) {
71
+ struct sb_stemmer_data * sb_data;
72
+ const sb_symbol * stemmed;
73
+ GetStemmer(self, sb_data);
74
+ stemmed = sb_stemmer_stem(sb_data->stemmer, (sb_symbol *)RSTRING(word)->ptr, RSTRING(word)->len);
75
+ /* printf(">>[libstemmer %s/%s]: %s-> %s\n", sb_data->lang, sb_data->enc, RSTRING(word)->ptr, stemmed); */
76
+ return rb_str_new2((char *)stemmed);
77
+ }
78
+
79
+ static VALUE
80
+ rb_stemmer_length(VALUE self) {
81
+ struct sb_stemmer_data * sb_data;
82
+ int length;
83
+ GetStemmer(self, sb_data);
84
+ length = sb_stemmer_length(sb_data->stemmer);
85
+ return INT2FIX(length);
86
+ }
87
+
88
+ static void
89
+ sb_stemmer_free(struct sb_stemmer_data * sb_data)
90
+ {
91
+ sb_stemmer_delete(sb_data->stemmer);
92
+ }
93
+
94
+ static VALUE
95
+ sb_stemmer_alloc(VALUE klass)
96
+ {
97
+ return Data_Wrap_Struct(klass, 0, sb_stemmer_free, 0);
98
+ }
99
+
100
+ void Init_stemmer() {
101
+ rb_mLingua = rb_define_module("Lingua");
102
+ rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
103
+ rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
104
+ rb_define_method(rb_cStemmer, "initialize", rb_stemmer_init, -1);
105
+ rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
106
+ rb_define_method(rb_cStemmer, "length", rb_stemmer_length, 0);
107
+ }
108
+
data/test.rb ADDED
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # $Id: test.rb 15 2008-01-05 11:56:57Z aurelian $
5
+ #
6
+
7
+ $kcode = "utf-8"
8
+
9
+ require "stemmer"
10
+
11
+ # puts ">>> test 1."
12
+ stemmer = Lingua::Stemmer.new()
13
+ puts stemmer.stem("installation")
14
+ puts stemmer.length
15
+ puts stemmer.stem("popularity")
16
+
17
+ puts ">>> test 2."
18
+ stemmer = Lingua::Stemmer.new(:language => 'en')
19
+ puts stemmer.stem("obnoxious")
20
+
21
+ puts ">>> test 3."
22
+ stemmer = Lingua::Stemmer.new(:encoding => 'UTF_8')
23
+ puts stemmer.stem("găinațul")
24
+
25
+ puts ">>> test 4."
26
+ stemmer = Lingua::Stemmer.new(:language => 'en', :encoding => 'UTF_8')
27
+ puts stemmer.stem("personalities")
28
+
29
+ puts ">>> test 5."
30
+ stemmer = Lingua::Stemmer.new(:encoding => 'UTF_8', :language => 'ro')
31
+ puts stemmer.stem("întrebător");
metadata ADDED
@@ -0,0 +1,141 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-stemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Aurelian Oancea
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-01-07 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Stemmer implementation to ruby using libstemmer_c.
17
+ email: aurelian@locknet.ro
18
+ executables: []
19
+
20
+ extensions:
21
+ - extconf.rb
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - extconf.rb
26
+ - ruby-stemmer.c
27
+ - test.rb
28
+ - MIT-LICENSE
29
+ - Rakefile
30
+ - README
31
+ - libstemmer_c/examples
32
+ - libstemmer_c/include
33
+ - libstemmer_c/include/libstemmer.h
34
+ - libstemmer_c/libstemmer
35
+ - libstemmer_c/libstemmer/libstemmer.c
36
+ - libstemmer_c/libstemmer/libstemmer_utf8.c
37
+ - libstemmer_c/libstemmer/modules.h
38
+ - libstemmer_c/libstemmer/modules.txt
39
+ - libstemmer_c/libstemmer/modules_utf8.h
40
+ - libstemmer_c/libstemmer/modules_utf8.txt
41
+ - libstemmer_c/Makefile
42
+ - libstemmer_c/MANIFEST
43
+ - libstemmer_c/mkinc.mak
44
+ - libstemmer_c/mkinc_utf8.mak
45
+ - libstemmer_c/README
46
+ - libstemmer_c/runtime
47
+ - libstemmer_c/runtime/api.c
48
+ - libstemmer_c/runtime/api.h
49
+ - libstemmer_c/runtime/header.h
50
+ - libstemmer_c/runtime/utilities.c
51
+ - libstemmer_c/src_c
52
+ - libstemmer_c/src_c/stem_ISO_8859_1_danish.c
53
+ - libstemmer_c/src_c/stem_ISO_8859_1_danish.h
54
+ - libstemmer_c/src_c/stem_ISO_8859_1_dutch.c
55
+ - libstemmer_c/src_c/stem_ISO_8859_1_dutch.h
56
+ - libstemmer_c/src_c/stem_ISO_8859_1_english.c
57
+ - libstemmer_c/src_c/stem_ISO_8859_1_english.h
58
+ - libstemmer_c/src_c/stem_ISO_8859_1_finnish.c
59
+ - libstemmer_c/src_c/stem_ISO_8859_1_finnish.h
60
+ - libstemmer_c/src_c/stem_ISO_8859_1_french.c
61
+ - libstemmer_c/src_c/stem_ISO_8859_1_french.h
62
+ - libstemmer_c/src_c/stem_ISO_8859_1_german.c
63
+ - libstemmer_c/src_c/stem_ISO_8859_1_german.h
64
+ - libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c
65
+ - libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h
66
+ - libstemmer_c/src_c/stem_ISO_8859_1_italian.c
67
+ - libstemmer_c/src_c/stem_ISO_8859_1_italian.h
68
+ - libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c
69
+ - libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h
70
+ - libstemmer_c/src_c/stem_ISO_8859_1_porter.c
71
+ - libstemmer_c/src_c/stem_ISO_8859_1_porter.h
72
+ - libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c
73
+ - libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h
74
+ - libstemmer_c/src_c/stem_ISO_8859_1_spanish.c
75
+ - libstemmer_c/src_c/stem_ISO_8859_1_spanish.h
76
+ - libstemmer_c/src_c/stem_ISO_8859_1_swedish.c
77
+ - libstemmer_c/src_c/stem_ISO_8859_1_swedish.h
78
+ - libstemmer_c/src_c/stem_ISO_8859_2_romanian.c
79
+ - libstemmer_c/src_c/stem_ISO_8859_2_romanian.h
80
+ - libstemmer_c/src_c/stem_KOI8_R_russian.c
81
+ - libstemmer_c/src_c/stem_KOI8_R_russian.h
82
+ - libstemmer_c/src_c/stem_UTF_8_danish.c
83
+ - libstemmer_c/src_c/stem_UTF_8_danish.h
84
+ - libstemmer_c/src_c/stem_UTF_8_dutch.c
85
+ - libstemmer_c/src_c/stem_UTF_8_dutch.h
86
+ - libstemmer_c/src_c/stem_UTF_8_english.c
87
+ - libstemmer_c/src_c/stem_UTF_8_english.h
88
+ - libstemmer_c/src_c/stem_UTF_8_finnish.c
89
+ - libstemmer_c/src_c/stem_UTF_8_finnish.h
90
+ - libstemmer_c/src_c/stem_UTF_8_french.c
91
+ - libstemmer_c/src_c/stem_UTF_8_french.h
92
+ - libstemmer_c/src_c/stem_UTF_8_german.c
93
+ - libstemmer_c/src_c/stem_UTF_8_german.h
94
+ - libstemmer_c/src_c/stem_UTF_8_hungarian.c
95
+ - libstemmer_c/src_c/stem_UTF_8_hungarian.h
96
+ - libstemmer_c/src_c/stem_UTF_8_italian.c
97
+ - libstemmer_c/src_c/stem_UTF_8_italian.h
98
+ - libstemmer_c/src_c/stem_UTF_8_norwegian.c
99
+ - libstemmer_c/src_c/stem_UTF_8_norwegian.h
100
+ - libstemmer_c/src_c/stem_UTF_8_porter.c
101
+ - libstemmer_c/src_c/stem_UTF_8_porter.h
102
+ - libstemmer_c/src_c/stem_UTF_8_portuguese.c
103
+ - libstemmer_c/src_c/stem_UTF_8_portuguese.h
104
+ - libstemmer_c/src_c/stem_UTF_8_romanian.c
105
+ - libstemmer_c/src_c/stem_UTF_8_romanian.h
106
+ - libstemmer_c/src_c/stem_UTF_8_russian.c
107
+ - libstemmer_c/src_c/stem_UTF_8_russian.h
108
+ - libstemmer_c/src_c/stem_UTF_8_spanish.c
109
+ - libstemmer_c/src_c/stem_UTF_8_spanish.h
110
+ - libstemmer_c/src_c/stem_UTF_8_swedish.c
111
+ - libstemmer_c/src_c/stem_UTF_8_swedish.h
112
+ - libstemmer_c/src_c/stem_UTF_8_turkish.c
113
+ - libstemmer_c/src_c/stem_UTF_8_turkish.h
114
+ has_rdoc: false
115
+ homepage: http://nrr.rubyforge.org
116
+ post_install_message:
117
+ rdoc_options: []
118
+
119
+ require_paths:
120
+ - lib
121
+ required_ruby_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ version: "0"
126
+ version:
127
+ required_rubygems_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: "0"
132
+ version:
133
+ requirements: []
134
+
135
+ rubyforge_project: nrr
136
+ rubygems_version: 1.0.1
137
+ signing_key:
138
+ specification_version: 2
139
+ summary: Stemmer implementation to ruby using libstemmer_c.
140
+ test_files: []
141
+