ruby-stemmer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. data/MIT-LICENSE +21 -0
  2. data/README +79 -0
  3. data/Rakefile +52 -0
  4. data/extconf.rb +14 -0
  5. data/libstemmer_c/MANIFEST +72 -0
  6. data/libstemmer_c/Makefile +9 -0
  7. data/libstemmer_c/README +125 -0
  8. data/libstemmer_c/include/libstemmer.h +79 -0
  9. data/libstemmer_c/libstemmer/libstemmer.c +93 -0
  10. data/libstemmer_c/libstemmer/libstemmer_utf8.c +93 -0
  11. data/libstemmer_c/libstemmer/modules.h +190 -0
  12. data/libstemmer_c/libstemmer/modules.txt +50 -0
  13. data/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  14. data/libstemmer_c/libstemmer/modules_utf8.txt +49 -0
  15. data/libstemmer_c/mkinc.mak +82 -0
  16. data/libstemmer_c/mkinc_utf8.mak +52 -0
  17. data/libstemmer_c/runtime/api.c +66 -0
  18. data/libstemmer_c/runtime/api.h +26 -0
  19. data/libstemmer_c/runtime/header.h +58 -0
  20. data/libstemmer_c/runtime/utilities.c +478 -0
  21. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  22. data/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  23. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  24. data/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  25. data/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  26. data/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  27. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  28. data/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  29. data/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  30. data/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  31. data/libstemmer_c/src_c/stem_ISO_8859_1_german.c +503 -0
  32. data/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  33. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  34. data/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  35. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  36. data/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  37. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  38. data/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  39. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  40. data/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  41. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  42. data/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  43. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  44. data/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  45. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  46. data/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  47. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  48. data/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  49. data/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  50. data/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  51. data/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  52. data/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  53. data/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  54. data/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  55. data/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  56. data/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  57. data/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  58. data/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  59. data/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  60. data/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  61. data/libstemmer_c/src_c/stem_UTF_8_german.c +509 -0
  62. data/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  63. data/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  64. data/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  65. data/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  66. data/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  67. data/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  68. data/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  69. data/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  70. data/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  71. data/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  72. data/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  73. data/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  74. data/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  75. data/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  76. data/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  77. data/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  78. data/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  79. data/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  80. data/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  81. data/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  82. data/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  83. data/ruby-stemmer.c +108 -0
  84. data/test.rb +31 -0
  85. metadata +141 -0
@@ -0,0 +1,16 @@
1
+
2
+ /* This file was generated automatically by the Snowball to ANSI C compiler */
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ extern struct SN_env * turkish_UTF_8_create_env(void);
9
+ extern void turkish_UTF_8_close_env(struct SN_env * z);
10
+
11
+ extern int turkish_UTF_8_stem(struct SN_env * z);
12
+
13
+ #ifdef __cplusplus
14
+ }
15
+ #endif
16
+
data/ruby-stemmer.c ADDED
@@ -0,0 +1,108 @@
1
+ //
2
+ // $Id: ruby-stemmer.c 17 2008-01-07 16:59:10Z aurelian $
3
+ //
4
+
5
+ #include "ruby.h"
6
+ #include <libstemmer.h>
7
+
8
+ #define GetStemmer(obj, sb_data) {\
9
+ Data_Get_Struct(obj, struct sb_stemmer_data, sb_data);\
10
+ }
11
+
12
+ VALUE rb_mLingua;
13
+ VALUE rb_cStemmer;
14
+
15
+ struct sb_stemmer_data {
16
+ struct sb_stemmer * stemmer;
17
+ const char * lang;
18
+ const char * enc;
19
+ };
20
+
21
+ static VALUE
22
+ rb_stemmer_init(int argc, VALUE *argv, VALUE self) {
23
+ VALUE roptions, rlang, renc;
24
+
25
+ struct sb_stemmer * stemmer;
26
+ struct sb_stemmer_data *sb_data;
27
+
28
+ rb_scan_args(argc, argv, "01", &roptions);
29
+
30
+ if(argc > 0) {
31
+ Check_Type(roptions, T_HASH);
32
+ if((rlang = rb_hash_aref(roptions, ID2SYM(rb_intern("language")))) != Qnil) {
33
+ Check_Type(rlang, T_STRING);
34
+ } else {
35
+ rlang = rb_str_new2("en");
36
+ }
37
+ if((renc = rb_hash_aref(roptions, ID2SYM(rb_intern("encoding")))) != Qnil) {
38
+ Check_Type(renc, T_STRING);
39
+ } else {
40
+ renc = rb_str_new2("UTF_8");
41
+ }
42
+ } else {
43
+ rlang = rb_str_new2("en");
44
+ renc = rb_str_new2("UTF_8");
45
+ }
46
+
47
+ stemmer = sb_stemmer_new( RSTRING(rlang)->ptr, RSTRING(renc)->ptr );
48
+ if (stemmer == 0) {
49
+ printf(">>[libstemmer]: got a null stemmer!\n");
50
+ if (renc == 0 ) {
51
+ rb_raise(rb_eRuntimeError, "Language %s not available for stemming", RSTRING(rlang)->ptr);
52
+ exit(1);
53
+ } else {
54
+ rb_raise(rb_eRuntimeError, "Language %s not available for stemming in encoding",
55
+ RSTRING(rlang)->ptr, RSTRING(renc)->ptr);
56
+ exit(1);
57
+ }
58
+ }
59
+
60
+ sb_data = ALLOC(struct sb_stemmer_data);
61
+ DATA_PTR(self) = sb_data;
62
+ sb_data->stemmer= stemmer;
63
+ sb_data->lang = RSTRING(rlang)->ptr;
64
+ sb_data->enc = RSTRING(renc)->ptr;
65
+
66
+ return self;
67
+ }
68
+
69
+ static VALUE
70
+ rb_stemmer_stem(VALUE self, VALUE word) {
71
+ struct sb_stemmer_data * sb_data;
72
+ const sb_symbol * stemmed;
73
+ GetStemmer(self, sb_data);
74
+ stemmed = sb_stemmer_stem(sb_data->stemmer, (sb_symbol *)RSTRING(word)->ptr, RSTRING(word)->len);
75
+ /* printf(">>[libstemmer %s/%s]: %s-> %s\n", sb_data->lang, sb_data->enc, RSTRING(word)->ptr, stemmed); */
76
+ return rb_str_new2((char *)stemmed);
77
+ }
78
+
79
+ static VALUE
80
+ rb_stemmer_length(VALUE self) {
81
+ struct sb_stemmer_data * sb_data;
82
+ int length;
83
+ GetStemmer(self, sb_data);
84
+ length = sb_stemmer_length(sb_data->stemmer);
85
+ return INT2FIX(length);
86
+ }
87
+
88
+ static void
89
+ sb_stemmer_free(struct sb_stemmer_data * sb_data)
90
+ {
91
+ sb_stemmer_delete(sb_data->stemmer);
92
+ }
93
+
94
+ static VALUE
95
+ sb_stemmer_alloc(VALUE klass)
96
+ {
97
+ return Data_Wrap_Struct(klass, 0, sb_stemmer_free, 0);
98
+ }
99
+
100
+ void Init_stemmer() {
101
+ rb_mLingua = rb_define_module("Lingua");
102
+ rb_cStemmer = rb_define_class_under(rb_mLingua, "Stemmer", rb_cObject);
103
+ rb_define_alloc_func(rb_cStemmer, sb_stemmer_alloc);
104
+ rb_define_method(rb_cStemmer, "initialize", rb_stemmer_init, -1);
105
+ rb_define_method(rb_cStemmer, "stem", rb_stemmer_stem, 1);
106
+ rb_define_method(rb_cStemmer, "length", rb_stemmer_length, 0);
107
+ }
108
+
data/test.rb ADDED
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ #
4
+ # $Id: test.rb 15 2008-01-05 11:56:57Z aurelian $
5
+ #
6
+
7
+ $kcode = "utf-8"
8
+
9
+ require "stemmer"
10
+
11
+ # puts ">>> test 1."
12
+ stemmer = Lingua::Stemmer.new()
13
+ puts stemmer.stem("installation")
14
+ puts stemmer.length
15
+ puts stemmer.stem("popularity")
16
+
17
+ puts ">>> test 2."
18
+ stemmer = Lingua::Stemmer.new(:language => 'en')
19
+ puts stemmer.stem("obnoxious")
20
+
21
+ puts ">>> test 3."
22
+ stemmer = Lingua::Stemmer.new(:encoding => 'UTF_8')
23
+ puts stemmer.stem("găinațul")
24
+
25
+ puts ">>> test 4."
26
+ stemmer = Lingua::Stemmer.new(:language => 'en', :encoding => 'UTF_8')
27
+ puts stemmer.stem("personalities")
28
+
29
+ puts ">>> test 5."
30
+ stemmer = Lingua::Stemmer.new(:encoding => 'UTF_8', :language => 'ro')
31
+ puts stemmer.stem("întrebător");
metadata ADDED
@@ -0,0 +1,141 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-stemmer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Aurelian Oancea
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-01-07 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Stemmer implementation to ruby using libstemmer_c.
17
+ email: aurelian@locknet.ro
18
+ executables: []
19
+
20
+ extensions:
21
+ - extconf.rb
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - extconf.rb
26
+ - ruby-stemmer.c
27
+ - test.rb
28
+ - MIT-LICENSE
29
+ - Rakefile
30
+ - README
31
+ - libstemmer_c/examples
32
+ - libstemmer_c/include
33
+ - libstemmer_c/include/libstemmer.h
34
+ - libstemmer_c/libstemmer
35
+ - libstemmer_c/libstemmer/libstemmer.c
36
+ - libstemmer_c/libstemmer/libstemmer_utf8.c
37
+ - libstemmer_c/libstemmer/modules.h
38
+ - libstemmer_c/libstemmer/modules.txt
39
+ - libstemmer_c/libstemmer/modules_utf8.h
40
+ - libstemmer_c/libstemmer/modules_utf8.txt
41
+ - libstemmer_c/Makefile
42
+ - libstemmer_c/MANIFEST
43
+ - libstemmer_c/mkinc.mak
44
+ - libstemmer_c/mkinc_utf8.mak
45
+ - libstemmer_c/README
46
+ - libstemmer_c/runtime
47
+ - libstemmer_c/runtime/api.c
48
+ - libstemmer_c/runtime/api.h
49
+ - libstemmer_c/runtime/header.h
50
+ - libstemmer_c/runtime/utilities.c
51
+ - libstemmer_c/src_c
52
+ - libstemmer_c/src_c/stem_ISO_8859_1_danish.c
53
+ - libstemmer_c/src_c/stem_ISO_8859_1_danish.h
54
+ - libstemmer_c/src_c/stem_ISO_8859_1_dutch.c
55
+ - libstemmer_c/src_c/stem_ISO_8859_1_dutch.h
56
+ - libstemmer_c/src_c/stem_ISO_8859_1_english.c
57
+ - libstemmer_c/src_c/stem_ISO_8859_1_english.h
58
+ - libstemmer_c/src_c/stem_ISO_8859_1_finnish.c
59
+ - libstemmer_c/src_c/stem_ISO_8859_1_finnish.h
60
+ - libstemmer_c/src_c/stem_ISO_8859_1_french.c
61
+ - libstemmer_c/src_c/stem_ISO_8859_1_french.h
62
+ - libstemmer_c/src_c/stem_ISO_8859_1_german.c
63
+ - libstemmer_c/src_c/stem_ISO_8859_1_german.h
64
+ - libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c
65
+ - libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h
66
+ - libstemmer_c/src_c/stem_ISO_8859_1_italian.c
67
+ - libstemmer_c/src_c/stem_ISO_8859_1_italian.h
68
+ - libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c
69
+ - libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h
70
+ - libstemmer_c/src_c/stem_ISO_8859_1_porter.c
71
+ - libstemmer_c/src_c/stem_ISO_8859_1_porter.h
72
+ - libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c
73
+ - libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h
74
+ - libstemmer_c/src_c/stem_ISO_8859_1_spanish.c
75
+ - libstemmer_c/src_c/stem_ISO_8859_1_spanish.h
76
+ - libstemmer_c/src_c/stem_ISO_8859_1_swedish.c
77
+ - libstemmer_c/src_c/stem_ISO_8859_1_swedish.h
78
+ - libstemmer_c/src_c/stem_ISO_8859_2_romanian.c
79
+ - libstemmer_c/src_c/stem_ISO_8859_2_romanian.h
80
+ - libstemmer_c/src_c/stem_KOI8_R_russian.c
81
+ - libstemmer_c/src_c/stem_KOI8_R_russian.h
82
+ - libstemmer_c/src_c/stem_UTF_8_danish.c
83
+ - libstemmer_c/src_c/stem_UTF_8_danish.h
84
+ - libstemmer_c/src_c/stem_UTF_8_dutch.c
85
+ - libstemmer_c/src_c/stem_UTF_8_dutch.h
86
+ - libstemmer_c/src_c/stem_UTF_8_english.c
87
+ - libstemmer_c/src_c/stem_UTF_8_english.h
88
+ - libstemmer_c/src_c/stem_UTF_8_finnish.c
89
+ - libstemmer_c/src_c/stem_UTF_8_finnish.h
90
+ - libstemmer_c/src_c/stem_UTF_8_french.c
91
+ - libstemmer_c/src_c/stem_UTF_8_french.h
92
+ - libstemmer_c/src_c/stem_UTF_8_german.c
93
+ - libstemmer_c/src_c/stem_UTF_8_german.h
94
+ - libstemmer_c/src_c/stem_UTF_8_hungarian.c
95
+ - libstemmer_c/src_c/stem_UTF_8_hungarian.h
96
+ - libstemmer_c/src_c/stem_UTF_8_italian.c
97
+ - libstemmer_c/src_c/stem_UTF_8_italian.h
98
+ - libstemmer_c/src_c/stem_UTF_8_norwegian.c
99
+ - libstemmer_c/src_c/stem_UTF_8_norwegian.h
100
+ - libstemmer_c/src_c/stem_UTF_8_porter.c
101
+ - libstemmer_c/src_c/stem_UTF_8_porter.h
102
+ - libstemmer_c/src_c/stem_UTF_8_portuguese.c
103
+ - libstemmer_c/src_c/stem_UTF_8_portuguese.h
104
+ - libstemmer_c/src_c/stem_UTF_8_romanian.c
105
+ - libstemmer_c/src_c/stem_UTF_8_romanian.h
106
+ - libstemmer_c/src_c/stem_UTF_8_russian.c
107
+ - libstemmer_c/src_c/stem_UTF_8_russian.h
108
+ - libstemmer_c/src_c/stem_UTF_8_spanish.c
109
+ - libstemmer_c/src_c/stem_UTF_8_spanish.h
110
+ - libstemmer_c/src_c/stem_UTF_8_swedish.c
111
+ - libstemmer_c/src_c/stem_UTF_8_swedish.h
112
+ - libstemmer_c/src_c/stem_UTF_8_turkish.c
113
+ - libstemmer_c/src_c/stem_UTF_8_turkish.h
114
+ has_rdoc: false
115
+ homepage: http://nrr.rubyforge.org
116
+ post_install_message:
117
+ rdoc_options: []
118
+
119
+ require_paths:
120
+ - lib
121
+ required_ruby_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ version: "0"
126
+ version:
127
+ required_rubygems_version: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: "0"
132
+ version:
133
+ requirements: []
134
+
135
+ rubyforge_project: nrr
136
+ rubygems_version: 1.0.1
137
+ signing_key:
138
+ specification_version: 2
139
+ summary: Stemmer implementation to ruby using libstemmer_c.
140
+ test_files: []
141
+