text_sentencer_c 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c8b424a5b6ea7cb3ac0b7138f3f9c2ec480144b6a5a21a706937f41e37e11edd
4
+ data.tar.gz: 40f8b0e846f631e1142113b0073b47169ef6cb2348bedf9b1a55c5b8662a3ee9
5
+ SHA512:
6
+ metadata.gz: 68fe2267dbb019c2e3e515527f0cd0026b9a2212ebafafe03b87d3663dc326d66fbe92c5c5fbf909e55b53798158e4f79ff2be79ce7925470025e64f249b61a6
7
+ data.tar.gz: c03d9f4347e6a3e2dbec7730560526ff5eac208fa553c87be9b9ebf78b0c9036346fc08ac5aa3cc1ea2402b3c8d82f5ddef94097cd1e3d5dc1ad71e0ce51b3bc
data/README.md ADDED
@@ -0,0 +1,11 @@
1
+ # text_sentencer_c
2
+
3
+ A Ruby gem for text segmentation using a C extension.
4
+ It is a re-implementation of 'text_sentencer', which is originally implemented in ruby, using a C extension, for a better performance.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'text_sentencer_c'
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'rake/extensiontask'
2
+
3
+ Rake::ExtensionTask.new('text_sentencer_c') do |ext|
4
+ ext.lib_dir = 'lib/text_sentencer_c'
5
+ ext.ext_dir = 'ext/text_sentencer_c'
6
+ end
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+ require 'json'
3
+ require 'text_sentencer_c'
4
+
5
+ rules_filename = nil
6
+ output_mode = :sentences
7
+
8
+ ## command line option processing
9
+ require 'optparse'
10
+ optparse = OptionParser.new do |opts|
11
+ opts.banner = "Usage: text_sentencer [options]"
12
+
13
+ opts.on('-r', '--rules=rules_filename', 'specifies the rules JSON file.') do |c|
14
+ rules_filename = c
15
+ end
16
+
17
+ opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
+ output_mode = :json
19
+ end
20
+
21
+ opts.on('-h', '--help', 'displays this screen.') do
22
+ puts opts
23
+ exit
24
+ end
25
+ end
26
+
27
+ optparse.parse!
28
+
29
+ rules = if rules_filename && File.file?(rules_filename)
30
+ JSON.parse File.read(rules_filename)
31
+ end
32
+
33
+ sentencer = TextSentencer.new(rules || TextSentencer::DEFAULT_RULES)
34
+
35
+ text = ARGF.read
36
+
37
+ # segments = sentencer.segment(text)
38
+ # p segments
39
+ # segments.each do |b, e|
40
+ # puts text[b...e]
41
+ # end
42
+
43
+ annotations = sentencer.annotate(text)
44
+
45
+ if output_mode == :json
46
+ puts JSON.pretty_generate(annotations)
47
+ else
48
+ annotations['blocks']&.each do |d|
49
+ span = d['span']
50
+ puts text[span['begin']...span['end']]
51
+ end
52
+ end
@@ -0,0 +1,269 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ # V=0 quiet, V=1 verbose. other values don't work.
5
+ V = 0
6
+ V0 = $(V:0=)
7
+ Q1 = $(V:1=)
8
+ Q = $(Q1:0=@)
9
+ ECHO1 = $(V:1=@ :)
10
+ ECHO = $(ECHO1:0=@ echo)
11
+ NULLCMD = :
12
+
13
+ #### Start of system configuration section. ####
14
+
15
+ srcdir = .
16
+ topdir = /home/jdkim/.rvm/rubies/ruby-3.3.4/include/ruby-3.3.0
17
+ hdrdir = $(topdir)
18
+ arch_hdrdir = /home/jdkim/.rvm/rubies/ruby-3.3.4/include/ruby-3.3.0/x86_64-linux
19
+ PATH_SEPARATOR = :
20
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
21
+ prefix = $(DESTDIR)/home/jdkim/.rvm/rubies/ruby-3.3.4
22
+ rubysitearchprefix = $(rubylibprefix)/$(sitearch)
23
+ rubyarchprefix = $(rubylibprefix)/$(arch)
24
+ rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
25
+ exec_prefix = $(prefix)
26
+ vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
27
+ sitearchhdrdir = $(sitehdrdir)/$(sitearch)
28
+ rubyarchhdrdir = $(rubyhdrdir)/$(arch)
29
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
30
+ sitehdrdir = $(rubyhdrdir)/site_ruby
31
+ rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
32
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
33
+ vendorlibdir = $(vendordir)/$(ruby_version)
34
+ vendordir = $(rubylibprefix)/vendor_ruby
35
+ sitearchdir = $(sitelibdir)/$(sitearch)
36
+ sitelibdir = $(sitedir)/$(ruby_version)
37
+ sitedir = $(rubylibprefix)/site_ruby
38
+ rubyarchdir = $(rubylibdir)/$(arch)
39
+ rubylibdir = $(rubylibprefix)/$(ruby_version)
40
+ sitearchincludedir = $(includedir)/$(sitearch)
41
+ archincludedir = $(includedir)/$(arch)
42
+ sitearchlibdir = $(libdir)/$(sitearch)
43
+ archlibdir = $(libdir)/$(arch)
44
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
45
+ mandir = $(datarootdir)/man
46
+ localedir = $(datarootdir)/locale
47
+ libdir = $(exec_prefix)/lib
48
+ psdir = $(docdir)
49
+ pdfdir = $(docdir)
50
+ dvidir = $(docdir)
51
+ htmldir = $(docdir)
52
+ infodir = $(datarootdir)/info
53
+ docdir = $(datarootdir)/doc/$(PACKAGE)
54
+ oldincludedir = $(DESTDIR)/usr/include
55
+ includedir = $(prefix)/include
56
+ runstatedir = $(localstatedir)/run
57
+ localstatedir = $(prefix)/var
58
+ sharedstatedir = $(prefix)/com
59
+ sysconfdir = $(prefix)/etc
60
+ datadir = $(datarootdir)
61
+ datarootdir = $(prefix)/share
62
+ libexecdir = $(exec_prefix)/libexec
63
+ sbindir = $(exec_prefix)/sbin
64
+ bindir = $(exec_prefix)/bin
65
+ archdir = $(rubyarchdir)
66
+
67
+
68
+ CC_WRAPPER =
69
+ CC = gcc
70
+ CXX = g++
71
+ LIBRUBY = $(LIBRUBY_SO)
72
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
73
+ LIBRUBYARG_SHARED = -Wl,-rpath,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)
74
+ LIBRUBYARG_STATIC = -Wl,-rpath,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)-static $(MAINLIBS)
75
+ empty =
76
+ OUTFLAG = -o $(empty)
77
+ COUTFLAG = -o $(empty)
78
+ CSRCFLAG = $(empty)
79
+
80
+ RUBY_EXTCONF_H =
81
+ cflags = $(optflags) $(debugflags) $(warnflags)
82
+ cxxflags =
83
+ optflags = -O3 -fno-fast-math
84
+ debugflags = -ggdb3
85
+ warnflags = -Wall -Wextra -Wdeprecated-declarations -Wdiv-by-zero -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wpointer-arith -Wwrite-strings -Wold-style-definition -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable -Wmisleading-indentation -Wundef
86
+ cppflags =
87
+ CCDLFLAGS = -fPIC
88
+ CFLAGS = $(CCDLFLAGS) $(cflags) -fPIC $(ARCH_FLAG)
89
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
90
+ DEFS =
91
+ CPPFLAGS = $(DEFS) $(cppflags)
92
+ CXXFLAGS = $(CCDLFLAGS) $(ARCH_FLAG)
93
+ ldflags = -L. -fstack-protector-strong -rdynamic -Wl,-export-dynamic -Wl,--no-as-needed
94
+ dldflags = -Wl,--compress-debug-sections=zlib
95
+ ARCH_FLAG =
96
+ DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
97
+ LDSHARED = $(CC) -shared
98
+ LDSHAREDXX = $(CXX) -shared
99
+ AR = gcc-ar
100
+ EXEEXT =
101
+
102
+ RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
103
+ RUBY_SO_NAME = ruby
104
+ RUBYW_INSTALL_NAME =
105
+ RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
106
+ RUBYW_BASE_NAME = rubyw
107
+ RUBY_BASE_NAME = ruby
108
+
109
+ arch = x86_64-linux
110
+ sitearch = $(arch)
111
+ ruby_version = 3.3.0
112
+ ruby = $(bindir)/$(RUBY_BASE_NAME)
113
+ RUBY = $(ruby)
114
+ BUILTRUBY = $(bindir)/$(RUBY_BASE_NAME)
115
+ ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
116
+
117
+ RM = rm -f
118
+ RM_RF = rm -fr
119
+ RMDIRS = rmdir --ignore-fail-on-non-empty -p
120
+ MAKEDIRS = /usr/bin/mkdir -p
121
+ INSTALL = /usr/bin/install -c
122
+ INSTALL_PROG = $(INSTALL) -m 0755
123
+ INSTALL_DATA = $(INSTALL) -m 644
124
+ COPY = cp
125
+ TOUCH = exit >
126
+
127
+ #### End of system configuration section. ####
128
+
129
+ preload =
130
+ libpath = . $(libdir)
131
+ LIBPATH = -L. -L$(libdir) -Wl,-rpath,$(libdir)
132
+ DEFFILE =
133
+
134
+ CLEANFILES = mkmf.log
135
+ DISTCLEANFILES =
136
+ DISTCLEANDIRS =
137
+
138
+ extout =
139
+ extout_prefix =
140
+ target_prefix = /text_sentencer_c
141
+ LOCAL_LIBS =
142
+ LIBS = $(LIBRUBYARG_SHARED) -lm -lpthread -lc
143
+ ORIG_SRCS = text_sentencer.c
144
+ SRCS = $(ORIG_SRCS)
145
+ OBJS = text_sentencer.o
146
+ HDRS = $(srcdir)/text_sentencer.h
147
+ LOCAL_HDRS =
148
+ TARGET = text_sentencer_c
149
+ TARGET_NAME = text_sentencer_c
150
+ TARGET_ENTRY = Init_$(TARGET_NAME)
151
+ DLLIB = $(TARGET).so
152
+ EXTSTATIC =
153
+ STATIC_LIB =
154
+
155
+ TIMESTAMP_DIR = .
156
+ BINDIR = $(bindir)
157
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
158
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
159
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
160
+ HDRDIR = $(sitehdrdir)$(target_prefix)
161
+ ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
162
+ TARGET_SO_DIR =
163
+ TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
164
+ CLEANLIBS = $(TARGET_SO) false
165
+ CLEANOBJS = $(OBJS) *.bak
166
+ TARGET_SO_DIR_TIMESTAMP = $(TIMESTAMP_DIR)/.sitearchdir.-.text_sentencer_c.time
167
+
168
+ all: $(DLLIB)
169
+ static: $(STATIC_LIB)
170
+ .PHONY: all install static install-so install-rb
171
+ .PHONY: clean clean-so clean-static clean-rb
172
+
173
+ clean-static::
174
+ clean-rb-default::
175
+ clean-rb::
176
+ clean-so::
177
+ clean: clean-so clean-static clean-rb-default clean-rb
178
+ -$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
179
+
180
+ distclean-rb-default::
181
+ distclean-rb::
182
+ distclean-so::
183
+ distclean-static::
184
+ distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
185
+ -$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
186
+ -$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
187
+ -$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
188
+
189
+ realclean: distclean
190
+ install: install-so install-rb
191
+
192
+ install-so: $(DLLIB) $(TARGET_SO_DIR_TIMESTAMP)
193
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
194
+ clean-static::
195
+ -$(Q)$(RM) $(STATIC_LIB)
196
+ install-rb: pre-install-rb do-install-rb install-rb-default
197
+ install-rb-default: pre-install-rb-default do-install-rb-default
198
+ pre-install-rb: Makefile
199
+ pre-install-rb-default: Makefile
200
+ do-install-rb:
201
+ do-install-rb-default:
202
+ pre-install-rb-default:
203
+ @$(NULLCMD)
204
+ $(TARGET_SO_DIR_TIMESTAMP):
205
+ $(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
206
+ $(Q) $(TOUCH) $@
207
+
208
+ site-install: site-install-so site-install-rb
209
+ site-install-so: install-so
210
+ site-install-rb: install-rb
211
+
212
+ .SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
213
+
214
+ .cc.o:
215
+ $(ECHO) compiling $(<)
216
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
217
+
218
+ .cc.S:
219
+ $(ECHO) translating $(<)
220
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
221
+
222
+ .mm.o:
223
+ $(ECHO) compiling $(<)
224
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
225
+
226
+ .mm.S:
227
+ $(ECHO) translating $(<)
228
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
229
+
230
+ .cxx.o:
231
+ $(ECHO) compiling $(<)
232
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
233
+
234
+ .cxx.S:
235
+ $(ECHO) translating $(<)
236
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
237
+
238
+ .cpp.o:
239
+ $(ECHO) compiling $(<)
240
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
241
+
242
+ .cpp.S:
243
+ $(ECHO) translating $(<)
244
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
245
+
246
+ .c.o:
247
+ $(ECHO) compiling $(<)
248
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
249
+
250
+ .c.S:
251
+ $(ECHO) translating $(<)
252
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
253
+
254
+ .m.o:
255
+ $(ECHO) compiling $(<)
256
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
257
+
258
+ .m.S:
259
+ $(ECHO) translating $(<)
260
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
261
+
262
+ $(TARGET_SO): $(OBJS) Makefile
263
+ $(ECHO) linking shared-object text_sentencer_c/$(DLLIB)
264
+ -$(Q)$(RM) $(@)
265
+ $(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
266
+
267
+
268
+
269
+ $(OBJS): $(HDRS) $(ruby_headers)
@@ -0,0 +1,7 @@
1
+ require 'mkmf'
2
+
3
+ # Check for ICU
4
+ have_library('icuuc')
5
+ have_library('icui18n')
6
+
7
+ create_makefile('text_sentencer_c/text_sentencer_c')
@@ -0,0 +1,600 @@
1
+ #include "text_sentencer.h"
2
+ #include <ruby/thread.h>
3
+ #include <stdio.h>
4
+ #include <stdlib.h>
5
+
6
+ // Define the TextSentencer class
7
+ VALUE rb_cTextSentencer;
8
+
9
+ static void text_sentencer_free(void *p) {
10
+ TextSentencer *ts = (TextSentencer *)p;
11
+
12
+ // Free each compiled regex
13
+ uregex_close(ts->break_pattern);
14
+ uregex_close(ts->candidate_pattern);
15
+
16
+ for (size_t i = 0; i < ts->num_positive_rules; i++) {
17
+ uregex_close(ts->positive_rules_pre[i]);
18
+ uregex_close(ts->positive_rules_post[i]);
19
+ }
20
+ for (size_t i = 0; i < ts->num_negative_rules; i++) {
21
+ uregex_close(ts->negative_rules_pre[i]);
22
+ uregex_close(ts->negative_rules_post[i]);
23
+ }
24
+
25
+ free(ts->positive_rules_pre);
26
+ free(ts->positive_rules_post);
27
+ free(ts->negative_rules_pre);
28
+ free(ts->negative_rules_post);
29
+
30
+ free(ts);
31
+ }
32
+
33
+ static size_t text_sentencer_memsize(const void *p) {
34
+ const TextSentencer *ts = (const TextSentencer *)p;
35
+
36
+ // Calculate the size of the TextSentencer structure itself
37
+ size_t size = sizeof(TextSentencer);
38
+
39
+ // Add the size of the arrays of pointers to URegularExpression
40
+ size += ts->num_positive_rules * sizeof(URegularExpression *);
41
+ size += ts->num_negative_rules * sizeof(URegularExpression *);
42
+
43
+ // Add the two main pointers for break_pattern and candidate_pattern
44
+ size += sizeof(URegularExpression *) * 2;
45
+
46
+ return size;
47
+ }
48
+
49
+
50
+ static const rb_data_type_t text_sentencer_type = {
51
+ "TextSentencer",
52
+ {
53
+ 0, // Mark function (not needed here)
54
+ text_sentencer_free, // Free function
55
+ text_sentencer_memsize, // Size function
56
+ },
57
+ NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY,
58
+ };
59
+
60
+ /*
61
+ // Helper function to convert UTF-8 string to UTF-16
62
+ static UChar* utf8_to_utf16(const char* utf8, int32_t* utf16_len, UErrorCode* status) {
63
+ int32_t utf16_capacity = 0;
64
+ u_strFromUTF8(NULL, 0, &utf16_capacity, utf8, -1, status);
65
+ if (*status != U_BUFFER_OVERFLOW_ERROR) {
66
+ return NULL;
67
+ }
68
+
69
+ *status = U_ZERO_ERROR;
70
+ UChar* utf16 = (UChar*)malloc(utf16_capacity * sizeof(UChar));
71
+ u_strFromUTF8(utf16, utf16_capacity, utf16_len, utf8, -1, status);
72
+ return utf16;
73
+ }
74
+ */
75
+
76
+ // Helper function to convert UTF-8 string to UTF-16
77
+ static UChar* utf8_to_utf16(const char* utf8, int32_t* utf16_len, UErrorCode* status) {
78
+ int32_t utf16_capacity = 0;
79
+
80
+ // First, get the required capacity for the UTF-16 string
81
+ u_strFromUTF8(NULL, 0, &utf16_capacity, utf8, -1, status);
82
+ if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
83
+ printf("utf8_to_utf16 returning NULL.\n");
84
+ return NULL; // Return if an error occurred that is not buffer overflow
85
+ }
86
+
87
+ *status = U_ZERO_ERROR; // Reset the status before the actual conversion
88
+
89
+ // Allocate memory for the UTF-16 string
90
+ UChar* utf16 = (UChar*)malloc(utf16_capacity * sizeof(UChar));
91
+ if (utf16 == NULL) {
92
+ *status = U_MEMORY_ALLOCATION_ERROR;
93
+ printf("utf8_to_utf16 returning NULL.\n");
94
+ return NULL;
95
+ }
96
+
97
+ // Perform the conversion from UTF-8 to UTF-16
98
+ u_strFromUTF8(utf16, utf16_capacity, utf16_len, utf8, -1, status);
99
+ if (U_FAILURE(*status)) {
100
+ free(utf16); // Free allocated memory if conversion fails
101
+ printf("utf8_to_utf16 returning NULL.\n");
102
+ return NULL;
103
+ }
104
+
105
+ return utf16;
106
+ }
107
+
108
+ /*
109
+ // Utility function to scan text with a regex pattern and return offsets using ICU
110
+ static long scan_offsets(const UChar *text, URegularExpression *pattern, long **span_starts, long **span_ends, UErrorCode *status) {
111
+ long allocated = 10;
112
+ long used = 0;
113
+
114
+ *span_starts = malloc(allocated * sizeof(long));
115
+ *span_ends = malloc(allocated * sizeof(long));
116
+ if (*span_starts == NULL || *span_ends == NULL) {
117
+ rb_raise(rb_eRuntimeError, "Memory allocation failed");
118
+ }
119
+
120
+ uregex_setText(pattern, text, -1, status);
121
+ if (U_FAILURE(*status)) {
122
+ rb_raise(rb_eRuntimeError, "Failed to set text for regex");
123
+ }
124
+
125
+ while (uregex_findNext(pattern, status)) {
126
+ if (used >= allocated) {
127
+ allocated *= 2;
128
+ *span_starts = realloc(*span_starts, allocated * sizeof(long));
129
+ *span_ends = realloc(*span_ends, allocated * sizeof(long));
130
+ if (*span_starts == NULL || *span_ends == NULL) {
131
+ rb_raise(rb_eRuntimeError, "Memory allocation failed");
132
+ }
133
+ }
134
+
135
+ (*span_starts)[used] = uregex_start(pattern, 0, status);
136
+ (*span_ends)[used] = uregex_end(pattern, 0, status);
137
+ if (U_FAILURE(*status)) {
138
+ rb_raise(rb_eRuntimeError, "Error during regex match: %s", u_errorName(*status));
139
+ }
140
+
141
+ used++;
142
+ }
143
+
144
+ return used;
145
+ }
146
+ */
147
+ // Utility function to match a regex against a substring using ICU
148
+ static int match_regex(const UChar *text, long start, long end, URegularExpression *pattern, UErrorCode *status) {
149
+ uregex_setText(pattern, text + start, end - start, status);
150
+ if (U_FAILURE(*status)) {
151
+ rb_raise(rb_eRuntimeError, "Failed to set text for regex");
152
+ }
153
+
154
+ return uregex_find(pattern, 0, status);
155
+ }
156
+
157
+ void print_utf16_hex(const UChar *utf16_text, int32_t utf16_len) {
158
+ printf("UTF-16 Length: %d\n", utf16_len);
159
+ printf("UTF-16 Hex Values: ");
160
+ for (int32_t i = 0; i < utf16_len; i++) {
161
+ printf("%04X ", utf16_text[i]); // Print each UChar as a 4-digit hexadecimal
162
+ }
163
+ printf("\n");
164
+ }
165
+
166
+ // Function to print UChar array after converting back to UTF-8
167
+ void print_utf16_as_utf8(const UChar *utf16_text, int32_t utf16_len) {
168
+ UErrorCode status = U_ZERO_ERROR;
169
+ int32_t utf8_len = 0;
170
+
171
+ // First, determine the length of the UTF-8 string
172
+ u_strToUTF8(NULL, 0, &utf8_len, utf16_text, utf16_len, &status);
173
+ if (status != U_BUFFER_OVERFLOW_ERROR) {
174
+ printf("Error in calculating UTF-8 length: %s\n", u_errorName(status));
175
+ return;
176
+ }
177
+
178
+ status = U_ZERO_ERROR; // Reset the error code
179
+ char *utf8_text = (char *)malloc(utf8_len + 1); // Allocate memory for the UTF-8 string
180
+ if (utf8_text == NULL) {
181
+ printf("Memory allocation failed for UTF-8 string\n");
182
+ return;
183
+ }
184
+
185
+ // Convert the UTF-16 string back to UTF-8
186
+ u_strToUTF8(utf8_text, utf8_len + 1, NULL, utf16_text, utf16_len, &status);
187
+ if (U_FAILURE(status)) {
188
+ printf("Error in converting UTF-16 to UTF-8: %s\n", u_errorName(status));
189
+ } else {
190
+ printf("UTF-8 Text: %s\n", utf8_text);
191
+ }
192
+
193
+ free(utf8_text); // Free the allocated memory for UTF-8 string
194
+ }
195
+
196
+ // Utility function to scan text with a regex pattern and return offsets using ICU
197
+ static long scan_offsets(const UChar *text, URegularExpression *pattern, long **span_starts, long **span_ends, UErrorCode *status) {
198
+ long allocated = 10;
199
+ long used = 0;
200
+
201
+ if (status == NULL || U_FAILURE(*status)) {
202
+ rb_raise(rb_eRuntimeError, "Invalid or uninitialized UErrorCode provided");
203
+ }
204
+
205
+ *span_starts = malloc(allocated * sizeof(long));
206
+ *span_ends = malloc(allocated * sizeof(long));
207
+ if (*span_starts == NULL || *span_ends == NULL) {
208
+ rb_raise(rb_eRuntimeError, "Memory allocation failed");
209
+ }
210
+
211
+ // Set the text to search with the regex
212
+ uregex_setText(pattern, text, -1, status);
213
+ if (U_FAILURE(*status)) {
214
+ rb_raise(rb_eRuntimeError, "Failed to set text for regex: %s", u_errorName(*status));
215
+ }
216
+
217
+ // Find matches and store offsets
218
+ while (uregex_findNext(pattern, status)) {
219
+ if (U_FAILURE(*status)) {
220
+ rb_raise(rb_eRuntimeError, "Error during regex match: %s", u_errorName(*status));
221
+ }
222
+
223
+ if (used >= allocated) {
224
+ allocated *= 2;
225
+ *span_starts = realloc(*span_starts, allocated * sizeof(long));
226
+ *span_ends = realloc(*span_ends, allocated * sizeof(long));
227
+ if (*span_starts == NULL || *span_ends == NULL) {
228
+ rb_raise(rb_eRuntimeError, "Memory allocation failed");
229
+ }
230
+ }
231
+
232
+ (*span_starts)[used] = uregex_start(pattern, 0, status);
233
+ (*span_ends)[used] = uregex_end(pattern, 0, status);
234
+ if (U_FAILURE(*status)) {
235
+ rb_raise(rb_eRuntimeError, "Error retrieving match offsets: %s", u_errorName(*status));
236
+ }
237
+
238
+ used++;
239
+ }
240
+
241
+ return used;
242
+ }
243
+
244
+ // Function to print UChar array as hexadecimal
245
+ struct segment_args {
246
+ // input
247
+ TextSentencer *ts;
248
+ const char *text;
249
+
250
+ // output
251
+ long *segment_starts;
252
+ long *segment_ends;
253
+ long num_segments;
254
+ };
255
+
256
+ static void *segment_without_gvl(void *arg) {
257
+ struct segment_args *args = (struct segment_args *)arg;
258
+
259
+ const TextSentencer *ts = args->ts;
260
+ const char *text = args->text;
261
+
262
+ UErrorCode status = U_ZERO_ERROR;
263
+ int32_t utf16_len;
264
+ UChar *utf16_text = utf8_to_utf16(text, &utf16_len, &status);
265
+ if (U_FAILURE(status)) {
266
+ rb_raise(rb_eRuntimeError, "Failed to convert text to UTF-16: %s", u_errorName(status));
267
+ }
268
+
269
+ // Scan for break patterns
270
+ long *break_starts, *break_ends;
271
+ long num_breaks = scan_offsets(utf16_text, ts->break_pattern, &break_starts, &break_ends, &status);
272
+
273
+ // Scan for candidate patterns
274
+ long *candidate_starts, *candidate_ends;
275
+ long num_candidates = scan_offsets(utf16_text, ts->candidate_pattern, &candidate_starts, &candidate_ends, &status);
276
+
277
+ // Filter candidates using positive rules
278
+ long *filtered_candidate_starts = malloc(num_candidates * sizeof(long));
279
+ long *filtered_candidate_ends = malloc(num_candidates * sizeof(long));
280
+ long num_filtered_candidates = 0;
281
+
282
+ for (long i = 0; i < num_candidates; ++i) {
283
+ long start = candidate_starts[i];
284
+ long end = candidate_ends[i];
285
+
286
+ int matched = 0;
287
+ for (size_t j = 0; j < ts->num_positive_rules; ++j) {
288
+ if ((start == 0 || match_regex(utf16_text, 0, start, ts->positive_rules_pre[j], &status)) &&
289
+ (end == utf16_len || match_regex(utf16_text, end, utf16_len, ts->positive_rules_post[j], &status))) {
290
+ matched = 1;
291
+ break;
292
+ }
293
+ }
294
+
295
+ if (matched) {
296
+ filtered_candidate_starts[num_filtered_candidates] = start;
297
+ filtered_candidate_ends[num_filtered_candidates] = end;
298
+ num_filtered_candidates++;
299
+ }
300
+ }
301
+
302
+ free(candidate_starts);
303
+ free(candidate_ends);
304
+
305
+ // Remove candidates using negative rules
306
+ long *final_candidate_starts = malloc(num_filtered_candidates * sizeof(long));
307
+ long *final_candidate_ends = malloc(num_filtered_candidates * sizeof(long));
308
+ long num_final_candidates = 0;
309
+
310
+ for (long i = 0; i < num_filtered_candidates; ++i) {
311
+ long start = filtered_candidate_starts[i];
312
+ long end = filtered_candidate_ends[i];
313
+
314
+ int reject = 0;
315
+ for (size_t j = 0; j < ts->num_negative_rules; ++j) {
316
+ if (match_regex(utf16_text, 0, start, ts->negative_rules_pre[j], &status) &&
317
+ match_regex(utf16_text, end, utf16_len, ts->negative_rules_post[j], &status)) {
318
+ reject = 1;
319
+ break;
320
+ }
321
+ }
322
+
323
+ if (!reject) {
324
+ final_candidate_starts[num_final_candidates] = start;
325
+ final_candidate_ends[num_final_candidates] = end;
326
+ num_final_candidates++;
327
+ }
328
+ }
329
+
330
+ free(filtered_candidate_starts);
331
+ free(filtered_candidate_ends);
332
+
333
+ // Merge the breaks and the filtered candidates
334
+ long total_breaks = num_breaks + num_final_candidates;
335
+ long *all_break_starts = malloc(total_breaks * sizeof(long));
336
+ long *all_break_ends = malloc(total_breaks * sizeof(long));
337
+
338
+ int i = 0, j = 0, k = 0;
339
+ while (i < num_breaks && j < num_final_candidates) {
340
+ if (break_starts[i] <= final_candidate_starts[j]) {
341
+ all_break_starts[k] = break_starts[i];
342
+ all_break_ends[k] = break_ends[i];
343
+ k++;
344
+ i++;
345
+
346
+ // To avoid duplicates
347
+ if (break_starts[i] == final_candidate_starts[j]) {
348
+ j++;
349
+ }
350
+ } else {
351
+ all_break_starts[k] = final_candidate_starts[j];
352
+ all_break_ends[k] = final_candidate_ends[j];
353
+ k++;
354
+ j++;
355
+ }
356
+ }
357
+
358
+ // Store remaining elements of the breaks
359
+ while (i < num_breaks) {
360
+ all_break_starts[k] = break_starts[i];
361
+ all_break_ends[k] = break_ends[i];
362
+ k++;
363
+ i++;
364
+ }
365
+
366
+ // Store remaining elements of the final candidates
367
+ while (j < num_final_candidates) {
368
+ all_break_starts[k] = final_candidate_starts[j];
369
+ all_break_ends[k] = final_candidate_ends[j];
370
+ k++;
371
+ j++;
372
+ }
373
+
374
+ free(break_starts);
375
+ free(break_ends);
376
+ free(final_candidate_starts);
377
+ free(final_candidate_ends);
378
+
379
+ // Convert breaks into sentence segments
380
+ long num_segments = 0;
381
+ long *segment_starts = malloc((k + 1) * sizeof(long));
382
+ long *segment_ends = malloc((k + 1) * sizeof(long));
383
+
384
+ long start = 0;
385
+ for (long i = 0; i < k; i++) {
386
+ if (all_break_starts[i] == 0) {
387
+ start = all_break_ends[i];
388
+ continue;
389
+ }
390
+
391
+ segment_starts[num_segments] = start;
392
+ segment_ends[num_segments] = all_break_starts[i];
393
+ num_segments++;
394
+ start = all_break_ends[i];
395
+ }
396
+
397
+ // Add the last segment
398
+ if (all_break_ends[k - 1] < utf16_len) {
399
+ segment_starts[num_segments] = start;
400
+ segment_ends[num_segments] = utf16_len;
401
+ num_segments++;
402
+ }
403
+
404
+ free(all_break_starts);
405
+ free(all_break_ends);
406
+ free(utf16_text); // Free the UTF-16 text after processing
407
+
408
+ // Pass the results back to the caller
409
+ args->segment_starts = segment_starts;
410
+ args->segment_ends = segment_ends;
411
+ args->num_segments = num_segments;
412
+
413
+ return NULL;
414
+ }
415
+
416
+ VALUE text_sentencer_segment(VALUE self, VALUE rb_text) {
417
+ TextSentencer *ts;
418
+ TypedData_Get_Struct(self, TextSentencer, &text_sentencer_type, ts);
419
+
420
+ Check_Type(rb_text, T_STRING);
421
+ const char *text = StringValueCStr(rb_text);
422
+
423
+ long *segment_starts = NULL;
424
+ long *segment_ends = NULL;
425
+ long num_segments = 0;
426
+
427
+ struct segment_args args = {ts, text, segment_starts, segment_ends, num_segments};
428
+ rb_thread_call_without_gvl(segment_without_gvl, &args, RUBY_UBF_IO, NULL);
429
+
430
+ VALUE segments = rb_ary_new();
431
+ for (long i = 0; i < args.num_segments; ++i) {
432
+ VALUE segment = rb_ary_new();
433
+ rb_ary_push(segment, LONG2NUM(args.segment_starts[i]));
434
+ rb_ary_push(segment, LONG2NUM(args.segment_ends[i]));
435
+ rb_ary_push(segments, segment);
436
+ }
437
+
438
+ free(args.segment_starts);
439
+ free(args.segment_ends);
440
+
441
+ return segments;
442
+ }
443
+
444
+ // Annotate method
445
+ VALUE text_sentencer_annotate(VALUE self, VALUE rb_text) {
446
+ Check_Type(rb_text, T_STRING);
447
+
448
+ VALUE segments = text_sentencer_segment(self, rb_text);
449
+ VALUE blocks = rb_ary_new();
450
+
451
+ for (long i = 0; i < RARRAY_LEN(segments); i++) {
452
+ VALUE span = rb_hash_new();
453
+ VALUE block = rb_hash_new();
454
+
455
+ VALUE segment = rb_ary_entry(segments, i);
456
+ long start = NUM2LONG(rb_ary_entry(segment, 0));
457
+ long end = NUM2LONG(rb_ary_entry(segment, 1));
458
+
459
+ rb_hash_aset(span, ID2SYM(rb_intern("begin")), LONG2NUM(start));
460
+ rb_hash_aset(span, ID2SYM(rb_intern("end")), LONG2NUM(end));
461
+
462
+ rb_hash_aset(block, ID2SYM(rb_intern("span")), span);
463
+ rb_hash_aset(block, ID2SYM(rb_intern("obj")), rb_str_new_cstr("Sentence"));
464
+
465
+ rb_ary_push(blocks, block);
466
+ }
467
+
468
+ VALUE result = rb_hash_new();
469
+ rb_hash_aset(result, ID2SYM(rb_intern("text")), rb_text);
470
+ rb_hash_aset(result, ID2SYM(rb_intern("blocks")), blocks);
471
+
472
+ return result;
473
+ }
474
+
475
+ // Allocate memory for TextSentencer structure
476
+ VALUE text_sentencer_allocate(VALUE klass) {
477
+ TextSentencer *ts;
478
+ VALUE obj = TypedData_Make_Struct(klass, TextSentencer, &text_sentencer_type, ts);
479
+
480
+ // Initialize fields to avoid uninitialized memory issues
481
+ ts->break_pattern = NULL;
482
+ ts->candidate_pattern = NULL;
483
+ ts->num_positive_rules = 0;
484
+ ts->positive_rules_pre = NULL;
485
+ ts->positive_rules_post = NULL;
486
+ ts->num_negative_rules = 0;
487
+ ts->negative_rules_pre = NULL;
488
+ ts->negative_rules_post = NULL;
489
+
490
+ return obj;
491
+ }
492
+
493
+ // Helper function to compile ICU regex
494
+ URegularExpression* compile_pattern(VALUE rb_pattern) {
495
+ Check_Type(rb_pattern, T_STRING);
496
+ const char *pattern_str = StringValueCStr(rb_pattern);
497
+
498
+ if (strlen(pattern_str) == 0) {
499
+ rb_raise(rb_eArgError, "Empty regex pattern is not allowed");
500
+ }
501
+
502
+ UErrorCode status = U_ZERO_ERROR;
503
+
504
+ // Convert the UTF-8 pattern to UTF-16
505
+ int32_t pattern_utf16_len = 0;
506
+ UChar *pattern_utf16 = utf8_to_utf16(pattern_str, &pattern_utf16_len, &status);
507
+
508
+ if (U_FAILURE(status)) {
509
+ rb_raise(rb_eRuntimeError, "Failed to convert regex pattern to UTF-16: %s", u_errorName(status));
510
+ }
511
+
512
+ // Compile the regular expression
513
+ UParseError parse_error;
514
+ URegularExpression *regex = uregex_open(pattern_utf16, pattern_utf16_len, 0, &parse_error, &status);
515
+ free(pattern_utf16); // Free the UTF-16 pattern after compilation
516
+
517
+ if (U_FAILURE(status)) {
518
+ char error_message[256];
519
+ snprintf(error_message, sizeof(error_message), "ICU regex compilation failed: %s", u_errorName(status));
520
+ rb_raise(rb_eRuntimeError, "%s", error_message);
521
+ }
522
+
523
+ return regex;
524
+ }
525
+
526
+ VALUE text_sentencer_initialize(VALUE self, VALUE rules) {
527
+ TextSentencer *ts;
528
+ TypedData_Get_Struct(self, TextSentencer, &text_sentencer_type, ts);
529
+
530
+ // Extract and compile the break pattern from the rules hash
531
+ VALUE rb_break_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("break_pattern")));
532
+
533
+ if (!NIL_P(rb_break_pattern)) {
534
+ ts->break_pattern = compile_pattern(rb_break_pattern);
535
+ } else {
536
+ rb_raise(rb_eArgError, "break_pattern is required in rules");
537
+ }
538
+
539
+ // Extract and compile the candidate pattern from the rules hash
540
+ VALUE rb_candidate_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("candidate_pattern")));
541
+
542
+ if (!NIL_P(rb_candidate_pattern)) {
543
+ ts->candidate_pattern = compile_pattern(rb_candidate_pattern);
544
+ } else {
545
+ rb_raise(rb_eArgError, "candidate_pattern is required in rules");
546
+ }
547
+
548
+ // Compile positive rules
549
+ VALUE rb_positive_rules = rb_hash_aref(rules, ID2SYM(rb_intern("positive_rules")));
550
+ if (NIL_P(rb_positive_rules) || TYPE(rb_positive_rules) != T_ARRAY) {
551
+ rb_raise(rb_eArgError, "positive_rules must be an array");
552
+ }
553
+
554
+ ts->num_positive_rules = RARRAY_LEN(rb_positive_rules);
555
+ ts->positive_rules_pre = malloc(ts->num_positive_rules * sizeof(URegularExpression *));
556
+ ts->positive_rules_post = malloc(ts->num_positive_rules * sizeof(URegularExpression *));
557
+ if ((ts->positive_rules_pre == NULL) || (ts->positive_rules_post == NULL)) {
558
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for positive rules");
559
+ }
560
+
561
+ for (size_t i = 0; i < ts->num_positive_rules; i++) {
562
+ VALUE rb_rule = rb_ary_entry(rb_positive_rules, i);
563
+ VALUE rb_rule_pre = rb_ary_entry(rb_rule, 0);
564
+ VALUE rb_rule_post = rb_ary_entry(rb_rule, 1);
565
+ ts->positive_rules_pre[i] = compile_pattern(rb_rule_pre);
566
+ ts->positive_rules_post[i] = compile_pattern(rb_rule_post);
567
+ }
568
+
569
+ // Compile negative rules
570
+ VALUE rb_negative_rules = rb_hash_aref(rules, ID2SYM(rb_intern("negative_rules")));
571
+ if (NIL_P(rb_negative_rules) || TYPE(rb_negative_rules) != T_ARRAY) {
572
+ rb_raise(rb_eArgError, "negative_rules must be an array");
573
+ }
574
+
575
+ ts->num_negative_rules = RARRAY_LEN(rb_negative_rules);
576
+ ts->negative_rules_pre = malloc(ts->num_negative_rules * sizeof(URegularExpression *));
577
+ ts->negative_rules_post = malloc(ts->num_negative_rules * sizeof(URegularExpression *));
578
+ if ((ts->negative_rules_pre == NULL) || (ts->negative_rules_post == NULL)) {
579
+ rb_raise(rb_eNoMemError, "Failed to allocate memory for negative rules");
580
+ }
581
+
582
+ for (size_t i = 0; i < ts->num_negative_rules; i++) {
583
+ VALUE rb_rule = rb_ary_entry(rb_negative_rules, i);
584
+ VALUE rb_rule_pre = rb_ary_entry(rb_rule, 0);
585
+ VALUE rb_rule_post = rb_ary_entry(rb_rule, 1);
586
+ ts->negative_rules_pre[i] = compile_pattern(rb_rule_pre);
587
+ ts->negative_rules_post[i] = compile_pattern(rb_rule_post);
588
+ }
589
+
590
+ return self;
591
+ }
592
+
593
+ // Module initialization
594
+ void Init_text_sentencer_c(void) {
595
+ rb_cTextSentencer = rb_define_class("TextSentencer", rb_cObject);
596
+ rb_define_alloc_func(rb_cTextSentencer, text_sentencer_allocate);
597
+ rb_define_method(rb_cTextSentencer, "initialize", text_sentencer_initialize, 1);
598
+ rb_define_method(rb_cTextSentencer, "segment", text_sentencer_segment, 1);
599
+ rb_define_method(rb_cTextSentencer, "annotate", text_sentencer_annotate, 1);
600
+ }
@@ -0,0 +1,31 @@
1
+ #ifndef TEXT_SENTENCER_H
2
+ #define TEXT_SENTENCER_H
3
+
4
+ #include "ruby.h"
5
+ #include <unicode/uregex.h>
6
+ #include <unicode/ustring.h>
7
+ #include <unicode/utypes.h>
8
+ #include <unicode/utext.h>
9
+
10
+
11
+ typedef struct {
12
+ URegularExpression *break_pattern; // a single pattern
13
+ URegularExpression *candidate_pattern; // a single pattern
14
+
15
+ size_t num_positive_rules;
16
+ URegularExpression **positive_rules_pre; // an array of pre-patterns
17
+ URegularExpression **positive_rules_post; // an array of post-patterns
18
+
19
+ size_t num_negative_rules;
20
+ URegularExpression **negative_rules_pre; // an array of pre-patterns
21
+ URegularExpression **negative_rules_post; // an array of post-patterns
22
+ } TextSentencer;
23
+
24
+ // Function prototypes
25
+ void Init_text_sentencer_c(void);
26
+ VALUE text_sentencer_allocate(VALUE klass);
27
+ VALUE text_sentencer_initialize(VALUE self, VALUE rules);
28
+ VALUE text_sentencer_segment(VALUE self, VALUE text);
29
+ VALUE text_sentencer_annotate(VALUE self, VALUE text);
30
+
31
+ #endif // TEXT_SENTENCER_H
@@ -0,0 +1,26 @@
1
+ require 'text_sentencer_c/text_sentencer_c'
2
+
3
+ class TextSentencer
4
+ DEFAULT_RULES = {
5
+ break_pattern: "([ \t]*\n+)+[ \t]*", # one or more consecutive blank lines
6
+ candidate_pattern: "[ \t]+",
7
+ positive_rules: [
8
+ ["[.!?]$", "^[0-9A-Z]"],
9
+ [":$", "^[0-9]"],
10
+ [":$", "^[A-Z][a-z]"]
11
+ ],
12
+ negative_rules: [
13
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.$', '^[A-Z][a-z]'],
14
+ ['(Sr|Jr)\.$', '^[A-Z][a-z]'],
15
+ ['\b[A-Z][a-z]*\.$', '^[0-9A-Z]'],
16
+ ['(cf|vs)\.$', '^.'],
17
+ ['e\.g\.$', '^.'],
18
+ ['i\.e\.$', '^.'],
19
+ ['(Sec|Chap|Fig|Eq)\.$', '^[0-9A-Z]']
20
+ ]
21
+ }
22
+
23
+ def self.update_rules(rules)
24
+ DEFAULT_RULES.merge(rules)
25
+ end
26
+ end
@@ -0,0 +1,16 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "text_sentencer_c"
3
+ spec.version = "0.1.0"
4
+ spec.authors = ["Jin-Dong Kim"]
5
+ spec.email = ["jindong.kim@gmail.com"]
6
+ spec.summary = "Text segmentation into sentences"
7
+ spec.description = "It is a reimplementation text_sentencer, which is originally written in ruby, using C extension for a better performance. It is a preliminary version, and may not be fully functional."
8
+ spec.homepage = "https://github.com/jdkim/text_sentencer_c"
9
+ spec.license = "MIT"
10
+
11
+ spec.files = Dir["{lib,ext}/**/*"] + ["text_sentencer_c.gemspec", "Rakefile", "README.md"]
12
+ spec.executables = ['text_sentencer']
13
+ spec.require_paths = ["lib", "ext"]
14
+
15
+ spec.extensions = ["ext/text_sentencer_c/extconf.rb"]
16
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text_sentencer_c
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jin-Dong Kim
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-08-28 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: It is a reimplementation text_sentencer, which is originally written
14
+ in ruby, using C extension for a better performance. It is a preliminary version,
15
+ and may not be fully functional.
16
+ email:
17
+ - jindong.kim@gmail.com
18
+ executables:
19
+ - text_sentencer
20
+ extensions:
21
+ - ext/text_sentencer_c/extconf.rb
22
+ extra_rdoc_files: []
23
+ files:
24
+ - README.md
25
+ - Rakefile
26
+ - bin/text_sentencer
27
+ - ext/text_sentencer_c/Makefile
28
+ - ext/text_sentencer_c/extconf.rb
29
+ - ext/text_sentencer_c/text_sentencer.c
30
+ - ext/text_sentencer_c/text_sentencer.h
31
+ - lib/text_sentencer_c.rb
32
+ - lib/text_sentencer_c/text_sentencer_c.so
33
+ - text_sentencer_c.gemspec
34
+ homepage: https://github.com/jdkim/text_sentencer_c
35
+ licenses:
36
+ - MIT
37
+ metadata: {}
38
+ post_install_message:
39
+ rdoc_options: []
40
+ require_paths:
41
+ - lib
42
+ - ext
43
+ required_ruby_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ required_rubygems_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ requirements: []
54
+ rubygems_version: 3.5.11
55
+ signing_key:
56
+ specification_version: 4
57
+ summary: Text segmentation into sentences
58
+ test_files: []