text_sentencer_c 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +11 -0
- data/Rakefile +6 -0
- data/bin/text_sentencer +52 -0
- data/ext/text_sentencer_c/Makefile +269 -0
- data/ext/text_sentencer_c/extconf.rb +7 -0
- data/ext/text_sentencer_c/text_sentencer.c +600 -0
- data/ext/text_sentencer_c/text_sentencer.h +31 -0
- data/lib/text_sentencer_c/text_sentencer_c.so +0 -0
- data/lib/text_sentencer_c.rb +26 -0
- data/text_sentencer_c.gemspec +16 -0
- metadata +58 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c8b424a5b6ea7cb3ac0b7138f3f9c2ec480144b6a5a21a706937f41e37e11edd
|
4
|
+
data.tar.gz: 40f8b0e846f631e1142113b0073b47169ef6cb2348bedf9b1a55c5b8662a3ee9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 68fe2267dbb019c2e3e515527f0cd0026b9a2212ebafafe03b87d3663dc326d66fbe92c5c5fbf909e55b53798158e4f79ff2be79ce7925470025e64f249b61a6
|
7
|
+
data.tar.gz: c03d9f4347e6a3e2dbec7730560526ff5eac208fa553c87be9b9ebf78b0c9036346fc08ac5aa3cc1ea2402b3c8d82f5ddef94097cd1e3d5dc1ad71e0ce51b3bc
|
data/README.md
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# text_sentencer_c
|
2
|
+
|
3
|
+
A Ruby gem for text segmentation using a C extension.
|
4
|
+
It is a re-implementation of 'text_sentencer', which is originally implemented in ruby, using a C extension, for a better performance.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
gem 'text_sentencer_c'
|
data/Rakefile
ADDED
data/bin/text_sentencer
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'json'
|
3
|
+
require 'text_sentencer_c'
|
4
|
+
|
5
|
+
rules_filename = nil
|
6
|
+
output_mode = :sentences
|
7
|
+
|
8
|
+
## command line option processing
|
9
|
+
require 'optparse'
|
10
|
+
optparse = OptionParser.new do |opts|
|
11
|
+
opts.banner = "Usage: text_sentencer [options]"
|
12
|
+
|
13
|
+
opts.on('-r', '--rules=rules_filename', 'specifies the rules JSON file.') do |c|
|
14
|
+
rules_filename = c
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.on('-j', '--json_output', 'outputs the result in JSON.') do
|
18
|
+
output_mode = :json
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
22
|
+
puts opts
|
23
|
+
exit
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
optparse.parse!
|
28
|
+
|
29
|
+
rules = if rules_filename && File.file?(rules_filename)
|
30
|
+
JSON.parse File.read(rules_filename)
|
31
|
+
end
|
32
|
+
|
33
|
+
sentencer = TextSentencer.new(rules || TextSentencer::DEFAULT_RULES)
|
34
|
+
|
35
|
+
text = ARGF.read
|
36
|
+
|
37
|
+
# segments = sentencer.segment(text)
|
38
|
+
# p segments
|
39
|
+
# segments.each do |b, e|
|
40
|
+
# puts text[b...e]
|
41
|
+
# end
|
42
|
+
|
43
|
+
annotations = sentencer.annotate(text)
|
44
|
+
|
45
|
+
if output_mode == :json
|
46
|
+
puts JSON.pretty_generate(annotations)
|
47
|
+
else
|
48
|
+
annotations['blocks']&.each do |d|
|
49
|
+
span = d['span']
|
50
|
+
puts text[span['begin']...span['end']]
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,269 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
5
|
+
V = 0
|
6
|
+
V0 = $(V:0=)
|
7
|
+
Q1 = $(V:1=)
|
8
|
+
Q = $(Q1:0=@)
|
9
|
+
ECHO1 = $(V:1=@ :)
|
10
|
+
ECHO = $(ECHO1:0=@ echo)
|
11
|
+
NULLCMD = :
|
12
|
+
|
13
|
+
#### Start of system configuration section. ####
|
14
|
+
|
15
|
+
srcdir = .
|
16
|
+
topdir = /home/jdkim/.rvm/rubies/ruby-3.3.4/include/ruby-3.3.0
|
17
|
+
hdrdir = $(topdir)
|
18
|
+
arch_hdrdir = /home/jdkim/.rvm/rubies/ruby-3.3.4/include/ruby-3.3.0/x86_64-linux
|
19
|
+
PATH_SEPARATOR = :
|
20
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
21
|
+
prefix = $(DESTDIR)/home/jdkim/.rvm/rubies/ruby-3.3.4
|
22
|
+
rubysitearchprefix = $(rubylibprefix)/$(sitearch)
|
23
|
+
rubyarchprefix = $(rubylibprefix)/$(arch)
|
24
|
+
rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
|
25
|
+
exec_prefix = $(prefix)
|
26
|
+
vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
|
27
|
+
sitearchhdrdir = $(sitehdrdir)/$(sitearch)
|
28
|
+
rubyarchhdrdir = $(rubyhdrdir)/$(arch)
|
29
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
30
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
31
|
+
rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
|
32
|
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
33
|
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
34
|
+
vendordir = $(rubylibprefix)/vendor_ruby
|
35
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
36
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
37
|
+
sitedir = $(rubylibprefix)/site_ruby
|
38
|
+
rubyarchdir = $(rubylibdir)/$(arch)
|
39
|
+
rubylibdir = $(rubylibprefix)/$(ruby_version)
|
40
|
+
sitearchincludedir = $(includedir)/$(sitearch)
|
41
|
+
archincludedir = $(includedir)/$(arch)
|
42
|
+
sitearchlibdir = $(libdir)/$(sitearch)
|
43
|
+
archlibdir = $(libdir)/$(arch)
|
44
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
45
|
+
mandir = $(datarootdir)/man
|
46
|
+
localedir = $(datarootdir)/locale
|
47
|
+
libdir = $(exec_prefix)/lib
|
48
|
+
psdir = $(docdir)
|
49
|
+
pdfdir = $(docdir)
|
50
|
+
dvidir = $(docdir)
|
51
|
+
htmldir = $(docdir)
|
52
|
+
infodir = $(datarootdir)/info
|
53
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
54
|
+
oldincludedir = $(DESTDIR)/usr/include
|
55
|
+
includedir = $(prefix)/include
|
56
|
+
runstatedir = $(localstatedir)/run
|
57
|
+
localstatedir = $(prefix)/var
|
58
|
+
sharedstatedir = $(prefix)/com
|
59
|
+
sysconfdir = $(prefix)/etc
|
60
|
+
datadir = $(datarootdir)
|
61
|
+
datarootdir = $(prefix)/share
|
62
|
+
libexecdir = $(exec_prefix)/libexec
|
63
|
+
sbindir = $(exec_prefix)/sbin
|
64
|
+
bindir = $(exec_prefix)/bin
|
65
|
+
archdir = $(rubyarchdir)
|
66
|
+
|
67
|
+
|
68
|
+
CC_WRAPPER =
|
69
|
+
CC = gcc
|
70
|
+
CXX = g++
|
71
|
+
LIBRUBY = $(LIBRUBY_SO)
|
72
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
73
|
+
LIBRUBYARG_SHARED = -Wl,-rpath,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)
|
74
|
+
LIBRUBYARG_STATIC = -Wl,-rpath,$(libdir) -L$(libdir) -l$(RUBY_SO_NAME)-static $(MAINLIBS)
|
75
|
+
empty =
|
76
|
+
OUTFLAG = -o $(empty)
|
77
|
+
COUTFLAG = -o $(empty)
|
78
|
+
CSRCFLAG = $(empty)
|
79
|
+
|
80
|
+
RUBY_EXTCONF_H =
|
81
|
+
cflags = $(optflags) $(debugflags) $(warnflags)
|
82
|
+
cxxflags =
|
83
|
+
optflags = -O3 -fno-fast-math
|
84
|
+
debugflags = -ggdb3
|
85
|
+
warnflags = -Wall -Wextra -Wdeprecated-declarations -Wdiv-by-zero -Wduplicated-cond -Wimplicit-function-declaration -Wimplicit-int -Wpointer-arith -Wwrite-strings -Wold-style-definition -Wimplicit-fallthrough=0 -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-packed-bitfield-compat -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wsuggest-attribute=format -Wsuggest-attribute=noreturn -Wunused-variable -Wmisleading-indentation -Wundef
|
86
|
+
cppflags =
|
87
|
+
CCDLFLAGS = -fPIC
|
88
|
+
CFLAGS = $(CCDLFLAGS) $(cflags) -fPIC $(ARCH_FLAG)
|
89
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
90
|
+
DEFS =
|
91
|
+
CPPFLAGS = $(DEFS) $(cppflags)
|
92
|
+
CXXFLAGS = $(CCDLFLAGS) $(ARCH_FLAG)
|
93
|
+
ldflags = -L. -fstack-protector-strong -rdynamic -Wl,-export-dynamic -Wl,--no-as-needed
|
94
|
+
dldflags = -Wl,--compress-debug-sections=zlib
|
95
|
+
ARCH_FLAG =
|
96
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
97
|
+
LDSHARED = $(CC) -shared
|
98
|
+
LDSHAREDXX = $(CXX) -shared
|
99
|
+
AR = gcc-ar
|
100
|
+
EXEEXT =
|
101
|
+
|
102
|
+
RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
|
103
|
+
RUBY_SO_NAME = ruby
|
104
|
+
RUBYW_INSTALL_NAME =
|
105
|
+
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
|
106
|
+
RUBYW_BASE_NAME = rubyw
|
107
|
+
RUBY_BASE_NAME = ruby
|
108
|
+
|
109
|
+
arch = x86_64-linux
|
110
|
+
sitearch = $(arch)
|
111
|
+
ruby_version = 3.3.0
|
112
|
+
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
113
|
+
RUBY = $(ruby)
|
114
|
+
BUILTRUBY = $(bindir)/$(RUBY_BASE_NAME)
|
115
|
+
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
116
|
+
|
117
|
+
RM = rm -f
|
118
|
+
RM_RF = rm -fr
|
119
|
+
RMDIRS = rmdir --ignore-fail-on-non-empty -p
|
120
|
+
MAKEDIRS = /usr/bin/mkdir -p
|
121
|
+
INSTALL = /usr/bin/install -c
|
122
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
123
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
124
|
+
COPY = cp
|
125
|
+
TOUCH = exit >
|
126
|
+
|
127
|
+
#### End of system configuration section. ####
|
128
|
+
|
129
|
+
preload =
|
130
|
+
libpath = . $(libdir)
|
131
|
+
LIBPATH = -L. -L$(libdir) -Wl,-rpath,$(libdir)
|
132
|
+
DEFFILE =
|
133
|
+
|
134
|
+
CLEANFILES = mkmf.log
|
135
|
+
DISTCLEANFILES =
|
136
|
+
DISTCLEANDIRS =
|
137
|
+
|
138
|
+
extout =
|
139
|
+
extout_prefix =
|
140
|
+
target_prefix = /text_sentencer_c
|
141
|
+
LOCAL_LIBS =
|
142
|
+
LIBS = $(LIBRUBYARG_SHARED) -lm -lpthread -lc
|
143
|
+
ORIG_SRCS = text_sentencer.c
|
144
|
+
SRCS = $(ORIG_SRCS)
|
145
|
+
OBJS = text_sentencer.o
|
146
|
+
HDRS = $(srcdir)/text_sentencer.h
|
147
|
+
LOCAL_HDRS =
|
148
|
+
TARGET = text_sentencer_c
|
149
|
+
TARGET_NAME = text_sentencer_c
|
150
|
+
TARGET_ENTRY = Init_$(TARGET_NAME)
|
151
|
+
DLLIB = $(TARGET).so
|
152
|
+
EXTSTATIC =
|
153
|
+
STATIC_LIB =
|
154
|
+
|
155
|
+
TIMESTAMP_DIR = .
|
156
|
+
BINDIR = $(bindir)
|
157
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
158
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
159
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
160
|
+
HDRDIR = $(sitehdrdir)$(target_prefix)
|
161
|
+
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
162
|
+
TARGET_SO_DIR =
|
163
|
+
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
164
|
+
CLEANLIBS = $(TARGET_SO) false
|
165
|
+
CLEANOBJS = $(OBJS) *.bak
|
166
|
+
TARGET_SO_DIR_TIMESTAMP = $(TIMESTAMP_DIR)/.sitearchdir.-.text_sentencer_c.time
|
167
|
+
|
168
|
+
all: $(DLLIB)
|
169
|
+
static: $(STATIC_LIB)
|
170
|
+
.PHONY: all install static install-so install-rb
|
171
|
+
.PHONY: clean clean-so clean-static clean-rb
|
172
|
+
|
173
|
+
clean-static::
|
174
|
+
clean-rb-default::
|
175
|
+
clean-rb::
|
176
|
+
clean-so::
|
177
|
+
clean: clean-so clean-static clean-rb-default clean-rb
|
178
|
+
-$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
179
|
+
|
180
|
+
distclean-rb-default::
|
181
|
+
distclean-rb::
|
182
|
+
distclean-so::
|
183
|
+
distclean-static::
|
184
|
+
distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
|
185
|
+
-$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
186
|
+
-$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
187
|
+
-$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
188
|
+
|
189
|
+
realclean: distclean
|
190
|
+
install: install-so install-rb
|
191
|
+
|
192
|
+
install-so: $(DLLIB) $(TARGET_SO_DIR_TIMESTAMP)
|
193
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
194
|
+
clean-static::
|
195
|
+
-$(Q)$(RM) $(STATIC_LIB)
|
196
|
+
install-rb: pre-install-rb do-install-rb install-rb-default
|
197
|
+
install-rb-default: pre-install-rb-default do-install-rb-default
|
198
|
+
pre-install-rb: Makefile
|
199
|
+
pre-install-rb-default: Makefile
|
200
|
+
do-install-rb:
|
201
|
+
do-install-rb-default:
|
202
|
+
pre-install-rb-default:
|
203
|
+
@$(NULLCMD)
|
204
|
+
$(TARGET_SO_DIR_TIMESTAMP):
|
205
|
+
$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
|
206
|
+
$(Q) $(TOUCH) $@
|
207
|
+
|
208
|
+
site-install: site-install-so site-install-rb
|
209
|
+
site-install-so: install-so
|
210
|
+
site-install-rb: install-rb
|
211
|
+
|
212
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
|
213
|
+
|
214
|
+
.cc.o:
|
215
|
+
$(ECHO) compiling $(<)
|
216
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
217
|
+
|
218
|
+
.cc.S:
|
219
|
+
$(ECHO) translating $(<)
|
220
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
221
|
+
|
222
|
+
.mm.o:
|
223
|
+
$(ECHO) compiling $(<)
|
224
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
225
|
+
|
226
|
+
.mm.S:
|
227
|
+
$(ECHO) translating $(<)
|
228
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
229
|
+
|
230
|
+
.cxx.o:
|
231
|
+
$(ECHO) compiling $(<)
|
232
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
233
|
+
|
234
|
+
.cxx.S:
|
235
|
+
$(ECHO) translating $(<)
|
236
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
237
|
+
|
238
|
+
.cpp.o:
|
239
|
+
$(ECHO) compiling $(<)
|
240
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
241
|
+
|
242
|
+
.cpp.S:
|
243
|
+
$(ECHO) translating $(<)
|
244
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
245
|
+
|
246
|
+
.c.o:
|
247
|
+
$(ECHO) compiling $(<)
|
248
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
249
|
+
|
250
|
+
.c.S:
|
251
|
+
$(ECHO) translating $(<)
|
252
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
253
|
+
|
254
|
+
.m.o:
|
255
|
+
$(ECHO) compiling $(<)
|
256
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
257
|
+
|
258
|
+
.m.S:
|
259
|
+
$(ECHO) translating $(<)
|
260
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
261
|
+
|
262
|
+
$(TARGET_SO): $(OBJS) Makefile
|
263
|
+
$(ECHO) linking shared-object text_sentencer_c/$(DLLIB)
|
264
|
+
-$(Q)$(RM) $(@)
|
265
|
+
$(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
266
|
+
|
267
|
+
|
268
|
+
|
269
|
+
$(OBJS): $(HDRS) $(ruby_headers)
|
@@ -0,0 +1,600 @@
|
|
1
|
+
#include "text_sentencer.h"
|
2
|
+
#include <ruby/thread.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#include <stdlib.h>
|
5
|
+
|
6
|
+
// Define the TextSentencer class
|
7
|
+
VALUE rb_cTextSentencer;
|
8
|
+
|
9
|
+
static void text_sentencer_free(void *p) {
|
10
|
+
TextSentencer *ts = (TextSentencer *)p;
|
11
|
+
|
12
|
+
// Free each compiled regex
|
13
|
+
uregex_close(ts->break_pattern);
|
14
|
+
uregex_close(ts->candidate_pattern);
|
15
|
+
|
16
|
+
for (size_t i = 0; i < ts->num_positive_rules; i++) {
|
17
|
+
uregex_close(ts->positive_rules_pre[i]);
|
18
|
+
uregex_close(ts->positive_rules_post[i]);
|
19
|
+
}
|
20
|
+
for (size_t i = 0; i < ts->num_negative_rules; i++) {
|
21
|
+
uregex_close(ts->negative_rules_pre[i]);
|
22
|
+
uregex_close(ts->negative_rules_post[i]);
|
23
|
+
}
|
24
|
+
|
25
|
+
free(ts->positive_rules_pre);
|
26
|
+
free(ts->positive_rules_post);
|
27
|
+
free(ts->negative_rules_pre);
|
28
|
+
free(ts->negative_rules_post);
|
29
|
+
|
30
|
+
free(ts);
|
31
|
+
}
|
32
|
+
|
33
|
+
static size_t text_sentencer_memsize(const void *p) {
|
34
|
+
const TextSentencer *ts = (const TextSentencer *)p;
|
35
|
+
|
36
|
+
// Calculate the size of the TextSentencer structure itself
|
37
|
+
size_t size = sizeof(TextSentencer);
|
38
|
+
|
39
|
+
// Add the size of the arrays of pointers to URegularExpression
|
40
|
+
size += ts->num_positive_rules * sizeof(URegularExpression *);
|
41
|
+
size += ts->num_negative_rules * sizeof(URegularExpression *);
|
42
|
+
|
43
|
+
// Add the two main pointers for break_pattern and candidate_pattern
|
44
|
+
size += sizeof(URegularExpression *) * 2;
|
45
|
+
|
46
|
+
return size;
|
47
|
+
}
|
48
|
+
|
49
|
+
|
50
|
+
static const rb_data_type_t text_sentencer_type = {
|
51
|
+
"TextSentencer",
|
52
|
+
{
|
53
|
+
0, // Mark function (not needed here)
|
54
|
+
text_sentencer_free, // Free function
|
55
|
+
text_sentencer_memsize, // Size function
|
56
|
+
},
|
57
|
+
NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY,
|
58
|
+
};
|
59
|
+
|
60
|
+
/*
|
61
|
+
// Helper function to convert UTF-8 string to UTF-16
|
62
|
+
static UChar* utf8_to_utf16(const char* utf8, int32_t* utf16_len, UErrorCode* status) {
|
63
|
+
int32_t utf16_capacity = 0;
|
64
|
+
u_strFromUTF8(NULL, 0, &utf16_capacity, utf8, -1, status);
|
65
|
+
if (*status != U_BUFFER_OVERFLOW_ERROR) {
|
66
|
+
return NULL;
|
67
|
+
}
|
68
|
+
|
69
|
+
*status = U_ZERO_ERROR;
|
70
|
+
UChar* utf16 = (UChar*)malloc(utf16_capacity * sizeof(UChar));
|
71
|
+
u_strFromUTF8(utf16, utf16_capacity, utf16_len, utf8, -1, status);
|
72
|
+
return utf16;
|
73
|
+
}
|
74
|
+
*/
|
75
|
+
|
76
|
+
// Helper function to convert UTF-8 string to UTF-16
|
77
|
+
static UChar* utf8_to_utf16(const char* utf8, int32_t* utf16_len, UErrorCode* status) {
|
78
|
+
int32_t utf16_capacity = 0;
|
79
|
+
|
80
|
+
// First, get the required capacity for the UTF-16 string
|
81
|
+
u_strFromUTF8(NULL, 0, &utf16_capacity, utf8, -1, status);
|
82
|
+
if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
|
83
|
+
printf("utf8_to_utf16 returning NULL.\n");
|
84
|
+
return NULL; // Return if an error occurred that is not buffer overflow
|
85
|
+
}
|
86
|
+
|
87
|
+
*status = U_ZERO_ERROR; // Reset the status before the actual conversion
|
88
|
+
|
89
|
+
// Allocate memory for the UTF-16 string
|
90
|
+
UChar* utf16 = (UChar*)malloc(utf16_capacity * sizeof(UChar));
|
91
|
+
if (utf16 == NULL) {
|
92
|
+
*status = U_MEMORY_ALLOCATION_ERROR;
|
93
|
+
printf("utf8_to_utf16 returning NULL.\n");
|
94
|
+
return NULL;
|
95
|
+
}
|
96
|
+
|
97
|
+
// Perform the conversion from UTF-8 to UTF-16
|
98
|
+
u_strFromUTF8(utf16, utf16_capacity, utf16_len, utf8, -1, status);
|
99
|
+
if (U_FAILURE(*status)) {
|
100
|
+
free(utf16); // Free allocated memory if conversion fails
|
101
|
+
printf("utf8_to_utf16 returning NULL.\n");
|
102
|
+
return NULL;
|
103
|
+
}
|
104
|
+
|
105
|
+
return utf16;
|
106
|
+
}
|
107
|
+
|
108
|
+
/*
|
109
|
+
// Utility function to scan text with a regex pattern and return offsets using ICU
|
110
|
+
static long scan_offsets(const UChar *text, URegularExpression *pattern, long **span_starts, long **span_ends, UErrorCode *status) {
|
111
|
+
long allocated = 10;
|
112
|
+
long used = 0;
|
113
|
+
|
114
|
+
*span_starts = malloc(allocated * sizeof(long));
|
115
|
+
*span_ends = malloc(allocated * sizeof(long));
|
116
|
+
if (*span_starts == NULL || *span_ends == NULL) {
|
117
|
+
rb_raise(rb_eRuntimeError, "Memory allocation failed");
|
118
|
+
}
|
119
|
+
|
120
|
+
uregex_setText(pattern, text, -1, status);
|
121
|
+
if (U_FAILURE(*status)) {
|
122
|
+
rb_raise(rb_eRuntimeError, "Failed to set text for regex");
|
123
|
+
}
|
124
|
+
|
125
|
+
while (uregex_findNext(pattern, status)) {
|
126
|
+
if (used >= allocated) {
|
127
|
+
allocated *= 2;
|
128
|
+
*span_starts = realloc(*span_starts, allocated * sizeof(long));
|
129
|
+
*span_ends = realloc(*span_ends, allocated * sizeof(long));
|
130
|
+
if (*span_starts == NULL || *span_ends == NULL) {
|
131
|
+
rb_raise(rb_eRuntimeError, "Memory allocation failed");
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
(*span_starts)[used] = uregex_start(pattern, 0, status);
|
136
|
+
(*span_ends)[used] = uregex_end(pattern, 0, status);
|
137
|
+
if (U_FAILURE(*status)) {
|
138
|
+
rb_raise(rb_eRuntimeError, "Error during regex match: %s", u_errorName(*status));
|
139
|
+
}
|
140
|
+
|
141
|
+
used++;
|
142
|
+
}
|
143
|
+
|
144
|
+
return used;
|
145
|
+
}
|
146
|
+
*/
|
147
|
+
// Utility function to match a regex against a substring using ICU
|
148
|
+
static int match_regex(const UChar *text, long start, long end, URegularExpression *pattern, UErrorCode *status) {
|
149
|
+
uregex_setText(pattern, text + start, end - start, status);
|
150
|
+
if (U_FAILURE(*status)) {
|
151
|
+
rb_raise(rb_eRuntimeError, "Failed to set text for regex");
|
152
|
+
}
|
153
|
+
|
154
|
+
return uregex_find(pattern, 0, status);
|
155
|
+
}
|
156
|
+
|
157
|
+
void print_utf16_hex(const UChar *utf16_text, int32_t utf16_len) {
|
158
|
+
printf("UTF-16 Length: %d\n", utf16_len);
|
159
|
+
printf("UTF-16 Hex Values: ");
|
160
|
+
for (int32_t i = 0; i < utf16_len; i++) {
|
161
|
+
printf("%04X ", utf16_text[i]); // Print each UChar as a 4-digit hexadecimal
|
162
|
+
}
|
163
|
+
printf("\n");
|
164
|
+
}
|
165
|
+
|
166
|
+
// Function to print UChar array after converting back to UTF-8
|
167
|
+
void print_utf16_as_utf8(const UChar *utf16_text, int32_t utf16_len) {
|
168
|
+
UErrorCode status = U_ZERO_ERROR;
|
169
|
+
int32_t utf8_len = 0;
|
170
|
+
|
171
|
+
// First, determine the length of the UTF-8 string
|
172
|
+
u_strToUTF8(NULL, 0, &utf8_len, utf16_text, utf16_len, &status);
|
173
|
+
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
174
|
+
printf("Error in calculating UTF-8 length: %s\n", u_errorName(status));
|
175
|
+
return;
|
176
|
+
}
|
177
|
+
|
178
|
+
status = U_ZERO_ERROR; // Reset the error code
|
179
|
+
char *utf8_text = (char *)malloc(utf8_len + 1); // Allocate memory for the UTF-8 string
|
180
|
+
if (utf8_text == NULL) {
|
181
|
+
printf("Memory allocation failed for UTF-8 string\n");
|
182
|
+
return;
|
183
|
+
}
|
184
|
+
|
185
|
+
// Convert the UTF-16 string back to UTF-8
|
186
|
+
u_strToUTF8(utf8_text, utf8_len + 1, NULL, utf16_text, utf16_len, &status);
|
187
|
+
if (U_FAILURE(status)) {
|
188
|
+
printf("Error in converting UTF-16 to UTF-8: %s\n", u_errorName(status));
|
189
|
+
} else {
|
190
|
+
printf("UTF-8 Text: %s\n", utf8_text);
|
191
|
+
}
|
192
|
+
|
193
|
+
free(utf8_text); // Free the allocated memory for UTF-8 string
|
194
|
+
}
|
195
|
+
|
196
|
+
// Utility function to scan text with a regex pattern and return offsets using ICU
|
197
|
+
static long scan_offsets(const UChar *text, URegularExpression *pattern, long **span_starts, long **span_ends, UErrorCode *status) {
|
198
|
+
long allocated = 10;
|
199
|
+
long used = 0;
|
200
|
+
|
201
|
+
if (status == NULL || U_FAILURE(*status)) {
|
202
|
+
rb_raise(rb_eRuntimeError, "Invalid or uninitialized UErrorCode provided");
|
203
|
+
}
|
204
|
+
|
205
|
+
*span_starts = malloc(allocated * sizeof(long));
|
206
|
+
*span_ends = malloc(allocated * sizeof(long));
|
207
|
+
if (*span_starts == NULL || *span_ends == NULL) {
|
208
|
+
rb_raise(rb_eRuntimeError, "Memory allocation failed");
|
209
|
+
}
|
210
|
+
|
211
|
+
// Set the text to search with the regex
|
212
|
+
uregex_setText(pattern, text, -1, status);
|
213
|
+
if (U_FAILURE(*status)) {
|
214
|
+
rb_raise(rb_eRuntimeError, "Failed to set text for regex: %s", u_errorName(*status));
|
215
|
+
}
|
216
|
+
|
217
|
+
// Find matches and store offsets
|
218
|
+
while (uregex_findNext(pattern, status)) {
|
219
|
+
if (U_FAILURE(*status)) {
|
220
|
+
rb_raise(rb_eRuntimeError, "Error during regex match: %s", u_errorName(*status));
|
221
|
+
}
|
222
|
+
|
223
|
+
if (used >= allocated) {
|
224
|
+
allocated *= 2;
|
225
|
+
*span_starts = realloc(*span_starts, allocated * sizeof(long));
|
226
|
+
*span_ends = realloc(*span_ends, allocated * sizeof(long));
|
227
|
+
if (*span_starts == NULL || *span_ends == NULL) {
|
228
|
+
rb_raise(rb_eRuntimeError, "Memory allocation failed");
|
229
|
+
}
|
230
|
+
}
|
231
|
+
|
232
|
+
(*span_starts)[used] = uregex_start(pattern, 0, status);
|
233
|
+
(*span_ends)[used] = uregex_end(pattern, 0, status);
|
234
|
+
if (U_FAILURE(*status)) {
|
235
|
+
rb_raise(rb_eRuntimeError, "Error retrieving match offsets: %s", u_errorName(*status));
|
236
|
+
}
|
237
|
+
|
238
|
+
used++;
|
239
|
+
}
|
240
|
+
|
241
|
+
return used;
|
242
|
+
}
|
243
|
+
|
244
|
+
// Function to print UChar array as hexadecimal
|
245
|
+
struct segment_args {
|
246
|
+
// input
|
247
|
+
TextSentencer *ts;
|
248
|
+
const char *text;
|
249
|
+
|
250
|
+
// output
|
251
|
+
long *segment_starts;
|
252
|
+
long *segment_ends;
|
253
|
+
long num_segments;
|
254
|
+
};
|
255
|
+
|
256
|
+
static void *segment_without_gvl(void *arg) {
|
257
|
+
struct segment_args *args = (struct segment_args *)arg;
|
258
|
+
|
259
|
+
const TextSentencer *ts = args->ts;
|
260
|
+
const char *text = args->text;
|
261
|
+
|
262
|
+
UErrorCode status = U_ZERO_ERROR;
|
263
|
+
int32_t utf16_len;
|
264
|
+
UChar *utf16_text = utf8_to_utf16(text, &utf16_len, &status);
|
265
|
+
if (U_FAILURE(status)) {
|
266
|
+
rb_raise(rb_eRuntimeError, "Failed to convert text to UTF-16: %s", u_errorName(status));
|
267
|
+
}
|
268
|
+
|
269
|
+
// Scan for break patterns
|
270
|
+
long *break_starts, *break_ends;
|
271
|
+
long num_breaks = scan_offsets(utf16_text, ts->break_pattern, &break_starts, &break_ends, &status);
|
272
|
+
|
273
|
+
// Scan for candidate patterns
|
274
|
+
long *candidate_starts, *candidate_ends;
|
275
|
+
long num_candidates = scan_offsets(utf16_text, ts->candidate_pattern, &candidate_starts, &candidate_ends, &status);
|
276
|
+
|
277
|
+
// Filter candidates using positive rules
|
278
|
+
long *filtered_candidate_starts = malloc(num_candidates * sizeof(long));
|
279
|
+
long *filtered_candidate_ends = malloc(num_candidates * sizeof(long));
|
280
|
+
long num_filtered_candidates = 0;
|
281
|
+
|
282
|
+
for (long i = 0; i < num_candidates; ++i) {
|
283
|
+
long start = candidate_starts[i];
|
284
|
+
long end = candidate_ends[i];
|
285
|
+
|
286
|
+
int matched = 0;
|
287
|
+
for (size_t j = 0; j < ts->num_positive_rules; ++j) {
|
288
|
+
if ((start == 0 || match_regex(utf16_text, 0, start, ts->positive_rules_pre[j], &status)) &&
|
289
|
+
(end == utf16_len || match_regex(utf16_text, end, utf16_len, ts->positive_rules_post[j], &status))) {
|
290
|
+
matched = 1;
|
291
|
+
break;
|
292
|
+
}
|
293
|
+
}
|
294
|
+
|
295
|
+
if (matched) {
|
296
|
+
filtered_candidate_starts[num_filtered_candidates] = start;
|
297
|
+
filtered_candidate_ends[num_filtered_candidates] = end;
|
298
|
+
num_filtered_candidates++;
|
299
|
+
}
|
300
|
+
}
|
301
|
+
|
302
|
+
free(candidate_starts);
|
303
|
+
free(candidate_ends);
|
304
|
+
|
305
|
+
// Remove candidates using negative rules
|
306
|
+
long *final_candidate_starts = malloc(num_filtered_candidates * sizeof(long));
|
307
|
+
long *final_candidate_ends = malloc(num_filtered_candidates * sizeof(long));
|
308
|
+
long num_final_candidates = 0;
|
309
|
+
|
310
|
+
for (long i = 0; i < num_filtered_candidates; ++i) {
|
311
|
+
long start = filtered_candidate_starts[i];
|
312
|
+
long end = filtered_candidate_ends[i];
|
313
|
+
|
314
|
+
int reject = 0;
|
315
|
+
for (size_t j = 0; j < ts->num_negative_rules; ++j) {
|
316
|
+
if (match_regex(utf16_text, 0, start, ts->negative_rules_pre[j], &status) &&
|
317
|
+
match_regex(utf16_text, end, utf16_len, ts->negative_rules_post[j], &status)) {
|
318
|
+
reject = 1;
|
319
|
+
break;
|
320
|
+
}
|
321
|
+
}
|
322
|
+
|
323
|
+
if (!reject) {
|
324
|
+
final_candidate_starts[num_final_candidates] = start;
|
325
|
+
final_candidate_ends[num_final_candidates] = end;
|
326
|
+
num_final_candidates++;
|
327
|
+
}
|
328
|
+
}
|
329
|
+
|
330
|
+
free(filtered_candidate_starts);
|
331
|
+
free(filtered_candidate_ends);
|
332
|
+
|
333
|
+
// Merge the breaks and the filtered candidates
|
334
|
+
long total_breaks = num_breaks + num_final_candidates;
|
335
|
+
long *all_break_starts = malloc(total_breaks * sizeof(long));
|
336
|
+
long *all_break_ends = malloc(total_breaks * sizeof(long));
|
337
|
+
|
338
|
+
int i = 0, j = 0, k = 0;
|
339
|
+
while (i < num_breaks && j < num_final_candidates) {
|
340
|
+
if (break_starts[i] <= final_candidate_starts[j]) {
|
341
|
+
all_break_starts[k] = break_starts[i];
|
342
|
+
all_break_ends[k] = break_ends[i];
|
343
|
+
k++;
|
344
|
+
i++;
|
345
|
+
|
346
|
+
// To avoid duplicates
|
347
|
+
if (break_starts[i] == final_candidate_starts[j]) {
|
348
|
+
j++;
|
349
|
+
}
|
350
|
+
} else {
|
351
|
+
all_break_starts[k] = final_candidate_starts[j];
|
352
|
+
all_break_ends[k] = final_candidate_ends[j];
|
353
|
+
k++;
|
354
|
+
j++;
|
355
|
+
}
|
356
|
+
}
|
357
|
+
|
358
|
+
// Store remaining elements of the breaks
|
359
|
+
while (i < num_breaks) {
|
360
|
+
all_break_starts[k] = break_starts[i];
|
361
|
+
all_break_ends[k] = break_ends[i];
|
362
|
+
k++;
|
363
|
+
i++;
|
364
|
+
}
|
365
|
+
|
366
|
+
// Store remaining elements of the final candidates
|
367
|
+
while (j < num_final_candidates) {
|
368
|
+
all_break_starts[k] = final_candidate_starts[j];
|
369
|
+
all_break_ends[k] = final_candidate_ends[j];
|
370
|
+
k++;
|
371
|
+
j++;
|
372
|
+
}
|
373
|
+
|
374
|
+
free(break_starts);
|
375
|
+
free(break_ends);
|
376
|
+
free(final_candidate_starts);
|
377
|
+
free(final_candidate_ends);
|
378
|
+
|
379
|
+
// Convert breaks into sentence segments
|
380
|
+
long num_segments = 0;
|
381
|
+
long *segment_starts = malloc((k + 1) * sizeof(long));
|
382
|
+
long *segment_ends = malloc((k + 1) * sizeof(long));
|
383
|
+
|
384
|
+
long start = 0;
|
385
|
+
for (long i = 0; i < k; i++) {
|
386
|
+
if (all_break_starts[i] == 0) {
|
387
|
+
start = all_break_ends[i];
|
388
|
+
continue;
|
389
|
+
}
|
390
|
+
|
391
|
+
segment_starts[num_segments] = start;
|
392
|
+
segment_ends[num_segments] = all_break_starts[i];
|
393
|
+
num_segments++;
|
394
|
+
start = all_break_ends[i];
|
395
|
+
}
|
396
|
+
|
397
|
+
// Add the last segment
|
398
|
+
if (all_break_ends[k - 1] < utf16_len) {
|
399
|
+
segment_starts[num_segments] = start;
|
400
|
+
segment_ends[num_segments] = utf16_len;
|
401
|
+
num_segments++;
|
402
|
+
}
|
403
|
+
|
404
|
+
free(all_break_starts);
|
405
|
+
free(all_break_ends);
|
406
|
+
free(utf16_text); // Free the UTF-16 text after processing
|
407
|
+
|
408
|
+
// Pass the results back to the caller
|
409
|
+
args->segment_starts = segment_starts;
|
410
|
+
args->segment_ends = segment_ends;
|
411
|
+
args->num_segments = num_segments;
|
412
|
+
|
413
|
+
return NULL;
|
414
|
+
}
|
415
|
+
|
416
|
+
VALUE text_sentencer_segment(VALUE self, VALUE rb_text) {
|
417
|
+
TextSentencer *ts;
|
418
|
+
TypedData_Get_Struct(self, TextSentencer, &text_sentencer_type, ts);
|
419
|
+
|
420
|
+
Check_Type(rb_text, T_STRING);
|
421
|
+
const char *text = StringValueCStr(rb_text);
|
422
|
+
|
423
|
+
long *segment_starts = NULL;
|
424
|
+
long *segment_ends = NULL;
|
425
|
+
long num_segments = 0;
|
426
|
+
|
427
|
+
struct segment_args args = {ts, text, segment_starts, segment_ends, num_segments};
|
428
|
+
rb_thread_call_without_gvl(segment_without_gvl, &args, RUBY_UBF_IO, NULL);
|
429
|
+
|
430
|
+
VALUE segments = rb_ary_new();
|
431
|
+
for (long i = 0; i < args.num_segments; ++i) {
|
432
|
+
VALUE segment = rb_ary_new();
|
433
|
+
rb_ary_push(segment, LONG2NUM(args.segment_starts[i]));
|
434
|
+
rb_ary_push(segment, LONG2NUM(args.segment_ends[i]));
|
435
|
+
rb_ary_push(segments, segment);
|
436
|
+
}
|
437
|
+
|
438
|
+
free(args.segment_starts);
|
439
|
+
free(args.segment_ends);
|
440
|
+
|
441
|
+
return segments;
|
442
|
+
}
|
443
|
+
|
444
|
+
// Annotate method
|
445
|
+
VALUE text_sentencer_annotate(VALUE self, VALUE rb_text) {
|
446
|
+
Check_Type(rb_text, T_STRING);
|
447
|
+
|
448
|
+
VALUE segments = text_sentencer_segment(self, rb_text);
|
449
|
+
VALUE blocks = rb_ary_new();
|
450
|
+
|
451
|
+
for (long i = 0; i < RARRAY_LEN(segments); i++) {
|
452
|
+
VALUE span = rb_hash_new();
|
453
|
+
VALUE block = rb_hash_new();
|
454
|
+
|
455
|
+
VALUE segment = rb_ary_entry(segments, i);
|
456
|
+
long start = NUM2LONG(rb_ary_entry(segment, 0));
|
457
|
+
long end = NUM2LONG(rb_ary_entry(segment, 1));
|
458
|
+
|
459
|
+
rb_hash_aset(span, ID2SYM(rb_intern("begin")), LONG2NUM(start));
|
460
|
+
rb_hash_aset(span, ID2SYM(rb_intern("end")), LONG2NUM(end));
|
461
|
+
|
462
|
+
rb_hash_aset(block, ID2SYM(rb_intern("span")), span);
|
463
|
+
rb_hash_aset(block, ID2SYM(rb_intern("obj")), rb_str_new_cstr("Sentence"));
|
464
|
+
|
465
|
+
rb_ary_push(blocks, block);
|
466
|
+
}
|
467
|
+
|
468
|
+
VALUE result = rb_hash_new();
|
469
|
+
rb_hash_aset(result, ID2SYM(rb_intern("text")), rb_text);
|
470
|
+
rb_hash_aset(result, ID2SYM(rb_intern("blocks")), blocks);
|
471
|
+
|
472
|
+
return result;
|
473
|
+
}
|
474
|
+
|
475
|
+
// Allocate memory for TextSentencer structure
|
476
|
+
VALUE text_sentencer_allocate(VALUE klass) {
|
477
|
+
TextSentencer *ts;
|
478
|
+
VALUE obj = TypedData_Make_Struct(klass, TextSentencer, &text_sentencer_type, ts);
|
479
|
+
|
480
|
+
// Initialize fields to avoid uninitialized memory issues
|
481
|
+
ts->break_pattern = NULL;
|
482
|
+
ts->candidate_pattern = NULL;
|
483
|
+
ts->num_positive_rules = 0;
|
484
|
+
ts->positive_rules_pre = NULL;
|
485
|
+
ts->positive_rules_post = NULL;
|
486
|
+
ts->num_negative_rules = 0;
|
487
|
+
ts->negative_rules_pre = NULL;
|
488
|
+
ts->negative_rules_post = NULL;
|
489
|
+
|
490
|
+
return obj;
|
491
|
+
}
|
492
|
+
|
493
|
+
// Helper function to compile ICU regex
|
494
|
+
URegularExpression* compile_pattern(VALUE rb_pattern) {
|
495
|
+
Check_Type(rb_pattern, T_STRING);
|
496
|
+
const char *pattern_str = StringValueCStr(rb_pattern);
|
497
|
+
|
498
|
+
if (strlen(pattern_str) == 0) {
|
499
|
+
rb_raise(rb_eArgError, "Empty regex pattern is not allowed");
|
500
|
+
}
|
501
|
+
|
502
|
+
UErrorCode status = U_ZERO_ERROR;
|
503
|
+
|
504
|
+
// Convert the UTF-8 pattern to UTF-16
|
505
|
+
int32_t pattern_utf16_len = 0;
|
506
|
+
UChar *pattern_utf16 = utf8_to_utf16(pattern_str, &pattern_utf16_len, &status);
|
507
|
+
|
508
|
+
if (U_FAILURE(status)) {
|
509
|
+
rb_raise(rb_eRuntimeError, "Failed to convert regex pattern to UTF-16: %s", u_errorName(status));
|
510
|
+
}
|
511
|
+
|
512
|
+
// Compile the regular expression
|
513
|
+
UParseError parse_error;
|
514
|
+
URegularExpression *regex = uregex_open(pattern_utf16, pattern_utf16_len, 0, &parse_error, &status);
|
515
|
+
free(pattern_utf16); // Free the UTF-16 pattern after compilation
|
516
|
+
|
517
|
+
if (U_FAILURE(status)) {
|
518
|
+
char error_message[256];
|
519
|
+
snprintf(error_message, sizeof(error_message), "ICU regex compilation failed: %s", u_errorName(status));
|
520
|
+
rb_raise(rb_eRuntimeError, "%s", error_message);
|
521
|
+
}
|
522
|
+
|
523
|
+
return regex;
|
524
|
+
}
|
525
|
+
|
526
|
+
VALUE text_sentencer_initialize(VALUE self, VALUE rules) {
|
527
|
+
TextSentencer *ts;
|
528
|
+
TypedData_Get_Struct(self, TextSentencer, &text_sentencer_type, ts);
|
529
|
+
|
530
|
+
// Extract and compile the break pattern from the rules hash
|
531
|
+
VALUE rb_break_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("break_pattern")));
|
532
|
+
|
533
|
+
if (!NIL_P(rb_break_pattern)) {
|
534
|
+
ts->break_pattern = compile_pattern(rb_break_pattern);
|
535
|
+
} else {
|
536
|
+
rb_raise(rb_eArgError, "break_pattern is required in rules");
|
537
|
+
}
|
538
|
+
|
539
|
+
// Extract and compile the candidate pattern from the rules hash
|
540
|
+
VALUE rb_candidate_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("candidate_pattern")));
|
541
|
+
|
542
|
+
if (!NIL_P(rb_candidate_pattern)) {
|
543
|
+
ts->candidate_pattern = compile_pattern(rb_candidate_pattern);
|
544
|
+
} else {
|
545
|
+
rb_raise(rb_eArgError, "candidate_pattern is required in rules");
|
546
|
+
}
|
547
|
+
|
548
|
+
// Compile positive rules
|
549
|
+
VALUE rb_positive_rules = rb_hash_aref(rules, ID2SYM(rb_intern("positive_rules")));
|
550
|
+
if (NIL_P(rb_positive_rules) || TYPE(rb_positive_rules) != T_ARRAY) {
|
551
|
+
rb_raise(rb_eArgError, "positive_rules must be an array");
|
552
|
+
}
|
553
|
+
|
554
|
+
ts->num_positive_rules = RARRAY_LEN(rb_positive_rules);
|
555
|
+
ts->positive_rules_pre = malloc(ts->num_positive_rules * sizeof(URegularExpression *));
|
556
|
+
ts->positive_rules_post = malloc(ts->num_positive_rules * sizeof(URegularExpression *));
|
557
|
+
if ((ts->positive_rules_pre == NULL) || (ts->positive_rules_post == NULL)) {
|
558
|
+
rb_raise(rb_eNoMemError, "Failed to allocate memory for positive rules");
|
559
|
+
}
|
560
|
+
|
561
|
+
for (size_t i = 0; i < ts->num_positive_rules; i++) {
|
562
|
+
VALUE rb_rule = rb_ary_entry(rb_positive_rules, i);
|
563
|
+
VALUE rb_rule_pre = rb_ary_entry(rb_rule, 0);
|
564
|
+
VALUE rb_rule_post = rb_ary_entry(rb_rule, 1);
|
565
|
+
ts->positive_rules_pre[i] = compile_pattern(rb_rule_pre);
|
566
|
+
ts->positive_rules_post[i] = compile_pattern(rb_rule_post);
|
567
|
+
}
|
568
|
+
|
569
|
+
// Compile negative rules
|
570
|
+
VALUE rb_negative_rules = rb_hash_aref(rules, ID2SYM(rb_intern("negative_rules")));
|
571
|
+
if (NIL_P(rb_negative_rules) || TYPE(rb_negative_rules) != T_ARRAY) {
|
572
|
+
rb_raise(rb_eArgError, "negative_rules must be an array");
|
573
|
+
}
|
574
|
+
|
575
|
+
ts->num_negative_rules = RARRAY_LEN(rb_negative_rules);
|
576
|
+
ts->negative_rules_pre = malloc(ts->num_negative_rules * sizeof(URegularExpression *));
|
577
|
+
ts->negative_rules_post = malloc(ts->num_negative_rules * sizeof(URegularExpression *));
|
578
|
+
if ((ts->negative_rules_pre == NULL) || (ts->negative_rules_post == NULL)) {
|
579
|
+
rb_raise(rb_eNoMemError, "Failed to allocate memory for negative rules");
|
580
|
+
}
|
581
|
+
|
582
|
+
for (size_t i = 0; i < ts->num_negative_rules; i++) {
|
583
|
+
VALUE rb_rule = rb_ary_entry(rb_negative_rules, i);
|
584
|
+
VALUE rb_rule_pre = rb_ary_entry(rb_rule, 0);
|
585
|
+
VALUE rb_rule_post = rb_ary_entry(rb_rule, 1);
|
586
|
+
ts->negative_rules_pre[i] = compile_pattern(rb_rule_pre);
|
587
|
+
ts->negative_rules_post[i] = compile_pattern(rb_rule_post);
|
588
|
+
}
|
589
|
+
|
590
|
+
return self;
|
591
|
+
}
|
592
|
+
|
593
|
+
// Module initialization
|
594
|
+
void Init_text_sentencer_c(void) {
|
595
|
+
rb_cTextSentencer = rb_define_class("TextSentencer", rb_cObject);
|
596
|
+
rb_define_alloc_func(rb_cTextSentencer, text_sentencer_allocate);
|
597
|
+
rb_define_method(rb_cTextSentencer, "initialize", text_sentencer_initialize, 1);
|
598
|
+
rb_define_method(rb_cTextSentencer, "segment", text_sentencer_segment, 1);
|
599
|
+
rb_define_method(rb_cTextSentencer, "annotate", text_sentencer_annotate, 1);
|
600
|
+
}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#ifndef TEXT_SENTENCER_H
|
2
|
+
#define TEXT_SENTENCER_H
|
3
|
+
|
4
|
+
#include "ruby.h"
|
5
|
+
#include <unicode/uregex.h>
|
6
|
+
#include <unicode/ustring.h>
|
7
|
+
#include <unicode/utypes.h>
|
8
|
+
#include <unicode/utext.h>
|
9
|
+
|
10
|
+
|
11
|
+
typedef struct {
|
12
|
+
URegularExpression *break_pattern; // a single pattern
|
13
|
+
URegularExpression *candidate_pattern; // a single pattern
|
14
|
+
|
15
|
+
size_t num_positive_rules;
|
16
|
+
URegularExpression **positive_rules_pre; // an array of pre-patterns
|
17
|
+
URegularExpression **positive_rules_post; // an array of post-patterns
|
18
|
+
|
19
|
+
size_t num_negative_rules;
|
20
|
+
URegularExpression **negative_rules_pre; // an array of pre-patterns
|
21
|
+
URegularExpression **negative_rules_post; // an array of post-patterns
|
22
|
+
} TextSentencer;
|
23
|
+
|
24
|
+
// Function prototypes
|
25
|
+
void Init_text_sentencer_c(void);
|
26
|
+
VALUE text_sentencer_allocate(VALUE klass);
|
27
|
+
VALUE text_sentencer_initialize(VALUE self, VALUE rules);
|
28
|
+
VALUE text_sentencer_segment(VALUE self, VALUE text);
|
29
|
+
VALUE text_sentencer_annotate(VALUE self, VALUE text);
|
30
|
+
|
31
|
+
#endif // TEXT_SENTENCER_H
|
Binary file
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'text_sentencer_c/text_sentencer_c'
|
2
|
+
|
3
|
+
class TextSentencer
|
4
|
+
DEFAULT_RULES = {
|
5
|
+
break_pattern: "([ \t]*\n+)+[ \t]*", # one or more consecutive blank lines
|
6
|
+
candidate_pattern: "[ \t]+",
|
7
|
+
positive_rules: [
|
8
|
+
["[.!?]$", "^[0-9A-Z]"],
|
9
|
+
[":$", "^[0-9]"],
|
10
|
+
[":$", "^[A-Z][a-z]"]
|
11
|
+
],
|
12
|
+
negative_rules: [
|
13
|
+
['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.$', '^[A-Z][a-z]'],
|
14
|
+
['(Sr|Jr)\.$', '^[A-Z][a-z]'],
|
15
|
+
['\b[A-Z][a-z]*\.$', '^[0-9A-Z]'],
|
16
|
+
['(cf|vs)\.$', '^.'],
|
17
|
+
['e\.g\.$', '^.'],
|
18
|
+
['i\.e\.$', '^.'],
|
19
|
+
['(Sec|Chap|Fig|Eq)\.$', '^[0-9A-Z]']
|
20
|
+
]
|
21
|
+
}
|
22
|
+
|
23
|
+
def self.update_rules(rules)
|
24
|
+
DEFAULT_RULES.merge(rules)
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
Gem::Specification.new do |spec|
|
2
|
+
spec.name = "text_sentencer_c"
|
3
|
+
spec.version = "0.1.0"
|
4
|
+
spec.authors = ["Jin-Dong Kim"]
|
5
|
+
spec.email = ["jindong.kim@gmail.com"]
|
6
|
+
spec.summary = "Text segmentation into sentences"
|
7
|
+
spec.description = "It is a reimplementation text_sentencer, which is originally written in ruby, using C extension for a better performance. It is a preliminary version, and may not be fully functional."
|
8
|
+
spec.homepage = "https://github.com/jdkim/text_sentencer_c"
|
9
|
+
spec.license = "MIT"
|
10
|
+
|
11
|
+
spec.files = Dir["{lib,ext}/**/*"] + ["text_sentencer_c.gemspec", "Rakefile", "README.md"]
|
12
|
+
spec.executables = ['text_sentencer']
|
13
|
+
spec.require_paths = ["lib", "ext"]
|
14
|
+
|
15
|
+
spec.extensions = ["ext/text_sentencer_c/extconf.rb"]
|
16
|
+
end
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: text_sentencer_c
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jin-Dong Kim
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-08-28 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: It is a reimplementation text_sentencer, which is originally written
|
14
|
+
in ruby, using C extension for a better performance. It is a preliminary version,
|
15
|
+
and may not be fully functional.
|
16
|
+
email:
|
17
|
+
- jindong.kim@gmail.com
|
18
|
+
executables:
|
19
|
+
- text_sentencer
|
20
|
+
extensions:
|
21
|
+
- ext/text_sentencer_c/extconf.rb
|
22
|
+
extra_rdoc_files: []
|
23
|
+
files:
|
24
|
+
- README.md
|
25
|
+
- Rakefile
|
26
|
+
- bin/text_sentencer
|
27
|
+
- ext/text_sentencer_c/Makefile
|
28
|
+
- ext/text_sentencer_c/extconf.rb
|
29
|
+
- ext/text_sentencer_c/text_sentencer.c
|
30
|
+
- ext/text_sentencer_c/text_sentencer.h
|
31
|
+
- lib/text_sentencer_c.rb
|
32
|
+
- lib/text_sentencer_c/text_sentencer_c.so
|
33
|
+
- text_sentencer_c.gemspec
|
34
|
+
homepage: https://github.com/jdkim/text_sentencer_c
|
35
|
+
licenses:
|
36
|
+
- MIT
|
37
|
+
metadata: {}
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
require_paths:
|
41
|
+
- lib
|
42
|
+
- ext
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements: []
|
54
|
+
rubygems_version: 3.5.11
|
55
|
+
signing_key:
|
56
|
+
specification_version: 4
|
57
|
+
summary: Text segmentation into sentences
|
58
|
+
test_files: []
|