sentence_it 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 28527ce7fb8f1678ce0f66e885a244c84aea773d1e904596248d9274b602e87d
4
+ data.tar.gz: 28930eab68014759d472c5d5714c7c93715732fbf2d7af1d8be9c581bb39e25e
5
+ SHA512:
6
+ metadata.gz: 4a8622f46efb4c2b06b888773107522642a91525fb65156277a1d9d2cf4d2159ce9eb58cc2321613faba682e0fa955b7db18bc3935dd6c686eb8f5c96e5bcca6
7
+ data.tar.gz: 62cf0ac8a791d6120fdb9cbc59c8cf437e5d8abdf15135dc55827ff0b51670d02cfef5ed70b473ef5fcd13a7f6a05248afbbc270782e866b41d747fcfa30272c
data/README.md ADDED
@@ -0,0 +1,11 @@
1
+ # SentenceIt
2
+
3
+ A Ruby gem for text segmentation using a C extension.
4
+ It is a re-implementation of 'text_sentencer', which is originally implemented in ruby, using a C extension, for a better performance.
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ ```ruby
11
+ gem 'sentencer_it'
data/bin/sentence_it ADDED
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ require 'json'
3
+ require 'sentence_it'
4
+
5
+ rules_filename = nil
6
+ output_mode = :sentences
7
+
8
+ ## command line option processing
9
+ require 'optparse'
10
+ optparse = OptionParser.new do |opts|
11
+ opts.banner = "Usage: text_sentencer [options]"
12
+
13
+ opts.on('-r', '--rules=rules_filename', 'specifies the rules JSON file.') do |c|
14
+ rules_filename = c
15
+ end
16
+
17
+ opts.on('-j', '--json_output', 'outputs the result in JSON.') do
18
+ output_mode = :json
19
+ end
20
+
21
+ opts.on('-h', '--help', 'displays this screen.') do
22
+ puts opts
23
+ exit
24
+ end
25
+ end
26
+
27
+ optparse.parse!
28
+
29
+ rules = if rules_filename && File.file?(rules_filename)
30
+ JSON.parse File.read(rules_filename)
31
+ end
32
+
33
+ sentencer = SentenceIt.new(rules || SentenceIt::DEFAULT_RULES)
34
+
35
+ text = ARGF.read
36
+
37
+ annotations = sentencer.annotate(text)
38
+
39
+ if output_mode == :json
40
+ puts JSON.pretty_generate(annotations)
41
+ else
42
+ annotations['blocks']&.each do |d|
43
+ span = d['span']
44
+ puts text[span['begin']...span['end']]
45
+ end
46
+ end
@@ -0,0 +1,4 @@
1
+ # extconf.rb
2
+ require 'mkmf'
3
+
4
+ create_makefile('senrence_it/sentence_it')
@@ -0,0 +1,196 @@
1
+ #include "sentence_it.h"
2
+ #include <string.h>
3
+ #include <regex.h>
4
+ #include <stdio.h>
5
+
6
+ // Define the default rules
7
+ VALUE default_rules;
8
+
9
+ // Allocate memory for sentence_it_t structure
10
+ VALUE sentence_it_allocate(VALUE klass) {
11
+ sentence_it_t *si;
12
+ return Data_Make_Struct(klass, sentence_it_t, NULL, RUBY_DEFAULT_FREE, si);
13
+ }
14
+
15
+ // Initialize the Ruby module and class
16
+ void Init_sentence_it() {
17
+ VALUE cSentenceIt = rb_define_class("SentenceIt", rb_cObject);
18
+ rb_define_alloc_func(cSentenceIt, sentence_it_allocate);
19
+ rb_define_method(cSentenceIt, "initialize", sentence_it_initialize, 1);
20
+ rb_define_method(cSentenceIt, "annotate", sentence_it_annotate, 1);
21
+ rb_define_method(cSentenceIt, "segment", sentence_it_segment, 1);
22
+ }
23
+
24
+ // Initialize the SentenceIt object with rules
25
+ VALUE sentence_it_initialize(VALUE self, VALUE rules) {
26
+ sentence_it_t *si;
27
+ Data_Get_Struct(self, sentence_it_t, si);
28
+
29
+ si->break_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("break_pattern")));
30
+ si->candidate_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("candidate_pattern")));
31
+ si->positive_rules = rb_hash_aref(rules, ID2SYM(rb_intern("positive_rules")));
32
+ si->negative_rules = rb_hash_aref(rules, ID2SYM(rb_intern("negative_rules")));
33
+
34
+ return self;
35
+ }
36
+
37
+ // Utility function to scan text with a regex pattern and return offsets
38
+ static VALUE scan_offsets(VALUE text, const char *pattern) {
39
+ regex_t regex;
40
+ regmatch_t pmatch[1];
41
+ int start = 0;
42
+ VALUE offsets = rb_ary_new();
43
+
44
+ if (regcomp(&regex, pattern, REG_EXTENDED)) {
45
+ rb_raise(rb_eRuntimeError, "Could not compile regex");
46
+ }
47
+
48
+ while (!regexec(&regex, RSTRING_PTR(text) + start, 1, pmatch, 0)) {
49
+ VALUE offset = rb_ary_new();
50
+ rb_ary_push(offset, INT2NUM(start + pmatch[0].rm_so));
51
+ rb_ary_push(offset, INT2NUM(start + pmatch[0].rm_eo));
52
+ rb_ary_push(offsets, offset);
53
+ start += pmatch[0].rm_eo;
54
+ }
55
+
56
+ regfree(&regex);
57
+ return offsets;
58
+ }
59
+
60
+ // Implement the segment function
61
+ VALUE sentence_it_segment(VALUE self, VALUE text) {
62
+ sentence_it_t *si;
63
+ Data_Get_Struct(self, sentence_it_t, si);
64
+
65
+ VALUE breaks;
66
+ if (RSTRING_LEN(si->break_pattern) == 0) {
67
+ breaks = rb_ary_new();
68
+ } else {
69
+ breaks = scan_offsets(text, RSTRING_PTR(si->break_pattern));
70
+ }
71
+
72
+ VALUE candidates;
73
+ if (RSTRING_LEN(si->candidate_pattern) == 0) {
74
+ candidates = rb_ary_new();
75
+ } else {
76
+ candidates = scan_offsets(text, RSTRING_PTR(si->candidate_pattern));
77
+ }
78
+
79
+ // Remove candidates that are already in breaks
80
+ for (int i = 0; i < RARRAY_LEN(breaks); i++) {
81
+ VALUE break_offset = rb_ary_entry(breaks, i);
82
+ for (int j = 0; j < RARRAY_LEN(candidates); j++) {
83
+ VALUE candidate_offset = rb_ary_entry(candidates, j);
84
+ if (rb_equal(break_offset, candidate_offset)) {
85
+ rb_ary_delete_at(candidates, j);
86
+ break;
87
+ }
88
+ }
89
+ }
90
+
91
+ // Process candidates
92
+ for (int i = 0; i < RARRAY_LEN(candidates); i++) {
93
+ VALUE candidate = rb_ary_entry(candidates, i);
94
+ int last_end = NUM2INT(rb_ary_entry(candidate, 0));
95
+ int next_begin = NUM2INT(rb_ary_entry(candidate, 1));
96
+
97
+ if (last_end == 0 || next_begin == RSTRING_LEN(text)) {
98
+ rb_ary_push(breaks, candidate);
99
+ continue;
100
+ }
101
+
102
+ VALUE last_text = rb_str_substr(text, 0, last_end);
103
+ VALUE next_text = rb_str_substr(text, next_begin, RSTRING_LEN(text) - next_begin);
104
+
105
+ for (int j = 0; j < RARRAY_LEN(si->positive_rules); j++) {
106
+ VALUE rule = rb_ary_entry(si->positive_rules, j);
107
+ const char *pattern1 = RSTRING_PTR(rb_ary_entry(rule, 0));
108
+ const char *pattern2 = RSTRING_PTR(rb_ary_entry(rule, 1));
109
+
110
+ regex_t regex1, regex2;
111
+ regcomp(&regex1, pattern1, REG_EXTENDED | REG_NOSUB);
112
+ regcomp(&regex2, pattern2, REG_EXTENDED | REG_NOSUB);
113
+
114
+ int match1 = !regexec(&regex1, RSTRING_PTR(last_text), 0, NULL, 0);
115
+ int match2 = !regexec(&regex2, RSTRING_PTR(next_text), 0, NULL, 0);
116
+
117
+ regfree(&regex1);
118
+ regfree(&regex2);
119
+
120
+ if (match1 && match2) {
121
+ int break_p = 1;
122
+ for (int k = 0; k < RARRAY_LEN(si->negative_rules); k++) {
123
+ VALUE neg_rule = rb_ary_entry(si->negative_rules, k);
124
+ const char *neg_pattern1 = RSTRING_PTR(rb_ary_entry(neg_rule, 0));
125
+ const char *neg_pattern2 = RSTRING_PTR(rb_ary_entry(neg_rule, 1));
126
+
127
+ regex_t neg_regex1, neg_regex2;
128
+ regcomp(&neg_regex1, neg_pattern1, REG_EXTENDED | REG_NOSUB);
129
+ regcomp(&neg_regex2, neg_pattern2, REG_EXTENDED | REG_NOSUB);
130
+
131
+ int neg_match1 = !regexec(&neg_regex1, RSTRING_PTR(last_text), 0, NULL, 0);
132
+ int neg_match2 = !regexec(&neg_regex2, RSTRING_PTR(next_text), 0, NULL, 0);
133
+
134
+ regfree(&neg_regex1);
135
+ regfree(&neg_regex2);
136
+
137
+ if (neg_match1 && neg_match2) {
138
+ break_p = 0;
139
+ break;
140
+ }
141
+ }
142
+ if (break_p) {
143
+ rb_ary_push(breaks, candidate);
144
+ break;
145
+ }
146
+ }
147
+ }
148
+ }
149
+
150
+ // Sort breaks
151
+ rb_funcall(breaks, rb_intern("sort!"), 0);
152
+
153
+ VALUE sentences = rb_ary_new();
154
+ int last_break = 0;
155
+ for (int i = 0; i < RARRAY_LEN(breaks); i++) {
156
+ VALUE b = rb_ary_entry(breaks, i);
157
+ int begin = NUM2INT(rb_ary_entry(b, 0));
158
+ if (begin > last_break) {
159
+ VALUE sentence = rb_ary_new3(2, INT2NUM(last_break), INT2NUM(begin));
160
+ rb_ary_push(sentences, sentence);
161
+ }
162
+ last_break = NUM2INT(rb_ary_entry(b, 1));
163
+ }
164
+ if (last_break < RSTRING_LEN(text)) {
165
+ VALUE sentence = rb_ary_new3(2, INT2NUM(last_break), INT2NUM(RSTRING_LEN(text)));
166
+ rb_ary_push(sentences, sentence);
167
+ }
168
+
169
+ return sentences;
170
+ }
171
+
172
+
173
+ // Implement the annotate function
174
+ VALUE sentence_it_annotate(VALUE self, VALUE text) {
175
+ VALUE segments = sentence_it_segment(self, text);
176
+ VALUE blocks = rb_ary_new();
177
+
178
+ for (int i = 0; i < RARRAY_LEN(segments); i++) {
179
+ VALUE segment = rb_ary_entry(segments, i);
180
+ VALUE span = rb_hash_new();
181
+ rb_hash_aset(span, rb_str_new_cstr("begin"), rb_ary_entry(segment, 0));
182
+ rb_hash_aset(span, rb_str_new_cstr("end"), rb_ary_entry(segment, 1));
183
+
184
+ VALUE block = rb_hash_new();
185
+ rb_hash_aset(block, rb_str_new_cstr("span"), span);
186
+ rb_hash_aset(block, rb_str_new_cstr("obj"), rb_str_new_cstr("Sentence"));
187
+
188
+ rb_ary_push(blocks, block);
189
+ }
190
+
191
+ VALUE result = rb_hash_new();
192
+ rb_hash_aset(result, rb_str_new_cstr("text"), text);
193
+ rb_hash_aset(result, rb_str_new_cstr("blocks"), blocks);
194
+
195
+ return result;
196
+ }
@@ -0,0 +1,21 @@
1
+ #ifndef SENTENCE_IT_H
2
+ #define SENTENCE_IT_H
3
+
4
+ #include "ruby.h"
5
+
6
+ // Define a structure to hold the rules
7
+ typedef struct {
8
+ VALUE break_pattern;
9
+ VALUE candidate_pattern;
10
+ VALUE positive_rules;
11
+ VALUE negative_rules;
12
+ } sentence_it_t;
13
+
14
+ // Function prototypes
15
+ void Init_sentence_it();
16
+ VALUE sentence_it_allocate(VALUE klass);
17
+ VALUE sentence_it_initialize(VALUE self, VALUE rules);
18
+ VALUE sentence_it_annotate(VALUE self, VALUE text);
19
+ VALUE sentence_it_segment(VALUE self, VALUE text);
20
+
21
+ #endif // SENTENCE_IT_H
Binary file
@@ -0,0 +1,26 @@
1
+ require 'sentence_it/sentence_it'
2
+
3
+ class SentenceIt
4
+ DEFAULT_RULES = {
5
+ break_pattern: "([ \t]*\n+)+[ \t]*",
6
+ candidate_pattern: "[ \t]+",
7
+ positive_rules: [
8
+ ['[.!?]', '[0-9A-Z]'],
9
+ [':', '[0-9]'],
10
+ [':', '[A-Z][a-z]']
11
+ ],
12
+ negative_rules: [
13
+ ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
14
+ ['(Sr|Jr)\.', '[A-Z][a-z]'],
15
+ ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
16
+ ['(cf|vs)\.', ''],
17
+ ['e\.g\.', ''],
18
+ ['i\.e\.', ''],
19
+ ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
20
+ ]
21
+ }
22
+
23
+ define self.update_rules(rules)
24
+ DEFAULT_RULES.merge(rules)
25
+ end
26
+ end
metadata ADDED
@@ -0,0 +1,56 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sentence_it
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Jin-Dong Kim
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-08-02 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: It is a reimplementation text_sentencer, which is originally written
14
+ in ruby, using C extension for a better performance.
15
+ email:
16
+ - jindong.kim@gmail.com
17
+ executables:
18
+ - sentence_it
19
+ extensions:
20
+ - ext/sentence_it/extconf.rb
21
+ extra_rdoc_files: []
22
+ files:
23
+ - README.md
24
+ - bin/sentence_it
25
+ - ext/sentence_it/extconf.rb
26
+ - ext/sentence_it/sentence_it.c
27
+ - ext/sentence_it/sentence_it.h
28
+ - lib/sentence_it.rb
29
+ - lib/sentence_it/sentence_it.so
30
+ homepage: https://github.com/jdkim/sentence_it
31
+ licenses:
32
+ - MIT
33
+ metadata:
34
+ homepage_uri: https://github.com/jdkim/sentence_it
35
+ source_code_uri: https://github.com/jdkim/sentence_it
36
+ post_install_message:
37
+ rdoc_options: []
38
+ require_paths:
39
+ - lib
40
+ - ext/sentence_it
41
+ required_ruby_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ required_rubygems_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: '0'
51
+ requirements: []
52
+ rubygems_version: 3.5.11
53
+ signing_key:
54
+ specification_version: 4
55
+ summary: A Ruby gem for text segmentation using a C extension
56
+ test_files: []