sentence_it 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +11 -0
- data/bin/sentence_it +46 -0
- data/ext/sentence_it/extconf.rb +4 -0
- data/ext/sentence_it/sentence_it.c +196 -0
- data/ext/sentence_it/sentence_it.h +21 -0
- data/lib/sentence_it/sentence_it.so +0 -0
- data/lib/sentence_it.rb +26 -0
- metadata +56 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 28527ce7fb8f1678ce0f66e885a244c84aea773d1e904596248d9274b602e87d
|
4
|
+
data.tar.gz: 28930eab68014759d472c5d5714c7c93715732fbf2d7af1d8be9c581bb39e25e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 4a8622f46efb4c2b06b888773107522642a91525fb65156277a1d9d2cf4d2159ce9eb58cc2321613faba682e0fa955b7db18bc3935dd6c686eb8f5c96e5bcca6
|
7
|
+
data.tar.gz: 62cf0ac8a791d6120fdb9cbc59c8cf437e5d8abdf15135dc55827ff0b51670d02cfef5ed70b473ef5fcd13a7f6a05248afbbc270782e866b41d747fcfa30272c
|
data/README.md
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# SentenceIt
|
2
|
+
|
3
|
+
A Ruby gem for text segmentation using a C extension.
|
4
|
+
It is a re-implementation of 'text_sentencer', which is originally implemented in ruby, using a C extension, for a better performance.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
gem 'sentencer_it'
|
data/bin/sentence_it
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'json'
|
3
|
+
require 'sentence_it'
|
4
|
+
|
5
|
+
rules_filename = nil
|
6
|
+
output_mode = :sentences
|
7
|
+
|
8
|
+
## command line option processing
|
9
|
+
require 'optparse'
|
10
|
+
optparse = OptionParser.new do |opts|
|
11
|
+
opts.banner = "Usage: text_sentencer [options]"
|
12
|
+
|
13
|
+
opts.on('-r', '--rules=rules_filename', 'specifies the rules JSON file.') do |c|
|
14
|
+
rules_filename = c
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.on('-j', '--json_output', 'outputs the result in JSON.') do
|
18
|
+
output_mode = :json
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
22
|
+
puts opts
|
23
|
+
exit
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
optparse.parse!
|
28
|
+
|
29
|
+
rules = if rules_filename && File.file?(rules_filename)
|
30
|
+
JSON.parse File.read(rules_filename)
|
31
|
+
end
|
32
|
+
|
33
|
+
sentencer = SentenceIt.new(rules || SentenceIt::DEFAULT_RULES)
|
34
|
+
|
35
|
+
text = ARGF.read
|
36
|
+
|
37
|
+
annotations = sentencer.annotate(text)
|
38
|
+
|
39
|
+
if output_mode == :json
|
40
|
+
puts JSON.pretty_generate(annotations)
|
41
|
+
else
|
42
|
+
annotations['blocks']&.each do |d|
|
43
|
+
span = d['span']
|
44
|
+
puts text[span['begin']...span['end']]
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
#include "sentence_it.h"
|
2
|
+
#include <string.h>
|
3
|
+
#include <regex.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
|
6
|
+
// Define the default rules
|
7
|
+
VALUE default_rules;
|
8
|
+
|
9
|
+
// Allocate memory for sentence_it_t structure
|
10
|
+
VALUE sentence_it_allocate(VALUE klass) {
|
11
|
+
sentence_it_t *si;
|
12
|
+
return Data_Make_Struct(klass, sentence_it_t, NULL, RUBY_DEFAULT_FREE, si);
|
13
|
+
}
|
14
|
+
|
15
|
+
// Initialize the Ruby module and class
|
16
|
+
void Init_sentence_it() {
|
17
|
+
VALUE cSentenceIt = rb_define_class("SentenceIt", rb_cObject);
|
18
|
+
rb_define_alloc_func(cSentenceIt, sentence_it_allocate);
|
19
|
+
rb_define_method(cSentenceIt, "initialize", sentence_it_initialize, 1);
|
20
|
+
rb_define_method(cSentenceIt, "annotate", sentence_it_annotate, 1);
|
21
|
+
rb_define_method(cSentenceIt, "segment", sentence_it_segment, 1);
|
22
|
+
}
|
23
|
+
|
24
|
+
// Initialize the SentenceIt object with rules
|
25
|
+
VALUE sentence_it_initialize(VALUE self, VALUE rules) {
|
26
|
+
sentence_it_t *si;
|
27
|
+
Data_Get_Struct(self, sentence_it_t, si);
|
28
|
+
|
29
|
+
si->break_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("break_pattern")));
|
30
|
+
si->candidate_pattern = rb_hash_aref(rules, ID2SYM(rb_intern("candidate_pattern")));
|
31
|
+
si->positive_rules = rb_hash_aref(rules, ID2SYM(rb_intern("positive_rules")));
|
32
|
+
si->negative_rules = rb_hash_aref(rules, ID2SYM(rb_intern("negative_rules")));
|
33
|
+
|
34
|
+
return self;
|
35
|
+
}
|
36
|
+
|
37
|
+
// Utility function to scan text with a regex pattern and return offsets
|
38
|
+
static VALUE scan_offsets(VALUE text, const char *pattern) {
|
39
|
+
regex_t regex;
|
40
|
+
regmatch_t pmatch[1];
|
41
|
+
int start = 0;
|
42
|
+
VALUE offsets = rb_ary_new();
|
43
|
+
|
44
|
+
if (regcomp(®ex, pattern, REG_EXTENDED)) {
|
45
|
+
rb_raise(rb_eRuntimeError, "Could not compile regex");
|
46
|
+
}
|
47
|
+
|
48
|
+
while (!regexec(®ex, RSTRING_PTR(text) + start, 1, pmatch, 0)) {
|
49
|
+
VALUE offset = rb_ary_new();
|
50
|
+
rb_ary_push(offset, INT2NUM(start + pmatch[0].rm_so));
|
51
|
+
rb_ary_push(offset, INT2NUM(start + pmatch[0].rm_eo));
|
52
|
+
rb_ary_push(offsets, offset);
|
53
|
+
start += pmatch[0].rm_eo;
|
54
|
+
}
|
55
|
+
|
56
|
+
regfree(®ex);
|
57
|
+
return offsets;
|
58
|
+
}
|
59
|
+
|
60
|
+
// Implement the segment function
|
61
|
+
VALUE sentence_it_segment(VALUE self, VALUE text) {
|
62
|
+
sentence_it_t *si;
|
63
|
+
Data_Get_Struct(self, sentence_it_t, si);
|
64
|
+
|
65
|
+
VALUE breaks;
|
66
|
+
if (RSTRING_LEN(si->break_pattern) == 0) {
|
67
|
+
breaks = rb_ary_new();
|
68
|
+
} else {
|
69
|
+
breaks = scan_offsets(text, RSTRING_PTR(si->break_pattern));
|
70
|
+
}
|
71
|
+
|
72
|
+
VALUE candidates;
|
73
|
+
if (RSTRING_LEN(si->candidate_pattern) == 0) {
|
74
|
+
candidates = rb_ary_new();
|
75
|
+
} else {
|
76
|
+
candidates = scan_offsets(text, RSTRING_PTR(si->candidate_pattern));
|
77
|
+
}
|
78
|
+
|
79
|
+
// Remove candidates that are already in breaks
|
80
|
+
for (int i = 0; i < RARRAY_LEN(breaks); i++) {
|
81
|
+
VALUE break_offset = rb_ary_entry(breaks, i);
|
82
|
+
for (int j = 0; j < RARRAY_LEN(candidates); j++) {
|
83
|
+
VALUE candidate_offset = rb_ary_entry(candidates, j);
|
84
|
+
if (rb_equal(break_offset, candidate_offset)) {
|
85
|
+
rb_ary_delete_at(candidates, j);
|
86
|
+
break;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
// Process candidates
|
92
|
+
for (int i = 0; i < RARRAY_LEN(candidates); i++) {
|
93
|
+
VALUE candidate = rb_ary_entry(candidates, i);
|
94
|
+
int last_end = NUM2INT(rb_ary_entry(candidate, 0));
|
95
|
+
int next_begin = NUM2INT(rb_ary_entry(candidate, 1));
|
96
|
+
|
97
|
+
if (last_end == 0 || next_begin == RSTRING_LEN(text)) {
|
98
|
+
rb_ary_push(breaks, candidate);
|
99
|
+
continue;
|
100
|
+
}
|
101
|
+
|
102
|
+
VALUE last_text = rb_str_substr(text, 0, last_end);
|
103
|
+
VALUE next_text = rb_str_substr(text, next_begin, RSTRING_LEN(text) - next_begin);
|
104
|
+
|
105
|
+
for (int j = 0; j < RARRAY_LEN(si->positive_rules); j++) {
|
106
|
+
VALUE rule = rb_ary_entry(si->positive_rules, j);
|
107
|
+
const char *pattern1 = RSTRING_PTR(rb_ary_entry(rule, 0));
|
108
|
+
const char *pattern2 = RSTRING_PTR(rb_ary_entry(rule, 1));
|
109
|
+
|
110
|
+
regex_t regex1, regex2;
|
111
|
+
regcomp(®ex1, pattern1, REG_EXTENDED | REG_NOSUB);
|
112
|
+
regcomp(®ex2, pattern2, REG_EXTENDED | REG_NOSUB);
|
113
|
+
|
114
|
+
int match1 = !regexec(®ex1, RSTRING_PTR(last_text), 0, NULL, 0);
|
115
|
+
int match2 = !regexec(®ex2, RSTRING_PTR(next_text), 0, NULL, 0);
|
116
|
+
|
117
|
+
regfree(®ex1);
|
118
|
+
regfree(®ex2);
|
119
|
+
|
120
|
+
if (match1 && match2) {
|
121
|
+
int break_p = 1;
|
122
|
+
for (int k = 0; k < RARRAY_LEN(si->negative_rules); k++) {
|
123
|
+
VALUE neg_rule = rb_ary_entry(si->negative_rules, k);
|
124
|
+
const char *neg_pattern1 = RSTRING_PTR(rb_ary_entry(neg_rule, 0));
|
125
|
+
const char *neg_pattern2 = RSTRING_PTR(rb_ary_entry(neg_rule, 1));
|
126
|
+
|
127
|
+
regex_t neg_regex1, neg_regex2;
|
128
|
+
regcomp(&neg_regex1, neg_pattern1, REG_EXTENDED | REG_NOSUB);
|
129
|
+
regcomp(&neg_regex2, neg_pattern2, REG_EXTENDED | REG_NOSUB);
|
130
|
+
|
131
|
+
int neg_match1 = !regexec(&neg_regex1, RSTRING_PTR(last_text), 0, NULL, 0);
|
132
|
+
int neg_match2 = !regexec(&neg_regex2, RSTRING_PTR(next_text), 0, NULL, 0);
|
133
|
+
|
134
|
+
regfree(&neg_regex1);
|
135
|
+
regfree(&neg_regex2);
|
136
|
+
|
137
|
+
if (neg_match1 && neg_match2) {
|
138
|
+
break_p = 0;
|
139
|
+
break;
|
140
|
+
}
|
141
|
+
}
|
142
|
+
if (break_p) {
|
143
|
+
rb_ary_push(breaks, candidate);
|
144
|
+
break;
|
145
|
+
}
|
146
|
+
}
|
147
|
+
}
|
148
|
+
}
|
149
|
+
|
150
|
+
// Sort breaks
|
151
|
+
rb_funcall(breaks, rb_intern("sort!"), 0);
|
152
|
+
|
153
|
+
VALUE sentences = rb_ary_new();
|
154
|
+
int last_break = 0;
|
155
|
+
for (int i = 0; i < RARRAY_LEN(breaks); i++) {
|
156
|
+
VALUE b = rb_ary_entry(breaks, i);
|
157
|
+
int begin = NUM2INT(rb_ary_entry(b, 0));
|
158
|
+
if (begin > last_break) {
|
159
|
+
VALUE sentence = rb_ary_new3(2, INT2NUM(last_break), INT2NUM(begin));
|
160
|
+
rb_ary_push(sentences, sentence);
|
161
|
+
}
|
162
|
+
last_break = NUM2INT(rb_ary_entry(b, 1));
|
163
|
+
}
|
164
|
+
if (last_break < RSTRING_LEN(text)) {
|
165
|
+
VALUE sentence = rb_ary_new3(2, INT2NUM(last_break), INT2NUM(RSTRING_LEN(text)));
|
166
|
+
rb_ary_push(sentences, sentence);
|
167
|
+
}
|
168
|
+
|
169
|
+
return sentences;
|
170
|
+
}
|
171
|
+
|
172
|
+
|
173
|
+
// Implement the annotate function
|
174
|
+
VALUE sentence_it_annotate(VALUE self, VALUE text) {
|
175
|
+
VALUE segments = sentence_it_segment(self, text);
|
176
|
+
VALUE blocks = rb_ary_new();
|
177
|
+
|
178
|
+
for (int i = 0; i < RARRAY_LEN(segments); i++) {
|
179
|
+
VALUE segment = rb_ary_entry(segments, i);
|
180
|
+
VALUE span = rb_hash_new();
|
181
|
+
rb_hash_aset(span, rb_str_new_cstr("begin"), rb_ary_entry(segment, 0));
|
182
|
+
rb_hash_aset(span, rb_str_new_cstr("end"), rb_ary_entry(segment, 1));
|
183
|
+
|
184
|
+
VALUE block = rb_hash_new();
|
185
|
+
rb_hash_aset(block, rb_str_new_cstr("span"), span);
|
186
|
+
rb_hash_aset(block, rb_str_new_cstr("obj"), rb_str_new_cstr("Sentence"));
|
187
|
+
|
188
|
+
rb_ary_push(blocks, block);
|
189
|
+
}
|
190
|
+
|
191
|
+
VALUE result = rb_hash_new();
|
192
|
+
rb_hash_aset(result, rb_str_new_cstr("text"), text);
|
193
|
+
rb_hash_aset(result, rb_str_new_cstr("blocks"), blocks);
|
194
|
+
|
195
|
+
return result;
|
196
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#ifndef SENTENCE_IT_H
|
2
|
+
#define SENTENCE_IT_H
|
3
|
+
|
4
|
+
#include "ruby.h"
|
5
|
+
|
6
|
+
// Define a structure to hold the rules
|
7
|
+
typedef struct {
|
8
|
+
VALUE break_pattern;
|
9
|
+
VALUE candidate_pattern;
|
10
|
+
VALUE positive_rules;
|
11
|
+
VALUE negative_rules;
|
12
|
+
} sentence_it_t;
|
13
|
+
|
14
|
+
// Function prototypes
|
15
|
+
void Init_sentence_it();
|
16
|
+
VALUE sentence_it_allocate(VALUE klass);
|
17
|
+
VALUE sentence_it_initialize(VALUE self, VALUE rules);
|
18
|
+
VALUE sentence_it_annotate(VALUE self, VALUE text);
|
19
|
+
VALUE sentence_it_segment(VALUE self, VALUE text);
|
20
|
+
|
21
|
+
#endif // SENTENCE_IT_H
|
Binary file
|
data/lib/sentence_it.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'sentence_it/sentence_it'
|
2
|
+
|
3
|
+
class SentenceIt
|
4
|
+
DEFAULT_RULES = {
|
5
|
+
break_pattern: "([ \t]*\n+)+[ \t]*",
|
6
|
+
candidate_pattern: "[ \t]+",
|
7
|
+
positive_rules: [
|
8
|
+
['[.!?]', '[0-9A-Z]'],
|
9
|
+
[':', '[0-9]'],
|
10
|
+
[':', '[A-Z][a-z]']
|
11
|
+
],
|
12
|
+
negative_rules: [
|
13
|
+
['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
|
14
|
+
['(Sr|Jr)\.', '[A-Z][a-z]'],
|
15
|
+
['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
|
16
|
+
['(cf|vs)\.', ''],
|
17
|
+
['e\.g\.', ''],
|
18
|
+
['i\.e\.', ''],
|
19
|
+
['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
|
20
|
+
]
|
21
|
+
}
|
22
|
+
|
23
|
+
define self.update_rules(rules)
|
24
|
+
DEFAULT_RULES.merge(rules)
|
25
|
+
end
|
26
|
+
end
|
metadata
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sentence_it
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jin-Dong Kim
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-08-02 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: It is a reimplementation text_sentencer, which is originally written
|
14
|
+
in ruby, using C extension for a better performance.
|
15
|
+
email:
|
16
|
+
- jindong.kim@gmail.com
|
17
|
+
executables:
|
18
|
+
- sentence_it
|
19
|
+
extensions:
|
20
|
+
- ext/sentence_it/extconf.rb
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- README.md
|
24
|
+
- bin/sentence_it
|
25
|
+
- ext/sentence_it/extconf.rb
|
26
|
+
- ext/sentence_it/sentence_it.c
|
27
|
+
- ext/sentence_it/sentence_it.h
|
28
|
+
- lib/sentence_it.rb
|
29
|
+
- lib/sentence_it/sentence_it.so
|
30
|
+
homepage: https://github.com/jdkim/sentence_it
|
31
|
+
licenses:
|
32
|
+
- MIT
|
33
|
+
metadata:
|
34
|
+
homepage_uri: https://github.com/jdkim/sentence_it
|
35
|
+
source_code_uri: https://github.com/jdkim/sentence_it
|
36
|
+
post_install_message:
|
37
|
+
rdoc_options: []
|
38
|
+
require_paths:
|
39
|
+
- lib
|
40
|
+
- ext/sentence_it
|
41
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
requirements: []
|
52
|
+
rubygems_version: 3.5.11
|
53
|
+
signing_key:
|
54
|
+
specification_version: 4
|
55
|
+
summary: A Ruby gem for text segmentation using a C extension
|
56
|
+
test_files: []
|