text_sentencer 1.0.2 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/bin/text_sentencer +21 -19
- data/lib/text_sentencer/string_scan_offset.rb +7 -7
- data/lib/text_sentencer/text_sentencer.rb +101 -111
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 13537fd663911db81cc5d4068c4c3d8439d332c4742e9d027c6687031f44f32f
|
4
|
+
data.tar.gz: dbd6aacd4e7dfca434c27608fa747bb7fbd8d43bf62af682f0e55784fcccf748
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9758707a8204ebe212e699f2f3dd2011d3ad0fdf355cde4c85df5521d17aa6763df593915b88890aad9c29c10903bbada40be9ec66cc082305fdf5b614c252bb
|
7
|
+
data.tar.gz: '09938a12984f7d33c6337adcaf59af9087ce2969d5d98b37b3e73d833d015a49950b6d7b65bc845dc0820727ea4d2b5af61a8bb0af7ce5201b7531748f8772a8'
|
data/bin/text_sentencer
CHANGED
@@ -8,20 +8,20 @@ output_mode = :sentences
|
|
8
8
|
## command line option processing
|
9
9
|
require 'optparse'
|
10
10
|
optparse = OptionParser.new do |opts|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
11
|
+
opts.banner = "Usage: text_sentencer [options]"
|
12
|
+
|
13
|
+
opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
|
14
|
+
config_filename = c
|
15
|
+
end
|
16
|
+
|
17
|
+
opts.on('-j', '--json_output', 'outputs the result in JSON.') do
|
18
|
+
output_mode = :json
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on('-h', '--help', 'displays this screen.') do
|
22
|
+
puts opts
|
23
|
+
exit
|
24
|
+
end
|
25
25
|
end
|
26
26
|
|
27
27
|
optparse.parse!
|
@@ -36,10 +36,12 @@ text = ARGF.read
|
|
36
36
|
annotations = sentencer.annotate(text)
|
37
37
|
|
38
38
|
if output_mode == :json
|
39
|
-
|
39
|
+
puts JSON.pretty_generate(annotations)
|
40
40
|
else
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
41
|
+
if annotations.has_key?(:blocks)
|
42
|
+
annotations[:blocks].each do |d|
|
43
|
+
span = d[:span]
|
44
|
+
puts '[' + text[span[:begin]...span[:end]] + ']'
|
45
|
+
end
|
46
|
+
end
|
45
47
|
end
|
@@ -3,115 +3,105 @@ require 'text_sentencer/string_scan_offset'
|
|
3
3
|
require 'pp'
|
4
4
|
|
5
5
|
class TextSentencer
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
sentences = []
|
108
|
-
lastbreak = 0
|
109
|
-
breaks.each do |b|
|
110
|
-
sentences << [lastbreak, b[0]] if b[0] > lastbreak
|
111
|
-
lastbreak = b[1]
|
112
|
-
end
|
113
|
-
sentences << [lastbreak, text.length] if lastbreak < text.length
|
114
|
-
|
115
|
-
sentences
|
116
|
-
end
|
6
|
+
## default rules
|
7
|
+
|
8
|
+
DEFAULT_RULES = {
|
9
|
+
# All the positions of new line characters always take sentence break.
|
10
|
+
break_pattern: /([ \t]*\n+)+[ \t]*/,
|
11
|
+
|
12
|
+
# All the positions of space and tab characters are candiates of sentence break.
|
13
|
+
candidate_pattern: /[ \t\n]+/,
|
14
|
+
|
15
|
+
# First, positive rules are applied to the break candidates to make initial segmantations.
|
16
|
+
positive_rules: [
|
17
|
+
[/[.!?]\z/, /\A[0-9A-Z]/],
|
18
|
+
[/[:]\z/, /\A[0-9]/],
|
19
|
+
[/[:]\z/, /\A[A-Z][a-z]/]
|
20
|
+
],
|
21
|
+
|
22
|
+
# Then, negative rules are applied to cancel some initial segmentations.
|
23
|
+
negative_rules: [
|
24
|
+
# Titles before names
|
25
|
+
[/(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.\z/, /\A[A-Z][a-z]/],
|
26
|
+
|
27
|
+
# Titles usually before names, but ..
|
28
|
+
[/(Sr|Jr)\.\z/, /\A[A-Z][a-z]/],
|
29
|
+
|
30
|
+
# Single letter abbriveations, e.g. middle name
|
31
|
+
# [/\b[A-Z]\.\z/, /\A[A-Z][a-z]/],
|
32
|
+
|
33
|
+
# Abbriveations, e.g. middle name
|
34
|
+
[/\b[A-Z][a-z]*\.\z/, /\A[0-9A-Z]/],
|
35
|
+
|
36
|
+
# Frequent abbreviations that will never appear in the end of a sentence
|
37
|
+
[/(cf|vs)\.\z/, //],
|
38
|
+
[/e\.g\.\z/, //],
|
39
|
+
[/i\.e\.\z/, //],
|
40
|
+
|
41
|
+
# Others
|
42
|
+
[/(Sec|Chap|Fig|Eq)\.\z/, /\A[0-9A-Z]/]
|
43
|
+
]
|
44
|
+
}
|
45
|
+
|
46
|
+
def initialize(rules = nil)
|
47
|
+
rules ||= DEFAULT_RULES
|
48
|
+
@rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
|
49
|
+
@rules[:break_pattern] ||= nil
|
50
|
+
@rules[:candidate_pattern] ||= nil
|
51
|
+
@rules[:positive_rules] ||= []
|
52
|
+
@rules[:negative_rules] ||= []
|
53
|
+
end
|
54
|
+
|
55
|
+
def annotate(text)
|
56
|
+
sentences = segment(text)
|
57
|
+
blocks = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
|
58
|
+
{text:text, blocks:blocks}
|
59
|
+
end
|
60
|
+
|
61
|
+
def segment(text)
|
62
|
+
# hard sentence breaks
|
63
|
+
breaks = if @rules[:break_pattern].nil?
|
64
|
+
[]
|
65
|
+
else
|
66
|
+
text.scan_offset(@rules[:break_pattern]).map{|m| m.offset(0)}
|
67
|
+
end
|
68
|
+
|
69
|
+
# candidates of sentence breaks
|
70
|
+
candidates = if @rules[:candidate_pattern].nil?
|
71
|
+
[]
|
72
|
+
else
|
73
|
+
text.scan_offset(@rules[:candidate_pattern]).map{|m| m.offset(0)}
|
74
|
+
end
|
75
|
+
|
76
|
+
# hard sentence breaks are already in
|
77
|
+
candidates -= breaks
|
78
|
+
|
79
|
+
# filter using the positive rules. The first and last breaks remain.
|
80
|
+
candidates.select! do |c|
|
81
|
+
c[0] == 0 ||
|
82
|
+
c[1] == text.length ||
|
83
|
+
@rules[:positive_rules].any? do |r|
|
84
|
+
(text[0...c[0]] =~ r[0]) && (text[c[1]...-1] =~ r[1])
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# filter using the negative rules
|
89
|
+
candidates.reject! do |c|
|
90
|
+
@rules[:negative_rules].any? do |r|
|
91
|
+
(text[0...c[0]] =~ r[0]) && (text[c[1]...-1] =~ r[1])
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# add all the filtered candidates to sentence breaks
|
96
|
+
breaks += candidates
|
97
|
+
breaks.uniq!
|
98
|
+
breaks.sort!
|
99
|
+
|
100
|
+
# add the initial and final breaks unless already exist
|
101
|
+
breaks.unshift([0, 0]) if breaks.empty? || breaks[0][0] != 0
|
102
|
+
breaks.push([text.length, text.length]) unless breaks[-1][1] == text.length
|
103
|
+
|
104
|
+
# conver the breaks into sentences
|
105
|
+
sentences = (1 ... breaks.length).map{|i| [breaks[i-1][1], breaks[i][0]]}
|
106
|
+
end
|
117
107
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: text_sentencer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jin-Dong Kim
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-08-03 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: TextSentencer is a simple rule-based system for segmenting text into
|
14
14
|
sentences.
|
@@ -41,8 +41,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
41
41
|
- !ruby/object:Gem::Version
|
42
42
|
version: '0'
|
43
43
|
requirements: []
|
44
|
-
|
45
|
-
rubygems_version: 2.4.8
|
44
|
+
rubygems_version: 3.5.11
|
46
45
|
signing_key:
|
47
46
|
specification_version: 4
|
48
47
|
summary: A simple, rule-based script to find sentence boundaries in text.
|