llt-segmenter 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/llt/segmenter/version.rb +1 -1
- data/lib/llt/segmenter.rb +64 -2
- data/spec/lib/llt/segmenter_spec.rb +31 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 201cbc98ca192041429641ef0e3c0fcf128a3654
|
4
|
+
data.tar.gz: 1f61cb110d27d4b60427f16b1c7eb58576c67eb0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 76017f1fc143d0d6d190b341218c3272f71ad230a1c81496ab9813a94a285262f3c15e73ef73570fc8a5b5d07ceb726b7e2ad9e046f9c54a8d61576017c0aac8
|
7
|
+
data.tar.gz: 0915e76fac1f68d23b9454c010fc5820b021457ff4b53cf8bb0bf98a05b73cffc21b888a0169d14d331a571812c5027e26e7dd56a07dc6936be7538160dff099
|
data/lib/llt/segmenter.rb
CHANGED
@@ -24,17 +24,21 @@ module LLT
|
|
24
24
|
# so we have to change things as long as this is not fixed.
|
25
25
|
#
|
26
26
|
# (?<=\s|^) can be just \b in MRI 2.0 and upwards
|
27
|
-
|
27
|
+
#
|
28
|
+
# Added > to the regex on Feb 11 2014 to treat a closing chevron as a kind
|
29
|
+
# of word boundary.
|
30
|
+
AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^|>)#{abbr}" }.join('|')
|
28
31
|
# the xml escaped characters cannot be refactored to something along
|
29
32
|
# &(?:amp|quot); - it's an invalid pattern in the look-behind
|
30
33
|
SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&|"|&apos|<|>);)/
|
31
34
|
DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
|
32
|
-
TRAILERS = /\)
|
35
|
+
TRAILERS = /\)|\s*<\/.*?>/
|
33
36
|
|
34
37
|
def segment(string, add_to: nil, **options)
|
35
38
|
setup(options)
|
36
39
|
# dump whitespace at the beginning and end!
|
37
40
|
string.strip!
|
41
|
+
string = normalize_whitespace(string)
|
38
42
|
sentences = scan_through_string(StringScanner.new(string))
|
39
43
|
add_to << sentences if add_to.respond_to?(:<<)
|
40
44
|
sentences
|
@@ -51,6 +55,64 @@ module LLT
|
|
51
55
|
@sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
|
52
56
|
end
|
53
57
|
|
58
|
+
# Used to normalized wonky whitespace in front of or behind direct speech
|
59
|
+
# delimiters like " (currently the only one supported).
|
60
|
+
def normalize_whitespace(string)
|
61
|
+
# in most cases there is nothing to do, then leave immediately
|
62
|
+
return string unless string.match(/\s"\s/)
|
63
|
+
|
64
|
+
scanner = StringScanner.new(string)
|
65
|
+
reset_direct_speech_status
|
66
|
+
string_with_normalized_whitespace(scanner)
|
67
|
+
end
|
68
|
+
|
69
|
+
def string_with_normalized_whitespace(scanner)
|
70
|
+
new_string = ''
|
71
|
+
until scanner.eos?
|
72
|
+
if match = scanner.scan_until(/"/)
|
73
|
+
new_string << normalized_match(scanner, match)
|
74
|
+
toggle_direct_speech_status
|
75
|
+
else
|
76
|
+
new_string << scanner.rest
|
77
|
+
break
|
78
|
+
end
|
79
|
+
end
|
80
|
+
new_string
|
81
|
+
end
|
82
|
+
|
83
|
+
def surrounded_by_whitespace?(scanner)
|
84
|
+
pos_before = scanner.pre_match[-1]
|
85
|
+
pos_behind = scanner.post_match[0]
|
86
|
+
pos_before == ' ' && (pos_behind == ' ' || pos_behind == nil) # end of string
|
87
|
+
end
|
88
|
+
|
89
|
+
def normalized_match(scanner, match)
|
90
|
+
if surrounded_by_whitespace?(scanner)
|
91
|
+
if direct_speech_open?
|
92
|
+
# eliminate the whitespace in front of "
|
93
|
+
match[0..-3] << '"'
|
94
|
+
else
|
95
|
+
# hop over the whitespace behind "
|
96
|
+
scanner.pos = scanner.pos + 1
|
97
|
+
match
|
98
|
+
end
|
99
|
+
else
|
100
|
+
match
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def direct_speech_open?
|
105
|
+
@direct_speech
|
106
|
+
end
|
107
|
+
|
108
|
+
def reset_direct_speech_status
|
109
|
+
@direct_speech = false
|
110
|
+
end
|
111
|
+
|
112
|
+
def toggle_direct_speech_status
|
113
|
+
@direct_speech = (@direct_speech ? false : true)
|
114
|
+
end
|
115
|
+
|
54
116
|
def scan_through_string(scanner, sentences = [])
|
55
117
|
while scanner.rest?
|
56
118
|
sentence = scan_until_next_sentence(scanner, sentences)
|
@@ -121,6 +121,25 @@ describe LLT::Segmenter do
|
|
121
121
|
sentences.should have(2).items
|
122
122
|
sentences[1].to_s.should == 'text 2.'
|
123
123
|
end
|
124
|
+
|
125
|
+
it "doesn't break when a random newline leads the last tag" do
|
126
|
+
txt = "<grc> text.\n</grc>"
|
127
|
+
sentences = segmenter.segment(txt, xml: true)
|
128
|
+
sentences.should have(1).item
|
129
|
+
end
|
130
|
+
|
131
|
+
it "handles abbreviation of Marcus (M.) at the beginning of a new paragraph" do
|
132
|
+
txt = "<p>qui facere poterat.</p>\n<p>\n<milestone/>\nM. Cicero inter Catilinas detestatur!"
|
133
|
+
sentences = segmenter.segment(txt, xml: true)
|
134
|
+
sentences.should have(2).items
|
135
|
+
end
|
136
|
+
|
137
|
+
it "treats an xml tag like a word boundary" do
|
138
|
+
# M. would not be recognized as abbreviation otherwise
|
139
|
+
txt = "<p>M. Cicero est.</p>"
|
140
|
+
sentences = segmenter.segment(txt, xml: true)
|
141
|
+
sentences.should have(1).item
|
142
|
+
end
|
124
143
|
end
|
125
144
|
|
126
145
|
context "with xml escaped characters" do
|
@@ -288,6 +307,18 @@ describe LLT::Segmenter do
|
|
288
307
|
end
|
289
308
|
end
|
290
309
|
|
310
|
+
context "with badly whitespaced direct speech delimiters" do
|
311
|
+
it "normalizes whitespace and knows to which sentence a \" belongs" do
|
312
|
+
txt = '"Marcus est. " Cicero est. " Iulius est. "'
|
313
|
+
sentences = segmenter.segment(txt)
|
314
|
+
#sentences.should have(3).items
|
315
|
+
sentences.map!(&:to_s)
|
316
|
+
sentences[0].should == '"Marcus est."'
|
317
|
+
sentences[1].should == 'Cicero est.'
|
318
|
+
sentences[2].should == '"Iulius est."'
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
291
322
|
describe "takes an optional keyword argument add_to" do
|
292
323
|
class ParagraphDummy
|
293
324
|
attr_reader :sentences
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02-
|
11
|
+
date: 2014-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|