llt-segmenter 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/llt/segmenter/version.rb +1 -1
- data/lib/llt/segmenter.rb +64 -2
- data/spec/lib/llt/segmenter_spec.rb +31 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 201cbc98ca192041429641ef0e3c0fcf128a3654
|
4
|
+
data.tar.gz: 1f61cb110d27d4b60427f16b1c7eb58576c67eb0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 76017f1fc143d0d6d190b341218c3272f71ad230a1c81496ab9813a94a285262f3c15e73ef73570fc8a5b5d07ceb726b7e2ad9e046f9c54a8d61576017c0aac8
|
7
|
+
data.tar.gz: 0915e76fac1f68d23b9454c010fc5820b021457ff4b53cf8bb0bf98a05b73cffc21b888a0169d14d331a571812c5027e26e7dd56a07dc6936be7538160dff099
|
data/lib/llt/segmenter.rb
CHANGED
@@ -24,17 +24,21 @@ module LLT
|
|
24
24
|
# so we have to change things as long as this is not fixed.
|
25
25
|
#
|
26
26
|
# (?<=\s|^) can be just \b in MRI 2.0 and upwards
|
27
|
-
|
27
|
+
#
|
28
|
+
# Added > to the regex on Feb 11 2014 to treat a closing chevron as a kind
|
29
|
+
# of word boundary.
|
30
|
+
AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^|>)#{abbr}" }.join('|')
|
28
31
|
# the xml escaped characters cannot be refactored to something along
|
29
32
|
# &(?:amp|quot); - it's an invalid pattern in the look-behind
|
30
33
|
SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&|"|&apos|<|>);)/
|
31
34
|
DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
|
32
|
-
TRAILERS = /\)
|
35
|
+
TRAILERS = /\)|\s*<\/.*?>/
|
33
36
|
|
34
37
|
def segment(string, add_to: nil, **options)
|
35
38
|
setup(options)
|
36
39
|
# dump whitespace at the beginning and end!
|
37
40
|
string.strip!
|
41
|
+
string = normalize_whitespace(string)
|
38
42
|
sentences = scan_through_string(StringScanner.new(string))
|
39
43
|
add_to << sentences if add_to.respond_to?(:<<)
|
40
44
|
sentences
|
@@ -51,6 +55,64 @@ module LLT
|
|
51
55
|
@sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
|
52
56
|
end
|
53
57
|
|
58
|
+
# Used to normalized wonky whitespace in front of or behind direct speech
|
59
|
+
# delimiters like " (currently the only one supported).
|
60
|
+
def normalize_whitespace(string)
|
61
|
+
# in most cases there is nothing to do, then leave immediately
|
62
|
+
return string unless string.match(/\s"\s/)
|
63
|
+
|
64
|
+
scanner = StringScanner.new(string)
|
65
|
+
reset_direct_speech_status
|
66
|
+
string_with_normalized_whitespace(scanner)
|
67
|
+
end
|
68
|
+
|
69
|
+
def string_with_normalized_whitespace(scanner)
|
70
|
+
new_string = ''
|
71
|
+
until scanner.eos?
|
72
|
+
if match = scanner.scan_until(/"/)
|
73
|
+
new_string << normalized_match(scanner, match)
|
74
|
+
toggle_direct_speech_status
|
75
|
+
else
|
76
|
+
new_string << scanner.rest
|
77
|
+
break
|
78
|
+
end
|
79
|
+
end
|
80
|
+
new_string
|
81
|
+
end
|
82
|
+
|
83
|
+
def surrounded_by_whitespace?(scanner)
|
84
|
+
pos_before = scanner.pre_match[-1]
|
85
|
+
pos_behind = scanner.post_match[0]
|
86
|
+
pos_before == ' ' && (pos_behind == ' ' || pos_behind == nil) # end of string
|
87
|
+
end
|
88
|
+
|
89
|
+
def normalized_match(scanner, match)
|
90
|
+
if surrounded_by_whitespace?(scanner)
|
91
|
+
if direct_speech_open?
|
92
|
+
# eliminate the whitespace in front of "
|
93
|
+
match[0..-3] << '"'
|
94
|
+
else
|
95
|
+
# hop over the whitespace behind "
|
96
|
+
scanner.pos = scanner.pos + 1
|
97
|
+
match
|
98
|
+
end
|
99
|
+
else
|
100
|
+
match
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def direct_speech_open?
|
105
|
+
@direct_speech
|
106
|
+
end
|
107
|
+
|
108
|
+
def reset_direct_speech_status
|
109
|
+
@direct_speech = false
|
110
|
+
end
|
111
|
+
|
112
|
+
def toggle_direct_speech_status
|
113
|
+
@direct_speech = (@direct_speech ? false : true)
|
114
|
+
end
|
115
|
+
|
54
116
|
def scan_through_string(scanner, sentences = [])
|
55
117
|
while scanner.rest?
|
56
118
|
sentence = scan_until_next_sentence(scanner, sentences)
|
@@ -121,6 +121,25 @@ describe LLT::Segmenter do
|
|
121
121
|
sentences.should have(2).items
|
122
122
|
sentences[1].to_s.should == 'text 2.'
|
123
123
|
end
|
124
|
+
|
125
|
+
it "doesn't break when a random newline leads the last tag" do
|
126
|
+
txt = "<grc> text.\n</grc>"
|
127
|
+
sentences = segmenter.segment(txt, xml: true)
|
128
|
+
sentences.should have(1).item
|
129
|
+
end
|
130
|
+
|
131
|
+
it "handles abbreviation of Marcus (M.) at the beginning of a new paragraph" do
|
132
|
+
txt = "<p>qui facere poterat.</p>\n<p>\n<milestone/>\nM. Cicero inter Catilinas detestatur!"
|
133
|
+
sentences = segmenter.segment(txt, xml: true)
|
134
|
+
sentences.should have(2).items
|
135
|
+
end
|
136
|
+
|
137
|
+
it "treats an xml tag like a word boundary" do
|
138
|
+
# M. would not be recognized as abbreviation otherwise
|
139
|
+
txt = "<p>M. Cicero est.</p>"
|
140
|
+
sentences = segmenter.segment(txt, xml: true)
|
141
|
+
sentences.should have(1).item
|
142
|
+
end
|
124
143
|
end
|
125
144
|
|
126
145
|
context "with xml escaped characters" do
|
@@ -288,6 +307,18 @@ describe LLT::Segmenter do
|
|
288
307
|
end
|
289
308
|
end
|
290
309
|
|
310
|
+
context "with badly whitespaced direct speech delimiters" do
|
311
|
+
it "normalizes whitespace and knows to which sentence a \" belongs" do
|
312
|
+
txt = '"Marcus est. " Cicero est. " Iulius est. "'
|
313
|
+
sentences = segmenter.segment(txt)
|
314
|
+
#sentences.should have(3).items
|
315
|
+
sentences.map!(&:to_s)
|
316
|
+
sentences[0].should == '"Marcus est."'
|
317
|
+
sentences[1].should == 'Cicero est.'
|
318
|
+
sentences[2].should == '"Iulius est."'
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
291
322
|
describe "takes an optional keyword argument add_to" do
|
292
323
|
class ParagraphDummy
|
293
324
|
attr_reader :sentences
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02-
|
11
|
+
date: 2014-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|