llt-segmenter 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7fbc7cac188b6e8e63674495586e2bd89511f18a
4
- data.tar.gz: 3766f1b4346d1fdaae057f6e75efaf78e4d1d318
3
+ metadata.gz: 201cbc98ca192041429641ef0e3c0fcf128a3654
4
+ data.tar.gz: 1f61cb110d27d4b60427f16b1c7eb58576c67eb0
5
5
  SHA512:
6
- metadata.gz: dd674f244cb2b773fa3431fa88b6a9861e16d01e2e6b58d428f42f8d41f4c2117bca582481e9f28cce5a9166dd2715dd6d8279c048a322ffdfec92201fe9a097
7
- data.tar.gz: 6adf68ff06205ce2c21b035b4a5a07cdad2f718296383a2de3ad3b194f22d8caee9b628d0ff3b8527ab49653833c78c0ce39170cbac3296c2b7ffaac5799feff
6
+ metadata.gz: 76017f1fc143d0d6d190b341218c3272f71ad230a1c81496ab9813a94a285262f3c15e73ef73570fc8a5b5d07ceb726b7e2ad9e046f9c54a8d61576017c0aac8
7
+ data.tar.gz: 0915e76fac1f68d23b9454c010fc5820b021457ff4b53cf8bb0bf98a05b73cffc21b888a0169d14d331a571812c5027e26e7dd56a07dc6936be7538160dff099
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Segmenter
3
- VERSION = "0.0.3"
3
+ VERSION = "0.0.4"
4
4
  end
5
5
  end
data/lib/llt/segmenter.rb CHANGED
@@ -24,17 +24,21 @@ module LLT
24
24
  # so we have to change things as long as this is not fixed.
25
25
  #
26
26
  # (?<=\s|^) can be just \b in MRI 2.0 and upwards
27
- AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
27
+ #
28
+ # Added > to the regex on Feb 11 2014 to treat a closing chevron as a kind
29
+ # of word boundary.
30
+ AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^|>)#{abbr}" }.join('|')
28
31
  # the xml escaped characters cannot be refactored to something along
29
32
  # &(?:amp|quot); - it's an invalid pattern in the look-behind
30
33
  SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&amp|&quot|&apos|&lt|&gt);)/
31
34
  DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
32
- TRAILERS = /\)|<\/.*?>/
35
+ TRAILERS = /\)|\s*<\/.*?>/
33
36
 
34
37
  def segment(string, add_to: nil, **options)
35
38
  setup(options)
36
39
  # dump whitespace at the beginning and end!
37
40
  string.strip!
41
+ string = normalize_whitespace(string)
38
42
  sentences = scan_through_string(StringScanner.new(string))
39
43
  add_to << sentences if add_to.respond_to?(:<<)
40
44
  sentences
@@ -51,6 +55,64 @@ module LLT
51
55
  @sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
52
56
  end
53
57
 
58
+ # Used to normalized wonky whitespace in front of or behind direct speech
59
+ # delimiters like " (currently the only one supported).
60
+ def normalize_whitespace(string)
61
+ # in most cases there is nothing to do, then leave immediately
62
+ return string unless string.match(/\s"\s/)
63
+
64
+ scanner = StringScanner.new(string)
65
+ reset_direct_speech_status
66
+ string_with_normalized_whitespace(scanner)
67
+ end
68
+
69
+ def string_with_normalized_whitespace(scanner)
70
+ new_string = ''
71
+ until scanner.eos?
72
+ if match = scanner.scan_until(/"/)
73
+ new_string << normalized_match(scanner, match)
74
+ toggle_direct_speech_status
75
+ else
76
+ new_string << scanner.rest
77
+ break
78
+ end
79
+ end
80
+ new_string
81
+ end
82
+
83
+ def surrounded_by_whitespace?(scanner)
84
+ pos_before = scanner.pre_match[-1]
85
+ pos_behind = scanner.post_match[0]
86
+ pos_before == ' ' && (pos_behind == ' ' || pos_behind == nil) # end of string
87
+ end
88
+
89
+ def normalized_match(scanner, match)
90
+ if surrounded_by_whitespace?(scanner)
91
+ if direct_speech_open?
92
+ # eliminate the whitespace in front of "
93
+ match[0..-3] << '"'
94
+ else
95
+ # hop over the whitespace behind "
96
+ scanner.pos = scanner.pos + 1
97
+ match
98
+ end
99
+ else
100
+ match
101
+ end
102
+ end
103
+
104
+ def direct_speech_open?
105
+ @direct_speech
106
+ end
107
+
108
+ def reset_direct_speech_status
109
+ @direct_speech = false
110
+ end
111
+
112
+ def toggle_direct_speech_status
113
+ @direct_speech = (@direct_speech ? false : true)
114
+ end
115
+
54
116
  def scan_through_string(scanner, sentences = [])
55
117
  while scanner.rest?
56
118
  sentence = scan_until_next_sentence(scanner, sentences)
@@ -121,6 +121,25 @@ describe LLT::Segmenter do
121
121
  sentences.should have(2).items
122
122
  sentences[1].to_s.should == 'text 2.'
123
123
  end
124
+
125
+ it "doesn't break when a random newline leads the last tag" do
126
+ txt = "<grc> text.\n</grc>"
127
+ sentences = segmenter.segment(txt, xml: true)
128
+ sentences.should have(1).item
129
+ end
130
+
131
+ it "handles abbreviation of Marcus (M.) at the beginning of a new paragraph" do
132
+ txt = "<p>qui facere poterat.</p>\n<p>\n<milestone/>\nM. Cicero inter Catilinas detestatur!"
133
+ sentences = segmenter.segment(txt, xml: true)
134
+ sentences.should have(2).items
135
+ end
136
+
137
+ it "treats an xml tag like a word boundary" do
138
+ # M. would not be recognized as abbreviation otherwise
139
+ txt = "<p>M. Cicero est.</p>"
140
+ sentences = segmenter.segment(txt, xml: true)
141
+ sentences.should have(1).item
142
+ end
124
143
  end
125
144
 
126
145
  context "with xml escaped characters" do
@@ -288,6 +307,18 @@ describe LLT::Segmenter do
288
307
  end
289
308
  end
290
309
 
310
+ context "with badly whitespaced direct speech delimiters" do
311
+ it "normalizes whitespace and knows to which sentence a \" belongs" do
312
+ txt = '"Marcus est. " Cicero est. " Iulius est. "'
313
+ sentences = segmenter.segment(txt)
314
+ #sentences.should have(3).items
315
+ sentences.map!(&:to_s)
316
+ sentences[0].should == '"Marcus est."'
317
+ sentences[1].should == 'Cicero est.'
318
+ sentences[2].should == '"Iulius est."'
319
+ end
320
+ end
321
+
291
322
  describe "takes an optional keyword argument add_to" do
292
323
  class ParagraphDummy
293
324
  attr_reader :sentences
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-05 00:00:00.000000000 Z
11
+ date: 2014-02-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler