llt-segmenter 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7fbc7cac188b6e8e63674495586e2bd89511f18a
4
- data.tar.gz: 3766f1b4346d1fdaae057f6e75efaf78e4d1d318
3
+ metadata.gz: 201cbc98ca192041429641ef0e3c0fcf128a3654
4
+ data.tar.gz: 1f61cb110d27d4b60427f16b1c7eb58576c67eb0
5
5
  SHA512:
6
- metadata.gz: dd674f244cb2b773fa3431fa88b6a9861e16d01e2e6b58d428f42f8d41f4c2117bca582481e9f28cce5a9166dd2715dd6d8279c048a322ffdfec92201fe9a097
7
- data.tar.gz: 6adf68ff06205ce2c21b035b4a5a07cdad2f718296383a2de3ad3b194f22d8caee9b628d0ff3b8527ab49653833c78c0ce39170cbac3296c2b7ffaac5799feff
6
+ metadata.gz: 76017f1fc143d0d6d190b341218c3272f71ad230a1c81496ab9813a94a285262f3c15e73ef73570fc8a5b5d07ceb726b7e2ad9e046f9c54a8d61576017c0aac8
7
+ data.tar.gz: 0915e76fac1f68d23b9454c010fc5820b021457ff4b53cf8bb0bf98a05b73cffc21b888a0169d14d331a571812c5027e26e7dd56a07dc6936be7538160dff099
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Segmenter
3
- VERSION = "0.0.3"
3
+ VERSION = "0.0.4"
4
4
  end
5
5
  end
data/lib/llt/segmenter.rb CHANGED
@@ -24,17 +24,21 @@ module LLT
24
24
  # so we have to change things as long as this is not fixed.
25
25
  #
26
26
  # (?<=\s|^) can be just \b in MRI 2.0 and upwards
27
- AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
27
+ #
28
+ # Added > to the regex on Feb 11 2014 to treat a closing chevron as a kind
29
+ # of word boundary.
30
+ AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^|>)#{abbr}" }.join('|')
28
31
  # the xml escaped characters cannot be refactored to something along
29
32
  # &(?:amp|quot); - it's an invalid pattern in the look-behind
30
33
  SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&amp|&quot|&apos|&lt|&gt);)/
31
34
  DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
32
- TRAILERS = /\)|<\/.*?>/
35
+ TRAILERS = /\)|\s*<\/.*?>/
33
36
 
34
37
  def segment(string, add_to: nil, **options)
35
38
  setup(options)
36
39
  # dump whitespace at the beginning and end!
37
40
  string.strip!
41
+ string = normalize_whitespace(string)
38
42
  sentences = scan_through_string(StringScanner.new(string))
39
43
  add_to << sentences if add_to.respond_to?(:<<)
40
44
  sentences
@@ -51,6 +55,64 @@ module LLT
51
55
  @sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
52
56
  end
53
57
 
58
+ # Used to normalized wonky whitespace in front of or behind direct speech
59
+ # delimiters like " (currently the only one supported).
60
+ def normalize_whitespace(string)
61
+ # in most cases there is nothing to do, then leave immediately
62
+ return string unless string.match(/\s"\s/)
63
+
64
+ scanner = StringScanner.new(string)
65
+ reset_direct_speech_status
66
+ string_with_normalized_whitespace(scanner)
67
+ end
68
+
69
+ def string_with_normalized_whitespace(scanner)
70
+ new_string = ''
71
+ until scanner.eos?
72
+ if match = scanner.scan_until(/"/)
73
+ new_string << normalized_match(scanner, match)
74
+ toggle_direct_speech_status
75
+ else
76
+ new_string << scanner.rest
77
+ break
78
+ end
79
+ end
80
+ new_string
81
+ end
82
+
83
+ def surrounded_by_whitespace?(scanner)
84
+ pos_before = scanner.pre_match[-1]
85
+ pos_behind = scanner.post_match[0]
86
+ pos_before == ' ' && (pos_behind == ' ' || pos_behind == nil) # end of string
87
+ end
88
+
89
+ def normalized_match(scanner, match)
90
+ if surrounded_by_whitespace?(scanner)
91
+ if direct_speech_open?
92
+ # eliminate the whitespace in front of "
93
+ match[0..-3] << '"'
94
+ else
95
+ # hop over the whitespace behind "
96
+ scanner.pos = scanner.pos + 1
97
+ match
98
+ end
99
+ else
100
+ match
101
+ end
102
+ end
103
+
104
+ def direct_speech_open?
105
+ @direct_speech
106
+ end
107
+
108
+ def reset_direct_speech_status
109
+ @direct_speech = false
110
+ end
111
+
112
+ def toggle_direct_speech_status
113
+ @direct_speech = (@direct_speech ? false : true)
114
+ end
115
+
54
116
  def scan_through_string(scanner, sentences = [])
55
117
  while scanner.rest?
56
118
  sentence = scan_until_next_sentence(scanner, sentences)
@@ -121,6 +121,25 @@ describe LLT::Segmenter do
121
121
  sentences.should have(2).items
122
122
  sentences[1].to_s.should == 'text 2.'
123
123
  end
124
+
125
+ it "doesn't break when a random newline leads the last tag" do
126
+ txt = "<grc> text.\n</grc>"
127
+ sentences = segmenter.segment(txt, xml: true)
128
+ sentences.should have(1).item
129
+ end
130
+
131
+ it "handles abbreviation of Marcus (M.) at the beginning of a new paragraph" do
132
+ txt = "<p>qui facere poterat.</p>\n<p>\n<milestone/>\nM. Cicero inter Catilinas detestatur!"
133
+ sentences = segmenter.segment(txt, xml: true)
134
+ sentences.should have(2).items
135
+ end
136
+
137
+ it "treats an xml tag like a word boundary" do
138
+ # M. would not be recognized as abbreviation otherwise
139
+ txt = "<p>M. Cicero est.</p>"
140
+ sentences = segmenter.segment(txt, xml: true)
141
+ sentences.should have(1).item
142
+ end
124
143
  end
125
144
 
126
145
  context "with xml escaped characters" do
@@ -288,6 +307,18 @@ describe LLT::Segmenter do
288
307
  end
289
308
  end
290
309
 
310
+ context "with badly whitespaced direct speech delimiters" do
311
+ it "normalizes whitespace and knows to which sentence a \" belongs" do
312
+ txt = '"Marcus est. " Cicero est. " Iulius est. "'
313
+ sentences = segmenter.segment(txt)
314
+ #sentences.should have(3).items
315
+ sentences.map!(&:to_s)
316
+ sentences[0].should == '"Marcus est."'
317
+ sentences[1].should == 'Cicero est.'
318
+ sentences[2].should == '"Iulius est."'
319
+ end
320
+ end
321
+
291
322
  describe "takes an optional keyword argument add_to" do
292
323
  class ParagraphDummy
293
324
  attr_reader :sentences
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-02-05 00:00:00.000000000 Z
11
+ date: 2014-02-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler