llt-segmenter 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 281e390815b02c98e23d91569c7fa99af1c7c0c4
4
- data.tar.gz: fae604f9cd60978890c7bd2d7c87133e98d4b666
3
+ metadata.gz: 7fbc7cac188b6e8e63674495586e2bd89511f18a
4
+ data.tar.gz: 3766f1b4346d1fdaae057f6e75efaf78e4d1d318
5
5
  SHA512:
6
- metadata.gz: a4273667e0f61109795552b9ab71c61fb5b9f5ec92497ce44c6ac471ed1d15babef116213e92851f9f270a60775329ae0308113deaba93c6b1022c03c188e0a9
7
- data.tar.gz: d3d2c7720f9cb445f45f2b679b8bde1e487bf348fe80a3172f1239078e3caa9317ddcfd6ca7f1aa4e61c1a2dbf9cd0c3defe0a1295d203337c6ebcc3e1ea82c2
6
+ metadata.gz: dd674f244cb2b773fa3431fa88b6a9861e16d01e2e6b58d428f42f8d41f4c2117bca582481e9f28cce5a9166dd2715dd6d8279c048a322ffdfec92201fe9a097
7
+ data.tar.gz: 6adf68ff06205ce2c21b035b4a5a07cdad2f718296383a2de3ad3b194f22d8caee9b628d0ff3b8527ab49653833c78c0ce39170cbac3296c2b7ffaac5799feff
data/.travis.yml CHANGED
@@ -3,4 +3,4 @@ before_script:
3
3
  - "export JRUBY_OPTS=--2.0"
4
4
  rvm:
5
5
  - 2.0.0
6
- - jruby-20mode
6
+ - jruby-1.7.8
data/README.md CHANGED
@@ -26,16 +26,6 @@ Or install it yourself as:
26
26
 
27
27
  TODO: Write usage instructions here
28
28
 
29
- ## API
30
- This currently is a list of requirements and will transform into an API documentation.
31
-
32
- Input:
33
- - Text or (URI)
34
- - Black-/Whitelist for separators.
35
-
36
- Output:
37
- - XML (TEI) or JSON
38
-
39
29
  ## Contributing
40
30
 
41
31
  1. Fork it
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Segmenter
3
- VERSION = "0.0.2"
3
+ VERSION = "0.0.3"
4
4
  end
5
5
  end
data/lib/llt/segmenter.rb CHANGED
@@ -13,7 +13,8 @@ module LLT
13
13
  def self.default_options
14
14
  {
15
15
  indexing: true,
16
- newline_boundary: 2
16
+ newline_boundary: 2,
17
+ xml: false
17
18
  }
18
19
  end
19
20
 
@@ -24,8 +25,10 @@ module LLT
24
25
  #
25
26
  # (?<=\s|^) can be just \b in MRI 2.0 and upwards
26
27
  AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
27
- SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[;\?!:]/
28
- DIRECT_SPEECH_DELIMITER = /['"”]/
28
+ # the xml escaped characters cannot be refactored to something along
29
+ # &(?:amp|quot); - it's an invalid pattern in the look-behind
30
+ SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&amp|&quot|&apos|&lt|&gt);)/
31
+ DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
29
32
  TRAILERS = /\)|<\/.*?>/
30
33
 
31
34
  def segment(string, add_to: nil, **options)
@@ -40,6 +43,7 @@ module LLT
40
43
  private
41
44
 
42
45
  def setup(options)
46
+ @xml = parse_option(:xml, options)
43
47
  @indexing = parse_option(:indexing, options)
44
48
  @id = 0 if @indexing
45
49
 
@@ -49,8 +53,9 @@ module LLT
49
53
 
50
54
  def scan_through_string(scanner, sentences = [])
51
55
  while scanner.rest?
52
- sentence = scanner.scan_until(@sentence_closer) ||
53
- rescue_no_delimiters(sentences, scanner)
56
+ sentence = scan_until_next_sentence(scanner, sentences)
57
+
58
+ rebuild_xml_tags(scanner, sentence, sentences) if @xml
54
59
  sentence << trailing_delimiters(scanner)
55
60
 
56
61
  sentence.strip!
@@ -63,12 +68,36 @@ module LLT
63
68
  sentences
64
69
  end
65
70
 
71
+ def scan_until_next_sentence(scanner, sentences)
72
+ scanner.scan_until(@sentence_closer) ||
73
+ rescue_no_delimiters(sentences, scanner)
74
+ end
75
+
66
76
  def id
67
77
  if @indexing
68
78
  @id += 1
69
79
  end
70
80
  end
71
81
 
82
+ # this is only needed when there is punctuation inside of xml tags
83
+ def rebuild_xml_tags(scanner, sentence, sentences)
84
+ if has_open_chevron?(sentence)
85
+ sentence << scanner.scan_until(/>/)
86
+ if inside_a_running_sentence?(sentence)
87
+ sentence << scan_until_next_sentence(scanner, sentences)
88
+ end
89
+ rebuild_xml_tags(scanner, sentence, sentences)
90
+ end
91
+ end
92
+
93
+ def has_open_chevron?(sentence)
94
+ sentence.count('<') > sentence.count('>')
95
+ end
96
+
97
+ def inside_a_running_sentence?(sentence)
98
+ ! sentence.match(/#{@sentence_closer}\s*<.*?>$/)
99
+ end
100
+
72
101
  def rescue_no_delimiters(sentences, scanner)
73
102
  if sentences.any?
74
103
  # broken off texts
@@ -89,9 +89,61 @@ describe LLT::Segmenter do
89
89
  context "with embedded xml" do
90
90
  it "doesn't break up before xml closing tags" do
91
91
  txt = '<grc> text.</grc>'
92
- sentences = segmenter.segment(txt)
92
+ sentences = segmenter.segment(txt, xml: true)
93
+ sentences.should have(1).item
94
+ end
95
+
96
+ it "doesn't break with punctuation in element names I" do
97
+ txt = '<grc.test>text.</grc.test>'
98
+ sentences = segmenter.segment(txt, xml: true)
93
99
  sentences.should have(1).item
94
100
  end
101
+
102
+ it "doesn't break with punctuation in element names II" do
103
+ txt = '<grc.test>text.</grc.test> text 2.'
104
+ sentences = segmenter.segment(txt, xml: true)
105
+ sentences.should have(2).items
106
+ sentences[0].to_s.should == '<grc.test>text.</grc.test>'
107
+ sentences[1].to_s.should == 'text 2.'
108
+ end
109
+
110
+ it "doesn't break with punctuation in element names III" do
111
+ txt = '<grc.test>text</grc.test> resumed. text 2.'
112
+ sentences = segmenter.segment(txt, xml: true)
113
+ sentences.should have(2).items
114
+ sentences[0].to_s.should == '<grc.test>text</grc.test> resumed.'
115
+ sentences[1].to_s.should == 'text 2.'
116
+ end
117
+
118
+ it "doesn't break with attribute values containing punctuation" do
119
+ txt = '<grc no="1.1"> text.</grc> text 2.'
120
+ sentences = segmenter.segment(txt, xml: true)
121
+ sentences.should have(2).items
122
+ sentences[1].to_s.should == 'text 2.'
123
+ end
124
+ end
125
+
126
+ context "with xml escaped characters" do
127
+ it "doesn't split when it shouldn't" do
128
+ txt = '&quot;text&quot; resumed. success.'
129
+ sentences = segmenter.segment(txt)
130
+ sentences.should have(2).item
131
+ sentences[1].to_s.should == 'success.'
132
+ end
133
+
134
+ it "acknowledges &quot; as potentially trailing delimiter" do
135
+ txt = '&quot;text.&quot; success.'
136
+ sentences = segmenter.segment(txt)
137
+ sentences.should have(2).item
138
+ sentences[1].to_s.should == 'success.'
139
+ end
140
+
141
+ it "acknowledges &apos; as potentially trailing delimiter" do
142
+ txt = '&apos;text.&apos; success.'
143
+ sentences = segmenter.segment(txt)
144
+ sentences.should have(2).item
145
+ sentences[1].to_s.should == 'success.'
146
+ end
95
147
  end
96
148
 
97
149
  context "newline (\\n) handling" do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-09 00:00:00.000000000 Z
11
+ date: 2014-02-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -152,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
152
152
  version: '0'
153
153
  requirements: []
154
154
  rubyforge_project:
155
- rubygems_version: 2.1.5
155
+ rubygems_version: 2.2.0
156
156
  signing_key:
157
157
  specification_version: 4
158
158
  summary: Segments text into sentences