llt-segmenter 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 281e390815b02c98e23d91569c7fa99af1c7c0c4
4
- data.tar.gz: fae604f9cd60978890c7bd2d7c87133e98d4b666
3
+ metadata.gz: 7fbc7cac188b6e8e63674495586e2bd89511f18a
4
+ data.tar.gz: 3766f1b4346d1fdaae057f6e75efaf78e4d1d318
5
5
  SHA512:
6
- metadata.gz: a4273667e0f61109795552b9ab71c61fb5b9f5ec92497ce44c6ac471ed1d15babef116213e92851f9f270a60775329ae0308113deaba93c6b1022c03c188e0a9
7
- data.tar.gz: d3d2c7720f9cb445f45f2b679b8bde1e487bf348fe80a3172f1239078e3caa9317ddcfd6ca7f1aa4e61c1a2dbf9cd0c3defe0a1295d203337c6ebcc3e1ea82c2
6
+ metadata.gz: dd674f244cb2b773fa3431fa88b6a9861e16d01e2e6b58d428f42f8d41f4c2117bca582481e9f28cce5a9166dd2715dd6d8279c048a322ffdfec92201fe9a097
7
+ data.tar.gz: 6adf68ff06205ce2c21b035b4a5a07cdad2f718296383a2de3ad3b194f22d8caee9b628d0ff3b8527ab49653833c78c0ce39170cbac3296c2b7ffaac5799feff
data/.travis.yml CHANGED
@@ -3,4 +3,4 @@ before_script:
3
3
  - "export JRUBY_OPTS=--2.0"
4
4
  rvm:
5
5
  - 2.0.0
6
- - jruby-20mode
6
+ - jruby-1.7.8
data/README.md CHANGED
@@ -26,16 +26,6 @@ Or install it yourself as:
26
26
 
27
27
  TODO: Write usage instructions here
28
28
 
29
- ## API
30
- This currently is a list of requirements and will transform into an API documentation.
31
-
32
- Input:
33
- - Text or (URI)
34
- - Black-/Whitelist for separators.
35
-
36
- Output:
37
- - XML (TEI) or JSON
38
-
39
29
  ## Contributing
40
30
 
41
31
  1. Fork it
@@ -1,5 +1,5 @@
1
1
  module LLT
2
2
  class Segmenter
3
- VERSION = "0.0.2"
3
+ VERSION = "0.0.3"
4
4
  end
5
5
  end
data/lib/llt/segmenter.rb CHANGED
@@ -13,7 +13,8 @@ module LLT
13
13
  def self.default_options
14
14
  {
15
15
  indexing: true,
16
- newline_boundary: 2
16
+ newline_boundary: 2,
17
+ xml: false
17
18
  }
18
19
  end
19
20
 
@@ -24,8 +25,10 @@ module LLT
24
25
  #
25
26
  # (?<=\s|^) can be just \b in MRI 2.0 and upwards
26
27
  AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
27
- SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[;\?!:]/
28
- DIRECT_SPEECH_DELIMITER = /['"”]/
28
+ # the xml escaped characters cannot be refactored to something along
29
+ # &(?:amp|quot); - it's an invalid pattern in the look-behind
30
+ SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&amp|&quot|&apos|&lt|&gt);)/
31
+ DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
29
32
  TRAILERS = /\)|<\/.*?>/
30
33
 
31
34
  def segment(string, add_to: nil, **options)
@@ -40,6 +43,7 @@ module LLT
40
43
  private
41
44
 
42
45
  def setup(options)
46
+ @xml = parse_option(:xml, options)
43
47
  @indexing = parse_option(:indexing, options)
44
48
  @id = 0 if @indexing
45
49
 
@@ -49,8 +53,9 @@ module LLT
49
53
 
50
54
  def scan_through_string(scanner, sentences = [])
51
55
  while scanner.rest?
52
- sentence = scanner.scan_until(@sentence_closer) ||
53
- rescue_no_delimiters(sentences, scanner)
56
+ sentence = scan_until_next_sentence(scanner, sentences)
57
+
58
+ rebuild_xml_tags(scanner, sentence, sentences) if @xml
54
59
  sentence << trailing_delimiters(scanner)
55
60
 
56
61
  sentence.strip!
@@ -63,12 +68,36 @@ module LLT
63
68
  sentences
64
69
  end
65
70
 
71
+ def scan_until_next_sentence(scanner, sentences)
72
+ scanner.scan_until(@sentence_closer) ||
73
+ rescue_no_delimiters(sentences, scanner)
74
+ end
75
+
66
76
  def id
67
77
  if @indexing
68
78
  @id += 1
69
79
  end
70
80
  end
71
81
 
82
+ # this is only needed when there is punctuation inside of xml tags
83
+ def rebuild_xml_tags(scanner, sentence, sentences)
84
+ if has_open_chevron?(sentence)
85
+ sentence << scanner.scan_until(/>/)
86
+ if inside_a_running_sentence?(sentence)
87
+ sentence << scan_until_next_sentence(scanner, sentences)
88
+ end
89
+ rebuild_xml_tags(scanner, sentence, sentences)
90
+ end
91
+ end
92
+
93
+ def has_open_chevron?(sentence)
94
+ sentence.count('<') > sentence.count('>')
95
+ end
96
+
97
+ def inside_a_running_sentence?(sentence)
98
+ ! sentence.match(/#{@sentence_closer}\s*<.*?>$/)
99
+ end
100
+
72
101
  def rescue_no_delimiters(sentences, scanner)
73
102
  if sentences.any?
74
103
  # broken off texts
@@ -89,9 +89,61 @@ describe LLT::Segmenter do
89
89
  context "with embedded xml" do
90
90
  it "doesn't break up before xml closing tags" do
91
91
  txt = '<grc> text.</grc>'
92
- sentences = segmenter.segment(txt)
92
+ sentences = segmenter.segment(txt, xml: true)
93
+ sentences.should have(1).item
94
+ end
95
+
96
+ it "doesn't break with punctuation in element names I" do
97
+ txt = '<grc.test>text.</grc.test>'
98
+ sentences = segmenter.segment(txt, xml: true)
93
99
  sentences.should have(1).item
94
100
  end
101
+
102
+ it "doesn't break with punctuation in element names II" do
103
+ txt = '<grc.test>text.</grc.test> text 2.'
104
+ sentences = segmenter.segment(txt, xml: true)
105
+ sentences.should have(2).items
106
+ sentences[0].to_s.should == '<grc.test>text.</grc.test>'
107
+ sentences[1].to_s.should == 'text 2.'
108
+ end
109
+
110
+ it "doesn't break with punctuation in element names III" do
111
+ txt = '<grc.test>text</grc.test> resumed. text 2.'
112
+ sentences = segmenter.segment(txt, xml: true)
113
+ sentences.should have(2).items
114
+ sentences[0].to_s.should == '<grc.test>text</grc.test> resumed.'
115
+ sentences[1].to_s.should == 'text 2.'
116
+ end
117
+
118
+ it "doesn't break with attribute values containing punctuation" do
119
+ txt = '<grc no="1.1"> text.</grc> text 2.'
120
+ sentences = segmenter.segment(txt, xml: true)
121
+ sentences.should have(2).items
122
+ sentences[1].to_s.should == 'text 2.'
123
+ end
124
+ end
125
+
126
+ context "with xml escaped characters" do
127
+ it "doesn't split when it shouldn't" do
128
+ txt = '&quot;text&quot; resumed. success.'
129
+ sentences = segmenter.segment(txt)
130
+ sentences.should have(2).item
131
+ sentences[1].to_s.should == 'success.'
132
+ end
133
+
134
+ it "acknowledges &quot; as potentially trailing delimiter" do
135
+ txt = '&quot;text.&quot; success.'
136
+ sentences = segmenter.segment(txt)
137
+ sentences.should have(2).item
138
+ sentences[1].to_s.should == 'success.'
139
+ end
140
+
141
+ it "acknowledges &apos; as potentially trailing delimiter" do
142
+ txt = '&apos;text.&apos; success.'
143
+ sentences = segmenter.segment(txt)
144
+ sentences.should have(2).item
145
+ sentences[1].to_s.should == 'success.'
146
+ end
95
147
  end
96
148
 
97
149
  context "newline (\\n) handling" do
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llt-segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-09 00:00:00.000000000 Z
11
+ date: 2014-02-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -152,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
152
152
  version: '0'
153
153
  requirements: []
154
154
  rubyforge_project:
155
- rubygems_version: 2.1.5
155
+ rubygems_version: 2.2.0
156
156
  signing_key:
157
157
  specification_version: 4
158
158
  summary: Segments text into sentences