llt-segmenter 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -1
- data/README.md +0 -10
- data/lib/llt/segmenter/version.rb +1 -1
- data/lib/llt/segmenter.rb +34 -5
- data/spec/lib/llt/segmenter_spec.rb +53 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7fbc7cac188b6e8e63674495586e2bd89511f18a
|
4
|
+
data.tar.gz: 3766f1b4346d1fdaae057f6e75efaf78e4d1d318
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd674f244cb2b773fa3431fa88b6a9861e16d01e2e6b58d428f42f8d41f4c2117bca582481e9f28cce5a9166dd2715dd6d8279c048a322ffdfec92201fe9a097
|
7
|
+
data.tar.gz: 6adf68ff06205ce2c21b035b4a5a07cdad2f718296383a2de3ad3b194f22d8caee9b628d0ff3b8527ab49653833c78c0ce39170cbac3296c2b7ffaac5799feff
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -26,16 +26,6 @@ Or install it yourself as:
|
|
26
26
|
|
27
27
|
TODO: Write usage instructions here
|
28
28
|
|
29
|
-
## API
|
30
|
-
This currently is a list of requirements and will transform into an API documentation.
|
31
|
-
|
32
|
-
Input:
|
33
|
-
- Text or (URI)
|
34
|
-
- Black-/Whitelist for separators.
|
35
|
-
|
36
|
-
Output:
|
37
|
-
- XML (TEI) or JSON
|
38
|
-
|
39
29
|
## Contributing
|
40
30
|
|
41
31
|
1. Fork it
|
data/lib/llt/segmenter.rb
CHANGED
@@ -13,7 +13,8 @@ module LLT
|
|
13
13
|
def self.default_options
|
14
14
|
{
|
15
15
|
indexing: true,
|
16
|
-
newline_boundary: 2
|
16
|
+
newline_boundary: 2,
|
17
|
+
xml: false
|
17
18
|
}
|
18
19
|
end
|
19
20
|
|
@@ -24,8 +25,10 @@ module LLT
|
|
24
25
|
#
|
25
26
|
# (?<=\s|^) can be just \b in MRI 2.0 and upwards
|
26
27
|
AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
|
27
|
-
|
28
|
-
|
28
|
+
# the xml escaped characters cannot be refactored to something along
|
29
|
+
# &(?:amp|quot); - it's an invalid pattern in the look-behind
|
30
|
+
SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&|"|&apos|<|>);)/
|
31
|
+
DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
|
29
32
|
TRAILERS = /\)|<\/.*?>/
|
30
33
|
|
31
34
|
def segment(string, add_to: nil, **options)
|
@@ -40,6 +43,7 @@ module LLT
|
|
40
43
|
private
|
41
44
|
|
42
45
|
def setup(options)
|
46
|
+
@xml = parse_option(:xml, options)
|
43
47
|
@indexing = parse_option(:indexing, options)
|
44
48
|
@id = 0 if @indexing
|
45
49
|
|
@@ -49,8 +53,9 @@ module LLT
|
|
49
53
|
|
50
54
|
def scan_through_string(scanner, sentences = [])
|
51
55
|
while scanner.rest?
|
52
|
-
sentence = scanner
|
53
|
-
|
56
|
+
sentence = scan_until_next_sentence(scanner, sentences)
|
57
|
+
|
58
|
+
rebuild_xml_tags(scanner, sentence, sentences) if @xml
|
54
59
|
sentence << trailing_delimiters(scanner)
|
55
60
|
|
56
61
|
sentence.strip!
|
@@ -63,12 +68,36 @@ module LLT
|
|
63
68
|
sentences
|
64
69
|
end
|
65
70
|
|
71
|
+
def scan_until_next_sentence(scanner, sentences)
|
72
|
+
scanner.scan_until(@sentence_closer) ||
|
73
|
+
rescue_no_delimiters(sentences, scanner)
|
74
|
+
end
|
75
|
+
|
66
76
|
def id
|
67
77
|
if @indexing
|
68
78
|
@id += 1
|
69
79
|
end
|
70
80
|
end
|
71
81
|
|
82
|
+
# this is only needed when there is punctuation inside of xml tags
|
83
|
+
def rebuild_xml_tags(scanner, sentence, sentences)
|
84
|
+
if has_open_chevron?(sentence)
|
85
|
+
sentence << scanner.scan_until(/>/)
|
86
|
+
if inside_a_running_sentence?(sentence)
|
87
|
+
sentence << scan_until_next_sentence(scanner, sentences)
|
88
|
+
end
|
89
|
+
rebuild_xml_tags(scanner, sentence, sentences)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def has_open_chevron?(sentence)
|
94
|
+
sentence.count('<') > sentence.count('>')
|
95
|
+
end
|
96
|
+
|
97
|
+
def inside_a_running_sentence?(sentence)
|
98
|
+
! sentence.match(/#{@sentence_closer}\s*<.*?>$/)
|
99
|
+
end
|
100
|
+
|
72
101
|
def rescue_no_delimiters(sentences, scanner)
|
73
102
|
if sentences.any?
|
74
103
|
# broken off texts
|
@@ -89,9 +89,61 @@ describe LLT::Segmenter do
|
|
89
89
|
context "with embedded xml" do
|
90
90
|
it "doesn't break up before xml closing tags" do
|
91
91
|
txt = '<grc> text.</grc>'
|
92
|
-
sentences = segmenter.segment(txt)
|
92
|
+
sentences = segmenter.segment(txt, xml: true)
|
93
|
+
sentences.should have(1).item
|
94
|
+
end
|
95
|
+
|
96
|
+
it "doesn't break with punctuation in element names I" do
|
97
|
+
txt = '<grc.test>text.</grc.test>'
|
98
|
+
sentences = segmenter.segment(txt, xml: true)
|
93
99
|
sentences.should have(1).item
|
94
100
|
end
|
101
|
+
|
102
|
+
it "doesn't break with punctuation in element names II" do
|
103
|
+
txt = '<grc.test>text.</grc.test> text 2.'
|
104
|
+
sentences = segmenter.segment(txt, xml: true)
|
105
|
+
sentences.should have(2).items
|
106
|
+
sentences[0].to_s.should == '<grc.test>text.</grc.test>'
|
107
|
+
sentences[1].to_s.should == 'text 2.'
|
108
|
+
end
|
109
|
+
|
110
|
+
it "doesn't break with punctuation in element names III" do
|
111
|
+
txt = '<grc.test>text</grc.test> resumed. text 2.'
|
112
|
+
sentences = segmenter.segment(txt, xml: true)
|
113
|
+
sentences.should have(2).items
|
114
|
+
sentences[0].to_s.should == '<grc.test>text</grc.test> resumed.'
|
115
|
+
sentences[1].to_s.should == 'text 2.'
|
116
|
+
end
|
117
|
+
|
118
|
+
it "doesn't break with attribute values containing punctuation" do
|
119
|
+
txt = '<grc no="1.1"> text.</grc> text 2.'
|
120
|
+
sentences = segmenter.segment(txt, xml: true)
|
121
|
+
sentences.should have(2).items
|
122
|
+
sentences[1].to_s.should == 'text 2.'
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
context "with xml escaped characters" do
|
127
|
+
it "doesn't split when it shouldn't" do
|
128
|
+
txt = '"text" resumed. success.'
|
129
|
+
sentences = segmenter.segment(txt)
|
130
|
+
sentences.should have(2).item
|
131
|
+
sentences[1].to_s.should == 'success.'
|
132
|
+
end
|
133
|
+
|
134
|
+
it "acknowledges " as potentially trailing delimiter" do
|
135
|
+
txt = '"text." success.'
|
136
|
+
sentences = segmenter.segment(txt)
|
137
|
+
sentences.should have(2).item
|
138
|
+
sentences[1].to_s.should == 'success.'
|
139
|
+
end
|
140
|
+
|
141
|
+
it "acknowledges ' as potentially trailing delimiter" do
|
142
|
+
txt = ''text.' success.'
|
143
|
+
sentences = segmenter.segment(txt)
|
144
|
+
sentences.should have(2).item
|
145
|
+
sentences[1].to_s.should == 'success.'
|
146
|
+
end
|
95
147
|
end
|
96
148
|
|
97
149
|
context "newline (\\n) handling" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-02-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -152,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
152
152
|
version: '0'
|
153
153
|
requirements: []
|
154
154
|
rubyforge_project:
|
155
|
-
rubygems_version: 2.
|
155
|
+
rubygems_version: 2.2.0
|
156
156
|
signing_key:
|
157
157
|
specification_version: 4
|
158
158
|
summary: Segments text into sentences
|