llt-segmenter 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -1
- data/README.md +0 -10
- data/lib/llt/segmenter/version.rb +1 -1
- data/lib/llt/segmenter.rb +34 -5
- data/spec/lib/llt/segmenter_spec.rb +53 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7fbc7cac188b6e8e63674495586e2bd89511f18a
|
4
|
+
data.tar.gz: 3766f1b4346d1fdaae057f6e75efaf78e4d1d318
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd674f244cb2b773fa3431fa88b6a9861e16d01e2e6b58d428f42f8d41f4c2117bca582481e9f28cce5a9166dd2715dd6d8279c048a322ffdfec92201fe9a097
|
7
|
+
data.tar.gz: 6adf68ff06205ce2c21b035b4a5a07cdad2f718296383a2de3ad3b194f22d8caee9b628d0ff3b8527ab49653833c78c0ce39170cbac3296c2b7ffaac5799feff
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -26,16 +26,6 @@ Or install it yourself as:
|
|
26
26
|
|
27
27
|
TODO: Write usage instructions here
|
28
28
|
|
29
|
-
## API
|
30
|
-
This currently is a list of requirements and will transform into an API documentation.
|
31
|
-
|
32
|
-
Input:
|
33
|
-
- Text or (URI)
|
34
|
-
- Black-/Whitelist for separators.
|
35
|
-
|
36
|
-
Output:
|
37
|
-
- XML (TEI) or JSON
|
38
|
-
|
39
29
|
## Contributing
|
40
30
|
|
41
31
|
1. Fork it
|
data/lib/llt/segmenter.rb
CHANGED
@@ -13,7 +13,8 @@ module LLT
|
|
13
13
|
def self.default_options
|
14
14
|
{
|
15
15
|
indexing: true,
|
16
|
-
newline_boundary: 2
|
16
|
+
newline_boundary: 2,
|
17
|
+
xml: false
|
17
18
|
}
|
18
19
|
end
|
19
20
|
|
@@ -24,8 +25,10 @@ module LLT
|
|
24
25
|
#
|
25
26
|
# (?<=\s|^) can be just \b in MRI 2.0 and upwards
|
26
27
|
AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
|
27
|
-
|
28
|
-
|
28
|
+
# the xml escaped characters cannot be refactored to something along
|
29
|
+
# &(?:amp|quot); - it's an invalid pattern in the look-behind
|
30
|
+
SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&|"|&apos|<|>);)/
|
31
|
+
DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
|
29
32
|
TRAILERS = /\)|<\/.*?>/
|
30
33
|
|
31
34
|
def segment(string, add_to: nil, **options)
|
@@ -40,6 +43,7 @@ module LLT
|
|
40
43
|
private
|
41
44
|
|
42
45
|
def setup(options)
|
46
|
+
@xml = parse_option(:xml, options)
|
43
47
|
@indexing = parse_option(:indexing, options)
|
44
48
|
@id = 0 if @indexing
|
45
49
|
|
@@ -49,8 +53,9 @@ module LLT
|
|
49
53
|
|
50
54
|
def scan_through_string(scanner, sentences = [])
|
51
55
|
while scanner.rest?
|
52
|
-
sentence = scanner
|
53
|
-
|
56
|
+
sentence = scan_until_next_sentence(scanner, sentences)
|
57
|
+
|
58
|
+
rebuild_xml_tags(scanner, sentence, sentences) if @xml
|
54
59
|
sentence << trailing_delimiters(scanner)
|
55
60
|
|
56
61
|
sentence.strip!
|
@@ -63,12 +68,36 @@ module LLT
|
|
63
68
|
sentences
|
64
69
|
end
|
65
70
|
|
71
|
+
def scan_until_next_sentence(scanner, sentences)
|
72
|
+
scanner.scan_until(@sentence_closer) ||
|
73
|
+
rescue_no_delimiters(sentences, scanner)
|
74
|
+
end
|
75
|
+
|
66
76
|
def id
|
67
77
|
if @indexing
|
68
78
|
@id += 1
|
69
79
|
end
|
70
80
|
end
|
71
81
|
|
82
|
+
# this is only needed when there is punctuation inside of xml tags
|
83
|
+
def rebuild_xml_tags(scanner, sentence, sentences)
|
84
|
+
if has_open_chevron?(sentence)
|
85
|
+
sentence << scanner.scan_until(/>/)
|
86
|
+
if inside_a_running_sentence?(sentence)
|
87
|
+
sentence << scan_until_next_sentence(scanner, sentences)
|
88
|
+
end
|
89
|
+
rebuild_xml_tags(scanner, sentence, sentences)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def has_open_chevron?(sentence)
|
94
|
+
sentence.count('<') > sentence.count('>')
|
95
|
+
end
|
96
|
+
|
97
|
+
def inside_a_running_sentence?(sentence)
|
98
|
+
! sentence.match(/#{@sentence_closer}\s*<.*?>$/)
|
99
|
+
end
|
100
|
+
|
72
101
|
def rescue_no_delimiters(sentences, scanner)
|
73
102
|
if sentences.any?
|
74
103
|
# broken off texts
|
@@ -89,9 +89,61 @@ describe LLT::Segmenter do
|
|
89
89
|
context "with embedded xml" do
|
90
90
|
it "doesn't break up before xml closing tags" do
|
91
91
|
txt = '<grc> text.</grc>'
|
92
|
-
sentences = segmenter.segment(txt)
|
92
|
+
sentences = segmenter.segment(txt, xml: true)
|
93
|
+
sentences.should have(1).item
|
94
|
+
end
|
95
|
+
|
96
|
+
it "doesn't break with punctuation in element names I" do
|
97
|
+
txt = '<grc.test>text.</grc.test>'
|
98
|
+
sentences = segmenter.segment(txt, xml: true)
|
93
99
|
sentences.should have(1).item
|
94
100
|
end
|
101
|
+
|
102
|
+
it "doesn't break with punctuation in element names II" do
|
103
|
+
txt = '<grc.test>text.</grc.test> text 2.'
|
104
|
+
sentences = segmenter.segment(txt, xml: true)
|
105
|
+
sentences.should have(2).items
|
106
|
+
sentences[0].to_s.should == '<grc.test>text.</grc.test>'
|
107
|
+
sentences[1].to_s.should == 'text 2.'
|
108
|
+
end
|
109
|
+
|
110
|
+
it "doesn't break with punctuation in element names III" do
|
111
|
+
txt = '<grc.test>text</grc.test> resumed. text 2.'
|
112
|
+
sentences = segmenter.segment(txt, xml: true)
|
113
|
+
sentences.should have(2).items
|
114
|
+
sentences[0].to_s.should == '<grc.test>text</grc.test> resumed.'
|
115
|
+
sentences[1].to_s.should == 'text 2.'
|
116
|
+
end
|
117
|
+
|
118
|
+
it "doesn't break with attribute values containing punctuation" do
|
119
|
+
txt = '<grc no="1.1"> text.</grc> text 2.'
|
120
|
+
sentences = segmenter.segment(txt, xml: true)
|
121
|
+
sentences.should have(2).items
|
122
|
+
sentences[1].to_s.should == 'text 2.'
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
context "with xml escaped characters" do
|
127
|
+
it "doesn't split when it shouldn't" do
|
128
|
+
txt = '"text" resumed. success.'
|
129
|
+
sentences = segmenter.segment(txt)
|
130
|
+
sentences.should have(2).item
|
131
|
+
sentences[1].to_s.should == 'success.'
|
132
|
+
end
|
133
|
+
|
134
|
+
it "acknowledges " as potentially trailing delimiter" do
|
135
|
+
txt = '"text." success.'
|
136
|
+
sentences = segmenter.segment(txt)
|
137
|
+
sentences.should have(2).item
|
138
|
+
sentences[1].to_s.should == 'success.'
|
139
|
+
end
|
140
|
+
|
141
|
+
it "acknowledges ' as potentially trailing delimiter" do
|
142
|
+
txt = ''text.' success.'
|
143
|
+
sentences = segmenter.segment(txt)
|
144
|
+
sentences.should have(2).item
|
145
|
+
sentences[1].to_s.should == 'success.'
|
146
|
+
end
|
95
147
|
end
|
96
148
|
|
97
149
|
context "newline (\\n) handling" do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llt-segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-02-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -152,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
152
152
|
version: '0'
|
153
153
|
requirements: []
|
154
154
|
rubyforge_project:
|
155
|
-
rubygems_version: 2.
|
155
|
+
rubygems_version: 2.2.0
|
156
156
|
signing_key:
|
157
157
|
specification_version: 4
|
158
158
|
summary: Segments text into sentences
|