pragmatic_segmenter 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +730 -0
- data/Rakefile +4 -0
- data/lib/pragmatic_segmenter.rb +2 -0
- data/lib/pragmatic_segmenter/abbreviation.rb +22 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +149 -0
- data/lib/pragmatic_segmenter/between_punctuation.rb +78 -0
- data/lib/pragmatic_segmenter/cleaner.rb +141 -0
- data/lib/pragmatic_segmenter/ellipsis.rb +36 -0
- data/lib/pragmatic_segmenter/exclamation_words.rb +19 -0
- data/lib/pragmatic_segmenter/languages/amharic.rb +33 -0
- data/lib/pragmatic_segmenter/languages/arabic.rb +83 -0
- data/lib/pragmatic_segmenter/languages/armenian.rb +33 -0
- data/lib/pragmatic_segmenter/languages/burmese.rb +33 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +132 -0
- data/lib/pragmatic_segmenter/languages/english.rb +44 -0
- data/lib/pragmatic_segmenter/languages/french.rb +29 -0
- data/lib/pragmatic_segmenter/languages/greek.rb +29 -0
- data/lib/pragmatic_segmenter/languages/hindi.rb +33 -0
- data/lib/pragmatic_segmenter/languages/italian.rb +39 -0
- data/lib/pragmatic_segmenter/languages/japanese.rb +58 -0
- data/lib/pragmatic_segmenter/languages/persian.rb +56 -0
- data/lib/pragmatic_segmenter/languages/russian.rb +60 -0
- data/lib/pragmatic_segmenter/languages/spanish.rb +39 -0
- data/lib/pragmatic_segmenter/languages/urdu.rb +33 -0
- data/lib/pragmatic_segmenter/list.rb +169 -0
- data/lib/pragmatic_segmenter/number.rb +35 -0
- data/lib/pragmatic_segmenter/process.rb +126 -0
- data/lib/pragmatic_segmenter/punctuation.rb +12 -0
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +62 -0
- data/lib/pragmatic_segmenter/rules.rb +38 -0
- data/lib/pragmatic_segmenter/segmenter.rb +81 -0
- data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +17 -0
- data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +37 -0
- data/lib/pragmatic_segmenter/types.rb +12 -0
- data/lib/pragmatic_segmenter/version.rb +3 -0
- data/pragmatic_segmenter.gemspec +25 -0
- data/spec/performance_spec.rb +24 -0
- data/spec/pragmatic_segmenter_spec.rb +1906 -0
- data/spec/spec_helper.rb +1 -0
- metadata +150 -0
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'benchmark'
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
RSpec.describe PragmaticSegmenter::Segmenter do
|
6
|
+
|
7
|
+
# Speed benchmarks tests
|
8
|
+
|
9
|
+
# it 'is fast' do
|
10
|
+
# string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
|
11
|
+
# benchmark do
|
12
|
+
# 100.times do
|
13
|
+
# PragmaticSegmenter::Segmenter.new(text: string, language: 'en').segment
|
14
|
+
# end
|
15
|
+
# end
|
16
|
+
# end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
def benchmark(&block)
|
21
|
+
block.call
|
22
|
+
time = Benchmark.realtime { block.call }
|
23
|
+
puts "RUNTIME: #{time}"
|
24
|
+
end
|
@@ -0,0 +1,1906 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe PragmaticSegmenter::Segmenter do
|
4
|
+
context "Golden Rules (English)" do
|
5
|
+
it "Simple period to end sentence #001" do
|
6
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello World. My name is Jonas.", language: "en")
|
7
|
+
expect(ps.segment).to eq(["Hello World.", "My name is Jonas."])
|
8
|
+
end
|
9
|
+
|
10
|
+
it "Question mark to end sentence #002" do
|
11
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "What is your name? My name is Jonas.", language: "en")
|
12
|
+
expect(ps.segment).to eq(["What is your name?", "My name is Jonas."])
|
13
|
+
end
|
14
|
+
|
15
|
+
it "Exclamation point to end sentence #003" do
|
16
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "There it is! I found it.", language: "en")
|
17
|
+
expect(ps.segment).to eq(["There it is!", "I found it."])
|
18
|
+
end
|
19
|
+
|
20
|
+
it "One letter upper case abbreviations #004" do
|
21
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "My name is Jonas E. Smith.", language: "en")
|
22
|
+
expect(ps.segment).to eq(["My name is Jonas E. Smith."])
|
23
|
+
end
|
24
|
+
|
25
|
+
it "One letter lower case abbreviations #005" do
|
26
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Please turn to p. 55.", language: "en")
|
27
|
+
expect(ps.segment).to eq(["Please turn to p. 55."])
|
28
|
+
end
|
29
|
+
|
30
|
+
it "Two letter lower case abbreviations in the middle of a sentence #006" do
|
31
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Were Jane and co. at the party?", language: "en")
|
32
|
+
expect(ps.segment).to eq(["Were Jane and co. at the party?"])
|
33
|
+
end
|
34
|
+
|
35
|
+
it "Two letter upper case abbreviations in the middle of a sentence #007" do
|
36
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "They closed the deal with Pitt, Briggs & Co. at noon.", language: "en")
|
37
|
+
expect(ps.segment).to eq(["They closed the deal with Pitt, Briggs & Co. at noon."])
|
38
|
+
end
|
39
|
+
|
40
|
+
it "Two letter lower case abbreviations at the end of a sentence #008" do
|
41
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Let's ask Jane and co. They should know.", language: "en")
|
42
|
+
expect(ps.segment).to eq(["Let's ask Jane and co.", "They should know."])
|
43
|
+
end
|
44
|
+
|
45
|
+
it "Two letter upper case abbreviations at the end of a sentence #009" do
|
46
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "They closed the deal with Pitt, Briggs & Co. It closed yesterday.", language: "en")
|
47
|
+
expect(ps.segment).to eq(["They closed the deal with Pitt, Briggs & Co.", "It closed yesterday."])
|
48
|
+
end
|
49
|
+
|
50
|
+
it "Two letter (prepositive) abbreviations #010" do
|
51
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "I can see Mt. Fuji from here.", language: "en")
|
52
|
+
expect(ps.segment).to eq(["I can see Mt. Fuji from here."])
|
53
|
+
end
|
54
|
+
|
55
|
+
it "Two letter (prepositive & postpositive) abbreviations #011" do
|
56
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Church is on 5th st. near the light.", language: "en")
|
57
|
+
expect(ps.segment).to eq(["St. Michael's Church is on 5th st. near the light."])
|
58
|
+
end
|
59
|
+
|
60
|
+
it "Possesive two letter abbreviations #012" do
|
61
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "That is JFK Jr.'s book.", language: "en")
|
62
|
+
expect(ps.segment).to eq(["That is JFK Jr.'s book."])
|
63
|
+
end
|
64
|
+
|
65
|
+
it "Multi-period abbreviations in the middle of a sentence #013" do
|
66
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "I visited the U.S.A. last year.", language: "en")
|
67
|
+
expect(ps.segment).to eq(["I visited the U.S.A. last year."])
|
68
|
+
end
|
69
|
+
|
70
|
+
it "Multi-period abbreviations at the end of a sentence #014" do
|
71
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "I live in the E.U. How about you?", language: "en")
|
72
|
+
expect(ps.segment).to eq(["I live in the E.U.", "How about you?"])
|
73
|
+
end
|
74
|
+
|
75
|
+
it "U.S. as sentence boundary #015" do
|
76
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "I live in the U.S. How about you?", language: "en")
|
77
|
+
expect(ps.segment).to eq(["I live in the U.S.", "How about you?"])
|
78
|
+
end
|
79
|
+
|
80
|
+
it "U.S. as non sentence boundary with next word capitalized #016" do
|
81
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "I work for the U.S. Government in Virginia.", language: "en")
|
82
|
+
expect(ps.segment).to eq(["I work for the U.S. Government in Virginia."])
|
83
|
+
end
|
84
|
+
|
85
|
+
it "U.S. as non sentence boundary #017" do
|
86
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "I have lived in the U.S. for 20 years.", language: "en")
|
87
|
+
expect(ps.segment).to eq(["I have lived in the U.S. for 20 years."])
|
88
|
+
end
|
89
|
+
|
90
|
+
xdescribe "not yet implemented" do
|
91
|
+
it "A.M. / P.M. as non sentence boundary and sentence boundary #018" do
|
92
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.", language: "en")
|
93
|
+
expect(ps.segment).to eq(["At 5 a.m. Mr. Smith went to the bank.", "He left the bank at 6 P.M.", "Mr. Smith then went to the store."])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
it "Number as non sentence boundary #019" do
|
98
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "She has $100.00 in her bag.", language: "en")
|
99
|
+
expect(ps.segment).to eq(["She has $100.00 in her bag."])
|
100
|
+
end
|
101
|
+
|
102
|
+
it "Number as sentence boundary #020" do
|
103
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "She has $100.00. It is in her bag.", language: "en")
|
104
|
+
expect(ps.segment).to eq(["She has $100.00.", "It is in her bag."])
|
105
|
+
end
|
106
|
+
|
107
|
+
it "Parenthetical inside sentence #021" do
|
108
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He teaches science (He previously worked for 5 years as an engineer.) at the local University.", language: "en")
|
109
|
+
expect(ps.segment).to eq(["He teaches science (He previously worked for 5 years as an engineer.) at the local University."])
|
110
|
+
end
|
111
|
+
|
112
|
+
it "Email addresses #022" do
|
113
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Her email is Jane.Doe@example.com. I sent her an email.", language: "en")
|
114
|
+
expect(ps.segment).to eq(["Her email is Jane.Doe@example.com.", "I sent her an email."])
|
115
|
+
end
|
116
|
+
|
117
|
+
it "Web addresses #023" do
|
118
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.", language: "en")
|
119
|
+
expect(ps.segment).to eq(["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."])
|
120
|
+
end
|
121
|
+
|
122
|
+
it "Single quotations inside sentence #024" do
|
123
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, 'This is great.' she said.", language: "en")
|
124
|
+
expect(ps.segment).to eq(["She turned to him, 'This is great.' she said."])
|
125
|
+
end
|
126
|
+
|
127
|
+
it "Double quotations inside sentence #025" do
|
128
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, \"This is great.\" she said.", language: "en")
|
129
|
+
expect(ps.segment).to eq(["She turned to him, \"This is great.\" she said."])
|
130
|
+
end
|
131
|
+
|
132
|
+
it "Double quotations at the end of a sentence #026" do
|
133
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, \"This is great.\" She held the book out to show him.", language: "en")
|
134
|
+
expect(ps.segment).to eq(["She turned to him, \"This is great.\"", "She held the book out to show him."])
|
135
|
+
end
|
136
|
+
|
137
|
+
it "Double punctuation (exclamation point) #027" do
|
138
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello!! Long time no see.", language: "en")
|
139
|
+
expect(ps.segment).to eq(["Hello!!", "Long time no see."])
|
140
|
+
end
|
141
|
+
|
142
|
+
it "Double punctuation (question mark) #028" do
|
143
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello?? Who is there?", language: "en")
|
144
|
+
expect(ps.segment).to eq(["Hello??", "Who is there?"])
|
145
|
+
end
|
146
|
+
|
147
|
+
it "Double punctuation (exclamation point / question mark) #029" do
|
148
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello!? Is that you?", language: "en")
|
149
|
+
expect(ps.segment).to eq(["Hello!?", "Is that you?"])
|
150
|
+
end
|
151
|
+
|
152
|
+
it "Double punctuation (question mark / exclamation point) #030" do
|
153
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello?! Is that you?", language: "en")
|
154
|
+
expect(ps.segment).to eq(["Hello?!", "Is that you?"])
|
155
|
+
end
|
156
|
+
|
157
|
+
it "List (period followed by parens and no period to end item) #031" do
|
158
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1.) The first item 2.) The second item", language: "en")
|
159
|
+
expect(ps.segment).to eq(["1.) The first item", "2.) The second item"])
|
160
|
+
end
|
161
|
+
|
162
|
+
it "List (period followed by parens and period to end item) #032" do
|
163
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1.) The first item. 2.) The second item.", language: "en")
|
164
|
+
expect(ps.segment).to eq(["1.) The first item.", "2.) The second item."])
|
165
|
+
end
|
166
|
+
|
167
|
+
it "List (parens and no period to end item) #033" do
|
168
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1) The first item 2) The second item", language: "en")
|
169
|
+
expect(ps.segment).to eq(["1) The first item", "2) The second item"])
|
170
|
+
end
|
171
|
+
|
172
|
+
it "List (parens and period to end item) #034" do
|
173
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1) The first item. 2) The second item.", language: "en")
|
174
|
+
expect(ps.segment).to eq(["1) The first item.", "2) The second item."])
|
175
|
+
end
|
176
|
+
|
177
|
+
it "List (period to mark list and no period to end item) #035" do
|
178
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1. The first item 2. The second item", language: "en")
|
179
|
+
expect(ps.segment).to eq(["1. The first item", "2. The second item"])
|
180
|
+
end
|
181
|
+
|
182
|
+
it "List (period to mark list and period to end item) #036" do
|
183
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1. The first item. 2. The second item.", language: "en")
|
184
|
+
expect(ps.segment).to eq(["1. The first item.", "2. The second item."])
|
185
|
+
end
|
186
|
+
|
187
|
+
it "List with bullet #037" do
|
188
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "• 9. The first item • 10. The second item", language: "en")
|
189
|
+
expect(ps.segment).to eq(["• 9. The first item", "• 10. The second item"])
|
190
|
+
end
|
191
|
+
|
192
|
+
it "List with hypthen #038" do
|
193
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "⁃9. The first item ⁃10. The second item", language: "en")
|
194
|
+
expect(ps.segment).to eq(["⁃9. The first item", "⁃10. The second item"])
|
195
|
+
end
|
196
|
+
|
197
|
+
it "Alphabetical list #039" do
|
198
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "a. The first item b. The second item c. The third list item", language: "en")
|
199
|
+
expect(ps.segment).to eq(["a. The first item", "b. The second item", "c. The third list item"])
|
200
|
+
end
|
201
|
+
|
202
|
+
it "Errant newlines in the middle of sentences (PDF) #040" do
|
203
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "This is a sentence\ncut off in the middle because pdf.", language: "en", doc_type: "pdf")
|
204
|
+
expect(ps.segment).to eq(["This is a sentence cut off in the middle because pdf."])
|
205
|
+
end
|
206
|
+
|
207
|
+
it "Errant newlines in the middle of sentences #041" do
|
208
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "It was a cold \nnight in the city.", language: "en")
|
209
|
+
expect(ps.segment).to eq(["It was a cold night in the city."])
|
210
|
+
end
|
211
|
+
|
212
|
+
it "Lower case list separated by newline #042" do
|
213
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "features\ncontact manager\nevents, activities\n", language: "en")
|
214
|
+
expect(ps.segment).to eq(["features", "contact manager", "events, activities"])
|
215
|
+
end
|
216
|
+
|
217
|
+
it "Geo Coordinates #043" do
|
218
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "You can find it at N°. 1026.253.553. That is where the treasure is.", language: "en")
|
219
|
+
expect(ps.segment).to eq(["You can find it at N°. 1026.253.553.", "That is where the treasure is."])
|
220
|
+
end
|
221
|
+
|
222
|
+
it "Named entities with an exclamation point #044" do
|
223
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "She works at Yahoo! in the accounting department.", language: "en")
|
224
|
+
expect(ps.segment).to eq(["She works at Yahoo! in the accounting department."])
|
225
|
+
end
|
226
|
+
|
227
|
+
it "I as a sentence boundary and I as an abbreviation #045" do
|
228
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "We make a good team, you and I. Did you see Albert I. Jones yesterday?", language: "en")
|
229
|
+
expect(ps.segment).to eq(["We make a good team, you and I.", "Did you see Albert I. Jones yesterday?"])
|
230
|
+
end
|
231
|
+
|
232
|
+
it "Ellipsis at end of quotation #046" do
|
233
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”", language: "en")
|
234
|
+
expect(ps.segment).to eq(["Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"])
|
235
|
+
end
|
236
|
+
|
237
|
+
it "Ellipsis with square brackets #047" do
|
238
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55).", language: "en")
|
239
|
+
expect(ps.segment).to eq(["\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)."])
|
240
|
+
end
|
241
|
+
|
242
|
+
it "Ellipsis as sentence boundary (standard ellipsis rules) #048" do
|
243
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.", language: "en")
|
244
|
+
expect(ps.segment).to eq(["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."])
|
245
|
+
end
|
246
|
+
|
247
|
+
it "Ellipsis as sentence boundary (non-standard ellipsis rules) #049" do
|
248
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "I never meant that.... She left the store.", language: "en")
|
249
|
+
expect(ps.segment).to eq(["I never meant that....", "She left the store."])
|
250
|
+
end
|
251
|
+
|
252
|
+
it "Ellipsis as non sentence boundary #050" do
|
253
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", language: "en")
|
254
|
+
expect(ps.segment).to eq(["I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."])
|
255
|
+
end
|
256
|
+
|
257
|
+
it "4-dot ellipsis #051" do
|
258
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", language: "en")
|
259
|
+
expect(ps.segment).to eq(["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."])
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
context "Golden Rules (languages other than English)" do
|
264
|
+
context "Golden Rules (German)" do
|
265
|
+
it "Quotation at end of sentence #001" do
|
266
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "„Ich habe heute keine Zeit“, sagte die Frau und flüsterte leise: „Und auch keine Lust.“ Wir haben 1.000.000 Euro.", language: "de")
|
267
|
+
expect(ps.segment).to eq(["„Ich habe heute keine Zeit“, sagte die Frau und flüsterte leise: „Und auch keine Lust.“", "Wir haben 1.000.000 Euro."])
|
268
|
+
end
|
269
|
+
|
270
|
+
it "Abbreviations #002" do
|
271
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Es gibt jedoch einige Vorsichtsmaßnahmen, die Du ergreifen kannst, z. B. ist es sehr empfehlenswert, dass Du Dein Zuhause von allem Junkfood befreist.", language: "de")
|
272
|
+
expect(ps.segment).to eq(["Es gibt jedoch einige Vorsichtsmaßnahmen, die Du ergreifen kannst, z. B. ist es sehr empfehlenswert, dass Du Dein Zuhause von allem Junkfood befreist."])
|
273
|
+
end
|
274
|
+
|
275
|
+
it "Numbers #003" do
|
276
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Was sind die Konsequenzen der Abstimmung vom 12. Juni?", language: "de")
|
277
|
+
expect(ps.segment).to eq(["Was sind die Konsequenzen der Abstimmung vom 12. Juni?"])
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
context "Golden Rules (Japanese)" do
|
282
|
+
it "Simple period to end sentence #001" do
|
283
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "これはペンです。それはマーカーです。", language: "ja")
|
284
|
+
expect(ps.segment).to eq(["これはペンです。", "それはマーカーです。"])
|
285
|
+
end
|
286
|
+
|
287
|
+
it "Question mark to end sentence #002" do
|
288
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "それは何ですか?ペンですか?", language: "ja")
|
289
|
+
expect(ps.segment).to eq(["それは何ですか?", "ペンですか?"])
|
290
|
+
end
|
291
|
+
|
292
|
+
it "Exclamation point to end sentence #003" do
|
293
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "良かったね!すごい!", language: "ja")
|
294
|
+
expect(ps.segment).to eq(["良かったね!", "すごい!"])
|
295
|
+
end
|
296
|
+
|
297
|
+
it "Quotation #004" do
|
298
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "自民党税制調査会の幹部は、「引き下げ幅は3.29%以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、30日に決定する与党税制改正大綱に盛り込むことにしています。2%台後半を目指すとする方向で最終調整に入りました。", language: "ja")
|
299
|
+
expect(ps.segment).to eq(["自民党税制調査会の幹部は、「引き下げ幅は3.29%以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、30日に決定する与党税制改正大綱に盛り込むことにしています。", "2%台後半を目指すとする方向で最終調整に入りました。"])
|
300
|
+
end
|
301
|
+
|
302
|
+
it "Errant newlines in the middle of sentences #005" do
|
303
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "これは父の\n家です。", language: "ja")
|
304
|
+
expect(ps.segment).to eq(["これは父の家です。"])
|
305
|
+
end
|
306
|
+
end
|
307
|
+
|
308
|
+
context "Golden Rules (Arabic)" do
|
309
|
+
it "Regular punctuation #001" do
|
310
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "سؤال وجواب: ماذا حدث بعد الانتخابات الايرانية؟ طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن. يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب.", language: "ar")
|
311
|
+
expect(ps.segment).to eq(["سؤال وجواب:", "ماذا حدث بعد الانتخابات الايرانية؟", "طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن.", "يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب."])
|
312
|
+
end
|
313
|
+
|
314
|
+
it "Abbreviations #002" do
|
315
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "وقال د. ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى. وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير.", language: "ar")
|
316
|
+
expect(ps.segment).to eq(["وقال د. ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى.", "وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير."])
|
317
|
+
end
|
318
|
+
|
319
|
+
it "Numbers and Dates #003" do
|
320
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12/08/2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار. ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية.", language: "ar")
|
321
|
+
expect(ps.segment).to eq(["ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12/08/2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار.", "ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية."])
|
322
|
+
end
|
323
|
+
|
324
|
+
it "Time #004" do
|
325
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز: رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه. العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي.", language: "ar")
|
326
|
+
expect(ps.segment).to eq(["الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز:", "رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه.", "العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي."])
|
327
|
+
end
|
328
|
+
|
329
|
+
it "Comma #005" do
|
330
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب، زرعها عملاء الموساد كما تقول مصادر إسرائيلية، وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية، وبدأت مراسم الحداد عليه", language: "ar")
|
331
|
+
expect(ps.segment).to eq(["عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب،", "زرعها عملاء الموساد كما تقول مصادر إسرائيلية،", "وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية،", "وبدأت مراسم الحداد عليه"])
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
context "Golden Rules (Italian)" do
|
336
|
+
it "Abbreviations #001" do
|
337
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Salve Sig.ra Mengoni! Come sta oggi?", language: "it")
|
338
|
+
expect(ps.segment).to eq(["Salve Sig.ra Mengoni!", "Come sta oggi?"])
|
339
|
+
end
|
340
|
+
|
341
|
+
it "Quotations #002" do
|
342
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.».", language: "it")
|
343
|
+
expect(ps.segment).to eq(["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.»."])
|
344
|
+
end
|
345
|
+
|
346
|
+
it "Numbers #003" do
|
347
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La casa costa 170.500.000,00€!", language: "it")
|
348
|
+
expect(ps.segment).to eq(["La casa costa 170.500.000,00€!"])
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
context "Golden Rules (Russian)" do
|
353
|
+
it "Abbreviations #001" do
|
354
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5 куб.м.", language: "ru")
|
355
|
+
expect(ps.segment).to eq(["Объем составляет 5 куб.м."])
|
356
|
+
end
|
357
|
+
|
358
|
+
it "Quotations #002" do
|
359
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?».", language: "ru")
|
360
|
+
expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»."])
|
361
|
+
end
|
362
|
+
|
363
|
+
it "Numbers #003" do
|
364
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27.10.14", language: "ru")
|
365
|
+
expect(ps.segment).to eq(["Сегодня 27.10.14"])
|
366
|
+
end
|
367
|
+
end
|
368
|
+
|
369
|
+
context "Golden Rules (Spanish)" do
|
370
|
+
it "Question mark to end sentence #001" do
|
371
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "¿Cómo está hoy? Espero que muy bien.", language: "es")
|
372
|
+
expect(ps.segment).to eq(["¿Cómo está hoy?", "Espero que muy bien."])
|
373
|
+
end
|
374
|
+
|
375
|
+
it "Exclamation point to end sentence #002" do
|
376
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "¡Hola señorita! Espero que muy bien.", language: "es")
|
377
|
+
expect(ps.segment).to eq(["¡Hola señorita!", "Espero que muy bien."])
|
378
|
+
end
|
379
|
+
|
380
|
+
it "Abbreviations #003" do
|
381
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", language: "es")
|
382
|
+
expect(ps.segment).to eq(["Hola Srta. Ledesma.", "Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."])
|
383
|
+
end
|
384
|
+
|
385
|
+
it "Numbers #004" do
|
386
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", language: "es")
|
387
|
+
expect(ps.segment).to eq(["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."])
|
388
|
+
end
|
389
|
+
|
390
|
+
it "Quotations #005" do
|
391
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles.", language: "es")
|
392
|
+
expect(ps.segment).to eq(["«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles."])
|
393
|
+
end
|
394
|
+
end
|
395
|
+
|
396
|
+
context "Golden Rules (Greek)" do
|
397
|
+
it "Question mark to end sentence #001" do
|
398
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Με συγχωρείτε· πού είναι οι τουαλέτες; Τις Κυριακές δε δούλευε κανένας. το κόστος του σπιτιού ήταν £260.950,00.", language: "el")
|
399
|
+
expect(ps.segment).to eq(["Με συγχωρείτε· πού είναι οι τουαλέτες;", "Τις Κυριακές δε δούλευε κανένας.", "το κόστος του σπιτιού ήταν £260.950,00."])
|
400
|
+
end
|
401
|
+
end
|
402
|
+
|
403
|
+
context "Golden Rules (Hindi)" do
|
404
|
+
it "Full stop #001" do
|
405
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।", language: "hi")
|
406
|
+
expect(ps.segment).to eq(["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"])
|
407
|
+
end
|
408
|
+
end
|
409
|
+
|
410
|
+
context "Golden Rules (Armenian)" do
|
411
|
+
it "Sentence ending punctuation #001" do
|
412
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ի՞նչ ես մտածում: Ոչինչ:", language: "hy")
|
413
|
+
expect(ps.segment).to eq(["Ի՞նչ ես մտածում:", "Ոչինչ:"])
|
414
|
+
end
|
415
|
+
|
416
|
+
it "Ellipsis #002" do
|
417
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:", language: "hy")
|
418
|
+
expect(ps.segment).to eq(["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"])
|
419
|
+
end
|
420
|
+
|
421
|
+
it "Period is not a sentence boundary #003" do
|
422
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:", language: "hy")
|
423
|
+
expect(ps.segment).to eq(["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"])
|
424
|
+
end
|
425
|
+
end
|
426
|
+
|
427
|
+
context "Golden Rules (Burmese)" do
|
428
|
+
it "Sentence ending punctuation #001" do
|
429
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။", language: 'my')
|
430
|
+
expect(ps.segment).to eq(["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"])
|
431
|
+
end
|
432
|
+
end
|
433
|
+
|
434
|
+
context "Golden Rules (Amharic)" do
|
435
|
+
it "Sentence ending punctuation #001" do
|
436
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።", language: 'am')
|
437
|
+
expect(ps.segment).to eq(["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"])
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
context "Golden Rules (Persian)" do
|
442
|
+
it "Sentence ending punctuation #001" do
|
443
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.", language: 'fa')
|
444
|
+
expect(ps.segment).to eq(["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."])
|
445
|
+
end
|
446
|
+
end
|
447
|
+
|
448
|
+
context "Golden Rules (Urdu)" do
|
449
|
+
it "Sentence ending punctuation #001" do
|
450
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟", language: 'ur')
|
451
|
+
expect(ps.segment).to eq(["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"])
|
452
|
+
end
|
453
|
+
end
|
454
|
+
end
|
455
|
+
|
456
|
+
context 'Language: English (en)' do
|
457
|
+
describe '#segment' do
|
458
|
+
it 'correctly segments text #001' do
|
459
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversations?'\nSo she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.", language: 'en')
|
460
|
+
expect(ps.segment).to eq(["Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversations?'", "So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her."])
|
461
|
+
end
|
462
|
+
|
463
|
+
it 'correctly segments text #002' do
|
464
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\n'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en', doc_type: 'pdf')
|
465
|
+
expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
|
466
|
+
end
|
467
|
+
|
468
|
+
it 'correctly segments text #003' do
|
469
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\r'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en', doc_type: 'pdf')
|
470
|
+
expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
|
471
|
+
end
|
472
|
+
|
473
|
+
it 'correctly segments text #004' do
|
474
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en')
|
475
|
+
expect(ps.segment).to eq(["'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
|
476
|
+
end
|
477
|
+
|
478
|
+
it 'correctly segments text #005' do
|
479
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Down, down, down. Would the fall NEVER come to an end! 'I wonder how many miles I've fallen by this time?' she said aloud.", language: 'en')
|
480
|
+
expect(ps.segment).to eq(["Down, down, down.", "Would the fall NEVER come to an end!", "'I wonder how many miles I've fallen by this time?' she said aloud."])
|
481
|
+
end
|
482
|
+
|
483
|
+
it 'correctly segments text #006' do
|
484
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it. 'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en')
|
485
|
+
expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
|
486
|
+
end
|
487
|
+
|
488
|
+
it 'correctly segments text #007' do
|
489
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'A minute is a unit of measurement of time or of angle. The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1. In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second. The minute is not an SI unit; however, it is accepted for use with SI units. The symbol for minute or minutes is min. The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system. Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length.', language: 'en')
|
490
|
+
expect(ps.segment).to eq(["A minute is a unit of measurement of time or of angle.", "The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1.", "In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second.", "The minute is not an SI unit; however, it is accepted for use with SI units.", "The symbol for minute or minutes is min.", "The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system.", "Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length."])
|
491
|
+
end
|
492
|
+
|
493
|
+
it 'correctly segments text #008' do
|
494
|
+
text = <<-EOF
|
495
|
+
About Me...............................................................................................5
|
496
|
+
Chapter 2 ...................................................................... 6
|
497
|
+
Three Weeks Later............................................................................ 7
|
498
|
+
Better Eating........................................................................................ 8
|
499
|
+
What's the Score?.............................................................. 9
|
500
|
+
How To Calculate the Score................... 16-17
|
501
|
+
EOF
|
502
|
+
|
503
|
+
ps = PragmaticSegmenter::Segmenter.new(text: text, language: 'en')
|
504
|
+
expect(ps.segment).to eq(["About Me", "Chapter 2", "Three Weeks Later", "Better Eating", "What's the Score?", "How To Calculate the Score"])
|
505
|
+
end
|
506
|
+
|
507
|
+
it 'correctly segments text #009' do
|
508
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'I think Jun. is a great month, said Mr. Suzuki.', language: 'en')
|
509
|
+
expect(ps.segment).to eq(["I think Jun. is a great month, said Mr. Suzuki."])
|
510
|
+
end
|
511
|
+
|
512
|
+
it 'correctly segments text #010' do
|
513
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Jun. is a great month, said Mr. Suzuki.', language: 'en')
|
514
|
+
expect(ps.segment).to eq(["Jun. is a great month, said Mr. Suzuki."])
|
515
|
+
end
|
516
|
+
|
517
|
+
it 'correctly segments text #011' do
|
518
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "I have 1.000.00. Yay $.50 and .50! That's 600.", language: 'en')
|
519
|
+
expect(ps.segment).to eq(["I have 1.000.00.", "Yay $.50 and .50!", "That's 600."])
|
520
|
+
end
|
521
|
+
|
522
|
+
it 'correctly segments text #012' do
|
523
|
+
ps = PragmaticSegmenter::Segmenter.new(text: '1.) This is a list item with a parens.', language: 'en')
|
524
|
+
expect(ps.segment).to eq(["1.) This is a list item with a parens."])
|
525
|
+
end
|
526
|
+
|
527
|
+
it 'correctly segments text #013' do
|
528
|
+
ps = PragmaticSegmenter::Segmenter.new(text: '1. This is a list item.', language: 'en')
|
529
|
+
expect(ps.segment).to eq(['1. This is a list item.'])
|
530
|
+
end
|
531
|
+
|
532
|
+
it 'correctly segments text #014' do
|
533
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'I live in the U.S.A. I went to J.C. Penney.', language: 'en')
|
534
|
+
expect(ps.segment).to eq(["I live in the U.S.A.", "I went to J.C. Penney."])
|
535
|
+
end
|
536
|
+
|
537
|
+
it 'correctly segments text #015' do
|
538
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'His name is Alfred E. Sloan.', language: 'en')
|
539
|
+
expect(ps.segment).to eq(['His name is Alfred E. Sloan.'])
|
540
|
+
end
|
541
|
+
|
542
|
+
it 'correctly segments text #016' do
|
543
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Q. What is his name? A. His name is Alfred E. Sloan.', language: 'en')
|
544
|
+
expect(ps.segment).to eq(['Q. What is his name?', 'A. His name is Alfred E. Sloan.'])
|
545
|
+
end
|
546
|
+
|
547
|
+
it 'correctly segments text #017' do
|
548
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Today is 11.18.2014.', language: 'en')
|
549
|
+
expect(ps.segment).to eq(['Today is 11.18.2014.'])
|
550
|
+
end
|
551
|
+
|
552
|
+
it 'correctly segments text #018' do
|
553
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'I need you to find 3 items, e.g. a hat, a coat, and a bag.', language: 'en')
|
554
|
+
expect(ps.segment).to eq(['I need you to find 3 items, e.g. a hat, a coat, and a bag.'])
|
555
|
+
end
|
556
|
+
|
557
|
+
it 'correctly segments text #019' do
|
558
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "The game is the Giants vs. the Tigers at 10 p.m. I'm going are you?", language: 'en')
|
559
|
+
expect(ps.segment).to eq(["The game is the Giants vs. the Tigers at 10 p.m.", "I'm going are you?"])
|
560
|
+
end
|
561
|
+
|
562
|
+
it 'correctly segments text #020' do
|
563
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'He is no. 5, the shortstop.', language: 'en')
|
564
|
+
expect(ps.segment).to eq(['He is no. 5, the shortstop.'])
|
565
|
+
end
|
566
|
+
|
567
|
+
it 'correctly segments text #021' do
|
568
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Remove long strings of dots........please.", language: 'en')
|
569
|
+
expect(ps.segment).to eq(["Remove long strings of dots please."])
|
570
|
+
end
|
571
|
+
|
572
|
+
it 'correctly segments text #022' do
|
573
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "See our additional services section or contact us for pricing\n.\n\n\nPricing Additionl Info\n", language: 'en')
|
574
|
+
expect(ps.segment).to eq(["See our additional services section or contact us for pricing.", "Pricing Additionl Info"])
|
575
|
+
end
|
576
|
+
|
577
|
+
it 'correctly segments text #023' do
|
578
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "As payment for 1. above, pay us a commission fee of 0 yen and for 2. above, no fee will be paid.", language: 'en')
|
579
|
+
expect(ps.segment).to eq(["As payment for 1. above, pay us a commission fee of 0 yen and for 2. above, no fee will be paid."])
|
580
|
+
end
|
581
|
+
|
582
|
+
it 'correctly segments text #024' do
|
583
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "features\ncontact manager\nevents, activities\n", language: 'en')
|
584
|
+
expect(ps.segment).to eq(['features', 'contact manager', 'events, activities'])
|
585
|
+
end
|
586
|
+
|
587
|
+
it 'correctly segments text #025' do
|
588
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Git rid of unnecessary white space.", language: 'en')
|
589
|
+
expect(ps.segment).to eq(["Git rid of unnecessary white space."])
|
590
|
+
end
|
591
|
+
|
592
|
+
it 'correctly segments text #026' do
|
593
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "See our additional services section or contact us for pricing\n. Pricing Additionl Info", language: 'en')
|
594
|
+
expect(ps.segment).to eq(["See our additional services section or contact us for pricing.", "Pricing Additionl Info"])
|
595
|
+
end
|
596
|
+
|
597
|
+
it 'correctly segments text #027' do
|
598
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Organising your care early \nmeans you'll have months to build a good relationship with your midwife or doctor, ready for \nthe birth.", language: 'en', doc_type: 'pdf')
|
599
|
+
expect(ps.segment).to eq(["Organising your care early means you'll have months to build a good relationship with your midwife or doctor, ready for the birth."])
|
600
|
+
end
|
601
|
+
|
602
|
+
it 'correctly segments text #028' do
|
603
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines:", language: 'en', doc_type: 'pdf')
|
604
|
+
expect(ps.segment).to eq(["10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:"])
|
605
|
+
end
|
606
|
+
|
607
|
+
it 'correctly segments text #029' do
|
608
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "• 9. Stop smoking \n• 10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines: \n\n1. Organise your pregnancy care early", language: 'en', doc_type: 'pdf')
|
609
|
+
expect(ps.segment).to eq(["• 9. Stop smoking", "• 10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:", "1. Organise your pregnancy care early"])
|
610
|
+
end
|
611
|
+
|
612
|
+
it 'correctly segments text #030' do
|
613
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "I have 600. How many do you have?", language: 'en')
|
614
|
+
expect(ps.segment).to eq(["I have 600.", "How many do you have?"])
|
615
|
+
end
|
616
|
+
|
617
|
+
it 'correctly segments text #031' do
|
618
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\n3\n\nIntroduction\n\n", language: 'en')
|
619
|
+
expect(ps.segment).to eq(["Introduction"])
|
620
|
+
end
|
621
|
+
|
622
|
+
it 'correctly segments text #032' do
|
623
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\nW\nA\nRN\nI\nNG\n", language: 'en')
|
624
|
+
expect(ps.segment).to eq(["WARNING"])
|
625
|
+
end
|
626
|
+
|
627
|
+
it 'correctly segments text #033' do
|
628
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\n\n\nW\nA\nRN\nI\nNG\n \n/\n \nA\nV\nE\nR\nT\nI\nS\nE\nM\nE\nNT\n", language: 'en')
|
629
|
+
expect(ps.segment).to eq(["WARNING", "AVERTISEMENT"])
|
630
|
+
end
|
631
|
+
|
632
|
+
it 'correctly segments text #034' do
|
633
|
+
ps = PragmaticSegmenter::Segmenter.new(text: '"Help yourself, sweetie," shouted Candy and gave her the cookie.', language: 'en')
|
634
|
+
expect(ps.segment).to eq(["\"Help yourself, sweetie,\" shouted Candy and gave her the cookie."])
|
635
|
+
end
|
636
|
+
|
637
|
+
it 'correctly segments text #035' do
|
638
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating \na shot.", language: 'en')
|
639
|
+
expect(ps.segment).to eq(["Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating a shot."])
|
640
|
+
end
|
641
|
+
|
642
|
+
it 'correctly segments text #036' do
|
643
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "This is a test. Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating \na shot.", language: 'en')
|
644
|
+
expect(ps.segment).to eq(["This is a test.", "Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating a shot."])
|
645
|
+
end
|
646
|
+
|
647
|
+
it 'correctly segments text #037' do
|
648
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "This was because it was an offensive weapon, designed to fight at a distance up to 400 yd \n( 365.8 m ).", language: 'en')
|
649
|
+
expect(ps.segment).to eq(["This was because it was an offensive weapon, designed to fight at a distance up to 400 yd ( 365.8 m )."])
|
650
|
+
end
|
651
|
+
|
652
|
+
it 'correctly segments text #038' do
|
653
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "“Are demonstrations are evidence of the public anger and frustration at opaque environmental management and decision-making?” Others yet say: \"Should we be scared about these 'protests'?\"", language: 'en')
|
654
|
+
expect(ps.segment).to eq(["“Are demonstrations are evidence of the public anger and frustration at opaque environmental management and decision-making?”", "Others yet say: \"Should we be scared about these 'protests'?\""])
|
655
|
+
end
|
656
|
+
|
657
|
+
it 'correctly segments text #039' do
|
658
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "www.testurl.Awesome.com", language: 'en')
|
659
|
+
expect(ps.segment).to eq(["www.testurl.Awesome.com"])
|
660
|
+
end
|
661
|
+
|
662
|
+
it 'correctly segments text #040' do
|
663
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "http://testurl.Awesome.com", language: 'en')
|
664
|
+
expect(ps.segment).to eq(["http://testurl.Awesome.com"])
|
665
|
+
end
|
666
|
+
|
667
|
+
it 'correctly segments text #041' do
|
668
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Church in is a church.", language: 'en')
|
669
|
+
expect(ps.segment).to eq(["St. Michael's Church in is a church."])
|
670
|
+
end
|
671
|
+
|
672
|
+
it 'correctly segments text #042' do
|
673
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "JFK Jr.'s book is on sale.", language: 'en')
|
674
|
+
expect(ps.segment).to eq(["JFK Jr.'s book is on sale."])
|
675
|
+
end
|
676
|
+
|
677
|
+
it 'correctly segments text #043' do
|
678
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "This is e.g. Mr. Smith, who talks slowly... And this is another sentence.", language: 'en')
|
679
|
+
expect(ps.segment).to eq(["This is e.g. Mr. Smith, who talks slowly...", "And this is another sentence."])
|
680
|
+
end
|
681
|
+
|
682
|
+
it 'correctly segments text #044' do
|
683
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Leave me alone!, he yelled. I am in the U.S. Army. Charles (Ind.) said he.", language: 'en')
|
684
|
+
expect(ps.segment).to eq(["Leave me alone!, he yelled.", "I am in the U.S. Army.", "Charles (Ind.) said he."])
|
685
|
+
end
|
686
|
+
|
687
|
+
it 'correctly segments text #045' do
|
688
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "This is the U.S. Senate my friends. <em>Yes.</em> <em>It is</em>!", language: 'en')
|
689
|
+
expect(ps.segment).to eq(["This is the U.S. Senate my friends.", "Yes.", "It is!"])
|
690
|
+
end
|
691
|
+
|
692
|
+
it 'correctly segments text #046' do
|
693
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Send it to P.O. box 6554", language: 'en')
|
694
|
+
expect(ps.segment).to eq(["Send it to P.O. box 6554"])
|
695
|
+
end
|
696
|
+
|
697
|
+
it 'correctly segments text #047' do
|
698
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "There were 500 cases in the U.S. The U.S. Commission asked the U.S. Government to give their opinion on the issue.", language: 'en')
|
699
|
+
expect(ps.segment).to eq(["There were 500 cases in the U.S.", "The U.S. Commission asked the U.S. Government to give their opinion on the issue."])
|
700
|
+
end
|
701
|
+
|
702
|
+
it 'correctly segments text #048' do
|
703
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en')
|
704
|
+
expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.", "(cited from WSJ 05/29/1987)"])
|
705
|
+
end
|
706
|
+
|
707
|
+
it 'correctly segments text #049' do
|
708
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990. `So what if you miss 50 tanks somewhere?' asks Rep. Norman Dicks (D., Wash.), a member of the House group that visited the talks in Vienna. Later, he recalls the words of his Marxist mentor: `The people! Theft! The holy fire!'", language: 'en')
|
709
|
+
expect(ps.segment).to eq(["Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990.", "'So what if you miss 50 tanks somewhere?' asks Rep. Norman Dicks (D., Wash.), a member of the House group that visited the talks in Vienna.", "Later, he recalls the words of his Marxist mentor: 'The people! Theft! The holy fire!'"])
|
710
|
+
end
|
711
|
+
|
712
|
+
it 'correctly segments text #050' do
|
713
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He climbed Mt. Fuji.", language: 'en')
|
714
|
+
expect(ps.segment).to eq(["He climbed Mt. Fuji."])
|
715
|
+
end
|
716
|
+
|
717
|
+
it 'correctly segments text #051' do
|
718
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He speaks !Xũ, !Kung, ǃʼOǃKung, !Xuun, !Kung-Ekoka, ǃHu, ǃKhung, ǃKu, ǃung, ǃXo, ǃXû, ǃXung, ǃXũ, and !Xun.", language: 'en')
|
719
|
+
expect(ps.segment).to eq(["He speaks !Xũ, !Kung, ǃʼOǃKung, !Xuun, !Kung-Ekoka, ǃHu, ǃKhung, ǃKu, ǃung, ǃXo, ǃXû, ǃXung, ǃXũ, and !Xun."])
|
720
|
+
end
|
721
|
+
|
722
|
+
it 'correctly segments text #052' do
|
723
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Test strange period.Does it segment correctly.", language: 'en')
|
724
|
+
expect(ps.segment).to eq(["Test strange period.", "Does it segment correctly."])
|
725
|
+
end
|
726
|
+
|
727
|
+
it 'correctly segments text #053' do
|
728
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "<h2 class=\"lined\">Hello</h2>\n<p>This is a test. Another test.</p>\n<div class=\"center\"><p>\n<img src=\"/images/content/example.jpg\">\n</p></div>", language: 'en')
|
729
|
+
expect(ps.segment).to eq(["Hello", "This is a test.", "Another test."])
|
730
|
+
end
|
731
|
+
|
732
|
+
it 'correctly segments text #054' do
|
733
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "This sentence ends with the psuedo-number x10. This one with the psuedo-number %3.00. One last sentence.", language: 'en')
|
734
|
+
expect(ps.segment).to eq(["This sentence ends with the psuedo-number x10.", "This one with the psuedo-number %3.00.", "One last sentence."])
|
735
|
+
end
|
736
|
+
|
737
|
+
it 'correctly segments text #055' do
|
738
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Testing mixed numbers Jahr10. And another 0.3 %11. That's weird.", language: 'en')
|
739
|
+
expect(ps.segment).to eq(["Testing mixed numbers Jahr10.", "And another 0.3 %11.", "That's weird."])
|
740
|
+
end
|
741
|
+
|
742
|
+
it 'correctly segments text #056' do
|
743
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Were Jane and co. at the party?", language: 'en')
|
744
|
+
expect(ps.segment).to eq(["Were Jane and co. at the party?"])
|
745
|
+
end
|
746
|
+
|
747
|
+
it 'correctly segments text #057' do
|
748
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Church is on 5th st. near the light.", language: 'en')
|
749
|
+
expect(ps.segment).to eq(["St. Michael's Church is on 5th st. near the light."])
|
750
|
+
end
|
751
|
+
|
752
|
+
it 'correctly segments text #058' do
|
753
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Let's ask Jane and co. They should know.", language: 'en')
|
754
|
+
expect(ps.segment).to eq(["Let's ask Jane and co.", "They should know."])
|
755
|
+
end
|
756
|
+
|
757
|
+
it 'correctly segments text #059' do
|
758
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He works at Yahoo! and Y!J.", language: 'en')
|
759
|
+
expect(ps.segment).to eq(["He works at Yahoo! and Y!J."])
|
760
|
+
end
|
761
|
+
|
762
|
+
it 'correctly segments text #060' do
|
763
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'The Scavenger Hunt ends on Dec. 31st, 2011.', language: 'en')
|
764
|
+
expect(ps.segment).to eq(['The Scavenger Hunt ends on Dec. 31st, 2011.'])
|
765
|
+
end
|
766
|
+
|
767
|
+
it 'correctly segments text #061' do
|
768
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Putter King Scavenger Hunt Trophy\n(6 3/4\" Engraved Crystal Trophy - Picture Coming Soon)\nThe Putter King team will judge the scavenger hunt and all decisions will be final. The scavenger hunt is open to anyone and everyone. The scavenger hunt ends on Dec. 31st, 2011.", language: 'en')
|
769
|
+
expect(ps.segment).to eq(["Putter King Scavenger Hunt Trophy", "(6 3/4\" Engraved Crystal Trophy - Picture Coming Soon)", "The Putter King team will judge the scavenger hunt and all decisions will be final.", "The scavenger hunt is open to anyone and everyone.", "The scavenger hunt ends on Dec. 31st, 2011."])
|
770
|
+
end
|
771
|
+
|
772
|
+
it 'correctly segments text #062' do
|
773
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Unauthorized modifications, alterations or installations of or to this equipment are prohibited and are in violation of AR 750-10. Any such unauthorized modifications, alterations or installations could result in death, injury or damage to the equipment.", language: 'en')
|
774
|
+
expect(ps.segment).to eq(["Unauthorized modifications, alterations or installations of or to this equipment are prohibited and are in violation of AR 750-10.", "Any such unauthorized modifications, alterations or installations could result in death, injury or damage to the equipment."])
|
775
|
+
end
|
776
|
+
|
777
|
+
it 'correctly segments text #063' do
|
778
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Header 1.2; Attachment Z\n\n\td. Compliance Log – Volume 12 \n\tAttachment A\n\n\te. Additional Logistics Data\n\tSection 10", language: 'en')
|
779
|
+
expect(ps.segment).to eq(["Header 1.2; Attachment Z", "d. Compliance Log – Volume 12", "Attachment A", "e. Additional Logistics Data", "Section 10"])
|
780
|
+
end
|
781
|
+
|
782
|
+
it 'correctly segments text #064' do
|
783
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "a.) The first item b.) The second item c.) The third list item", language: 'en')
|
784
|
+
expect(ps.segment).to eq(["a.) The first item", "b.) The second item", "c.) The third list item"])
|
785
|
+
end
|
786
|
+
|
787
|
+
it 'correctly segments text #065' do
|
788
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "a) The first item b) The second item c) The third list item", language: 'en')
|
789
|
+
expect(ps.segment).to eq(["a) The first item", "b) The second item", "c) The third list item"])
|
790
|
+
end
|
791
|
+
|
792
|
+
it 'correctly segments text #066' do
|
793
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello Wolrd. Here is a secret code AS750-10. Another sentence. Finally, this. 1. The first item 2. The second item 3. The third list item 4. Hello 5. Hello 6. Hello 7. Hello 8. Hello 9. Hello 10. Hello 11. Hello", language: 'en')
|
794
|
+
expect(ps.segment).to eq(["Hello Wolrd.", "Here is a secret code AS750-10.", "Another sentence.", "Finally, this.", "1. The first item", "2. The second item", "3. The third list item", "4. Hello", "5. Hello", "6. Hello", "7. Hello", "8. Hello", "9. Hello", "10. Hello", "11. Hello"])
|
795
|
+
end
|
796
|
+
|
797
|
+
it 'correctly segments text #067' do
|
798
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He works for ABC Ltd. and sometimes for BCD Ltd. She works for ABC Co. and BCD Co. They work for ABC Corp. and BCD Corp.", language: 'en')
|
799
|
+
expect(ps.segment).to eq(["He works for ABC Ltd. and sometimes for BCD Ltd.", "She works for ABC Co. and BCD Co.", "They work for ABC Corp. and BCD Corp."])
|
800
|
+
end
|
801
|
+
|
802
|
+
it 'correctly segments text #068' do
|
803
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "<bpt i=\"0\" type=\"bold\"><b></bpt>J1.txt<ept i=\"1\"></b></ept>", language: 'en')
|
804
|
+
expect(ps.segment).to eq(["J1.txt"])
|
805
|
+
end
|
806
|
+
|
807
|
+
it 'correctly segments text #069' do
|
808
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration.", language: 'en')
|
809
|
+
expect(ps.segment).to eq(["On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S.", "Millions attended the Inauguration."])
|
810
|
+
end
|
811
|
+
|
812
|
+
it 'correctly segments text #070' do
|
813
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "The U.K. Panel on enivronmental issues said it was true. Finally he left the U.K. He went to a new location.", language: 'en')
|
814
|
+
expect(ps.segment).to eq(["The U.K. Panel on enivronmental issues said it was true.", "Finally he left the U.K.", "He went to a new location."])
|
815
|
+
end
|
816
|
+
|
817
|
+
it 'correctly segments text #071' do
|
818
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He left at 6 P.M. Travelers who didn't get the warning at 5 P.M. left later.", language: 'en')
|
819
|
+
expect(ps.segment).to eq(["He left at 6 P.M.", "Travelers who didn't get the warning at 5 P.M. left later."])
|
820
|
+
end
|
821
|
+
|
822
|
+
it 'correctly segments text #072' do
|
823
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He left at 6 a.m. Travelers who didn't get the warning at 5 a.m. left later.", language: 'en')
|
824
|
+
expect(ps.segment).to eq(["He left at 6 a.m.", "Travelers who didn't get the warning at 5 a.m. left later."])
|
825
|
+
end
|
826
|
+
|
827
|
+
it 'correctly segments text #073' do
|
828
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He left at 6 A.M. Travelers who didn't get the warning at 5 A.M. left later.", language: 'en')
|
829
|
+
expect(ps.segment).to eq(["He left at 6 A.M.", "Travelers who didn't get the warning at 5 A.M. left later."])
|
830
|
+
end
|
831
|
+
|
832
|
+
it 'correctly segments text #074' do
|
833
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item \rIt was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”. \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", language: nil)
|
834
|
+
expect(ps.segment).to eq(["Hello World.", "My name is Jonas.", "What is your name?", "My name is Jonas.", "There it is!", "I found it.", "My name is Jonas E. Smith.", "Please turn to p. 55.", "Were Jane and co. at the party?", "They closed the deal with Pitt, Briggs & Co. at noon.", "Let's ask Jane and co.", "They should know.", "They closed the deal with Pitt, Briggs & Co.", "It closed yesterday.", "I can see Mt. Fuji from here.", "St. Michael's Church is on 5th st. near the light.", "That is JFK Jr.'s book.", "I visited the U.S.A. last year.", "I live in the E.U.", "How about you?", "I live in the U.S.", "How about you?", "I work for the U.S. Government in Virginia.", "I have lived in the U.S. for 20 years.", "She has $100.00 in her bag.", "She has $100.00.", "It is in her bag.", "He teaches science (He previously worked for 5 years as an engineer.) at the local University.", "Her email is Jane.Doe@example.com.", "I sent her an email.", "The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out.", "She turned to him, 'This is great.' she said.", "She turned to him, \"This is great.\" she said.", "She turned to him, \"This is great.\"", "She held the book out to show him.", "Hello!!", "Long time no see.", "Hello??", "Who is there?", "Hello!?", "Is that you?", "Hello?!", "Is that you?", "1.) The first item", "2.) The second item", "1.) The first item.", "2.) The second item.", "1) The first item", "2) The second item", "1) The first item.", "2) The second item.", "1. The first item", "2. The second item", "1. The first item.", "2. The second item.", "• 9. The first item", "• 10. The second item", "⁃9. The first item", "⁃10. The second item", "a. The first item", "b. The second item", "c. The third list item", "It was a cold night in the city.", "features", "contact manager", "events, activities", "You can find it at N°. 1026.253.553.", "That is where the treasure is.", "She works at Yahoo! in the accounting department.", "We make a good team, you and I.", "Did you see Albert I. Jones yesterday?", "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”.", "\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55).", "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence.", "I never meant that....", "She left the store.", "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."])
|
835
|
+
end
|
836
|
+
|
837
|
+
it 'correctly segments text #075' do
|
838
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "His name is Mark E. Smith. a. here it is b. another c. one more\n They went to the store. It was John A. Smith. She was Jane B. Smith.", language: "en")
|
839
|
+
expect(ps.segment).to eq(["His name is Mark E. Smith.", "a. here it is", "b. another", "c. one more", "They went to the store.", "It was John A. Smith.", "She was Jane B. Smith."])
|
840
|
+
end
|
841
|
+
|
842
|
+
it 'correctly segments text #076' do
|
843
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "a) here it is b) another c) one more\n They went to the store. w) hello x) hello y) hello", language: "en")
|
844
|
+
expect(ps.segment).to eq(["a) here it is", "b) another", "c) one more", "They went to the store.", "w) hello", "x) hello", "y) hello"])
|
845
|
+
end
|
846
|
+
|
847
|
+
it 'correctly segments text #077' do
|
848
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello{b^>1<b^} hello{b^>1<b^}.", language: "en")
|
849
|
+
expect(ps.segment).to eq(["Hello hello."])
|
850
|
+
end
|
851
|
+
|
852
|
+
it 'correctly segments text #078' do
|
853
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "'Well?' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs? How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en')
|
854
|
+
expect(ps.segment).to eq(["'Well?' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs? How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
|
855
|
+
end
|
856
|
+
|
857
|
+
it 'correctly segments text #079' do
|
858
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Leave me alone! he yelled. I am in the U.S. Army. Charles (Ind.) said he.", language: 'en')
|
859
|
+
expect(ps.segment).to eq(["Leave me alone! he yelled.", "I am in the U.S. Army.", "Charles (Ind.) said he."])
|
860
|
+
end
|
861
|
+
|
862
|
+
it 'correctly segments text #080' do
|
863
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, “This is great.” She held the book out to show him.", language: 'en')
|
864
|
+
expect(ps.segment).to eq(["She turned to him, “This is great.”", "She held the book out to show him."])
|
865
|
+
end
|
866
|
+
end
|
867
|
+
end
|
868
|
+
|
869
|
+
context 'Language: Japanese (ja)' do
|
870
|
+
describe '#segment' do
|
871
|
+
it 'correctly segments text #001' do
|
872
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です。 \nこれは山です(これは山です。これは山です)。これは山です、これは山です、これは山です、これは山です(これは山です。これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です。 \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です。 \n※1 これは山です。 \n2.)これは山です、これは山です、これは山です、これは山です。 \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です。 \n4.)これは山です、これは山です(これは山です、これは山です、これは山です。これは山です)これは山です、これは山です(これは山です、これは山です)。 \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です。 \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja')
|
873
|
+
expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です。", "これは山です(これは山です。これは山です)。", "これは山です、これは山です、これは山です、これは山です(これは山です。これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です。", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です。", "※1 これは山です。", "2.)これは山です、これは山です、これは山です、これは山です。", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です。", "4.)これは山です、これは山です(これは山です、これは山です、これは山です。これは山です)これは山です、これは山です(これは山です、これは山です)。", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です。", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"])
|
874
|
+
end
|
875
|
+
|
876
|
+
it 'correctly segments text #002' do
|
877
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "フフーの\n主たる債務", language: 'ja')
|
878
|
+
expect(ps.segment).to eq(["フフーの主たる債務"])
|
879
|
+
end
|
880
|
+
|
881
|
+
it 'correctly segments text #003' do
|
882
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です. \nこれは山です(これは山です.これは山です).これは山です、これは山です、これは山です、これは山です(これは山です.これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です. \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です. \n※1 これは山です. \n2.)これは山です、これは山です、これは山です、これは山です. \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です. \n4.)これは山です、これは山です(これは山です、これは山です、これは山です.これは山です)これは山です、これは山です(これは山です、これは山です). \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です. \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja')
|
883
|
+
expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です.", "これは山です(これは山です.これは山です).", "これは山です、これは山です、これは山です、これは山です(これは山です.これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です.", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です.", "※1 これは山です.", "2.)これは山です、これは山です、これは山です、これは山です.", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です.", "4.)これは山です、これは山です(これは山です、これは山です、これは山です.これは山です)これは山です、これは山です(これは山です、これは山です).", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です.", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"])
|
884
|
+
end
|
885
|
+
|
886
|
+
it 'correctly segments text #004' do
|
887
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です! \nこれは山です(これは山です!これは山です)!これは山です、これは山です、これは山です、これは山です(これは山です!これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です! \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です! \n※1 これは山です! \n2.)これは山です、これは山です、これは山です、これは山です! \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です! \n4.)これは山です、これは山です(これは山です、これは山です、これは山です!これは山です)これは山です、これは山です(これは山です、これは山です)! \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です! \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja')
|
888
|
+
expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です!", "これは山です(これは山です!これは山です)!", "これは山です、これは山です、これは山です、これは山です(これは山です!これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です!", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です!", "※1 これは山です!", "2.)これは山です、これは山です、これは山です、これは山です!", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です!", "4.)これは山です、これは山です(これは山です、これは山です、これは山です!これは山です)これは山です、これは山です(これは山です、これは山です)!", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です!", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"])
|
889
|
+
end
|
890
|
+
end
|
891
|
+
end
|
892
|
+
|
893
|
+
context 'Language: Arabic (ar)' do
|
894
|
+
# Thanks to Mahmoud Holmez for the Arabic test examples.
|
895
|
+
describe '#segment' do
|
896
|
+
it 'correctly segments text #001' do
|
897
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "سؤال وجواب: ماذا حدث بعد الانتخابات الايرانية؟ طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن. يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب.", language: 'ar')
|
898
|
+
expect(ps.segment).to eq(["سؤال وجواب:", "ماذا حدث بعد الانتخابات الايرانية؟", "طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن.", "يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب."])
|
899
|
+
end
|
900
|
+
|
901
|
+
it 'correctly segments text #002' do
|
902
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "وقال د. ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى. وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير.", language: 'ar')
|
903
|
+
expect(ps.segment).to eq(["وقال د. ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى.", "وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير."])
|
904
|
+
end
|
905
|
+
|
906
|
+
it 'correctly segments text #003' do
|
907
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12/08/2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار. ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية.", language: 'ar')
|
908
|
+
expect(ps.segment).to eq(["ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12/08/2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار.", "ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية."])
|
909
|
+
end
|
910
|
+
|
911
|
+
it 'correctly segments text #004' do
|
912
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز: رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه. العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي.", language: 'ar')
|
913
|
+
expect(ps.segment).to eq(["الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز:", "رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه.", "العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي."])
|
914
|
+
end
|
915
|
+
|
916
|
+
it 'correctly segments text #005' do
|
917
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب، زرعها عملاء الموساد كما تقول مصادر إسرائيلية، وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية، وبدأت مراسم الحداد عليه", language: 'ar')
|
918
|
+
expect(ps.segment).to eq(["عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب،", "زرعها عملاء الموساد كما تقول مصادر إسرائيلية،", "وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية،", "وبدأت مراسم الحداد عليه"])
|
919
|
+
end
|
920
|
+
end
|
921
|
+
end
|
922
|
+
|
923
|
+
context 'Language: Italian (it)' do
|
924
|
+
# Thanks to Davide Fornelli for the Italian test examples.
|
925
|
+
describe '#segment' do
|
926
|
+
|
927
|
+
it 'correctly segments text #001' do
|
928
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Salve Sig.ra Mengoni! Come sta oggi?", language: 'it')
|
929
|
+
expect(ps.segment).to eq(["Salve Sig.ra Mengoni!", "Come sta oggi?"])
|
930
|
+
end
|
931
|
+
|
932
|
+
it 'correctly segments text #002' do
|
933
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Buongiorno! Sono l'Ing. Mengozzi. È presente l'Avv. Cassioni?", language: 'it')
|
934
|
+
expect(ps.segment).to eq(["Buongiorno!", "Sono l'Ing. Mengozzi.", "È presente l'Avv. Cassioni?"])
|
935
|
+
end
|
936
|
+
|
937
|
+
it 'correctly segments text #003' do
|
938
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Mi fissi un appuntamento per mar. 23 Nov.. Grazie.", language: 'it')
|
939
|
+
expect(ps.segment).to eq(["Mi fissi un appuntamento per mar. 23 Nov..", "Grazie."])
|
940
|
+
end
|
941
|
+
|
942
|
+
it 'correctly segments text #004' do
|
943
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ecco il mio tel.:01234567. Mi saluti la Sig.na Manelli. Arrivederci.", language: 'it')
|
944
|
+
expect(ps.segment).to eq(["Ecco il mio tel.:01234567.", "Mi saluti la Sig.na Manelli.", "Arrivederci."])
|
945
|
+
end
|
946
|
+
|
947
|
+
it 'correctly segments text #005' do
|
948
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La centrale meteor. si è guastata. Gli idraul. son dovuti andare a sistemarla.", language: 'it')
|
949
|
+
expect(ps.segment).to eq(["La centrale meteor. si è guastata.", "Gli idraul. son dovuti andare a sistemarla."])
|
950
|
+
end
|
951
|
+
|
952
|
+
it 'correctly segments text #006' do
|
953
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hanno creato un algoritmo allo st. d. arte. Si ringrazia lo psicol. Serenti.", language: 'it')
|
954
|
+
expect(ps.segment).to eq(["Hanno creato un algoritmo allo st. d. arte.", "Si ringrazia lo psicol. Serenti."])
|
955
|
+
end
|
956
|
+
|
957
|
+
it 'correctly segments text #007' do
|
958
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Chiamate il V.Cte. delle F.P., adesso!", language: 'it')
|
959
|
+
expect(ps.segment).to eq(["Chiamate il V.Cte. delle F.P., adesso!"])
|
960
|
+
end
|
961
|
+
|
962
|
+
it 'correctly segments text #008' do
|
963
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Giancarlo ha sostenuto l'esame di econ. az..", language: 'it')
|
964
|
+
expect(ps.segment).to eq(["Giancarlo ha sostenuto l'esame di econ. az.."])
|
965
|
+
end
|
966
|
+
|
967
|
+
it 'correctly segments text #009' do
|
968
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Stava viaggiando a 90 km/h verso la provincia di TR quando il Dott. Mesini ha sentito un rumore e si fermò!", language: 'it')
|
969
|
+
expect(ps.segment).to eq(["Stava viaggiando a 90 km/h verso la provincia di TR quando il Dott. Mesini ha sentito un rumore e si fermò!"])
|
970
|
+
end
|
971
|
+
|
972
|
+
it 'correctly segments text #010' do
|
973
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Egregio Dir. Amm., le faccio sapere che l'ascensore non funziona.", language: 'it')
|
974
|
+
expect(ps.segment).to eq(["Egregio Dir. Amm., le faccio sapere che l'ascensore non funziona."])
|
975
|
+
end
|
976
|
+
|
977
|
+
it 'correctly segments text #011' do
|
978
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Stava mangiando e/o dormendo.", language: 'it')
|
979
|
+
expect(ps.segment).to eq(["Stava mangiando e/o dormendo."])
|
980
|
+
end
|
981
|
+
|
982
|
+
it 'correctly segments text #012' do
|
983
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ricordatevi che dom 25 Set. sarà il compleanno di Maria; dovremo darle un regalo.", language: 'it')
|
984
|
+
expect(ps.segment).to eq(["Ricordatevi che dom 25 Set. sarà il compleanno di Maria; dovremo darle un regalo."])
|
985
|
+
end
|
986
|
+
|
987
|
+
it 'correctly segments text #013' do
|
988
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La politica è quella della austerità; quindi verranno fatti tagli agli sprechi.", language: 'it')
|
989
|
+
expect(ps.segment).to eq(["La politica è quella della austerità; quindi verranno fatti tagli agli sprechi."])
|
990
|
+
end
|
991
|
+
|
992
|
+
it 'correctly segments text #014' do
|
993
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Nel tribunale, l'Avv. Fabrizi ha urlato \"Io, l'illustrissimo Fabrizi, vi si oppone!\".", language: 'it')
|
994
|
+
expect(ps.segment).to eq(["Nel tribunale, l'Avv. Fabrizi ha urlato \"Io, l'illustrissimo Fabrizi, vi si oppone!\"."])
|
995
|
+
end
|
996
|
+
|
997
|
+
it 'correctly segments text #015' do
|
998
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Le parti fisiche di un computer (ad es. RAM, CPU, tastiera, mouse, etc.) sono definiti HW.", language: 'it')
|
999
|
+
expect(ps.segment).to eq(["Le parti fisiche di un computer (ad es. RAM, CPU, tastiera, mouse, etc.) sono definiti HW."])
|
1000
|
+
end
|
1001
|
+
|
1002
|
+
it 'correctly segments text #016' do
|
1003
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La parola 'casa' è sinonimo di abitazione.", language: 'it')
|
1004
|
+
expect(ps.segment).to eq(["La parola 'casa' è sinonimo di abitazione."])
|
1005
|
+
end
|
1006
|
+
|
1007
|
+
it 'correctly segments text #017' do
|
1008
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La \"Mulino Bianco\" fa alimentari pre-confezionati.", language: 'it')
|
1009
|
+
expect(ps.segment).to eq(["La \"Mulino Bianco\" fa alimentari pre-confezionati."])
|
1010
|
+
end
|
1011
|
+
|
1012
|
+
it 'correctly segments text #018' do
|
1013
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\"Ei fu. Siccome immobile / dato il mortal sospiro / stette la spoglia immemore / orba di tanto spiro / [...]\" (Manzoni).", language: 'it')
|
1014
|
+
expect(ps.segment).to eq(["\"Ei fu. Siccome immobile / dato il mortal sospiro / stette la spoglia immemore / orba di tanto spiro / [...]\" (Manzoni)."])
|
1015
|
+
end
|
1016
|
+
|
1017
|
+
it 'correctly segments text #019' do
|
1018
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Una lettera si può iniziare in questo modo «Il/la sottoscritto/a ... nato/a a ...».", language: 'it')
|
1019
|
+
expect(ps.segment).to eq(["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a ... nato/a a ...»."])
|
1020
|
+
end
|
1021
|
+
|
1022
|
+
it 'correctly segments text #020' do
|
1023
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Per casa, in uno degli esercizi per i bambini c'era \"3 + (14/7) = 5\"", language: 'it')
|
1024
|
+
expect(ps.segment).to eq(["Per casa, in uno degli esercizi per i bambini c'era \"3 + (14/7) = 5\""])
|
1025
|
+
end
|
1026
|
+
|
1027
|
+
it 'correctly segments text #021' do
|
1028
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ai bambini è stato chiesto di fare \"4:2*2\"", language: 'it')
|
1029
|
+
expect(ps.segment).to eq(["Ai bambini è stato chiesto di fare \"4:2*2\""])
|
1030
|
+
end
|
1031
|
+
|
1032
|
+
it 'correctly segments text #022' do
|
1033
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La maestra esclamò: \"Bambini, quanto fa '2/3 + 4/3?'\".", language: 'it')
|
1034
|
+
expect(ps.segment).to eq(["La maestra esclamò: \"Bambini, quanto fa \'2/3 + 4/3?\'\"."])
|
1035
|
+
end
|
1036
|
+
|
1037
|
+
it 'correctly segments text #023' do
|
1038
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Il motore misurava 120°C.", language: 'it')
|
1039
|
+
expect(ps.segment).to eq(["Il motore misurava 120°C."])
|
1040
|
+
end
|
1041
|
+
|
1042
|
+
it 'correctly segments text #024' do
|
1043
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Il volume era di 3m³.", language: 'it')
|
1044
|
+
expect(ps.segment).to eq(["Il volume era di 3m³."])
|
1045
|
+
end
|
1046
|
+
|
1047
|
+
it 'correctly segments text #025' do
|
1048
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La stanza misurava 20m².", language: 'it')
|
1049
|
+
expect(ps.segment).to eq(["La stanza misurava 20m²."])
|
1050
|
+
end
|
1051
|
+
|
1052
|
+
it 'correctly segments text #026' do
|
1053
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1°C corrisponde a 33.8°F.", language: 'it')
|
1054
|
+
expect(ps.segment).to eq(["1°C corrisponde a 33.8°F."])
|
1055
|
+
end
|
1056
|
+
|
1057
|
+
it 'correctly segments text #027' do
|
1058
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Oggi è il 27-10-14.", language: 'it')
|
1059
|
+
expect(ps.segment).to eq(["Oggi è il 27-10-14."])
|
1060
|
+
end
|
1061
|
+
|
1062
|
+
it 'correctly segments text #028' do
|
1063
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La casa costa 170.500.000,00€!", language: 'it')
|
1064
|
+
expect(ps.segment).to eq(["La casa costa 170.500.000,00€!"])
|
1065
|
+
end
|
1066
|
+
|
1067
|
+
it 'correctly segments text #029' do
|
1068
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Il corridore 103 è arrivato 4°.", language: 'it')
|
1069
|
+
expect(ps.segment).to eq(["Il corridore 103 è arrivato 4°."])
|
1070
|
+
end
|
1071
|
+
|
1072
|
+
it 'correctly segments text #030' do
|
1073
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Oggi è il 27/10/2014.", language: 'it')
|
1074
|
+
expect(ps.segment).to eq(["Oggi è il 27/10/2014."])
|
1075
|
+
end
|
1076
|
+
|
1077
|
+
it 'correctly segments text #031' do
|
1078
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ecco l'elenco: 1.gelato, 2.carne, 3.riso.", language: 'it')
|
1079
|
+
expect(ps.segment).to eq(["Ecco l'elenco: 1.gelato, 2.carne, 3.riso."])
|
1080
|
+
end
|
1081
|
+
|
1082
|
+
it 'correctly segments text #032' do
|
1083
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Devi comprare : 1)pesce 2)sale.", language: 'it')
|
1084
|
+
expect(ps.segment).to eq(["Devi comprare : 1)pesce 2)sale."])
|
1085
|
+
end
|
1086
|
+
|
1087
|
+
it 'correctly segments text #033' do
|
1088
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La macchina viaggiava a 100 km/h.", language: 'it')
|
1089
|
+
expect(ps.segment).to eq(["La macchina viaggiava a 100 km/h."])
|
1090
|
+
end
|
1091
|
+
end
|
1092
|
+
end
|
1093
|
+
|
1094
|
+
context 'Language: Russian (ru)' do
|
1095
|
+
# Thanks to Anastasiia Tsvitailo for the Russian test examples.
|
1096
|
+
describe '#segment' do
|
1097
|
+
it 'correctly segments text #001' do
|
1098
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?».", language: 'ru')
|
1099
|
+
expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»."])
|
1100
|
+
end
|
1101
|
+
|
1102
|
+
it 'correctly segments text #002' do
|
1103
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "«Я приду поздно», — сказал Андрей.", language: 'ru')
|
1104
|
+
expect(ps.segment).to eq(["«Я приду поздно», — сказал Андрей."])
|
1105
|
+
end
|
1106
|
+
|
1107
|
+
it 'correctly segments text #003' do
|
1108
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "«К чему ты готовишься? – спросила мама. – Завтра ведь выходной».", language: 'ru')
|
1109
|
+
expect(ps.segment).to eq(["«К чему ты готовишься? – спросила мама. – Завтра ведь выходной»."])
|
1110
|
+
end
|
1111
|
+
|
1112
|
+
it 'correctly segments text #004' do
|
1113
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "По словам Пушкина, «Привычка свыше дана, замена счастью она».", language: 'ru')
|
1114
|
+
expect(ps.segment).to eq(["По словам Пушкина, «Привычка свыше дана, замена счастью она»."])
|
1115
|
+
end
|
1116
|
+
|
1117
|
+
it 'correctly segments text #005' do
|
1118
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Он сказал: «Я очень устал», и сразу же замолчал.", language: 'ru')
|
1119
|
+
expect(ps.segment).to eq(["Он сказал: «Я очень устал», и сразу же замолчал."])
|
1120
|
+
end
|
1121
|
+
|
1122
|
+
it 'correctly segments text #006' do
|
1123
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Мне стало как-то ужасно грустно в это мгновение; однако что-то похожее на смех зашевелилось в душе моей.", language: 'ru')
|
1124
|
+
expect(ps.segment).to eq(["Мне стало как-то ужасно грустно в это мгновение; однако что-то похожее на смех зашевелилось в душе моей."])
|
1125
|
+
end
|
1126
|
+
|
1127
|
+
it 'correctly segments text #007' do
|
1128
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Шухов как был в ватных брюках, не снятых на ночь (повыше левого колена их тоже был пришит затасканный, погрязневший лоскут, и на нем выведен черной, уже поблекшей краской номер Щ-854), надел телогрейку…", language: 'ru')
|
1129
|
+
expect(ps.segment).to eq(["Шухов как был в ватных брюках, не снятых на ночь (повыше левого колена их тоже был пришит затасканный, погрязневший лоскут, и на нем выведен черной, уже поблекшей краской номер Щ-854), надел телогрейку…"])
|
1130
|
+
end
|
1131
|
+
|
1132
|
+
it 'correctly segments text #008' do
|
1133
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Слово «дом» является синонимом жилища", language: 'ru')
|
1134
|
+
expect(ps.segment).to eq(["Слово «дом» является синонимом жилища"])
|
1135
|
+
end
|
1136
|
+
|
1137
|
+
it 'correctly segments text #009' do
|
1138
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "В Санкт-Петербург на гастроли приехал театр «Современник»", language: 'ru')
|
1139
|
+
expect(ps.segment).to eq(["В Санкт-Петербург на гастроли приехал театр «Современник»"])
|
1140
|
+
end
|
1141
|
+
|
1142
|
+
it 'correctly segments text #010' do
|
1143
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Машина едет со скоростью 100 км/ч.", language: 'ru')
|
1144
|
+
expect(ps.segment).to eq(["Машина едет со скоростью 100 км/ч."])
|
1145
|
+
end
|
1146
|
+
|
1147
|
+
it 'correctly segments text #011' do
|
1148
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Я поем и/или лягу спать.", language: 'ru')
|
1149
|
+
expect(ps.segment).to eq(["Я поем и/или лягу спать."])
|
1150
|
+
end
|
1151
|
+
|
1152
|
+
it 'correctly segments text #012' do
|
1153
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Он не мог справиться с примером \"3 + (14:7) = 5\"", language: 'ru')
|
1154
|
+
expect(ps.segment).to eq(["Он не мог справиться с примером \"3 + (14:7) = 5\""])
|
1155
|
+
end
|
1156
|
+
|
1157
|
+
it 'correctly segments text #013' do
|
1158
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Вот список: 1.мороженое, 2.мясо, 3.рис.", language: 'ru')
|
1159
|
+
expect(ps.segment).to eq(["Вот список: 1.мороженое, 2.мясо, 3.рис."])
|
1160
|
+
end
|
1161
|
+
|
1162
|
+
it 'correctly segments text #014' do
|
1163
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Квартира 234 находится на 4-ом этаже.", language: 'ru')
|
1164
|
+
expect(ps.segment).to eq(["Квартира 234 находится на 4-ом этаже."])
|
1165
|
+
end
|
1166
|
+
|
1167
|
+
it 'correctly segments text #015' do
|
1168
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "В это время года температура может подниматься до 40°C.", language: 'ru')
|
1169
|
+
expect(ps.segment).to eq(["В это время года температура может подниматься до 40°C."])
|
1170
|
+
end
|
1171
|
+
|
1172
|
+
it 'correctly segments text #016' do
|
1173
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5м³.", language: 'ru')
|
1174
|
+
expect(ps.segment).to eq(["Объем составляет 5м³."])
|
1175
|
+
end
|
1176
|
+
|
1177
|
+
it 'correctly segments text #017' do
|
1178
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5 куб.м.", language: 'ru')
|
1179
|
+
expect(ps.segment).to eq(["Объем составляет 5 куб.м."])
|
1180
|
+
end
|
1181
|
+
|
1182
|
+
it 'correctly segments text #018' do
|
1183
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Площадь комнаты 14м².", language: 'ru')
|
1184
|
+
expect(ps.segment).to eq(["Площадь комнаты 14м²."])
|
1185
|
+
end
|
1186
|
+
|
1187
|
+
it 'correctly segments text #019' do
|
1188
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Площадь комнаты 14 кв.м.", language: 'ru')
|
1189
|
+
expect(ps.segment).to eq(["Площадь комнаты 14 кв.м."])
|
1190
|
+
end
|
1191
|
+
|
1192
|
+
it 'correctly segments text #020' do
|
1193
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1°C соответствует 33.8°F.", language: 'ru')
|
1194
|
+
expect(ps.segment).to eq(["1°C соответствует 33.8°F."])
|
1195
|
+
end
|
1196
|
+
|
1197
|
+
it 'correctly segments text #021' do
|
1198
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27.10.14", language: 'ru')
|
1199
|
+
expect(ps.segment).to eq(["Сегодня 27.10.14"])
|
1200
|
+
end
|
1201
|
+
|
1202
|
+
it 'correctly segments text #022' do
|
1203
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27 октября 2014 года.", language: 'ru')
|
1204
|
+
expect(ps.segment).to eq(["Сегодня 27 октября 2014 года."])
|
1205
|
+
end
|
1206
|
+
|
1207
|
+
it 'correctly segments text #023' do
|
1208
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Эта машина стоит 150 000 дол.!", language: 'ru')
|
1209
|
+
expect(ps.segment).to eq(["Эта машина стоит 150 000 дол.!"])
|
1210
|
+
end
|
1211
|
+
|
1212
|
+
it 'correctly segments text #024' do
|
1213
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Эта машина стоит $150 000!", language: 'ru')
|
1214
|
+
expect(ps.segment).to eq(["Эта машина стоит $150 000!"])
|
1215
|
+
end
|
1216
|
+
|
1217
|
+
it 'correctly segments text #025' do
|
1218
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Вот номер моего телефона: +39045969798. Передавайте привет г-ну Шапочкину. До свидания.", language: 'ru')
|
1219
|
+
expect(ps.segment).to eq(["Вот номер моего телефона: +39045969798.", "Передавайте привет г-ну Шапочкину.", "До свидания."])
|
1220
|
+
end
|
1221
|
+
|
1222
|
+
it 'correctly segments text #026' do
|
1223
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Постойте, разве можно указывать цены в у.е.!", language: 'ru')
|
1224
|
+
expect(ps.segment).to eq(["Постойте, разве можно указывать цены в у.е.!"])
|
1225
|
+
end
|
1226
|
+
|
1227
|
+
it 'correctly segments text #027' do
|
1228
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Едем на скорости 90 км/ч в сторону пгт. Брагиновка, о котором мы так много слышали по ТВ!", language: 'ru')
|
1229
|
+
expect(ps.segment).to eq(["Едем на скорости 90 км/ч в сторону пгт. Брагиновка, о котором мы так много слышали по ТВ!"])
|
1230
|
+
end
|
1231
|
+
|
1232
|
+
it 'correctly segments text #028' do
|
1233
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Д-р ветеринарных наук А. И. Семенов и пр. выступали на этом семинаре.", language: 'ru')
|
1234
|
+
expect(ps.segment).to eq(["Д-р ветеринарных наук А. И. Семенов и пр. выступали на этом семинаре."])
|
1235
|
+
end
|
1236
|
+
|
1237
|
+
it 'correctly segments text #029' do
|
1238
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Уважаемый проф. Семенов! Просьба до 20.10 сдать отчет на кафедру.", language: 'ru')
|
1239
|
+
expect(ps.segment).to eq(["Уважаемый проф. Семенов!", "Просьба до 20.10 сдать отчет на кафедру."])
|
1240
|
+
end
|
1241
|
+
|
1242
|
+
it 'correctly segments text #030' do
|
1243
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Первоначальная стоимость этого комплекта 30 долл., но сейчас действует скидка. Предъявите дисконтную карту, пожалуйста!", language: 'ru')
|
1244
|
+
expect(ps.segment).to eq(["Первоначальная стоимость этого комплекта 30 долл., но сейчас действует скидка.", "Предъявите дисконтную карту, пожалуйста!"])
|
1245
|
+
end
|
1246
|
+
|
1247
|
+
it 'correctly segments text #031' do
|
1248
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Виктор съел пол-лимона и ушел по-английски из дома на ул. 1 Мая.", language: 'ru')
|
1249
|
+
expect(ps.segment).to eq(["Виктор съел пол-лимона и ушел по-английски из дома на ул. 1 Мая."])
|
1250
|
+
end
|
1251
|
+
|
1252
|
+
it 'correctly segments text #032' do
|
1253
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Напоминаю Вам, что 25.10 день рождения у Маши К., нужно будет купить ей подарок.", language: 'ru')
|
1254
|
+
expect(ps.segment).to eq(["Напоминаю Вам, что 25.10 день рождения у Маши К., нужно будет купить ей подарок."])
|
1255
|
+
end
|
1256
|
+
|
1257
|
+
it 'correctly segments text #033' do
|
1258
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "В 2010-2012 гг. Виктор посещал г. Волгоград неоднократно.", language: 'ru')
|
1259
|
+
expect(ps.segment).to eq(["В 2010-2012 гг. Виктор посещал г. Волгоград неоднократно."])
|
1260
|
+
end
|
1261
|
+
|
1262
|
+
it 'correctly segments text #034' do
|
1263
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?»", language: 'ru')
|
1264
|
+
expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»"])
|
1265
|
+
end
|
1266
|
+
|
1267
|
+
it 'correctly segments text #035' do
|
1268
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Кв. 234 находится на 4 этаже.", language: 'ru')
|
1269
|
+
expect(ps.segment).to eq(["Кв. 234 находится на 4 этаже."])
|
1270
|
+
end
|
1271
|
+
|
1272
|
+
it 'correctly segments text #036' do
|
1273
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "В это время года температура может подниматься до 40°C.", language: 'ru')
|
1274
|
+
expect(ps.segment).to eq(["В это время года температура может подниматься до 40°C."])
|
1275
|
+
end
|
1276
|
+
|
1277
|
+
it 'correctly segments text #037' do
|
1278
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Нужно купить 1)рыбу 2)соль.", language: 'ru')
|
1279
|
+
expect(ps.segment).to eq(["Нужно купить 1)рыбу 2)соль."])
|
1280
|
+
end
|
1281
|
+
|
1282
|
+
it 'correctly segments text #038' do
|
1283
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Машина едет со скоростью 100 км/ч.", language: 'ru')
|
1284
|
+
expect(ps.segment).to eq(["Машина едет со скоростью 100 км/ч."])
|
1285
|
+
end
|
1286
|
+
|
1287
|
+
it 'correctly segments text #039' do
|
1288
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Л.Н. Толстой написал \"Войну и мир\". Кроме Волконских, Л. Н. Толстой состоял в близком родстве с некоторыми другими аристократическими родами. Дом, где родился Л.Н.Толстой, 1898 г. В 1854 году дом продан по распоряжению писателя на вывоз в село Долгое.", language: 'ru')
|
1289
|
+
expect(ps.segment).to eq(["Л.Н. Толстой написал \"Войну и мир\".", "Кроме Волконских, Л. Н. Толстой состоял в близком родстве с некоторыми другими аристократическими родами.", "Дом, где родился Л.Н.Толстой, 1898 г. В 1854 году дом продан по распоряжению писателя на вывоз в село Долгое."])
|
1290
|
+
end
|
1291
|
+
end
|
1292
|
+
end
|
1293
|
+
|
1294
|
+
context 'Language: German (de)' do
|
1295
|
+
# Thanks to Silvia Busse for the German test examples.
|
1296
|
+
describe '#segment' do
|
1297
|
+
it 'correctly segments text #001' do
|
1298
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\n \n\n http:www.babycentre.co.uk/midwives \n\n \n\n \n\n10 steps to a healthy pregnancy (German) \n\n10 Schritte zu einer gesunden Schwangerschaft \n \n• 1. Planen und organisieren Sie die Zeit der Schwangerschaft frühzeitig! \n• 2. Essen Sie gesund! \n• 3. Seien Sie achtsam bei der Auswahl der Nahrungsmittel! \n• 4. Nehmen Sie zusätzlich Folsäurepräparate und essen Sie Fisch! \n• 5. Treiben Sie regelmäßig Sport! \n• 6. Beginnen Sie mit Übungen für die Beckenbodenmuskulatur! \n• 7. Reduzieren Sie Ihren Alkoholgenuss! \n• 8. Reduzieren Sie Ihren Koffeingenuß! \n• 9. Hören Sie mit dem Rauchen auf! \n• 10. Gönnen Sie sich Erholung! \n \n \nZehn einfach zu befolgende Tipps sollen Ihnen helfen, eine möglichst problemlose \nSchwangerschaft zu erleben und ein gesundes Baby auf die Welt zu bringen: \n\n1. Planen und organisieren Sie die Zeit der Schwangerschaft frühzeitig!", language: 'de', doc_type: 'pdf')
|
1299
|
+
expect(ps.segment).to eq(["http:www.babycentre.co.uk/midwives", "10 steps to a healthy pregnancy (German)", "10 Schritte zu einer gesunden Schwangerschaft", "• 1. Planen und organisieren Sie die Zeit der Schwangerschaft frühzeitig!", "• 2. Essen Sie gesund!", "• 3. Seien Sie achtsam bei der Auswahl der Nahrungsmittel!", "• 4. Nehmen Sie zusätzlich Folsäurepräparate und essen Sie Fisch!", "• 5. Treiben Sie regelmäßig Sport!", "• 6. Beginnen Sie mit Übungen für die Beckenbodenmuskulatur!", "• 7. Reduzieren Sie Ihren Alkoholgenuss!", "• 8. Reduzieren Sie Ihren Koffeingenuß!", "• 9. Hören Sie mit dem Rauchen auf!", "• 10. Gönnen Sie sich Erholung!", "Zehn einfach zu befolgende Tipps sollen Ihnen helfen, eine möglichst problemlose Schwangerschaft zu erleben und ein gesundes Baby auf die Welt zu bringen:", "1. Planen und organisieren Sie die Zeit der Schwangerschaft frühzeitig!"])
|
1300
|
+
end
|
1301
|
+
|
1302
|
+
it 'correctly segments text #002' do
|
1303
|
+
ps = PragmaticSegmenter::Segmenter.new(text: '„Ich habe heute keine Zeit“, sagte die Frau und flüsterte leise: „Und auch keine Lust.“ Wir haben 1.000.000 Euro.', language: 'de')
|
1304
|
+
expect(ps.segment).to eq(["„Ich habe heute keine Zeit“, sagte die Frau und flüsterte leise: „Und auch keine Lust.“", "Wir haben 1.000.000 Euro."])
|
1305
|
+
end
|
1306
|
+
|
1307
|
+
it 'correctly segments text #003' do
|
1308
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Thomas sagte: ,,Wann kommst zu mir?” ,,Das weiß ich noch nicht“, antwortete Susi, ,,wahrscheinlich am Sonntag.“ Wir haben 1.000.000 Euro.', language: 'de')
|
1309
|
+
expect(ps.segment).to eq(["Thomas sagte: ,,Wann kommst zu mir?” ,,Das weiß ich noch nicht“, antwortete Susi, ,,wahrscheinlich am Sonntag.“", "Wir haben 1.000.000 Euro."])
|
1310
|
+
end
|
1311
|
+
|
1312
|
+
it 'correctly segments text #004' do
|
1313
|
+
ps = PragmaticSegmenter::Segmenter.new(text: '„Lass uns jetzt essen gehen!“, sagte die Mutter zu ihrer Freundin, „am besten zum Italiener.“', language: 'de')
|
1314
|
+
expect(ps.segment).to eq(['„Lass uns jetzt essen gehen!“, sagte die Mutter zu ihrer Freundin, „am besten zum Italiener.“'])
|
1315
|
+
end
|
1316
|
+
|
1317
|
+
it 'correctly segments text #005' do
|
1318
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Wir haben 1.000.000 Euro.', language: 'de')
|
1319
|
+
expect(ps.segment).to eq(['Wir haben 1.000.000 Euro.'])
|
1320
|
+
end
|
1321
|
+
|
1322
|
+
it 'correctly segments text #006' do
|
1323
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Sie bekommen 3,50 Euro zurück.', language: 'de')
|
1324
|
+
expect(ps.segment).to eq(['Sie bekommen 3,50 Euro zurück.'])
|
1325
|
+
end
|
1326
|
+
|
1327
|
+
it 'correctly segments text #007' do
|
1328
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Dafür brauchen wir 5,5 Stunden.', language: 'de')
|
1329
|
+
expect(ps.segment).to eq(['Dafür brauchen wir 5,5 Stunden.'])
|
1330
|
+
end
|
1331
|
+
|
1332
|
+
it 'correctly segments text #008' do
|
1333
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Bitte überweisen Sie 5.300,25 Euro.', language: 'de')
|
1334
|
+
expect(ps.segment).to eq(['Bitte überweisen Sie 5.300,25 Euro.'])
|
1335
|
+
end
|
1336
|
+
|
1337
|
+
it 'correctly segments text #009' do
|
1338
|
+
ps = PragmaticSegmenter::Segmenter.new(text: '1. Dies ist eine Punkteliste.', language: 'de')
|
1339
|
+
expect(ps.segment).to eq(['1. Dies ist eine Punkteliste.'])
|
1340
|
+
end
|
1341
|
+
|
1342
|
+
it 'correctly segments text #010' do
|
1343
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Wir trafen Dr. med. Meyer in der Stadt.', language: 'de')
|
1344
|
+
expect(ps.segment).to eq(['Wir trafen Dr. med. Meyer in der Stadt.'])
|
1345
|
+
end
|
1346
|
+
|
1347
|
+
it 'correctly segments text #011' do
|
1348
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Wir brauchen Getränke, z. B. Wasser, Saft, Bier usw.', language: 'de')
|
1349
|
+
expect(ps.segment).to eq(['Wir brauchen Getränke, z. B. Wasser, Saft, Bier usw.'])
|
1350
|
+
end
|
1351
|
+
|
1352
|
+
it 'correctly segments text #012' do
|
1353
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Ich kann u.a. Spanisch sprechen.', language: 'de')
|
1354
|
+
expect(ps.segment).to eq(['Ich kann u.a. Spanisch sprechen.'])
|
1355
|
+
end
|
1356
|
+
|
1357
|
+
it 'correctly segments text #013' do
|
1358
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Frau Prof. Schulze ist z. Z. nicht da.', language: 'de')
|
1359
|
+
expect(ps.segment).to eq(['Frau Prof. Schulze ist z. Z. nicht da.'])
|
1360
|
+
end
|
1361
|
+
|
1362
|
+
it 'correctly segments text #014' do
|
1363
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Sie erhalten ein neues Bank-Statement bzw. ein neues Schreiben.', language: 'de')
|
1364
|
+
expect(ps.segment).to eq(['Sie erhalten ein neues Bank-Statement bzw. ein neues Schreiben.'])
|
1365
|
+
end
|
1366
|
+
|
1367
|
+
it 'correctly segments text #015' do
|
1368
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Z. T. ist die Lieferung unvollständig.', language: 'de')
|
1369
|
+
expect(ps.segment).to eq(['Z. T. ist die Lieferung unvollständig.'])
|
1370
|
+
end
|
1371
|
+
|
1372
|
+
it 'correctly segments text #016' do
|
1373
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Das finden Sie auf S. 225.', language: 'de')
|
1374
|
+
expect(ps.segment).to eq(['Das finden Sie auf S. 225.'])
|
1375
|
+
end
|
1376
|
+
|
1377
|
+
it 'correctly segments text #017' do
|
1378
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Sie besucht eine kath. Schule.', language: 'de')
|
1379
|
+
expect(ps.segment).to eq(['Sie besucht eine kath. Schule.'])
|
1380
|
+
end
|
1381
|
+
|
1382
|
+
it 'correctly segments text #018' do
|
1383
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Wir benötigen Zeitungen, Zeitschriften u. Ä. für unser Projekt.', language: 'de')
|
1384
|
+
expect(ps.segment).to eq(['Wir benötigen Zeitungen, Zeitschriften u. Ä. für unser Projekt.'])
|
1385
|
+
end
|
1386
|
+
|
1387
|
+
it 'correctly segments text #019' do
|
1388
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Das steht auf S. 23, s. vorherige Anmerkung.', language: 'de')
|
1389
|
+
expect(ps.segment).to eq(['Das steht auf S. 23, s. vorherige Anmerkung.'])
|
1390
|
+
end
|
1391
|
+
|
1392
|
+
it 'correctly segments text #020' do
|
1393
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Dies ist meine Adresse: Dr. Meier, Berliner Str. 5, 21234 Bremen.', language: 'de')
|
1394
|
+
expect(ps.segment).to eq(['Dies ist meine Adresse: Dr. Meier, Berliner Str. 5, 21234 Bremen.'])
|
1395
|
+
end
|
1396
|
+
|
1397
|
+
it 'correctly segments text #021' do
|
1398
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Er sagte: „Hallo, wie geht´s Ihnen, Frau Prof. Müller?“', language: 'de')
|
1399
|
+
expect(ps.segment).to eq(['Er sagte: „Hallo, wie geht´s Ihnen, Frau Prof. Müller?“'])
|
1400
|
+
end
|
1401
|
+
|
1402
|
+
it 'correctly segments text #022' do
|
1403
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Fit in vier Wochen\n\nDeine Anleitung für eine reine Ernährung und ein gesünderes und glücklicheres Leben\n\nRECHTLICHE HINWEISE\n\nOhne die ausdrückliche schriftliche Genehmigung der Eigentümerin von instafemmefitness, Anna Anderson, darf dieses E-Book weder teilweise noch in vollem Umfang reproduziert, gespeichert, kopiert oder auf irgendeine Weise übertragen werden. Wenn Du das E-Book auf einem öffentlich zugänglichen Computer ausdruckst, musst Du es nach dem Ausdrucken von dem Computer löschen. Jedes E-Book wird mit einem Benutzernamen und Transaktionsinformationen versehen.\n\nVerstöße gegen dieses Urheberrecht werden im vollen gesetzlichen Umfang geltend gemacht. Obgleich die Autorin und Herausgeberin alle Anstrengungen unternommen hat, sicherzustellen, dass die Informationen in diesem Buch zum Zeitpunkt der Drucklegung korrekt sind, übernimmt die Autorin und Herausgeberin keine Haftung für etwaige Verluste, Schäden oder Störungen, die durch Fehler oder Auslassungen in Folge von Fahrlässigkeit, zufälligen Umständen oder sonstigen Ursachen entstehen, und lehnt hiermit jedwede solche Haftung ab.\n\nDieses Buch ist kein Ersatz für die medizinische Beratung durch Ärzte. Der Leser/die Leserin sollte regelmäßig einen Arzt/eine Ärztin hinsichtlich Fragen zu seiner/ihrer Gesundheit und vor allem in Bezug auf Symptome, die eventuell einer ärztlichen Diagnose oder Behandlung bedürfen, konsultieren.\n\nDie Informationen in diesem Buch sind dazu gedacht, ein ordnungsgemäßes Training zu ergänzen, nicht aber zu ersetzen. Wie jeder andere Sport, der Geschwindigkeit, Ausrüstung, Gleichgewicht und Umweltfaktoren einbezieht, stellt dieser Sport ein gewisses Risiko dar. Die Autorin und Herausgeberin rät den Lesern dazu, die volle Verantwortung für die eigene Sicherheit zu übernehmen und die eigenen Grenzen zu beachten. Vor dem Ausüben der in diesem Buch beschriebenen Übungen solltest Du sicherstellen, dass Deine Ausrüstung in gutem Zustand ist, und Du solltest keine Risiken außerhalb Deines Erfahrungs- oder Trainingsniveaus, Deiner Fähigkeiten oder Deines Komfortbereichs eingehen.\nHintergrundillustrationen Urheberrecht © 2013 bei Shuttershock, Buchgestaltung und -produktion durch Anna Anderson Verfasst von Anna Anderson\nUrheberrecht © 2014 Instafemmefitness. Alle Rechte vorbehalten\n\nÜber mich", language: 'de')
|
1404
|
+
expect(ps.segment).to eq(["Fit in vier Wochen", "Deine Anleitung für eine reine Ernährung und ein gesünderes und glücklicheres Leben", "RECHTLICHE HINWEISE", "Ohne die ausdrückliche schriftliche Genehmigung der Eigentümerin von instafemmefitness, Anna Anderson, darf dieses E-Book weder teilweise noch in vollem Umfang reproduziert, gespeichert, kopiert oder auf irgendeine Weise übertragen werden.", "Wenn Du das E-Book auf einem öffentlich zugänglichen Computer ausdruckst, musst Du es nach dem Ausdrucken von dem Computer löschen.", "Jedes E-Book wird mit einem Benutzernamen und Transaktionsinformationen versehen.", "Verstöße gegen dieses Urheberrecht werden im vollen gesetzlichen Umfang geltend gemacht.", "Obgleich die Autorin und Herausgeberin alle Anstrengungen unternommen hat, sicherzustellen, dass die Informationen in diesem Buch zum Zeitpunkt der Drucklegung korrekt sind, übernimmt die Autorin und Herausgeberin keine Haftung für etwaige Verluste, Schäden oder Störungen, die durch Fehler oder Auslassungen in Folge von Fahrlässigkeit, zufälligen Umständen oder sonstigen Ursachen entstehen, und lehnt hiermit jedwede solche Haftung ab.", "Dieses Buch ist kein Ersatz für die medizinische Beratung durch Ärzte.", "Der Leser/die Leserin sollte regelmäßig einen Arzt/eine Ärztin hinsichtlich Fragen zu seiner/ihrer Gesundheit und vor allem in Bezug auf Symptome, die eventuell einer ärztlichen Diagnose oder Behandlung bedürfen, konsultieren.", "Die Informationen in diesem Buch sind dazu gedacht, ein ordnungsgemäßes Training zu ergänzen, nicht aber zu ersetzen.", "Wie jeder andere Sport, der Geschwindigkeit, Ausrüstung, Gleichgewicht und Umweltfaktoren einbezieht, stellt dieser Sport ein gewisses Risiko dar.", "Die Autorin und Herausgeberin rät den Lesern dazu, die volle Verantwortung für die eigene Sicherheit zu übernehmen und die eigenen Grenzen zu beachten.", "Vor dem Ausüben der in diesem Buch beschriebenen Übungen solltest Du sicherstellen, dass Deine Ausrüstung in gutem Zustand ist, und Du solltest keine Risiken außerhalb Deines Erfahrungs- oder Trainingsniveaus, Deiner Fähigkeiten oder Deines Komfortbereichs eingehen.", "Hintergrundillustrationen Urheberrecht © 2013 bei Shuttershock, Buchgestaltung und -produktion durch Anna Anderson Verfasst von Anna Anderson", "Urheberrecht © 2014 Instafemmefitness.", "Alle Rechte vorbehalten", "Über mich"])
|
1405
|
+
end
|
1406
|
+
|
1407
|
+
it 'correctly segments text #023' do
|
1408
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Es gibt jedoch einige Vorsichtsmaßnahmen, die Du ergreifen kannst, z. B. ist es sehr empfehlenswert, dass Du Dein Zuhause von allem Junkfood befreist. Ich persönlich kaufe kein Junkfood oder etwas, das nicht rein ist (ich traue mir da selbst nicht!). Ich finde jeden Vorwand, um das Junkfood zu essen, vor allem die Vorstellung, dass ich nicht mehr in Versuchung kommen werde, wenn ich es jetzt aufesse und es weg ist. Es ist schon komisch, was unser Verstand mitunter anstellt!", language: 'de')
|
1409
|
+
expect(ps.segment).to eq(["Es gibt jedoch einige Vorsichtsmaßnahmen, die Du ergreifen kannst, z. B. ist es sehr empfehlenswert, dass Du Dein Zuhause von allem Junkfood befreist.", "Ich persönlich kaufe kein Junkfood oder etwas, das nicht rein ist (ich traue mir da selbst nicht!).", "Ich finde jeden Vorwand, um das Junkfood zu essen, vor allem die Vorstellung, dass ich nicht mehr in Versuchung kommen werde, wenn ich es jetzt aufesse und es weg ist.", "Es ist schon komisch, was unser Verstand mitunter anstellt!"])
|
1410
|
+
end
|
1411
|
+
|
1412
|
+
it 'correctly segments text #024' do
|
1413
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ob Sie in Hannover nur auf der Durchreise, für einen längeren Aufenthalt oder zum Besuch einer der zahlreichen Messen sind: Die Hauptstadt des Landes Niedersachsens hat viele Sehenswürdigkeiten und ist zu jeder Jahreszeit eine Reise Wert. \nHannovers Ursprünge können bis zur römischen Kaiserzeit zurückverfolgt werden, und zwar durch Ausgrabungen von Tongefäßen aus dem 1. -3. Jahrhundert nach Christus, die an mehreren Stellen im Untergrund des Stadtzentrums durchgeführt wurden.", language: 'de')
|
1414
|
+
expect(ps.segment).to eq(["Ob Sie in Hannover nur auf der Durchreise, für einen längeren Aufenthalt oder zum Besuch einer der zahlreichen Messen sind: Die Hauptstadt des Landes Niedersachsens hat viele Sehenswürdigkeiten und ist zu jeder Jahreszeit eine Reise Wert.", "Hannovers Ursprünge können bis zur römischen Kaiserzeit zurückverfolgt werden, und zwar durch Ausgrabungen von Tongefäßen aus dem 1. -3. Jahrhundert nach Christus, die an mehreren Stellen im Untergrund des Stadtzentrums durchgeführt wurden."])
|
1415
|
+
end
|
1416
|
+
|
1417
|
+
it 'correctly segments text #025' do
|
1418
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "• 3. Seien Sie achtsam bei der Auswahl der Nahrungsmittel! \n• 4. Nehmen Sie zusätzlich Folsäurepräparate und essen Sie Fisch! \n• 5. Treiben Sie regelmäßig Sport! \n• 6. Beginnen Sie mit Übungen für die Beckenbodenmuskulatur! \n• 7. Reduzieren Sie Ihren Alkoholgenuss! \n", language: 'de')
|
1419
|
+
expect(ps.segment).to eq(["• 3. Seien Sie achtsam bei der Auswahl der Nahrungsmittel!", "• 4. Nehmen Sie zusätzlich Folsäurepräparate und essen Sie Fisch!", "• 5. Treiben Sie regelmäßig Sport!", "• 6. Beginnen Sie mit Übungen für die Beckenbodenmuskulatur!", "• 7. Reduzieren Sie Ihren Alkoholgenuss!"])
|
1420
|
+
end
|
1421
|
+
|
1422
|
+
it 'correctly segments text #026' do
|
1423
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Schwangere Frauen sollten während der \nersten drei Monate eine tägliche Dosis von 400 Mikrogramm Folsäure zusätzlich nehmen. \nFolsäure befindet sich auch in einigen Gemüse- und Müslisorten.", language: 'de', doc_type: 'pdf')
|
1424
|
+
expect(ps.segment).to eq(["Schwangere Frauen sollten während der ersten drei Monate eine tägliche Dosis von 400 Mikrogramm Folsäure zusätzlich nehmen.", "Folsäure befindet sich auch in einigen Gemüse- und Müslisorten."])
|
1425
|
+
end
|
1426
|
+
|
1427
|
+
it 'correctly segments text #027' do
|
1428
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Andere \nFischsorten (z.B. Hai, Thunfisch, Aal und Seeteufel) weisen einen erhöhten Quecksilbergehalt \nauf und sollten deshalb in der Schwangerschaft nur selten verzehrt werden.", language: 'de', doc_type: 'pdf')
|
1429
|
+
expect(ps.segment).to eq(["Andere Fischsorten (z.B. Hai, Thunfisch, Aal und Seeteufel) weisen einen erhöhten Quecksilbergehalt auf und sollten deshalb in der Schwangerschaft nur selten verzehrt werden."])
|
1430
|
+
end
|
1431
|
+
|
1432
|
+
it 'correctly segments text #028' do
|
1433
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Übung Präsens\n1. Ich ___ gern Tennis.\nspielen\nspielt\nspiele\n2. Karl __ mir den Ball.\ngebt\ngibt\ngeben\n3. Ihr ___ fast jeden Tag.\narbeitet\narbeite\narbeiten\n4. ___ Susi Deutsch?\nSprichst\nSprecht\nSpricht\n5. Wann ___ Karl und Julia? Heute?\nkommen\nkommt\nkomme\n\n\n\n\n", language: 'de', doc_type: 'docx')
|
1434
|
+
expect(ps.segment).to eq(["Übung Präsens", "1. Ich ___ gern Tennis.", "spielen", "spielt", "spiele", "2. Karl __ mir den Ball.", "gebt", "gibt", "geben", "3. Ihr ___ fast jeden Tag.", "arbeitet", "arbeite", "arbeiten", "4. ___ Susi Deutsch?", "Sprichst", "Sprecht", "Spricht", "5. Wann ___ Karl und Julia?", "Heute?", "kommen", "kommt", "komme"])
|
1435
|
+
end
|
1436
|
+
|
1437
|
+
it 'correctly segments text #029' do
|
1438
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\n• einige Sorten Weichkäse \n• rohes oder nicht ganz durchgebratenes Fleisch \n• ungeputztes Gemüse und ungewaschener Salat \n• nicht ganz durchgebratenes Hühnerfleisch, rohe oder nur weich gekochte Eier", language: 'de', doc_type: 'pdf')
|
1439
|
+
expect(ps.segment).to eq(["• einige Sorten Weichkäse", "• rohes oder nicht ganz durchgebratenes Fleisch", "• ungeputztes Gemüse und ungewaschener Salat", "• nicht ganz durchgebratenes Hühnerfleisch, rohe oder nur weich gekochte Eier"])
|
1440
|
+
end
|
1441
|
+
|
1442
|
+
it 'correctly segments text #030' do
|
1443
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Was sind die Konsequenzen der Abstimmung vom 12. Juni?", language: 'de')
|
1444
|
+
expect(ps.segment).to eq(["Was sind die Konsequenzen der Abstimmung vom 12. Juni?"])
|
1445
|
+
end
|
1446
|
+
|
1447
|
+
it 'correctly segments text #031' do
|
1448
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Was pro Jahr10. Zudem pro Jahr um 0.3 %11. Der gängigen Theorie nach erfolgt der Anstieg.", language: 'de')
|
1449
|
+
expect(ps.segment).to eq(["Was pro Jahr10.", "Zudem pro Jahr um 0.3 %11.", "Der gängigen Theorie nach erfolgt der Anstieg."])
|
1450
|
+
end
|
1451
|
+
|
1452
|
+
it 'correctly segments text #032' do
|
1453
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "s. vorherige Anmerkung.", language: 'de')
|
1454
|
+
expect(ps.segment).to eq(["s. vorherige Anmerkung."])
|
1455
|
+
end
|
1456
|
+
end
|
1457
|
+
end
|
1458
|
+
|
1459
|
+
context 'Language: Spanish (es)' do
|
1460
|
+
# Thanks to Alejandro Naser Pastoriza for the Spanish test examples.
|
1461
|
+
describe '#segment' do
|
1462
|
+
it 'correctly segments text #001' do
|
1463
|
+
ps = PragmaticSegmenter::Segmenter.new(text: '«Ninguna mente extraordinaria está exenta de un toque de demencia», dijo Aristóteles. Pablo, ¿adónde vas? ¡¿Qué viste?!', language: 'es')
|
1464
|
+
expect(ps.segment).to eq(['«Ninguna mente extraordinaria está exenta de un toque de demencia», dijo Aristóteles.', 'Pablo, ¿adónde vas?', '¡¿Qué viste?!'])
|
1465
|
+
end
|
1466
|
+
|
1467
|
+
it 'correctly segments text #002' do
|
1468
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Admón. es administración o me equivoco.', language: 'es')
|
1469
|
+
expect(ps.segment).to eq(['Admón. es administración o me equivoco.'])
|
1470
|
+
end
|
1471
|
+
|
1472
|
+
it 'correctly segments text #003' do
|
1473
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "• 1. Busca atención prenatal desde el principio \n• 2. Aliméntate bien \n• 3. Presta mucha atención a la higiene de los alimentos \n• 4. Toma suplementos de ácido fólico y come pescado \n• 5. Haz ejercicio regularmente \n• 6. Comienza a hacer ejercicios de Kegel \n• 7. Restringe el consumo de alcohol \n• 8. Disminuye el consumo de cafeína \n• 9. Deja de fumar \n• 10. Descansa", language: 'es')
|
1474
|
+
expect(ps.segment).to eq(["• 1. Busca atención prenatal desde el principio", "• 2. Aliméntate bien", "• 3. Presta mucha atención a la higiene de los alimentos", "• 4. Toma suplementos de ácido fólico y come pescado", "• 5. Haz ejercicio regularmente", "• 6. Comienza a hacer ejercicios de Kegel", "• 7. Restringe el consumo de alcohol", "• 8. Disminuye el consumo de cafeína", "• 9. Deja de fumar", "• 10. Descansa"])
|
1475
|
+
end
|
1476
|
+
|
1477
|
+
it 'correctly segments text #004' do
|
1478
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "• 1. Busca atención prenatal desde el principio \n• 2. Aliméntate bien \n• 3. Presta mucha atención a la higiene de los alimentos \n• 4. Toma suplementos de ácido fólico y come pescado \n• 5. Haz ejercicio regularmente \n• 6. Comienza a hacer ejercicios de Kegel \n• 7. Restringe el consumo de alcohol \n• 8. Disminuye el consumo de cafeína \n• 9. Deja de fumar \n• 10. Descansa \n• 11. Hola", language: 'es')
|
1479
|
+
expect(ps.segment).to eq(["• 1. Busca atención prenatal desde el principio", "• 2. Aliméntate bien", "• 3. Presta mucha atención a la higiene de los alimentos", "• 4. Toma suplementos de ácido fólico y come pescado", "• 5. Haz ejercicio regularmente", "• 6. Comienza a hacer ejercicios de Kegel", "• 7. Restringe el consumo de alcohol", "• 8. Disminuye el consumo de cafeína", "• 9. Deja de fumar", "• 10. Descansa", "• 11. Hola"])
|
1480
|
+
end
|
1481
|
+
|
1482
|
+
it 'correctly segments text #005' do
|
1483
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "¡Hola Srta. Ledesma! ¿Cómo está hoy? Espero que muy bien.", language: 'es')
|
1484
|
+
expect(ps.segment).to eq(["¡Hola Srta. Ledesma!", "¿Cómo está hoy?", "Espero que muy bien."])
|
1485
|
+
end
|
1486
|
+
|
1487
|
+
it 'correctly segments text #006' do
|
1488
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", language: 'es')
|
1489
|
+
expect(ps.segment).to eq(["Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."])
|
1490
|
+
end
|
1491
|
+
|
1492
|
+
it 'correctly segments text #007' do
|
1493
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "He apuntado una cita para la siguiente fecha: Mar. 23 de Nov. de 2014. Gracias.", language: 'es')
|
1494
|
+
expect(ps.segment).to eq(["He apuntado una cita para la siguiente fecha: Mar. 23 de Nov. de 2014.", "Gracias."])
|
1495
|
+
end
|
1496
|
+
|
1497
|
+
it 'correctly segments text #008' do
|
1498
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Núm. de tel: 351.123.465.4. Envíe mis saludos a la Sra. Rescia.", language: 'es')
|
1499
|
+
expect(ps.segment).to eq(["Núm. de tel: 351.123.465.4.", "Envíe mis saludos a la Sra. Rescia."])
|
1500
|
+
end
|
1501
|
+
|
1502
|
+
it 'correctly segments text #009' do
|
1503
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Cero en la escala Celsius o de grados centígrados (0 °C) se define como el equivalente a 273.15 K, con una diferencia de temperatura de 1 °C equivalente a una diferencia de 1 Kelvin. Esto significa que 100 °C, definido como el punto de ebullición del agua, se define como el equivalente a 373.15 K.", language: 'es')
|
1504
|
+
expect(ps.segment).to eq(["Cero en la escala Celsius o de grados centígrados (0 °C) se define como el equivalente a 273.15 K, con una diferencia de temperatura de 1 °C equivalente a una diferencia de 1 Kelvin.", "Esto significa que 100 °C, definido como el punto de ebullición del agua, se define como el equivalente a 373.15 K."])
|
1505
|
+
end
|
1506
|
+
|
1507
|
+
it 'correctly segments text #010' do
|
1508
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Durante la primera misión del Discovery (30 Ago. 1984 15:08.10) tuvo lugar el lanzamiento de dos satélites de comunicación, el nombre de esta misión fue STS-41-D.", language: 'es')
|
1509
|
+
expect(ps.segment).to eq(["Durante la primera misión del Discovery (30 Ago. 1984 15:08.10) tuvo lugar el lanzamiento de dos satélites de comunicación, el nombre de esta misión fue STS-41-D."])
|
1510
|
+
end
|
1511
|
+
|
1512
|
+
it 'correctly segments text #011' do
|
1513
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Frase del gran José Hernández: \"Aquí me pongo a cantar / al compás de la vigüela, / que el hombre que lo desvela / una pena estrordinaria, / como la ave solitaria / con el cantar se consuela. / [...] \".", language: 'es')
|
1514
|
+
expect(ps.segment).to eq(["Frase del gran José Hernández: \"Aquí me pongo a cantar / al compás de la vigüela, / que el hombre que lo desvela / una pena estrordinaria, / como la ave solitaria / con el cantar se consuela. / [...] \"."])
|
1515
|
+
end
|
1516
|
+
|
1517
|
+
it 'correctly segments text #012' do
|
1518
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Citando a Criss Jami «Prefiero ser un artista a ser un líder, irónicamente, un líder tiene que seguir las reglas.», lo cual parece muy acertado.", language: 'es')
|
1519
|
+
expect(ps.segment).to eq(["Citando a Criss Jami «Prefiero ser un artista a ser un líder, irónicamente, un líder tiene que seguir las reglas.», lo cual parece muy acertado."])
|
1520
|
+
end
|
1521
|
+
|
1522
|
+
it 'correctly segments text #013' do
|
1523
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Cuando llegué, le estaba dando ejercicios a los niños, uno de los cuales era \"3 + (14/7).x = 5\". ¿Qué te parece?", language: 'es')
|
1524
|
+
expect(ps.segment).to eq(["Cuando llegué, le estaba dando ejercicios a los niños, uno de los cuales era \"3 + (14/7).x = 5\".", "¿Qué te parece?"])
|
1525
|
+
end
|
1526
|
+
|
1527
|
+
it 'correctly segments text #014' do
|
1528
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Se le pidió a los niños que leyeran los párrf. 5 y 6 del art. 4 de la constitución de los EE. UU..", language: 'es')
|
1529
|
+
expect(ps.segment).to eq(["Se le pidió a los niños que leyeran los párrf. 5 y 6 del art. 4 de la constitución de los EE. UU.."])
|
1530
|
+
end
|
1531
|
+
|
1532
|
+
it 'correctly segments text #015' do
|
1533
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Una de las preguntas realizadas en la evaluación del día Lun. 15 de Mar. fue la siguiente: \"Alumnos, ¿cuál es el resultado de la operación 1.1 + 4/5?\". Disponían de 1 min. para responder esa pregunta.", language: 'es')
|
1534
|
+
expect(ps.segment).to eq(["Una de las preguntas realizadas en la evaluación del día Lun. 15 de Mar. fue la siguiente: \"Alumnos, ¿cuál es el resultado de la operación 1.1 + 4/5?\".", "Disponían de 1 min. para responder esa pregunta."])
|
1535
|
+
end
|
1536
|
+
|
1537
|
+
it 'correctly segments text #016' do
|
1538
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La temperatura del motor alcanzó los 120.5°C. Afortunadamente, pudo llegar al final de carrera.", language: 'es')
|
1539
|
+
expect(ps.segment).to eq(["La temperatura del motor alcanzó los 120.5°C.", "Afortunadamente, pudo llegar al final de carrera."])
|
1540
|
+
end
|
1541
|
+
|
1542
|
+
it 'correctly segments text #017' do
|
1543
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "El volumen del cuerpo es 3m³. ¿Cuál es la superficie de cada cara del prisma?", language: 'es')
|
1544
|
+
expect(ps.segment).to eq(["El volumen del cuerpo es 3m³.", "¿Cuál es la superficie de cada cara del prisma?"])
|
1545
|
+
end
|
1546
|
+
|
1547
|
+
it 'correctly segments text #018' do
|
1548
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La habitación tiene 20.55m². El living tiene 50.0m².", language: 'es')
|
1549
|
+
expect(ps.segment).to eq(["La habitación tiene 20.55m².", "El living tiene 50.0m²."])
|
1550
|
+
end
|
1551
|
+
|
1552
|
+
it 'correctly segments text #019' do
|
1553
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1°C corresponde a 33.8°F. ¿A cuánto corresponde 35°C?", language: 'es')
|
1554
|
+
expect(ps.segment).to eq(["1°C corresponde a 33.8°F.", "¿A cuánto corresponde 35°C?"])
|
1555
|
+
end
|
1556
|
+
|
1557
|
+
it 'correctly segments text #020' do
|
1558
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hamilton ganó el último gran premio de Fórmula 1, luego de 1:39:02.619 Hs. de carrera, segundo resultó Massa, a una diferencia de 2.5 segundos. De esta manera se consagró ¡Campeón mundial!", language: 'es')
|
1559
|
+
expect(ps.segment).to eq(["Hamilton ganó el último gran premio de Fórmula 1, luego de 1:39:02.619 Hs. de carrera, segundo resultó Massa, a una diferencia de 2.5 segundos.", "De esta manera se consagró ¡Campeón mundial!"])
|
1560
|
+
end
|
1561
|
+
|
1562
|
+
it 'correctly segments text #021' do
|
1563
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", language: 'es')
|
1564
|
+
expect(ps.segment).to eq(["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."])
|
1565
|
+
end
|
1566
|
+
|
1567
|
+
it 'correctly segments text #022' do
|
1568
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "El corredor No. 103 arrivó 4°.", language: 'es')
|
1569
|
+
expect(ps.segment).to eq(["El corredor No. 103 arrivó 4°."])
|
1570
|
+
end
|
1571
|
+
|
1572
|
+
it 'correctly segments text #023' do
|
1573
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hoy es 27/04/2014, y es mi cumpleaños. ¿Cuándo es el tuyo?", language: 'es')
|
1574
|
+
expect(ps.segment).to eq(["Hoy es 27/04/2014, y es mi cumpleaños.", "¿Cuándo es el tuyo?"])
|
1575
|
+
end
|
1576
|
+
|
1577
|
+
it 'correctly segments text #024' do
|
1578
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Aquí está la lista de compras para el almuerzo: 1.Helado, 2.Carne, 3.Arroz. ¿Cuánto costará? Quizás $12.5.", language: 'es')
|
1579
|
+
expect(ps.segment).to eq(["Aquí está la lista de compras para el almuerzo: 1.Helado, 2.Carne, 3.Arroz.", "¿Cuánto costará?", "Quizás $12.5."])
|
1580
|
+
end
|
1581
|
+
|
1582
|
+
it 'correctly segments text #025' do
|
1583
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1 + 1 es 2. 2 + 2 es 4. El auto es de color rojo.", language: 'es')
|
1584
|
+
expect(ps.segment).to eq(["1 + 1 es 2.", "2 + 2 es 4.", "El auto es de color rojo."])
|
1585
|
+
end
|
1586
|
+
|
1587
|
+
it 'correctly segments text #026' do
|
1588
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "La máquina viajaba a 100 km/h. ¿En cuánto tiempo recorrió los 153 Km.?", language: 'es')
|
1589
|
+
expect(ps.segment).to eq(["La máquina viajaba a 100 km/h.", "¿En cuánto tiempo recorrió los 153 Km.?"])
|
1590
|
+
end
|
1591
|
+
|
1592
|
+
it 'correctly segments text #027' do
|
1593
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\n \nCentro de Relaciones Interinstitucionales -CERI \n\nCra. 7 No. 40-53 Piso 10 Tel. (57-1) 3239300 Ext. 1010 Fax: (57-1) 3402973 Bogotá, D.C. - Colombia \n\nhttp://www.udistrital.edu.co - http://ceri.udistrital.edu.co - relinter@udistrital.edu.co \n\n \n\nCERI 0908 \n \nBogotá, D.C. 6 de noviembre de 2014. \n \nSeñores: \nEMBAJADA DE UNITED KINGDOM \n \n", language: 'es')
|
1594
|
+
expect(ps.segment).to eq(["Centro de Relaciones Interinstitucionales -CERI", "Cra. 7 No. 40-53 Piso 10 Tel. (57-1) 3239300 Ext. 1010 Fax: (57-1) 3402973 Bogotá, D.C. - Colombia", "http://www.udistrital.edu.co - http://ceri.udistrital.edu.co - relinter@udistrital.edu.co", "CERI 0908", "Bogotá, D.C. 6 de noviembre de 2014.", "Señores:", "EMBAJADA DE UNITED KINGDOM"])
|
1595
|
+
end
|
1596
|
+
|
1597
|
+
it 'correctly segments text #028' do
|
1598
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "N°. 1026.253.553", language: 'es')
|
1599
|
+
expect(ps.segment).to eq(["N°. 1026.253.553"])
|
1600
|
+
end
|
1601
|
+
|
1602
|
+
it 'correctly segments text #029' do
|
1603
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\nA continuación me permito presentar a la Ingeniera LAURA MILENA LEÓN \nSANDOVAL, identificada con el documento N°. 1026.253.553 de Bogotá, \negresada del Programa Ingeniería Industrial en el año 2012, quien se desatacó por \nsu excelencia académica, actualmente cursa el programa de Maestría en \nIngeniería Industrial y se encuentra en un intercambio cultural en Bangalore – \nIndia.", language: 'es', doc_type: 'pdf')
|
1604
|
+
expect(ps.segment).to eq(["A continuación me permito presentar a la Ingeniera LAURA MILENA LEÓN SANDOVAL, identificada con el documento N°. 1026.253.553 de Bogotá, egresada del Programa Ingeniería Industrial en el año 2012, quien se desatacó por su excelencia académica, actualmente cursa el programa de Maestría en Ingeniería Industrial y se encuentra en un intercambio cultural en Bangalore – India."])
|
1605
|
+
end
|
1606
|
+
|
1607
|
+
it 'correctly segments text #030' do
|
1608
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\n__________________________________________________________\nEl Board para Servicios Educativos de Putnam/Northern Westchester según el título IX, Sección 504 del “Rehabilitation Act” del 1973, del Título VII y del Acta “American with Disabilities” no discrimina para la admisión a programas educativos por sexo, creencia, nacionalidad, origen, edad o discapacidad.", language: 'es')
|
1609
|
+
expect(ps.segment).to eq(["El Board para Servicios Educativos de Putnam/Northern Westchester según el título IX, Sección 504 del “Rehabilitation Act” del 1973, del Título VII y del Acta “American with Disabilities” no discrimina para la admisión a programas educativos por sexo, creencia, nacionalidad, origen, edad o discapacidad."])
|
1610
|
+
end
|
1611
|
+
|
1612
|
+
it 'correctly segments text #031' do
|
1613
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Explora oportunidades de carrera en el área de Salud en el Hospital de Northern en Mt. Kisco.", language: 'es')
|
1614
|
+
expect(ps.segment).to eq(["Explora oportunidades de carrera en el área de Salud en el Hospital de Northern en Mt. Kisco."])
|
1615
|
+
end
|
1616
|
+
end
|
1617
|
+
end
|
1618
|
+
|
1619
|
+
context 'Language: Hindi (hi)' do
|
1620
|
+
describe '#segment' do
|
1621
|
+
it 'correctly segments text #001' do
|
1622
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।", language: 'hi')
|
1623
|
+
expect(ps.segment).to eq(["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"])
|
1624
|
+
end
|
1625
|
+
end
|
1626
|
+
end
|
1627
|
+
|
1628
|
+
context 'Language: Greek (el)' do
|
1629
|
+
describe '#segment' do
|
1630
|
+
it 'correctly segments text #001' do
|
1631
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Με συγχωρείτε· πού είναι οι τουαλέτες; Τις Κυριακές δε δούλευε κανένας. το κόστος του σπιτιού ήταν £260.950,00.", language: 'el')
|
1632
|
+
expect(ps.segment).to eq(["Με συγχωρείτε· πού είναι οι τουαλέτες;", "Τις Κυριακές δε δούλευε κανένας.", "το κόστος του σπιτιού ήταν £260.950,00."])
|
1633
|
+
end
|
1634
|
+
end
|
1635
|
+
end
|
1636
|
+
|
1637
|
+
context 'Language: French (fr)' do
|
1638
|
+
describe '#segment' do
|
1639
|
+
it 'correctly segments text #001' do
|
1640
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Après avoir été l'un des acteurs du projet génome humain, le Genoscope met aujourd'hui le cap vers la génomique environnementale. L'exploitation des données de séquences, prolongée par l'identification expérimentale des fonctions biologiques, notamment dans le domaine de la biocatalyse, ouvrent des perspectives de développements en biotechnologie industrielle.", language: 'fr')
|
1641
|
+
expect(ps.segment).to eq(["Après avoir été l'un des acteurs du projet génome humain, le Genoscope met aujourd'hui le cap vers la génomique environnementale.", "L'exploitation des données de séquences, prolongée par l'identification expérimentale des fonctions biologiques, notamment dans le domaine de la biocatalyse, ouvrent des perspectives de développements en biotechnologie industrielle."])
|
1642
|
+
end
|
1643
|
+
|
1644
|
+
it 'correctly segments text #002' do
|
1645
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\"Airbus livrera comme prévu 30 appareils 380 cette année avec en ligne de mire l'objectif d'équilibre financier du programme en 2015\", a-t-il ajouté.", language: 'fr')
|
1646
|
+
expect(ps.segment).to eq(["\"Airbus livrera comme prévu 30 appareils 380 cette année avec en ligne de mire l'objectif d'équilibre financier du programme en 2015\", a-t-il ajouté."])
|
1647
|
+
end
|
1648
|
+
|
1649
|
+
it 'correctly segments text #003' do
|
1650
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "À 11 heures ce matin, la direction ne décomptait que douze grévistes en tout sur la France : ce sont ceux du site de Saran (Loiret), dont l’effectif est de 809 salariés, dont la moitié d’intérimaires. Elle assure que ce mouvement « n’aura aucun impact sur les livraisons ».", language: 'fr')
|
1651
|
+
expect(ps.segment).to eq(["À 11 heures ce matin, la direction ne décomptait que douze grévistes en tout sur la France : ce sont ceux du site de Saran (Loiret), dont l’effectif est de 809 salariés, dont la moitié d’intérimaires.", "Elle assure que ce mouvement « n’aura aucun impact sur les livraisons »."])
|
1652
|
+
end
|
1653
|
+
|
1654
|
+
it 'correctly segments text #004' do
|
1655
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ce modèle permet d’afficher le texte « LL.AA.II.RR. » pour l’abréviation de « Leurs Altesses impériales et royales » avec son infobulle.", language: 'fr')
|
1656
|
+
expect(ps.segment).to eq(["Ce modèle permet d’afficher le texte « LL.AA.II.RR. » pour l’abréviation de « Leurs Altesses impériales et royales » avec son infobulle."])
|
1657
|
+
end
|
1658
|
+
|
1659
|
+
it 'correctly segments text #005' do
|
1660
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Les derniers ouvrages de Intercept Ltd. sont ici.", language: 'fr')
|
1661
|
+
expect(ps.segment).to eq(["Les derniers ouvrages de Intercept Ltd. sont ici."])
|
1662
|
+
end
|
1663
|
+
end
|
1664
|
+
end
|
1665
|
+
|
1666
|
+
context 'Language: Armenian (hy)' do
|
1667
|
+
describe '#segment' do
|
1668
|
+
# Thanks to Armine Abelyan for the Armenian test examples.
|
1669
|
+
|
1670
|
+
it 'correctly segments text #001' do
|
1671
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Սա այն փուլն է, երբ տեղի է ունենում Համակարգի մշակումը: Համաձայն Փուլ 2-ի, Մատակարարը մշակում և/կամ հարմարեցնում է համապատասխան ծրագիրը, տեղադրում ծրագրի բաղկացուցիչները, կատարում առանձին բլոկի և համակարգի թեստավորում և ներառում տարբեր մոդուլներ եզակի աշխատանքային համակարգում, որը կազմում է այս Փուլի արդյունքը:", language: 'hy')
|
1672
|
+
expect(ps.segment).to eq(["Սա այն փուլն է, երբ տեղի է ունենում Համակարգի մշակումը:", "Համաձայն Փուլ 2-ի, Մատակարարը մշակում և/կամ հարմարեցնում է համապատասխան ծրագիրը, տեղադրում ծրագրի բաղկացուցիչները, կատարում առանձին բլոկի և համակարգի թեստավորում և ներառում տարբեր մոդուլներ եզակի աշխատանքային համակարգում, որը կազմում է այս Փուլի արդյունքը:"])
|
1673
|
+
end
|
1674
|
+
|
1675
|
+
it 'correctly segments text #002' do
|
1676
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար: 2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ: Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի: Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից: Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում: Մատակարարը պետք է տրամադրի հետևյալը`", language: 'hy')
|
1677
|
+
expect(ps.segment).to eq(["Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար:", "2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ:", "Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի:", "Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից:", "Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում:", "Մատակարարը պետք է տրամադրի հետևյալը`"])
|
1678
|
+
end
|
1679
|
+
|
1680
|
+
it 'correctly segments text #003' do
|
1681
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար: 2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ: Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի: Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից: Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում: Մատակարարը պետք է տրամադրի հետևյալը`", language: 'hy')
|
1682
|
+
expect(ps.segment).to eq(["Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար:", "2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ:", "Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի:", "Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից:", "Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում:", "Մատակարարը պետք է տրամադրի հետևյալը`"])
|
1683
|
+
end
|
1684
|
+
|
1685
|
+
it 'correctly segments text #004' do
|
1686
|
+
# "Hello world. My name is Armine." ==> ["Hello world.", "My name is Armine."]
|
1687
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Բարև Ձեզ: Իմ անունն էԱրմինե:", language: 'hy')
|
1688
|
+
expect(ps.segment).to eq(["Բարև Ձեզ:", "Իմ անունն էԱրմինե:"])
|
1689
|
+
end
|
1690
|
+
|
1691
|
+
it 'correctly segments text #005' do
|
1692
|
+
# "Today is Monday. I am going to work." ==> ["Today is Monday.", "I am going to work."]
|
1693
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Այսօր երկուշաբթի է: Ես գնում եմ աշխատանքի:", language: 'hy')
|
1694
|
+
expect(ps.segment).to eq(["Այսօր երկուշաբթի է:", "Ես գնում եմ աշխատանքի:"])
|
1695
|
+
end
|
1696
|
+
|
1697
|
+
it 'correctly segments text #006' do
|
1698
|
+
# "Tomorrow is September 1st. We are going to school." ==> ["Tomorrow is September 1st.", "We are going to school."]
|
1699
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Վաղը սեպտեմբերի 1-ն է: Մենք գնում ենք դպրոց:", language: 'hy')
|
1700
|
+
expect(ps.segment).to eq(["Վաղը սեպտեմբերի 1-ն է:", "Մենք գնում ենք դպրոց:"])
|
1701
|
+
end
|
1702
|
+
|
1703
|
+
it 'correctly segments text #007' do
|
1704
|
+
# "Yes, I understood. I really love you." ==> ["Yes, I understood.", "I really love you."]
|
1705
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Այո, ես հասկացա: Ես իսկապես քեզ սիրում եմ:", language: 'hy')
|
1706
|
+
expect(ps.segment).to eq(["Այո, ես հասկացա:", "Ես իսկապես քեզ սիրում եմ:"])
|
1707
|
+
end
|
1708
|
+
|
1709
|
+
it 'correctly segments text #008' do
|
1710
|
+
# "Close the windows. It is raining in the evening." ==> ["Close the windows.", "It is raining in the evening."]
|
1711
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Փակիր պատուհանները: Երեկոյան անձրևում է:", language: 'hy')
|
1712
|
+
expect(ps.segment).to eq(["Փակիր պատուհանները:", "Երեկոյան անձրևում է:"])
|
1713
|
+
end
|
1714
|
+
|
1715
|
+
it 'correctly segments text #009' do
|
1716
|
+
# "It is dark. I should go home." ==> ["It is dark.", "I should go home."]
|
1717
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Մութ է: Ես պետք է տուն վերադառնամ:", language: 'hy')
|
1718
|
+
expect(ps.segment).to eq(["Մութ է:", "Ես պետք է տուն վերադառնամ:"])
|
1719
|
+
end
|
1720
|
+
|
1721
|
+
it 'correctly segments text #010' do
|
1722
|
+
# "You know, I am starting to believe. Everything is changing." ==> ["You know, I am starting to believe.", "Everything is changing."]
|
1723
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Գիտես, սկսել եմ հավատալ: Ամեն ինչ փոխվում է:", language: 'hy')
|
1724
|
+
expect(ps.segment).to eq(["Գիտես, սկսել եմ հավատալ:", "Ամեն ինչ փոխվում է:"])
|
1725
|
+
end
|
1726
|
+
|
1727
|
+
it 'correctly segments text #011' do
|
1728
|
+
# "It is a new Christmas tree. We should decorate it." ==> ["It is a new Christmas tree.", "We should decorate it."]
|
1729
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Տոնածառը նոր է: Պետք է այն զարդարել:", language: 'hy')
|
1730
|
+
expect(ps.segment).to eq(["Տոնածառը նոր է:", "Պետք է այն զարդարել:"])
|
1731
|
+
end
|
1732
|
+
|
1733
|
+
it 'correctly segments text #012' do
|
1734
|
+
# "I am in hurry. I could not wait you." ==> ["I am in hurry.", "I could not wait you."]
|
1735
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ես շտապում եմ: Ես քեզ չեմ կարող սպասել:", language: 'hy')
|
1736
|
+
expect(ps.segment).to eq(["Ես շտապում եմ:", "Ես քեզ չեմ կարող սպասել:"])
|
1737
|
+
end
|
1738
|
+
|
1739
|
+
it 'correctly segments text #013' do
|
1740
|
+
# "Wait, we love each other. I want us to live together." ==> ["Wait, we love each other.", "I want us to live together."]
|
1741
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Սպասիր, մենք իրար սիրում ենք: Ցանկանում եմ միասին ապրենք:", language: 'hy')
|
1742
|
+
expect(ps.segment).to eq(["Սպասիր, մենք իրար սիրում ենք:", "Ցանկանում եմ միասին ապրենք:"])
|
1743
|
+
end
|
1744
|
+
|
1745
|
+
it 'correctly segments text #014' do
|
1746
|
+
# "No, I do not think so. It is not true." ==> ["No, I do not think so.", "It is not true."]
|
1747
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ոչ, այդպես չեմ կարծում: Դա ճիշտ չէ:", language: 'hy')
|
1748
|
+
expect(ps.segment).to eq(["Ոչ, այդպես չեմ կարծում:", "Դա ճիշտ չէ:"])
|
1749
|
+
end
|
1750
|
+
|
1751
|
+
it 'correctly segments text #015' do
|
1752
|
+
# "April 24 it has started to rain... I was thinking about." ==> ["April 24 it has started to rain... I was thinking about."]
|
1753
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:", language: 'hy')
|
1754
|
+
expect(ps.segment).to eq(["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"])
|
1755
|
+
end
|
1756
|
+
|
1757
|
+
it 'correctly segments text #016' do
|
1758
|
+
# "It was 1960...it was winter...it was night. It was cold...emptiness." ==> ["It was 1960...it was winter...it was night.", "It was cold...emptiness."]
|
1759
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "1960 թվական…ձմեռ…գիշեր: Սառն էր…դատարկություն:", language: 'hy')
|
1760
|
+
expect(ps.segment).to eq(["1960 թվական…ձմեռ…գիշեր:", "Սառն էր…դատարկություն:"])
|
1761
|
+
end
|
1762
|
+
|
1763
|
+
it 'correctly segments text #017' do
|
1764
|
+
# "Why a computer could not do what a man could do? Simply it doesn't have a human brain." ==> ["Why a computer could not do what a man could do?", "Simply it doesn't have a human brain."]
|
1765
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը: Պարզապես չունի մարդկային ուղեղ:", language: 'hy')
|
1766
|
+
expect(ps.segment).to eq(["Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը:", "Պարզապես չունի մարդկային ուղեղ:"])
|
1767
|
+
end
|
1768
|
+
|
1769
|
+
it 'correctly segments text #018' do
|
1770
|
+
# "Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity." ==> ["Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity."]
|
1771
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:", language: 'hy')
|
1772
|
+
expect(ps.segment).to eq(["Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:"])
|
1773
|
+
end
|
1774
|
+
|
1775
|
+
it 'correctly segments text #019' do
|
1776
|
+
# "So, we are coming to the end. The logic is...simplicity and work" ==> ["So, we are coming to the end.", "Simplicity and work."]
|
1777
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:", language: 'hy')
|
1778
|
+
expect(ps.segment).to eq(["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"])
|
1779
|
+
end
|
1780
|
+
|
1781
|
+
it 'correctly segments text #020' do
|
1782
|
+
# "What are you thinking? Nothing!" ==> ["What are you thinking?", "Nothing!"]
|
1783
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Ի՞նչ ես մտածում: Ոչինչ:", language: 'hy')
|
1784
|
+
expect(ps.segment).to eq(["Ի՞նչ ես մտածում:", "Ոչինչ:"])
|
1785
|
+
end
|
1786
|
+
|
1787
|
+
it 'correctly segments text #021' do
|
1788
|
+
# "Can we work together ?. May be what you are thinking, is possible." ==> ["Can we work together?.", "May be what you are thinking is possible."]
|
1789
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Կարող ե՞նք միասին աշխատել: Գուցե այն ինչ մտածում ես, իրականանալի է:", language: 'hy')
|
1790
|
+
expect(ps.segment).to eq(["Կարող ե՞նք միասին աշխատել:", "Գուցե այն ինչ մտածում ես, իրականանալի է:"])
|
1791
|
+
end
|
1792
|
+
|
1793
|
+
it 'correctly segments text #022' do
|
1794
|
+
# "Now what we have started, comes to the end. However the questions are numerous... ." ==> ["Now what we have started, comes to the end.", "However the questions are numerous... ."]
|
1795
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում: Հարցերը սակայն շատ են...:", language: 'hy')
|
1796
|
+
expect(ps.segment).to eq(["Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում:", "Հարցերը սակայն շատ են...:"])
|
1797
|
+
end
|
1798
|
+
|
1799
|
+
it 'correctly segments text #023' do
|
1800
|
+
# "Honey... I am waiting. Shall I go... or?" ==> ["Honey... I am waiting.", "Shall I go... or?"]
|
1801
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Սիրելիս...սպասում եմ: Գնամ թ՟ե …:", language: 'hy')
|
1802
|
+
expect(ps.segment).to eq(["Սիրելիս...սպասում եմ:", "Գնամ թ՟ե …:"])
|
1803
|
+
end
|
1804
|
+
end
|
1805
|
+
end
|
1806
|
+
|
1807
|
+
context 'Language: Burmese (my)' do
|
1808
|
+
describe '#segment' do
|
1809
|
+
it 'correctly segments text #001' do
|
1810
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။", language: 'my')
|
1811
|
+
expect(ps.segment).to eq(["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"])
|
1812
|
+
end
|
1813
|
+
end
|
1814
|
+
end
|
1815
|
+
|
1816
|
+
context 'Language: Amharic (am)' do
|
1817
|
+
describe '#segment' do
|
1818
|
+
it 'correctly segments text #001' do
|
1819
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።", language: 'am')
|
1820
|
+
expect(ps.segment).to eq(["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"])
|
1821
|
+
end
|
1822
|
+
end
|
1823
|
+
end
|
1824
|
+
|
1825
|
+
context 'Language: Persian (fa)' do
|
1826
|
+
describe '#segment' do
|
1827
|
+
it 'correctly segments text #001' do
|
1828
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.", language: 'fa')
|
1829
|
+
expect(ps.segment).to eq(["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."])
|
1830
|
+
end
|
1831
|
+
end
|
1832
|
+
end
|
1833
|
+
|
1834
|
+
context 'Language: Urdu (ur)' do
|
1835
|
+
describe '#segment' do
|
1836
|
+
it 'correctly segments text #001' do
|
1837
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟", language: 'ur')
|
1838
|
+
expect(ps.segment).to eq(["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"])
|
1839
|
+
end
|
1840
|
+
end
|
1841
|
+
end
|
1842
|
+
|
1843
|
+
context 'Language: Chinese (zh)' do
|
1844
|
+
describe '#segment' do
|
1845
|
+
it 'correctly segments text #001' do
|
1846
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。", language: 'zh')
|
1847
|
+
expect(ps.segment).to eq(["安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。"])
|
1848
|
+
end
|
1849
|
+
end
|
1850
|
+
end
|
1851
|
+
|
1852
|
+
context 'miscellaneous tests' do
|
1853
|
+
describe '#segment' do
|
1854
|
+
it 'handles nil' do
|
1855
|
+
ps = PragmaticSegmenter::Segmenter.new(text: nil)
|
1856
|
+
expect(ps.segment).to eq([])
|
1857
|
+
end
|
1858
|
+
|
1859
|
+
it 'handles no language' do
|
1860
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Hello world. Hello.')
|
1861
|
+
expect(ps.segment).to eq(["Hello world.", "Hello."])
|
1862
|
+
end
|
1863
|
+
|
1864
|
+
it 'handles empty strings' do
|
1865
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "\n")
|
1866
|
+
expect(ps.segment).to eq([])
|
1867
|
+
end
|
1868
|
+
|
1869
|
+
it 'handles empty strings' do
|
1870
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "<b></b>")
|
1871
|
+
expect(ps.segment).to eq([])
|
1872
|
+
end
|
1873
|
+
|
1874
|
+
it 'handles empty strings' do
|
1875
|
+
ps = PragmaticSegmenter::Segmenter.new(text: '')
|
1876
|
+
expect(ps.segment).to eq([])
|
1877
|
+
end
|
1878
|
+
|
1879
|
+
it 'has an option to not use the cleaner' do
|
1880
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "It was a cold \nnight in the city.", language: "en", clean: false)
|
1881
|
+
expect(ps.segment).to eq(["It was a cold", "night in the city."])
|
1882
|
+
end
|
1883
|
+
|
1884
|
+
it 'does not mutate the input string' do
|
1885
|
+
text = "It was a cold \nnight in the city."
|
1886
|
+
PragmaticSegmenter::Segmenter.new(text: text, language: "en").segment
|
1887
|
+
expect(text).to eq("It was a cold \nnight in the city.")
|
1888
|
+
end
|
1889
|
+
|
1890
|
+
|
1891
|
+
end
|
1892
|
+
|
1893
|
+
describe '#clean' do
|
1894
|
+
it 'cleans the text' do
|
1895
|
+
ps = PragmaticSegmenter::Cleaner.new(text: "It was a cold \nnight in the city.", language: "en")
|
1896
|
+
expect(ps.clean).to eq("It was a cold night in the city.")
|
1897
|
+
end
|
1898
|
+
|
1899
|
+
it 'does not mutate the input string (cleaner)' do
|
1900
|
+
text = "It was a cold \nnight in the city."
|
1901
|
+
PragmaticSegmenter::Cleaner.new(text: text, language: "en").clean
|
1902
|
+
expect(text).to eq("It was a cold \nnight in the city.")
|
1903
|
+
end
|
1904
|
+
end
|
1905
|
+
end
|
1906
|
+
end
|