pragmatic_segmenter 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +730 -0
  7. data/Rakefile +4 -0
  8. data/lib/pragmatic_segmenter.rb +2 -0
  9. data/lib/pragmatic_segmenter/abbreviation.rb +22 -0
  10. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +149 -0
  11. data/lib/pragmatic_segmenter/between_punctuation.rb +78 -0
  12. data/lib/pragmatic_segmenter/cleaner.rb +141 -0
  13. data/lib/pragmatic_segmenter/ellipsis.rb +36 -0
  14. data/lib/pragmatic_segmenter/exclamation_words.rb +19 -0
  15. data/lib/pragmatic_segmenter/languages/amharic.rb +33 -0
  16. data/lib/pragmatic_segmenter/languages/arabic.rb +83 -0
  17. data/lib/pragmatic_segmenter/languages/armenian.rb +33 -0
  18. data/lib/pragmatic_segmenter/languages/burmese.rb +33 -0
  19. data/lib/pragmatic_segmenter/languages/deutsch.rb +132 -0
  20. data/lib/pragmatic_segmenter/languages/english.rb +44 -0
  21. data/lib/pragmatic_segmenter/languages/french.rb +29 -0
  22. data/lib/pragmatic_segmenter/languages/greek.rb +29 -0
  23. data/lib/pragmatic_segmenter/languages/hindi.rb +33 -0
  24. data/lib/pragmatic_segmenter/languages/italian.rb +39 -0
  25. data/lib/pragmatic_segmenter/languages/japanese.rb +58 -0
  26. data/lib/pragmatic_segmenter/languages/persian.rb +56 -0
  27. data/lib/pragmatic_segmenter/languages/russian.rb +60 -0
  28. data/lib/pragmatic_segmenter/languages/spanish.rb +39 -0
  29. data/lib/pragmatic_segmenter/languages/urdu.rb +33 -0
  30. data/lib/pragmatic_segmenter/list.rb +169 -0
  31. data/lib/pragmatic_segmenter/number.rb +35 -0
  32. data/lib/pragmatic_segmenter/process.rb +126 -0
  33. data/lib/pragmatic_segmenter/punctuation.rb +12 -0
  34. data/lib/pragmatic_segmenter/punctuation_replacer.rb +62 -0
  35. data/lib/pragmatic_segmenter/rules.rb +38 -0
  36. data/lib/pragmatic_segmenter/segmenter.rb +81 -0
  37. data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +17 -0
  38. data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +37 -0
  39. data/lib/pragmatic_segmenter/types.rb +12 -0
  40. data/lib/pragmatic_segmenter/version.rb +3 -0
  41. data/pragmatic_segmenter.gemspec +25 -0
  42. data/spec/performance_spec.rb +24 -0
  43. data/spec/pragmatic_segmenter_spec.rb +1906 -0
  44. data/spec/spec_helper.rb +1 -0
  45. metadata +150 -0
@@ -0,0 +1,24 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'benchmark'
3
+ require 'spec_helper'
4
+
5
+ RSpec.describe PragmaticSegmenter::Segmenter do
6
+
7
+ # Speed benchmarks tests
8
+
9
+ # it 'is fast' do
10
+ # string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
11
+ # benchmark do
12
+ # 100.times do
13
+ # PragmaticSegmenter::Segmenter.new(text: string, language: 'en').segment
14
+ # end
15
+ # end
16
+ # end
17
+
18
+ end
19
+
20
+ def benchmark(&block)
21
+ block.call
22
+ time = Benchmark.realtime { block.call }
23
+ puts "RUNTIME: #{time}"
24
+ end
@@ -0,0 +1,1906 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe PragmaticSegmenter::Segmenter do
4
+ context "Golden Rules (English)" do
5
+ it "Simple period to end sentence #001" do
6
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hello World. My name is Jonas.", language: "en")
7
+ expect(ps.segment).to eq(["Hello World.", "My name is Jonas."])
8
+ end
9
+
10
+ it "Question mark to end sentence #002" do
11
+ ps = PragmaticSegmenter::Segmenter.new(text: "What is your name? My name is Jonas.", language: "en")
12
+ expect(ps.segment).to eq(["What is your name?", "My name is Jonas."])
13
+ end
14
+
15
+ it "Exclamation point to end sentence #003" do
16
+ ps = PragmaticSegmenter::Segmenter.new(text: "There it is! I found it.", language: "en")
17
+ expect(ps.segment).to eq(["There it is!", "I found it."])
18
+ end
19
+
20
+ it "One letter upper case abbreviations #004" do
21
+ ps = PragmaticSegmenter::Segmenter.new(text: "My name is Jonas E. Smith.", language: "en")
22
+ expect(ps.segment).to eq(["My name is Jonas E. Smith."])
23
+ end
24
+
25
+ it "One letter lower case abbreviations #005" do
26
+ ps = PragmaticSegmenter::Segmenter.new(text: "Please turn to p. 55.", language: "en")
27
+ expect(ps.segment).to eq(["Please turn to p. 55."])
28
+ end
29
+
30
+ it "Two letter lower case abbreviations in the middle of a sentence #006" do
31
+ ps = PragmaticSegmenter::Segmenter.new(text: "Were Jane and co. at the party?", language: "en")
32
+ expect(ps.segment).to eq(["Were Jane and co. at the party?"])
33
+ end
34
+
35
+ it "Two letter upper case abbreviations in the middle of a sentence #007" do
36
+ ps = PragmaticSegmenter::Segmenter.new(text: "They closed the deal with Pitt, Briggs & Co. at noon.", language: "en")
37
+ expect(ps.segment).to eq(["They closed the deal with Pitt, Briggs & Co. at noon."])
38
+ end
39
+
40
+ it "Two letter lower case abbreviations at the end of a sentence #008" do
41
+ ps = PragmaticSegmenter::Segmenter.new(text: "Let's ask Jane and co. They should know.", language: "en")
42
+ expect(ps.segment).to eq(["Let's ask Jane and co.", "They should know."])
43
+ end
44
+
45
+ it "Two letter upper case abbreviations at the end of a sentence #009" do
46
+ ps = PragmaticSegmenter::Segmenter.new(text: "They closed the deal with Pitt, Briggs & Co. It closed yesterday.", language: "en")
47
+ expect(ps.segment).to eq(["They closed the deal with Pitt, Briggs & Co.", "It closed yesterday."])
48
+ end
49
+
50
+ it "Two letter (prepositive) abbreviations #010" do
51
+ ps = PragmaticSegmenter::Segmenter.new(text: "I can see Mt. Fuji from here.", language: "en")
52
+ expect(ps.segment).to eq(["I can see Mt. Fuji from here."])
53
+ end
54
+
55
+ it "Two letter (prepositive & postpositive) abbreviations #011" do
56
+ ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Church is on 5th st. near the light.", language: "en")
57
+ expect(ps.segment).to eq(["St. Michael's Church is on 5th st. near the light."])
58
+ end
59
+
60
+ it "Possesive two letter abbreviations #012" do
61
+ ps = PragmaticSegmenter::Segmenter.new(text: "That is JFK Jr.'s book.", language: "en")
62
+ expect(ps.segment).to eq(["That is JFK Jr.'s book."])
63
+ end
64
+
65
+ it "Multi-period abbreviations in the middle of a sentence #013" do
66
+ ps = PragmaticSegmenter::Segmenter.new(text: "I visited the U.S.A. last year.", language: "en")
67
+ expect(ps.segment).to eq(["I visited the U.S.A. last year."])
68
+ end
69
+
70
+ it "Multi-period abbreviations at the end of a sentence #014" do
71
+ ps = PragmaticSegmenter::Segmenter.new(text: "I live in the E.U. How about you?", language: "en")
72
+ expect(ps.segment).to eq(["I live in the E.U.", "How about you?"])
73
+ end
74
+
75
+ it "U.S. as sentence boundary #015" do
76
+ ps = PragmaticSegmenter::Segmenter.new(text: "I live in the U.S. How about you?", language: "en")
77
+ expect(ps.segment).to eq(["I live in the U.S.", "How about you?"])
78
+ end
79
+
80
+ it "U.S. as non sentence boundary with next word capitalized #016" do
81
+ ps = PragmaticSegmenter::Segmenter.new(text: "I work for the U.S. Government in Virginia.", language: "en")
82
+ expect(ps.segment).to eq(["I work for the U.S. Government in Virginia."])
83
+ end
84
+
85
+ it "U.S. as non sentence boundary #017" do
86
+ ps = PragmaticSegmenter::Segmenter.new(text: "I have lived in the U.S. for 20 years.", language: "en")
87
+ expect(ps.segment).to eq(["I have lived in the U.S. for 20 years."])
88
+ end
89
+
90
+ xdescribe "not yet implemented" do
91
+ it "A.M. / P.M. as non sentence boundary and sentence boundary #018" do
92
+ ps = PragmaticSegmenter::Segmenter.new(text: "At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.", language: "en")
93
+ expect(ps.segment).to eq(["At 5 a.m. Mr. Smith went to the bank.", "He left the bank at 6 P.M.", "Mr. Smith then went to the store."])
94
+ end
95
+ end
96
+
97
+ it "Number as non sentence boundary #019" do
98
+ ps = PragmaticSegmenter::Segmenter.new(text: "She has $100.00 in her bag.", language: "en")
99
+ expect(ps.segment).to eq(["She has $100.00 in her bag."])
100
+ end
101
+
102
+ it "Number as sentence boundary #020" do
103
+ ps = PragmaticSegmenter::Segmenter.new(text: "She has $100.00. It is in her bag.", language: "en")
104
+ expect(ps.segment).to eq(["She has $100.00.", "It is in her bag."])
105
+ end
106
+
107
+ it "Parenthetical inside sentence #021" do
108
+ ps = PragmaticSegmenter::Segmenter.new(text: "He teaches science (He previously worked for 5 years as an engineer.) at the local University.", language: "en")
109
+ expect(ps.segment).to eq(["He teaches science (He previously worked for 5 years as an engineer.) at the local University."])
110
+ end
111
+
112
+ it "Email addresses #022" do
113
+ ps = PragmaticSegmenter::Segmenter.new(text: "Her email is Jane.Doe@example.com. I sent her an email.", language: "en")
114
+ expect(ps.segment).to eq(["Her email is Jane.Doe@example.com.", "I sent her an email."])
115
+ end
116
+
117
+ it "Web addresses #023" do
118
+ ps = PragmaticSegmenter::Segmenter.new(text: "The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.", language: "en")
119
+ expect(ps.segment).to eq(["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."])
120
+ end
121
+
122
+ it "Single quotations inside sentence #024" do
123
+ ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, 'This is great.' she said.", language: "en")
124
+ expect(ps.segment).to eq(["She turned to him, 'This is great.' she said."])
125
+ end
126
+
127
+ it "Double quotations inside sentence #025" do
128
+ ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, \"This is great.\" she said.", language: "en")
129
+ expect(ps.segment).to eq(["She turned to him, \"This is great.\" she said."])
130
+ end
131
+
132
+ it "Double quotations at the end of a sentence #026" do
133
+ ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, \"This is great.\" She held the book out to show him.", language: "en")
134
+ expect(ps.segment).to eq(["She turned to him, \"This is great.\"", "She held the book out to show him."])
135
+ end
136
+
137
+ it "Double punctuation (exclamation point) #027" do
138
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hello!! Long time no see.", language: "en")
139
+ expect(ps.segment).to eq(["Hello!!", "Long time no see."])
140
+ end
141
+
142
+ it "Double punctuation (question mark) #028" do
143
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hello?? Who is there?", language: "en")
144
+ expect(ps.segment).to eq(["Hello??", "Who is there?"])
145
+ end
146
+
147
+ it "Double punctuation (exclamation point / question mark) #029" do
148
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hello!? Is that you?", language: "en")
149
+ expect(ps.segment).to eq(["Hello!?", "Is that you?"])
150
+ end
151
+
152
+ it "Double punctuation (question mark / exclamation point) #030" do
153
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hello?! Is that you?", language: "en")
154
+ expect(ps.segment).to eq(["Hello?!", "Is that you?"])
155
+ end
156
+
157
+ it "List (period followed by parens and no period to end item) #031" do
158
+ ps = PragmaticSegmenter::Segmenter.new(text: "1.) The first item 2.) The second item", language: "en")
159
+ expect(ps.segment).to eq(["1.) The first item", "2.) The second item"])
160
+ end
161
+
162
+ it "List (period followed by parens and period to end item) #032" do
163
+ ps = PragmaticSegmenter::Segmenter.new(text: "1.) The first item. 2.) The second item.", language: "en")
164
+ expect(ps.segment).to eq(["1.) The first item.", "2.) The second item."])
165
+ end
166
+
167
+ it "List (parens and no period to end item) #033" do
168
+ ps = PragmaticSegmenter::Segmenter.new(text: "1) The first item 2) The second item", language: "en")
169
+ expect(ps.segment).to eq(["1) The first item", "2) The second item"])
170
+ end
171
+
172
+ it "List (parens and period to end item) #034" do
173
+ ps = PragmaticSegmenter::Segmenter.new(text: "1) The first item. 2) The second item.", language: "en")
174
+ expect(ps.segment).to eq(["1) The first item.", "2) The second item."])
175
+ end
176
+
177
+ it "List (period to mark list and no period to end item) #035" do
178
+ ps = PragmaticSegmenter::Segmenter.new(text: "1. The first item 2. The second item", language: "en")
179
+ expect(ps.segment).to eq(["1. The first item", "2. The second item"])
180
+ end
181
+
182
+ it "List (period to mark list and period to end item) #036" do
183
+ ps = PragmaticSegmenter::Segmenter.new(text: "1. The first item. 2. The second item.", language: "en")
184
+ expect(ps.segment).to eq(["1. The first item.", "2. The second item."])
185
+ end
186
+
187
+ it "List with bullet #037" do
188
+ ps = PragmaticSegmenter::Segmenter.new(text: "• 9. The first item • 10. The second item", language: "en")
189
+ expect(ps.segment).to eq(["• 9. The first item", "• 10. The second item"])
190
+ end
191
+
192
+ it "List with hypthen #038" do
193
+ ps = PragmaticSegmenter::Segmenter.new(text: "⁃9. The first item ⁃10. The second item", language: "en")
194
+ expect(ps.segment).to eq(["⁃9. The first item", "⁃10. The second item"])
195
+ end
196
+
197
+ it "Alphabetical list #039" do
198
+ ps = PragmaticSegmenter::Segmenter.new(text: "a. The first item b. The second item c. The third list item", language: "en")
199
+ expect(ps.segment).to eq(["a. The first item", "b. The second item", "c. The third list item"])
200
+ end
201
+
202
+ it "Errant newlines in the middle of sentences (PDF) #040" do
203
+ ps = PragmaticSegmenter::Segmenter.new(text: "This is a sentence\ncut off in the middle because pdf.", language: "en", doc_type: "pdf")
204
+ expect(ps.segment).to eq(["This is a sentence cut off in the middle because pdf."])
205
+ end
206
+
207
+ it "Errant newlines in the middle of sentences #041" do
208
+ ps = PragmaticSegmenter::Segmenter.new(text: "It was a cold \nnight in the city.", language: "en")
209
+ expect(ps.segment).to eq(["It was a cold night in the city."])
210
+ end
211
+
212
+ it "Lower case list separated by newline #042" do
213
+ ps = PragmaticSegmenter::Segmenter.new(text: "features\ncontact manager\nevents, activities\n", language: "en")
214
+ expect(ps.segment).to eq(["features", "contact manager", "events, activities"])
215
+ end
216
+
217
+ it "Geo Coordinates #043" do
218
+ ps = PragmaticSegmenter::Segmenter.new(text: "You can find it at N°. 1026.253.553. That is where the treasure is.", language: "en")
219
+ expect(ps.segment).to eq(["You can find it at N°. 1026.253.553.", "That is where the treasure is."])
220
+ end
221
+
222
+ it "Named entities with an exclamation point #044" do
223
+ ps = PragmaticSegmenter::Segmenter.new(text: "She works at Yahoo! in the accounting department.", language: "en")
224
+ expect(ps.segment).to eq(["She works at Yahoo! in the accounting department."])
225
+ end
226
+
227
+ it "I as a sentence boundary and I as an abbreviation #045" do
228
+ ps = PragmaticSegmenter::Segmenter.new(text: "We make a good team, you and I. Did you see Albert I. Jones yesterday?", language: "en")
229
+ expect(ps.segment).to eq(["We make a good team, you and I.", "Did you see Albert I. Jones yesterday?"])
230
+ end
231
+
232
+ it "Ellipsis at end of quotation #046" do
233
+ ps = PragmaticSegmenter::Segmenter.new(text: "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”", language: "en")
234
+ expect(ps.segment).to eq(["Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"])
235
+ end
236
+
237
+ it "Ellipsis with square brackets #047" do
238
+ ps = PragmaticSegmenter::Segmenter.new(text: "\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55).", language: "en")
239
+ expect(ps.segment).to eq(["\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)."])
240
+ end
241
+
242
+ it "Ellipsis as sentence boundary (standard ellipsis rules) #048" do
243
+ ps = PragmaticSegmenter::Segmenter.new(text: "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.", language: "en")
244
+ expect(ps.segment).to eq(["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."])
245
+ end
246
+
247
+ it "Ellipsis as sentence boundary (non-standard ellipsis rules) #049" do
248
+ ps = PragmaticSegmenter::Segmenter.new(text: "I never meant that.... She left the store.", language: "en")
249
+ expect(ps.segment).to eq(["I never meant that....", "She left the store."])
250
+ end
251
+
252
+ it "Ellipsis as non sentence boundary #050" do
253
+ ps = PragmaticSegmenter::Segmenter.new(text: "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", language: "en")
254
+ expect(ps.segment).to eq(["I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."])
255
+ end
256
+
257
+ it "4-dot ellipsis #051" do
258
+ ps = PragmaticSegmenter::Segmenter.new(text: "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", language: "en")
259
+ expect(ps.segment).to eq(["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."])
260
+ end
261
+ end
262
+
263
+ context "Golden Rules (languages other than English)" do
264
+ context "Golden Rules (German)" do
265
+ it "Quotation at end of sentence #001" do
266
+ ps = PragmaticSegmenter::Segmenter.new(text: "„Ich habe heute keine Zeit“, sagte die Frau und flüsterte leise: „Und auch keine Lust.“ Wir haben 1.000.000 Euro.", language: "de")
267
+ expect(ps.segment).to eq(["„Ich habe heute keine Zeit“, sagte die Frau und flüsterte leise: „Und auch keine Lust.“", "Wir haben 1.000.000 Euro."])
268
+ end
269
+
270
+ it "Abbreviations #002" do
271
+ ps = PragmaticSegmenter::Segmenter.new(text: "Es gibt jedoch einige Vorsichtsmaßnahmen, die Du ergreifen kannst, z. B. ist es sehr empfehlenswert, dass Du Dein Zuhause von allem Junkfood befreist.", language: "de")
272
+ expect(ps.segment).to eq(["Es gibt jedoch einige Vorsichtsmaßnahmen, die Du ergreifen kannst, z. B. ist es sehr empfehlenswert, dass Du Dein Zuhause von allem Junkfood befreist."])
273
+ end
274
+
275
+ it "Numbers #003" do
276
+ ps = PragmaticSegmenter::Segmenter.new(text: "Was sind die Konsequenzen der Abstimmung vom 12. Juni?", language: "de")
277
+ expect(ps.segment).to eq(["Was sind die Konsequenzen der Abstimmung vom 12. Juni?"])
278
+ end
279
+ end
280
+
281
+ context "Golden Rules (Japanese)" do
282
+ it "Simple period to end sentence #001" do
283
+ ps = PragmaticSegmenter::Segmenter.new(text: "これはペンです。それはマーカーです。", language: "ja")
284
+ expect(ps.segment).to eq(["これはペンです。", "それはマーカーです。"])
285
+ end
286
+
287
+ it "Question mark to end sentence #002" do
288
+ ps = PragmaticSegmenter::Segmenter.new(text: "それは何ですか?ペンですか?", language: "ja")
289
+ expect(ps.segment).to eq(["それは何ですか?", "ペンですか?"])
290
+ end
291
+
292
+ it "Exclamation point to end sentence #003" do
293
+ ps = PragmaticSegmenter::Segmenter.new(text: "良かったね!すごい!", language: "ja")
294
+ expect(ps.segment).to eq(["良かったね!", "すごい!"])
295
+ end
296
+
297
+ it "Quotation #004" do
298
+ ps = PragmaticSegmenter::Segmenter.new(text: "自民党税制調査会の幹部は、「引き下げ幅は3.29%以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、30日に決定する与党税制改正大綱に盛り込むことにしています。2%台後半を目指すとする方向で最終調整に入りました。", language: "ja")
299
+ expect(ps.segment).to eq(["自民党税制調査会の幹部は、「引き下げ幅は3.29%以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、30日に決定する与党税制改正大綱に盛り込むことにしています。", "2%台後半を目指すとする方向で最終調整に入りました。"])
300
+ end
301
+
302
+ it "Errant newlines in the middle of sentences #005" do
303
+ ps = PragmaticSegmenter::Segmenter.new(text: "これは父の\n家です。", language: "ja")
304
+ expect(ps.segment).to eq(["これは父の家です。"])
305
+ end
306
+ end
307
+
308
+ context "Golden Rules (Arabic)" do
309
+ it "Regular punctuation #001" do
310
+ ps = PragmaticSegmenter::Segmenter.new(text: "سؤال وجواب: ماذا حدث بعد الانتخابات الايرانية؟ طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن. يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب.", language: "ar")
311
+ expect(ps.segment).to eq(["سؤال وجواب:", "ماذا حدث بعد الانتخابات الايرانية؟", "طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن.", "يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب."])
312
+ end
313
+
314
+ it "Abbreviations #002" do
315
+ ps = PragmaticSegmenter::Segmenter.new(text: "وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى. وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير.", language: "ar")
316
+ expect(ps.segment).to eq(["وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى.", "وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير."])
317
+ end
318
+
319
+ it "Numbers and Dates #003" do
320
+ ps = PragmaticSegmenter::Segmenter.new(text: "ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار. ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية.", language: "ar")
321
+ expect(ps.segment).to eq(["ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار.", "ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية."])
322
+ end
323
+
324
+ it "Time #004" do
325
+ ps = PragmaticSegmenter::Segmenter.new(text: "الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز: رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه. العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي.", language: "ar")
326
+ expect(ps.segment).to eq(["الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز:", "رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه.", "العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي."])
327
+ end
328
+
329
+ it "Comma #005" do
330
+ ps = PragmaticSegmenter::Segmenter.new(text: "عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب، زرعها عملاء الموساد كما تقول مصادر إسرائيلية، وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية، وبدأت مراسم الحداد عليه", language: "ar")
331
+ expect(ps.segment).to eq(["عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب،", "زرعها عملاء الموساد كما تقول مصادر إسرائيلية،", "وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية،", "وبدأت مراسم الحداد عليه"])
332
+ end
333
+ end
334
+
335
+ context "Golden Rules (Italian)" do
336
+ it "Abbreviations #001" do
337
+ ps = PragmaticSegmenter::Segmenter.new(text: "Salve Sig.ra Mengoni! Come sta oggi?", language: "it")
338
+ expect(ps.segment).to eq(["Salve Sig.ra Mengoni!", "Come sta oggi?"])
339
+ end
340
+
341
+ it "Quotations #002" do
342
+ ps = PragmaticSegmenter::Segmenter.new(text: "Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.».", language: "it")
343
+ expect(ps.segment).to eq(["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.»."])
344
+ end
345
+
346
+ it "Numbers #003" do
347
+ ps = PragmaticSegmenter::Segmenter.new(text: "La casa costa 170.500.000,00€!", language: "it")
348
+ expect(ps.segment).to eq(["La casa costa 170.500.000,00€!"])
349
+ end
350
+ end
351
+
352
+ context "Golden Rules (Russian)" do
353
+ it "Abbreviations #001" do
354
+ ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5 куб.м.", language: "ru")
355
+ expect(ps.segment).to eq(["Объем составляет 5 куб.м."])
356
+ end
357
+
358
+ it "Quotations #002" do
359
+ ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?».", language: "ru")
360
+ expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»."])
361
+ end
362
+
363
+ it "Numbers #003" do
364
+ ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27.10.14", language: "ru")
365
+ expect(ps.segment).to eq(["Сегодня 27.10.14"])
366
+ end
367
+ end
368
+
369
+ context "Golden Rules (Spanish)" do
370
+ it "Question mark to end sentence #001" do
371
+ ps = PragmaticSegmenter::Segmenter.new(text: "¿Cómo está hoy? Espero que muy bien.", language: "es")
372
+ expect(ps.segment).to eq(["¿Cómo está hoy?", "Espero que muy bien."])
373
+ end
374
+
375
+ it "Exclamation point to end sentence #002" do
376
+ ps = PragmaticSegmenter::Segmenter.new(text: "¡Hola señorita! Espero que muy bien.", language: "es")
377
+ expect(ps.segment).to eq(["¡Hola señorita!", "Espero que muy bien."])
378
+ end
379
+
380
+ it "Abbreviations #003" do
381
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", language: "es")
382
+ expect(ps.segment).to eq(["Hola Srta. Ledesma.", "Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."])
383
+ end
384
+
385
+ it "Numbers #004" do
386
+ ps = PragmaticSegmenter::Segmenter.new(text: "¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", language: "es")
387
+ expect(ps.segment).to eq(["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."])
388
+ end
389
+
390
+ it "Quotations #005" do
391
+ ps = PragmaticSegmenter::Segmenter.new(text: "«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles.", language: "es")
392
+ expect(ps.segment).to eq(["«Ninguna mente extraordinaria está exenta de un toque de demencia.», dijo Aristóteles."])
393
+ end
394
+ end
395
+
396
+ context "Golden Rules (Greek)" do
397
+ it "Question mark to end sentence #001" do
398
+ ps = PragmaticSegmenter::Segmenter.new(text: "Με συγχωρείτε· πού είναι οι τουαλέτες; Τις Κυριακές δε δούλευε κανένας. το κόστος του σπιτιού ήταν £260.950,00.", language: "el")
399
+ expect(ps.segment).to eq(["Με συγχωρείτε· πού είναι οι τουαλέτες;", "Τις Κυριακές δε δούλευε κανένας.", "το κόστος του σπιτιού ήταν £260.950,00."])
400
+ end
401
+ end
402
+
403
+ context "Golden Rules (Hindi)" do
404
+ it "Full stop #001" do
405
+ ps = PragmaticSegmenter::Segmenter.new(text: "सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।", language: "hi")
406
+ expect(ps.segment).to eq(["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"])
407
+ end
408
+ end
409
+
410
+ context "Golden Rules (Armenian)" do
411
+ it "Sentence ending punctuation #001" do
412
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ի՞նչ ես մտածում: Ոչինչ:", language: "hy")
413
+ expect(ps.segment).to eq(["Ի՞նչ ես մտածում:", "Ոչինչ:"])
414
+ end
415
+
416
+ it "Ellipsis #002" do
417
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:", language: "hy")
418
+ expect(ps.segment).to eq(["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"])
419
+ end
420
+
421
+ it "Period is not a sentence boundary #003" do
422
+ ps = PragmaticSegmenter::Segmenter.new(text: "Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:", language: "hy")
423
+ expect(ps.segment).to eq(["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"])
424
+ end
425
+ end
426
+
427
+ context "Golden Rules (Burmese)" do
428
+ it "Sentence ending punctuation #001" do
429
+ ps = PragmaticSegmenter::Segmenter.new(text: "ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။", language: 'my')
430
+ expect(ps.segment).to eq(["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"])
431
+ end
432
+ end
433
+
434
+ context "Golden Rules (Amharic)" do
435
+ it "Sentence ending punctuation #001" do
436
+ ps = PragmaticSegmenter::Segmenter.new(text: "እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።", language: 'am')
437
+ expect(ps.segment).to eq(["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"])
438
+ end
439
+ end
440
+
441
+ context "Golden Rules (Persian)" do
442
+ it "Sentence ending punctuation #001" do
443
+ ps = PragmaticSegmenter::Segmenter.new(text: "خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.", language: 'fa')
444
+ expect(ps.segment).to eq(["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."])
445
+ end
446
+ end
447
+
448
+ context "Golden Rules (Urdu)" do
449
+ it "Sentence ending punctuation #001" do
450
+ ps = PragmaticSegmenter::Segmenter.new(text: "کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟", language: 'ur')
451
+ expect(ps.segment).to eq(["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"])
452
+ end
453
+ end
454
+ end
455
+
456
+ context 'Language: English (en)' do
457
+ describe '#segment' do
458
+ it 'correctly segments text #001' do
459
+ ps = PragmaticSegmenter::Segmenter.new(text: "Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversations?'\nSo she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.", language: 'en')
460
+ expect(ps.segment).to eq(["Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversations?'", "So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her."])
461
+ end
462
+
463
+ it 'correctly segments text #002' do
464
+ ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\n'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en', doc_type: 'pdf')
465
+ expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
466
+ end
467
+
468
+ it 'correctly segments text #003' do
469
+ ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\r'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en', doc_type: 'pdf')
470
+ expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
471
+ end
472
+
473
+ it 'correctly segments text #004' do
474
+ ps = PragmaticSegmenter::Segmenter.new(text: "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en')
475
+ expect(ps.segment).to eq(["'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
476
+ end
477
+
478
+ it 'correctly segments text #005' do
479
+ ps = PragmaticSegmenter::Segmenter.new(text: "Down, down, down. Would the fall NEVER come to an end! 'I wonder how many miles I've fallen by this time?' she said aloud.", language: 'en')
480
+ expect(ps.segment).to eq(["Down, down, down.", "Would the fall NEVER come to an end!", "'I wonder how many miles I've fallen by this time?' she said aloud."])
481
+ end
482
+
483
+ it 'correctly segments text #006' do
484
+ ps = PragmaticSegmenter::Segmenter.new(text: "Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it. 'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en')
485
+ expect(ps.segment).to eq(["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
486
+ end
487
+
488
+ it 'correctly segments text #007' do
489
+ ps = PragmaticSegmenter::Segmenter.new(text: 'A minute is a unit of measurement of time or of angle. The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1. In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second. The minute is not an SI unit; however, it is accepted for use with SI units. The symbol for minute or minutes is min. The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system. Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length.', language: 'en')
490
+ expect(ps.segment).to eq(["A minute is a unit of measurement of time or of angle.", "The minute is a unit of time equal to 1/60th of an hour or 60 seconds by 1.", "In the UTC time scale, a minute occasionally has 59 or 61 seconds; see leap second.", "The minute is not an SI unit; however, it is accepted for use with SI units.", "The symbol for minute or minutes is min.", "The fact that an hour contains 60 minutes is probably due to influences from the Babylonians, who used a base-60 or sexagesimal counting system.", "Colloquially, a min. may also refer to an indefinite amount of time substantially longer than the standardized length."])
491
+ end
492
+
493
+ it 'correctly segments text #008' do
494
+ text = <<-EOF
495
+ About Me...............................................................................................5
496
+ Chapter 2 ...................................................................... 6
497
+ Three Weeks Later............................................................................ 7
498
+ Better Eating........................................................................................ 8
499
+ What's the Score?.............................................................. 9
500
+ How To Calculate the Score................... 16-17
501
+ EOF
502
+
503
+ ps = PragmaticSegmenter::Segmenter.new(text: text, language: 'en')
504
+ expect(ps.segment).to eq(["About Me", "Chapter 2", "Three Weeks Later", "Better Eating", "What's the Score?", "How To Calculate the Score"])
505
+ end
506
+
507
+ it 'correctly segments text #009' do
508
+ ps = PragmaticSegmenter::Segmenter.new(text: 'I think Jun. is a great month, said Mr. Suzuki.', language: 'en')
509
+ expect(ps.segment).to eq(["I think Jun. is a great month, said Mr. Suzuki."])
510
+ end
511
+
512
+ it 'correctly segments text #010' do
513
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Jun. is a great month, said Mr. Suzuki.', language: 'en')
514
+ expect(ps.segment).to eq(["Jun. is a great month, said Mr. Suzuki."])
515
+ end
516
+
517
+ it 'correctly segments text #011' do
518
+ ps = PragmaticSegmenter::Segmenter.new(text: "I have 1.000.00. Yay $.50 and .50! That's 600.", language: 'en')
519
+ expect(ps.segment).to eq(["I have 1.000.00.", "Yay $.50 and .50!", "That's 600."])
520
+ end
521
+
522
+ it 'correctly segments text #012' do
523
+ ps = PragmaticSegmenter::Segmenter.new(text: '1.) This is a list item with a parens.', language: 'en')
524
+ expect(ps.segment).to eq(["1.) This is a list item with a parens."])
525
+ end
526
+
527
+ it 'correctly segments text #013' do
528
+ ps = PragmaticSegmenter::Segmenter.new(text: '1. This is a list item.', language: 'en')
529
+ expect(ps.segment).to eq(['1. This is a list item.'])
530
+ end
531
+
532
+ it 'correctly segments text #014' do
533
+ ps = PragmaticSegmenter::Segmenter.new(text: 'I live in the U.S.A. I went to J.C. Penney.', language: 'en')
534
+ expect(ps.segment).to eq(["I live in the U.S.A.", "I went to J.C. Penney."])
535
+ end
536
+
537
+ it 'correctly segments text #015' do
538
+ ps = PragmaticSegmenter::Segmenter.new(text: 'His name is Alfred E. Sloan.', language: 'en')
539
+ expect(ps.segment).to eq(['His name is Alfred E. Sloan.'])
540
+ end
541
+
542
+ it 'correctly segments text #016' do
543
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Q. What is his name? A. His name is Alfred E. Sloan.', language: 'en')
544
+ expect(ps.segment).to eq(['Q. What is his name?', 'A. His name is Alfred E. Sloan.'])
545
+ end
546
+
547
+ it 'correctly segments text #017' do
548
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Today is 11.18.2014.', language: 'en')
549
+ expect(ps.segment).to eq(['Today is 11.18.2014.'])
550
+ end
551
+
552
+ it 'correctly segments text #018' do
553
+ ps = PragmaticSegmenter::Segmenter.new(text: 'I need you to find 3 items, e.g. a hat, a coat, and a bag.', language: 'en')
554
+ expect(ps.segment).to eq(['I need you to find 3 items, e.g. a hat, a coat, and a bag.'])
555
+ end
556
+
557
+ it 'correctly segments text #019' do
558
+ ps = PragmaticSegmenter::Segmenter.new(text: "The game is the Giants vs. the Tigers at 10 p.m. I'm going are you?", language: 'en')
559
+ expect(ps.segment).to eq(["The game is the Giants vs. the Tigers at 10 p.m.", "I'm going are you?"])
560
+ end
561
+
562
+ it 'correctly segments text #020' do
563
+ ps = PragmaticSegmenter::Segmenter.new(text: 'He is no. 5, the shortstop.', language: 'en')
564
+ expect(ps.segment).to eq(['He is no. 5, the shortstop.'])
565
+ end
566
+
567
+ it 'correctly segments text #021' do
568
+ ps = PragmaticSegmenter::Segmenter.new(text: "Remove long strings of dots........please.", language: 'en')
569
+ expect(ps.segment).to eq(["Remove long strings of dots please."])
570
+ end
571
+
572
+ it 'correctly segments text #022' do
573
+ ps = PragmaticSegmenter::Segmenter.new(text: "See our additional services section or contact us for pricing\n.\n\n\nPricing Additionl Info\n", language: 'en')
574
+ expect(ps.segment).to eq(["See our additional services section or contact us for pricing.", "Pricing Additionl Info"])
575
+ end
576
+
577
+ it 'correctly segments text #023' do
578
+ ps = PragmaticSegmenter::Segmenter.new(text: "As payment for 1. above, pay us a commission fee of 0 yen and for 2. above, no fee will be paid.", language: 'en')
579
+ expect(ps.segment).to eq(["As payment for 1. above, pay us a commission fee of 0 yen and for 2. above, no fee will be paid."])
580
+ end
581
+
582
+ it 'correctly segments text #024' do
583
+ ps = PragmaticSegmenter::Segmenter.new(text: "features\ncontact manager\nevents, activities\n", language: 'en')
584
+ expect(ps.segment).to eq(['features', 'contact manager', 'events, activities'])
585
+ end
586
+
587
+ it 'correctly segments text #025' do
588
+ ps = PragmaticSegmenter::Segmenter.new(text: "Git rid of unnecessary white space.", language: 'en')
589
+ expect(ps.segment).to eq(["Git rid of unnecessary white space."])
590
+ end
591
+
592
+ it 'correctly segments text #026' do
593
+ ps = PragmaticSegmenter::Segmenter.new(text: "See our additional services section or contact us for pricing\n. Pricing Additionl Info", language: 'en')
594
+ expect(ps.segment).to eq(["See our additional services section or contact us for pricing.", "Pricing Additionl Info"])
595
+ end
596
+
597
+ it 'correctly segments text #027' do
598
+ ps = PragmaticSegmenter::Segmenter.new(text: "Organising your care early \nmeans you'll have months to build a good relationship with your midwife or doctor, ready for \nthe birth.", language: 'en', doc_type: 'pdf')
599
+ expect(ps.segment).to eq(["Organising your care early means you'll have months to build a good relationship with your midwife or doctor, ready for the birth."])
600
+ end
601
+
602
+ it 'correctly segments text #028' do
603
+ ps = PragmaticSegmenter::Segmenter.new(text: "10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines:", language: 'en', doc_type: 'pdf')
604
+ expect(ps.segment).to eq(["10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:"])
605
+ end
606
+
607
+ it 'correctly segments text #029' do
608
+ ps = PragmaticSegmenter::Segmenter.new(text: "• 9. Stop smoking \n• 10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines: \n\n1. Organise your pregnancy care early", language: 'en', doc_type: 'pdf')
609
+ expect(ps.segment).to eq(["• 9. Stop smoking", "• 10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:", "1. Organise your pregnancy care early"])
610
+ end
611
+
612
+ it 'correctly segments text #030' do
613
+ ps = PragmaticSegmenter::Segmenter.new(text: "I have 600. How many do you have?", language: 'en')
614
+ expect(ps.segment).to eq(["I have 600.", "How many do you have?"])
615
+ end
616
+
617
+ it 'correctly segments text #031' do
618
+ ps = PragmaticSegmenter::Segmenter.new(text: "\n3\n\nIntroduction\n\n", language: 'en')
619
+ expect(ps.segment).to eq(["Introduction"])
620
+ end
621
+
622
+ it 'correctly segments text #032' do
623
+ ps = PragmaticSegmenter::Segmenter.new(text: "\nW\nA\nRN\nI\nNG\n", language: 'en')
624
+ expect(ps.segment).to eq(["WARNING"])
625
+ end
626
+
627
+ it 'correctly segments text #033' do
628
+ ps = PragmaticSegmenter::Segmenter.new(text: "\n\n\nW\nA\nRN\nI\nNG\n \n/\n \nA\nV\nE\nR\nT\nI\nS\nE\nM\nE\nNT\n", language: 'en')
629
+ expect(ps.segment).to eq(["WARNING", "AVERTISEMENT"])
630
+ end
631
+
632
+ it 'correctly segments text #034' do
633
+ ps = PragmaticSegmenter::Segmenter.new(text: '"Help yourself, sweetie," shouted Candy and gave her the cookie.', language: 'en')
634
+ expect(ps.segment).to eq(["\"Help yourself, sweetie,\" shouted Candy and gave her the cookie."])
635
+ end
636
+
637
+ it 'correctly segments text #035' do
638
+ ps = PragmaticSegmenter::Segmenter.new(text: "Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating \na shot.", language: 'en')
639
+ expect(ps.segment).to eq(["Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating a shot."])
640
+ end
641
+
642
+ it 'correctly segments text #036' do
643
+ ps = PragmaticSegmenter::Segmenter.new(text: "This is a test. Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating \na shot.", language: 'en')
644
+ expect(ps.segment).to eq(["This is a test.", "Until its release, a generic mechanism was known, where the sear keeps the hammer in back position, and when one pulls the trigger, the sear slips out of hammer’s notches, the hammer falls initiating a shot."])
645
+ end
646
+
647
+ it 'correctly segments text #037' do
648
+ ps = PragmaticSegmenter::Segmenter.new(text: "This was because it was an offensive weapon, designed to fight at a distance up to 400 yd \n( 365.8 m ).", language: 'en')
649
+ expect(ps.segment).to eq(["This was because it was an offensive weapon, designed to fight at a distance up to 400 yd ( 365.8 m )."])
650
+ end
651
+
652
+ it 'correctly segments text #038' do
653
+ ps = PragmaticSegmenter::Segmenter.new(text: "“Are demonstrations are evidence of the public anger and frustration at opaque environmental management and decision-making?” Others yet say: \"Should we be scared about these 'protests'?\"", language: 'en')
654
+ expect(ps.segment).to eq(["“Are demonstrations are evidence of the public anger and frustration at opaque environmental management and decision-making?”", "Others yet say: \"Should we be scared about these 'protests'?\""])
655
+ end
656
+
657
+ it 'correctly segments text #039' do
658
+ ps = PragmaticSegmenter::Segmenter.new(text: "www.testurl.Awesome.com", language: 'en')
659
+ expect(ps.segment).to eq(["www.testurl.Awesome.com"])
660
+ end
661
+
662
+ it 'correctly segments text #040' do
663
+ ps = PragmaticSegmenter::Segmenter.new(text: "http://testurl.Awesome.com", language: 'en')
664
+ expect(ps.segment).to eq(["http://testurl.Awesome.com"])
665
+ end
666
+
667
+ it 'correctly segments text #041' do
668
+ ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Church in is a church.", language: 'en')
669
+ expect(ps.segment).to eq(["St. Michael's Church in is a church."])
670
+ end
671
+
672
+ it 'correctly segments text #042' do
673
+ ps = PragmaticSegmenter::Segmenter.new(text: "JFK Jr.'s book is on sale.", language: 'en')
674
+ expect(ps.segment).to eq(["JFK Jr.'s book is on sale."])
675
+ end
676
+
677
+ it 'correctly segments text #043' do
678
+ ps = PragmaticSegmenter::Segmenter.new(text: "This is e.g. Mr. Smith, who talks slowly... And this is another sentence.", language: 'en')
679
+ expect(ps.segment).to eq(["This is e.g. Mr. Smith, who talks slowly...", "And this is another sentence."])
680
+ end
681
+
682
+ it 'correctly segments text #044' do
683
+ ps = PragmaticSegmenter::Segmenter.new(text: "Leave me alone!, he yelled. I am in the U.S. Army. Charles (Ind.) said he.", language: 'en')
684
+ expect(ps.segment).to eq(["Leave me alone!, he yelled.", "I am in the U.S. Army.", "Charles (Ind.) said he."])
685
+ end
686
+
687
+ it 'correctly segments text #045' do
688
+ ps = PragmaticSegmenter::Segmenter.new(text: "This is the U.S. Senate my friends. <em>Yes.</em> <em>It is</em>!", language: 'en')
689
+ expect(ps.segment).to eq(["This is the U.S. Senate my friends.", "Yes.", "It is!"])
690
+ end
691
+
692
+ it 'correctly segments text #046' do
693
+ ps = PragmaticSegmenter::Segmenter.new(text: "Send it to P.O. box 6554", language: 'en')
694
+ expect(ps.segment).to eq(["Send it to P.O. box 6554"])
695
+ end
696
+
697
+ it 'correctly segments text #047' do
698
+ ps = PragmaticSegmenter::Segmenter.new(text: "There were 500 cases in the U.S. The U.S. Commission asked the U.S. Government to give their opinion on the issue.", language: 'en')
699
+ expect(ps.segment).to eq(["There were 500 cases in the U.S.", "The U.S. Commission asked the U.S. Government to give their opinion on the issue."])
700
+ end
701
+
702
+ it 'correctly segments text #048' do
703
+ ps = PragmaticSegmenter::Segmenter.new(text: "CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co. (cited from WSJ 05/29/1987)", language: 'en')
704
+ expect(ps.segment).to eq(["CELLULAR COMMUNICATIONS INC. sold 1,550,000 common shares at $21.75 each yesterday, according to lead underwriter L.F. Rothschild & Co.", "(cited from WSJ 05/29/1987)"])
705
+ end
706
+
707
+ it 'correctly segments text #049' do
708
+ ps = PragmaticSegmenter::Segmenter.new(text: "Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990. `So what if you miss 50 tanks somewhere?' asks Rep. Norman Dicks (D., Wash.), a member of the House group that visited the talks in Vienna. Later, he recalls the words of his Marxist mentor: `The people! Theft! The holy fire!'", language: 'en')
709
+ expect(ps.segment).to eq(["Rolls-Royce Motor Cars Inc. said it expects its U.S. sales to remain steady at about 1,200 cars in 1990.", "'So what if you miss 50 tanks somewhere?' asks Rep. Norman Dicks (D., Wash.), a member of the House group that visited the talks in Vienna.", "Later, he recalls the words of his Marxist mentor: 'The people! Theft! The holy fire!'"])
710
+ end
711
+
712
+ it 'correctly segments text #050' do
713
+ ps = PragmaticSegmenter::Segmenter.new(text: "He climbed Mt. Fuji.", language: 'en')
714
+ expect(ps.segment).to eq(["He climbed Mt. Fuji."])
715
+ end
716
+
717
+ it 'correctly segments text #051' do
718
+ ps = PragmaticSegmenter::Segmenter.new(text: "He speaks !Xũ, !Kung, ǃʼOǃKung, !Xuun, !Kung-Ekoka, ǃHu, ǃKhung, ǃKu, ǃung, ǃXo, ǃXû, ǃXung, ǃXũ, and !Xun.", language: 'en')
719
+ expect(ps.segment).to eq(["He speaks !Xũ, !Kung, ǃʼOǃKung, !Xuun, !Kung-Ekoka, ǃHu, ǃKhung, ǃKu, ǃung, ǃXo, ǃXû, ǃXung, ǃXũ, and !Xun."])
720
+ end
721
+
722
+ it 'correctly segments text #052' do
723
+ ps = PragmaticSegmenter::Segmenter.new(text: "Test strange period.Does it segment correctly.", language: 'en')
724
+ expect(ps.segment).to eq(["Test strange period.", "Does it segment correctly."])
725
+ end
726
+
727
+ it 'correctly segments text #053' do
728
+ ps = PragmaticSegmenter::Segmenter.new(text: "<h2 class=\"lined\">Hello</h2>\n<p>This is a test. Another test.</p>\n<div class=\"center\"><p>\n<img src=\"/images/content/example.jpg\">\n</p></div>", language: 'en')
729
+ expect(ps.segment).to eq(["Hello", "This is a test.", "Another test."])
730
+ end
731
+
732
+ it 'correctly segments text #054' do
733
+ ps = PragmaticSegmenter::Segmenter.new(text: "This sentence ends with the psuedo-number x10. This one with the psuedo-number %3.00. One last sentence.", language: 'en')
734
+ expect(ps.segment).to eq(["This sentence ends with the psuedo-number x10.", "This one with the psuedo-number %3.00.", "One last sentence."])
735
+ end
736
+
737
+ it 'correctly segments text #055' do
738
+ ps = PragmaticSegmenter::Segmenter.new(text: "Testing mixed numbers Jahr10. And another 0.3 %11. That's weird.", language: 'en')
739
+ expect(ps.segment).to eq(["Testing mixed numbers Jahr10.", "And another 0.3 %11.", "That's weird."])
740
+ end
741
+
742
+ it 'correctly segments text #056' do
743
+ ps = PragmaticSegmenter::Segmenter.new(text: "Were Jane and co. at the party?", language: 'en')
744
+ expect(ps.segment).to eq(["Were Jane and co. at the party?"])
745
+ end
746
+
747
+ it 'correctly segments text #057' do
748
+ ps = PragmaticSegmenter::Segmenter.new(text: "St. Michael's Church is on 5th st. near the light.", language: 'en')
749
+ expect(ps.segment).to eq(["St. Michael's Church is on 5th st. near the light."])
750
+ end
751
+
752
+ it 'correctly segments text #058' do
753
+ ps = PragmaticSegmenter::Segmenter.new(text: "Let's ask Jane and co. They should know.", language: 'en')
754
+ expect(ps.segment).to eq(["Let's ask Jane and co.", "They should know."])
755
+ end
756
+
757
+ it 'correctly segments text #059' do
758
+ ps = PragmaticSegmenter::Segmenter.new(text: "He works at Yahoo! and Y!J.", language: 'en')
759
+ expect(ps.segment).to eq(["He works at Yahoo! and Y!J."])
760
+ end
761
+
762
+ it 'correctly segments text #060' do
763
+ ps = PragmaticSegmenter::Segmenter.new(text: 'The Scavenger Hunt ends on Dec. 31st, 2011.', language: 'en')
764
+ expect(ps.segment).to eq(['The Scavenger Hunt ends on Dec. 31st, 2011.'])
765
+ end
766
+
767
+ it 'correctly segments text #061' do
768
+ ps = PragmaticSegmenter::Segmenter.new(text: "Putter King Scavenger Hunt Trophy\n(6 3/4\" Engraved Crystal Trophy - Picture Coming Soon)\nThe Putter King team will judge the scavenger hunt and all decisions will be final. The scavenger hunt is open to anyone and everyone. The scavenger hunt ends on Dec. 31st, 2011.", language: 'en')
769
+ expect(ps.segment).to eq(["Putter King Scavenger Hunt Trophy", "(6 3/4\" Engraved Crystal Trophy - Picture Coming Soon)", "The Putter King team will judge the scavenger hunt and all decisions will be final.", "The scavenger hunt is open to anyone and everyone.", "The scavenger hunt ends on Dec. 31st, 2011."])
770
+ end
771
+
772
+ it 'correctly segments text #062' do
773
+ ps = PragmaticSegmenter::Segmenter.new(text: "Unauthorized modifications, alterations or installations of or to this equipment are prohibited and are in violation of AR 750-10. Any such unauthorized modifications, alterations or installations could result in death, injury or damage to the equipment.", language: 'en')
774
+ expect(ps.segment).to eq(["Unauthorized modifications, alterations or installations of or to this equipment are prohibited and are in violation of AR 750-10.", "Any such unauthorized modifications, alterations or installations could result in death, injury or damage to the equipment."])
775
+ end
776
+
777
+ it 'correctly segments text #063' do
778
+ ps = PragmaticSegmenter::Segmenter.new(text: "Header 1.2; Attachment Z\n\n\td. Compliance Log – Volume 12 \n\tAttachment A\n\n\te. Additional Logistics Data\n\tSection 10", language: 'en')
779
+ expect(ps.segment).to eq(["Header 1.2; Attachment Z", "d. Compliance Log – Volume 12", "Attachment A", "e. Additional Logistics Data", "Section 10"])
780
+ end
781
+
782
+ it 'correctly segments text #064' do
783
+ ps = PragmaticSegmenter::Segmenter.new(text: "a.) The first item b.) The second item c.) The third list item", language: 'en')
784
+ expect(ps.segment).to eq(["a.) The first item", "b.) The second item", "c.) The third list item"])
785
+ end
786
+
787
+ it 'correctly segments text #065' do
788
+ ps = PragmaticSegmenter::Segmenter.new(text: "a) The first item b) The second item c) The third list item", language: 'en')
789
+ expect(ps.segment).to eq(["a) The first item", "b) The second item", "c) The third list item"])
790
+ end
791
+
792
+ it 'correctly segments text #066' do
793
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hello Wolrd. Here is a secret code AS750-10. Another sentence. Finally, this. 1. The first item 2. The second item 3. The third list item 4. Hello 5. Hello 6. Hello 7. Hello 8. Hello 9. Hello 10. Hello 11. Hello", language: 'en')
794
+ expect(ps.segment).to eq(["Hello Wolrd.", "Here is a secret code AS750-10.", "Another sentence.", "Finally, this.", "1. The first item", "2. The second item", "3. The third list item", "4. Hello", "5. Hello", "6. Hello", "7. Hello", "8. Hello", "9. Hello", "10. Hello", "11. Hello"])
795
+ end
796
+
797
+ it 'correctly segments text #067' do
798
+ ps = PragmaticSegmenter::Segmenter.new(text: "He works for ABC Ltd. and sometimes for BCD Ltd. She works for ABC Co. and BCD Co. They work for ABC Corp. and BCD Corp.", language: 'en')
799
+ expect(ps.segment).to eq(["He works for ABC Ltd. and sometimes for BCD Ltd.", "She works for ABC Co. and BCD Co.", "They work for ABC Corp. and BCD Corp."])
800
+ end
801
+
802
+ it 'correctly segments text #068' do
803
+ ps = PragmaticSegmenter::Segmenter.new(text: "<bpt i=\"0\" type=\"bold\">&lt;b&gt;</bpt>J1.txt<ept i=\"1\">&lt;/b&gt;</ept>", language: 'en')
804
+ expect(ps.segment).to eq(["J1.txt"])
805
+ end
806
+
807
+ it 'correctly segments text #069' do
808
+ ps = PragmaticSegmenter::Segmenter.new(text: "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration.", language: 'en')
809
+ expect(ps.segment).to eq(["On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S.", "Millions attended the Inauguration."])
810
+ end
811
+
812
+ it 'correctly segments text #070' do
813
+ ps = PragmaticSegmenter::Segmenter.new(text: "The U.K. Panel on enivronmental issues said it was true. Finally he left the U.K. He went to a new location.", language: 'en')
814
+ expect(ps.segment).to eq(["The U.K. Panel on enivronmental issues said it was true.", "Finally he left the U.K.", "He went to a new location."])
815
+ end
816
+
817
+ it 'correctly segments text #071' do
818
+ ps = PragmaticSegmenter::Segmenter.new(text: "He left at 6 P.M. Travelers who didn't get the warning at 5 P.M. left later.", language: 'en')
819
+ expect(ps.segment).to eq(["He left at 6 P.M.", "Travelers who didn't get the warning at 5 P.M. left later."])
820
+ end
821
+
822
+ it 'correctly segments text #072' do
823
+ ps = PragmaticSegmenter::Segmenter.new(text: "He left at 6 a.m. Travelers who didn't get the warning at 5 a.m. left later.", language: 'en')
824
+ expect(ps.segment).to eq(["He left at 6 a.m.", "Travelers who didn't get the warning at 5 a.m. left later."])
825
+ end
826
+
827
+ it 'correctly segments text #073' do
828
+ ps = PragmaticSegmenter::Segmenter.new(text: "He left at 6 A.M. Travelers who didn't get the warning at 5 A.M. left later.", language: 'en')
829
+ expect(ps.segment).to eq(["He left at 6 A.M.", "Travelers who didn't get the warning at 5 A.M. left later."])
830
+ end
831
+
832
+ it 'correctly segments text #074' do
833
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item \rIt was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”. \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", language: nil)
834
+ expect(ps.segment).to eq(["Hello World.", "My name is Jonas.", "What is your name?", "My name is Jonas.", "There it is!", "I found it.", "My name is Jonas E. Smith.", "Please turn to p. 55.", "Were Jane and co. at the party?", "They closed the deal with Pitt, Briggs & Co. at noon.", "Let's ask Jane and co.", "They should know.", "They closed the deal with Pitt, Briggs & Co.", "It closed yesterday.", "I can see Mt. Fuji from here.", "St. Michael's Church is on 5th st. near the light.", "That is JFK Jr.'s book.", "I visited the U.S.A. last year.", "I live in the E.U.", "How about you?", "I live in the U.S.", "How about you?", "I work for the U.S. Government in Virginia.", "I have lived in the U.S. for 20 years.", "She has $100.00 in her bag.", "She has $100.00.", "It is in her bag.", "He teaches science (He previously worked for 5 years as an engineer.) at the local University.", "Her email is Jane.Doe@example.com.", "I sent her an email.", "The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out.", "She turned to him, 'This is great.' she said.", "She turned to him, \"This is great.\" she said.", "She turned to him, \"This is great.\"", "She held the book out to show him.", "Hello!!", "Long time no see.", "Hello??", "Who is there?", "Hello!?", "Is that you?", "Hello?!", "Is that you?", "1.) The first item", "2.) The second item", "1.) The first item.", "2.) The second item.", "1) The first item", "2) The second item", "1) The first item.", "2) The second item.", "1. The first item", "2. The second item", "1. The first item.", "2. The second item.", "• 9. The first item", "• 10. The second item", "⁃9. The first item", "⁃10. The second item", "a. The first item", "b. The second item", "c. The third list item", "It was a cold night in the city.", "features", "contact manager", "events, activities", "You can find it at N°. 1026.253.553.", "That is where the treasure is.", "She works at Yahoo! in the accounting department.", "We make a good team, you and I.", "Did you see Albert I. Jones yesterday?", "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”.", "\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55).", "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence.", "I never meant that....", "She left the store.", "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."])
835
+ end
836
+
837
+ it 'correctly segments text #075' do
838
+ ps = PragmaticSegmenter::Segmenter.new(text: "His name is Mark E. Smith. a. here it is b. another c. one more\n They went to the store. It was John A. Smith. She was Jane B. Smith.", language: "en")
839
+ expect(ps.segment).to eq(["His name is Mark E. Smith.", "a. here it is", "b. another", "c. one more", "They went to the store.", "It was John A. Smith.", "She was Jane B. Smith."])
840
+ end
841
+
842
+ it 'correctly segments text #076' do
843
+ ps = PragmaticSegmenter::Segmenter.new(text: "a) here it is b) another c) one more\n They went to the store. w) hello x) hello y) hello", language: "en")
844
+ expect(ps.segment).to eq(["a) here it is", "b) another", "c) one more", "They went to the store.", "w) hello", "x) hello", "y) hello"])
845
+ end
846
+
847
+ it 'correctly segments text #077' do
848
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hello{b^&gt;1&lt;b^} hello{b^>1<b^}.", language: "en")
849
+ expect(ps.segment).to eq(["Hello hello."])
850
+ end
851
+
852
+ it 'correctly segments text #078' do
853
+ ps = PragmaticSegmenter::Segmenter.new(text: "'Well?' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs? How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)", language: 'en')
854
+ expect(ps.segment).to eq(["'Well?' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs? How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
855
+ end
856
+
857
+ it 'correctly segments text #079' do
858
+ ps = PragmaticSegmenter::Segmenter.new(text: "Leave me alone! he yelled. I am in the U.S. Army. Charles (Ind.) said he.", language: 'en')
859
+ expect(ps.segment).to eq(["Leave me alone! he yelled.", "I am in the U.S. Army.", "Charles (Ind.) said he."])
860
+ end
861
+
862
+ it 'correctly segments text #080' do
863
+ ps = PragmaticSegmenter::Segmenter.new(text: "She turned to him, “This is great.” She held the book out to show him.", language: 'en')
864
+ expect(ps.segment).to eq(["She turned to him, “This is great.”", "She held the book out to show him."])
865
+ end
866
+ end
867
+ end
868
+
869
+ context 'Language: Japanese (ja)' do
870
+ describe '#segment' do
871
+ it 'correctly segments text #001' do
872
+ ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です。 \nこれは山です(これは山です。これは山です)。これは山です、これは山です、これは山です、これは山です(これは山です。これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です。 \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です。 \n※1 これは山です。 \n2.)これは山です、これは山です、これは山です、これは山です。 \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です。 \n4.)これは山です、これは山です(これは山です、これは山です、これは山です。これは山です)これは山です、これは山です(これは山です、これは山です)。 \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です。 \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja')
873
+ expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です。", "これは山です(これは山です。これは山です)。", "これは山です、これは山です、これは山です、これは山です(これは山です。これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です。", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です。", "※1 これは山です。", "2.)これは山です、これは山です、これは山です、これは山です。", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です。", "4.)これは山です、これは山です(これは山です、これは山です、これは山です。これは山です)これは山です、これは山です(これは山です、これは山です)。", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です。", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"])
874
+ end
875
+
876
+ it 'correctly segments text #002' do
877
+ ps = PragmaticSegmenter::Segmenter.new(text: "フフーの\n主たる債務", language: 'ja')
878
+ expect(ps.segment).to eq(["フフーの主たる債務"])
879
+ end
880
+
881
+ it 'correctly segments text #003' do
882
+ ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です. \nこれは山です(これは山です.これは山です).これは山です、これは山です、これは山です、これは山です(これは山です.これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です. \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です. \n※1 これは山です. \n2.)これは山です、これは山です、これは山です、これは山です. \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です. \n4.)これは山です、これは山です(これは山です、これは山です、これは山です.これは山です)これは山です、これは山です(これは山です、これは山です). \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です. \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja')
883
+ expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です.", "これは山です(これは山です.これは山です).", "これは山です、これは山です、これは山です、これは山です(これは山です.これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です.", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です.", "※1 これは山です.", "2.)これは山です、これは山です、これは山です、これは山です.", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です.", "4.)これは山です、これは山です(これは山です、これは山です、これは山です.これは山です)これは山です、これは山です(これは山です、これは山です).", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です.", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"])
884
+ end
885
+
886
+ it 'correctly segments text #004' do
887
+ ps = PragmaticSegmenter::Segmenter.new(text: "これは山です \nこれは山です \nこれは山です(「これは山です」) \nこれは山です(これは山です「これは山です」)これは山です・これは山です、これは山です! \nこれは山です(これは山です!これは山です)!これは山です、これは山です、これは山です、これは山です(これは山です!これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です! \n1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です! \n※1 これは山です! \n2.)これは山です、これは山です、これは山です、これは山です! \n3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です! \n4.)これは山です、これは山です(これは山です、これは山です、これは山です!これは山です)これは山です、これは山です(これは山です、これは山です)! \nこれは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です! \n(1) 「これは山です」(これは山です:0円) (※1) \n① これは山です", language: 'ja')
888
+ expect(ps.segment).to eq(["これは山です", "これは山です", "これは山です(「これは山です」)", "これは山です(これは山です「これは山です」)これは山です・これは山です、これは山です!", "これは山です(これは山です!これは山です)!", "これは山です、これは山です、これは山です、これは山です(これは山です!これは山です)これは山です、これは山です、これは山です「これは山です」これは山です(これは山です:0円)これは山です!", "1.)これは山です、これは山です(これは山です、これは山です6円(※1))これは山です!", "※1 これは山です!", "2.)これは山です、これは山です、これは山です、これは山です!", "3.)これは山です、これは山です・これは山です、これは山です、これは山です、これは山です(これは山です「これは山です」)これは山です、これは山です、これは山です、これは山です!", "4.)これは山です、これは山です(これは山です、これは山です、これは山です!これは山です)これは山です、これは山です(これは山です、これは山です)!", "これは山です、これは山です、これは山です、これは山です、これは山です(者)これは山です!", "(1) 「これは山です」(これは山です:0円) (※1)", "① これは山です"])
889
+ end
890
+ end
891
+ end
892
+
893
+ context 'Language: Arabic (ar)' do
894
+ # Thanks to Mahmoud Holmez for the Arabic test examples.
895
+ describe '#segment' do
896
+ it 'correctly segments text #001' do
897
+ ps = PragmaticSegmenter::Segmenter.new(text: "سؤال وجواب: ماذا حدث بعد الانتخابات الايرانية؟ طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن. يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب.", language: 'ar')
898
+ expect(ps.segment).to eq(["سؤال وجواب:", "ماذا حدث بعد الانتخابات الايرانية؟", "طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن.", "يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب."])
899
+ end
900
+
901
+ it 'correctly segments text #002' do
902
+ ps = PragmaticSegmenter::Segmenter.new(text: "وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى. وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير.", language: 'ar')
903
+ expect(ps.segment).to eq(["وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى.", "وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير."])
904
+ end
905
+
906
+ it 'correctly segments text #003' do
907
+ ps = PragmaticSegmenter::Segmenter.new(text: "ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار. ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية.", language: 'ar')
908
+ expect(ps.segment).to eq(["ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار.", "ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية."])
909
+ end
910
+
911
+ it 'correctly segments text #004' do
912
+ ps = PragmaticSegmenter::Segmenter.new(text: "الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز: رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه. العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي.", language: 'ar')
913
+ expect(ps.segment).to eq(["الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز:", "رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه.", "العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي."])
914
+ end
915
+
916
+ it 'correctly segments text #005' do
917
+ ps = PragmaticSegmenter::Segmenter.new(text: "عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب، زرعها عملاء الموساد كما تقول مصادر إسرائيلية، وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية، وبدأت مراسم الحداد عليه", language: 'ar')
918
+ expect(ps.segment).to eq(["عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب،", "زرعها عملاء الموساد كما تقول مصادر إسرائيلية،", "وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية،", "وبدأت مراسم الحداد عليه"])
919
+ end
920
+ end
921
+ end
922
+
923
+ context 'Language: Italian (it)' do
924
+ # Thanks to Davide Fornelli for the Italian test examples.
925
+ describe '#segment' do
926
+
927
+ it 'correctly segments text #001' do
928
+ ps = PragmaticSegmenter::Segmenter.new(text: "Salve Sig.ra Mengoni! Come sta oggi?", language: 'it')
929
+ expect(ps.segment).to eq(["Salve Sig.ra Mengoni!", "Come sta oggi?"])
930
+ end
931
+
932
+ it 'correctly segments text #002' do
933
+ ps = PragmaticSegmenter::Segmenter.new(text: "Buongiorno! Sono l'Ing. Mengozzi. È presente l'Avv. Cassioni?", language: 'it')
934
+ expect(ps.segment).to eq(["Buongiorno!", "Sono l'Ing. Mengozzi.", "È presente l'Avv. Cassioni?"])
935
+ end
936
+
937
+ it 'correctly segments text #003' do
938
+ ps = PragmaticSegmenter::Segmenter.new(text: "Mi fissi un appuntamento per mar. 23 Nov.. Grazie.", language: 'it')
939
+ expect(ps.segment).to eq(["Mi fissi un appuntamento per mar. 23 Nov..", "Grazie."])
940
+ end
941
+
942
+ it 'correctly segments text #004' do
943
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ecco il mio tel.:01234567. Mi saluti la Sig.na Manelli. Arrivederci.", language: 'it')
944
+ expect(ps.segment).to eq(["Ecco il mio tel.:01234567.", "Mi saluti la Sig.na Manelli.", "Arrivederci."])
945
+ end
946
+
947
+ it 'correctly segments text #005' do
948
+ ps = PragmaticSegmenter::Segmenter.new(text: "La centrale meteor. si è guastata. Gli idraul. son dovuti andare a sistemarla.", language: 'it')
949
+ expect(ps.segment).to eq(["La centrale meteor. si è guastata.", "Gli idraul. son dovuti andare a sistemarla."])
950
+ end
951
+
952
+ it 'correctly segments text #006' do
953
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hanno creato un algoritmo allo st. d. arte. Si ringrazia lo psicol. Serenti.", language: 'it')
954
+ expect(ps.segment).to eq(["Hanno creato un algoritmo allo st. d. arte.", "Si ringrazia lo psicol. Serenti."])
955
+ end
956
+
957
+ it 'correctly segments text #007' do
958
+ ps = PragmaticSegmenter::Segmenter.new(text: "Chiamate il V.Cte. delle F.P., adesso!", language: 'it')
959
+ expect(ps.segment).to eq(["Chiamate il V.Cte. delle F.P., adesso!"])
960
+ end
961
+
962
+ it 'correctly segments text #008' do
963
+ ps = PragmaticSegmenter::Segmenter.new(text: "Giancarlo ha sostenuto l'esame di econ. az..", language: 'it')
964
+ expect(ps.segment).to eq(["Giancarlo ha sostenuto l'esame di econ. az.."])
965
+ end
966
+
967
+ it 'correctly segments text #009' do
968
+ ps = PragmaticSegmenter::Segmenter.new(text: "Stava viaggiando a 90 km/h verso la provincia di TR quando il Dott. Mesini ha sentito un rumore e si fermò!", language: 'it')
969
+ expect(ps.segment).to eq(["Stava viaggiando a 90 km/h verso la provincia di TR quando il Dott. Mesini ha sentito un rumore e si fermò!"])
970
+ end
971
+
972
+ it 'correctly segments text #010' do
973
+ ps = PragmaticSegmenter::Segmenter.new(text: "Egregio Dir. Amm., le faccio sapere che l'ascensore non funziona.", language: 'it')
974
+ expect(ps.segment).to eq(["Egregio Dir. Amm., le faccio sapere che l'ascensore non funziona."])
975
+ end
976
+
977
+ it 'correctly segments text #011' do
978
+ ps = PragmaticSegmenter::Segmenter.new(text: "Stava mangiando e/o dormendo.", language: 'it')
979
+ expect(ps.segment).to eq(["Stava mangiando e/o dormendo."])
980
+ end
981
+
982
+ it 'correctly segments text #012' do
983
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ricordatevi che dom 25 Set. sarà il compleanno di Maria; dovremo darle un regalo.", language: 'it')
984
+ expect(ps.segment).to eq(["Ricordatevi che dom 25 Set. sarà il compleanno di Maria; dovremo darle un regalo."])
985
+ end
986
+
987
+ it 'correctly segments text #013' do
988
+ ps = PragmaticSegmenter::Segmenter.new(text: "La politica è quella della austerità; quindi verranno fatti tagli agli sprechi.", language: 'it')
989
+ expect(ps.segment).to eq(["La politica è quella della austerità; quindi verranno fatti tagli agli sprechi."])
990
+ end
991
+
992
+ it 'correctly segments text #014' do
993
+ ps = PragmaticSegmenter::Segmenter.new(text: "Nel tribunale, l'Avv. Fabrizi ha urlato \"Io, l'illustrissimo Fabrizi, vi si oppone!\".", language: 'it')
994
+ expect(ps.segment).to eq(["Nel tribunale, l'Avv. Fabrizi ha urlato \"Io, l'illustrissimo Fabrizi, vi si oppone!\"."])
995
+ end
996
+
997
+ it 'correctly segments text #015' do
998
+ ps = PragmaticSegmenter::Segmenter.new(text: "Le parti fisiche di un computer (ad es. RAM, CPU, tastiera, mouse, etc.) sono definiti HW.", language: 'it')
999
+ expect(ps.segment).to eq(["Le parti fisiche di un computer (ad es. RAM, CPU, tastiera, mouse, etc.) sono definiti HW."])
1000
+ end
1001
+
1002
+ it 'correctly segments text #016' do
1003
+ ps = PragmaticSegmenter::Segmenter.new(text: "La parola 'casa' è sinonimo di abitazione.", language: 'it')
1004
+ expect(ps.segment).to eq(["La parola 'casa' è sinonimo di abitazione."])
1005
+ end
1006
+
1007
+ it 'correctly segments text #017' do
1008
+ ps = PragmaticSegmenter::Segmenter.new(text: "La \"Mulino Bianco\" fa alimentari pre-confezionati.", language: 'it')
1009
+ expect(ps.segment).to eq(["La \"Mulino Bianco\" fa alimentari pre-confezionati."])
1010
+ end
1011
+
1012
+ it 'correctly segments text #018' do
1013
+ ps = PragmaticSegmenter::Segmenter.new(text: "\"Ei fu. Siccome immobile / dato il mortal sospiro / stette la spoglia immemore / orba di tanto spiro / [...]\" (Manzoni).", language: 'it')
1014
+ expect(ps.segment).to eq(["\"Ei fu. Siccome immobile / dato il mortal sospiro / stette la spoglia immemore / orba di tanto spiro / [...]\" (Manzoni)."])
1015
+ end
1016
+
1017
+ it 'correctly segments text #019' do
1018
+ ps = PragmaticSegmenter::Segmenter.new(text: "Una lettera si può iniziare in questo modo «Il/la sottoscritto/a ... nato/a a ...».", language: 'it')
1019
+ expect(ps.segment).to eq(["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a ... nato/a a ...»."])
1020
+ end
1021
+
1022
+ it 'correctly segments text #020' do
1023
+ ps = PragmaticSegmenter::Segmenter.new(text: "Per casa, in uno degli esercizi per i bambini c'era \"3 + (14/7) = 5\"", language: 'it')
1024
+ expect(ps.segment).to eq(["Per casa, in uno degli esercizi per i bambini c'era \"3 + (14/7) = 5\""])
1025
+ end
1026
+
1027
+ it 'correctly segments text #021' do
1028
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ai bambini è stato chiesto di fare \"4:2*2\"", language: 'it')
1029
+ expect(ps.segment).to eq(["Ai bambini è stato chiesto di fare \"4:2*2\""])
1030
+ end
1031
+
1032
+ it 'correctly segments text #022' do
1033
+ ps = PragmaticSegmenter::Segmenter.new(text: "La maestra esclamò: \"Bambini, quanto fa '2/3 + 4/3?'\".", language: 'it')
1034
+ expect(ps.segment).to eq(["La maestra esclamò: \"Bambini, quanto fa \'2/3 + 4/3?\'\"."])
1035
+ end
1036
+
1037
+ it 'correctly segments text #023' do
1038
+ ps = PragmaticSegmenter::Segmenter.new(text: "Il motore misurava 120°C.", language: 'it')
1039
+ expect(ps.segment).to eq(["Il motore misurava 120°C."])
1040
+ end
1041
+
1042
+ it 'correctly segments text #024' do
1043
+ ps = PragmaticSegmenter::Segmenter.new(text: "Il volume era di 3m³.", language: 'it')
1044
+ expect(ps.segment).to eq(["Il volume era di 3m³."])
1045
+ end
1046
+
1047
+ it 'correctly segments text #025' do
1048
+ ps = PragmaticSegmenter::Segmenter.new(text: "La stanza misurava 20m².", language: 'it')
1049
+ expect(ps.segment).to eq(["La stanza misurava 20m²."])
1050
+ end
1051
+
1052
+ it 'correctly segments text #026' do
1053
+ ps = PragmaticSegmenter::Segmenter.new(text: "1°C corrisponde a 33.8°F.", language: 'it')
1054
+ expect(ps.segment).to eq(["1°C corrisponde a 33.8°F."])
1055
+ end
1056
+
1057
+ it 'correctly segments text #027' do
1058
+ ps = PragmaticSegmenter::Segmenter.new(text: "Oggi è il 27-10-14.", language: 'it')
1059
+ expect(ps.segment).to eq(["Oggi è il 27-10-14."])
1060
+ end
1061
+
1062
+ it 'correctly segments text #028' do
1063
+ ps = PragmaticSegmenter::Segmenter.new(text: "La casa costa 170.500.000,00€!", language: 'it')
1064
+ expect(ps.segment).to eq(["La casa costa 170.500.000,00€!"])
1065
+ end
1066
+
1067
+ it 'correctly segments text #029' do
1068
+ ps = PragmaticSegmenter::Segmenter.new(text: "Il corridore 103 è arrivato 4°.", language: 'it')
1069
+ expect(ps.segment).to eq(["Il corridore 103 è arrivato 4°."])
1070
+ end
1071
+
1072
+ it 'correctly segments text #030' do
1073
+ ps = PragmaticSegmenter::Segmenter.new(text: "Oggi è il 27/10/2014.", language: 'it')
1074
+ expect(ps.segment).to eq(["Oggi è il 27/10/2014."])
1075
+ end
1076
+
1077
+ it 'correctly segments text #031' do
1078
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ecco l'elenco: 1.gelato, 2.carne, 3.riso.", language: 'it')
1079
+ expect(ps.segment).to eq(["Ecco l'elenco: 1.gelato, 2.carne, 3.riso."])
1080
+ end
1081
+
1082
+ it 'correctly segments text #032' do
1083
+ ps = PragmaticSegmenter::Segmenter.new(text: "Devi comprare : 1)pesce 2)sale.", language: 'it')
1084
+ expect(ps.segment).to eq(["Devi comprare : 1)pesce 2)sale."])
1085
+ end
1086
+
1087
+ it 'correctly segments text #033' do
1088
+ ps = PragmaticSegmenter::Segmenter.new(text: "La macchina viaggiava a 100 km/h.", language: 'it')
1089
+ expect(ps.segment).to eq(["La macchina viaggiava a 100 km/h."])
1090
+ end
1091
+ end
1092
+ end
1093
+
1094
+ context 'Language: Russian (ru)' do
1095
+ # Thanks to Anastasiia Tsvitailo for the Russian test examples.
1096
+ describe '#segment' do
1097
+ it 'correctly segments text #001' do
1098
+ ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?».", language: 'ru')
1099
+ expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»."])
1100
+ end
1101
+
1102
+ it 'correctly segments text #002' do
1103
+ ps = PragmaticSegmenter::Segmenter.new(text: "«Я приду поздно», — сказал Андрей.", language: 'ru')
1104
+ expect(ps.segment).to eq(["«Я приду поздно», — сказал Андрей."])
1105
+ end
1106
+
1107
+ it 'correctly segments text #003' do
1108
+ ps = PragmaticSegmenter::Segmenter.new(text: "«К чему ты готовишься? – спросила мама. – Завтра ведь выходной».", language: 'ru')
1109
+ expect(ps.segment).to eq(["«К чему ты готовишься? – спросила мама. – Завтра ведь выходной»."])
1110
+ end
1111
+
1112
+ it 'correctly segments text #004' do
1113
+ ps = PragmaticSegmenter::Segmenter.new(text: "По словам Пушкина, «Привычка свыше дана, замена счастью она».", language: 'ru')
1114
+ expect(ps.segment).to eq(["По словам Пушкина, «Привычка свыше дана, замена счастью она»."])
1115
+ end
1116
+
1117
+ it 'correctly segments text #005' do
1118
+ ps = PragmaticSegmenter::Segmenter.new(text: "Он сказал: «Я очень устал», и сразу же замолчал.", language: 'ru')
1119
+ expect(ps.segment).to eq(["Он сказал: «Я очень устал», и сразу же замолчал."])
1120
+ end
1121
+
1122
+ it 'correctly segments text #006' do
1123
+ ps = PragmaticSegmenter::Segmenter.new(text: "Мне стало как-то ужасно грустно в это мгновение; однако что-то похожее на смех зашевелилось в душе моей.", language: 'ru')
1124
+ expect(ps.segment).to eq(["Мне стало как-то ужасно грустно в это мгновение; однако что-то похожее на смех зашевелилось в душе моей."])
1125
+ end
1126
+
1127
+ it 'correctly segments text #007' do
1128
+ ps = PragmaticSegmenter::Segmenter.new(text: "Шухов как был в ватных брюках, не снятых на ночь (повыше левого колена их тоже был пришит затасканный, погрязневший лоскут, и на нем выведен черной, уже поблекшей краской номер Щ-854), надел телогрейку…", language: 'ru')
1129
+ expect(ps.segment).to eq(["Шухов как был в ватных брюках, не снятых на ночь (повыше левого колена их тоже был пришит затасканный, погрязневший лоскут, и на нем выведен черной, уже поблекшей краской номер Щ-854), надел телогрейку…"])
1130
+ end
1131
+
1132
+ it 'correctly segments text #008' do
1133
+ ps = PragmaticSegmenter::Segmenter.new(text: "Слово «дом» является синонимом жилища", language: 'ru')
1134
+ expect(ps.segment).to eq(["Слово «дом» является синонимом жилища"])
1135
+ end
1136
+
1137
+ it 'correctly segments text #009' do
1138
+ ps = PragmaticSegmenter::Segmenter.new(text: "В Санкт-Петербург на гастроли приехал театр «Современник»", language: 'ru')
1139
+ expect(ps.segment).to eq(["В Санкт-Петербург на гастроли приехал театр «Современник»"])
1140
+ end
1141
+
1142
+ it 'correctly segments text #010' do
1143
+ ps = PragmaticSegmenter::Segmenter.new(text: "Машина едет со скоростью 100 км/ч.", language: 'ru')
1144
+ expect(ps.segment).to eq(["Машина едет со скоростью 100 км/ч."])
1145
+ end
1146
+
1147
+ it 'correctly segments text #011' do
1148
+ ps = PragmaticSegmenter::Segmenter.new(text: "Я поем и/или лягу спать.", language: 'ru')
1149
+ expect(ps.segment).to eq(["Я поем и/или лягу спать."])
1150
+ end
1151
+
1152
+ it 'correctly segments text #012' do
1153
+ ps = PragmaticSegmenter::Segmenter.new(text: "Он не мог справиться с примером \"3 + (14:7) = 5\"", language: 'ru')
1154
+ expect(ps.segment).to eq(["Он не мог справиться с примером \"3 + (14:7) = 5\""])
1155
+ end
1156
+
1157
+ it 'correctly segments text #013' do
1158
+ ps = PragmaticSegmenter::Segmenter.new(text: "Вот список: 1.мороженое, 2.мясо, 3.рис.", language: 'ru')
1159
+ expect(ps.segment).to eq(["Вот список: 1.мороженое, 2.мясо, 3.рис."])
1160
+ end
1161
+
1162
+ it 'correctly segments text #014' do
1163
+ ps = PragmaticSegmenter::Segmenter.new(text: "Квартира 234 находится на 4-ом этаже.", language: 'ru')
1164
+ expect(ps.segment).to eq(["Квартира 234 находится на 4-ом этаже."])
1165
+ end
1166
+
1167
+ it 'correctly segments text #015' do
1168
+ ps = PragmaticSegmenter::Segmenter.new(text: "В это время года температура может подниматься до 40°C.", language: 'ru')
1169
+ expect(ps.segment).to eq(["В это время года температура может подниматься до 40°C."])
1170
+ end
1171
+
1172
+ it 'correctly segments text #016' do
1173
+ ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5м³.", language: 'ru')
1174
+ expect(ps.segment).to eq(["Объем составляет 5м³."])
1175
+ end
1176
+
1177
+ it 'correctly segments text #017' do
1178
+ ps = PragmaticSegmenter::Segmenter.new(text: "Объем составляет 5 куб.м.", language: 'ru')
1179
+ expect(ps.segment).to eq(["Объем составляет 5 куб.м."])
1180
+ end
1181
+
1182
+ it 'correctly segments text #018' do
1183
+ ps = PragmaticSegmenter::Segmenter.new(text: "Площадь комнаты 14м².", language: 'ru')
1184
+ expect(ps.segment).to eq(["Площадь комнаты 14м²."])
1185
+ end
1186
+
1187
+ it 'correctly segments text #019' do
1188
+ ps = PragmaticSegmenter::Segmenter.new(text: "Площадь комнаты 14 кв.м.", language: 'ru')
1189
+ expect(ps.segment).to eq(["Площадь комнаты 14 кв.м."])
1190
+ end
1191
+
1192
+ it 'correctly segments text #020' do
1193
+ ps = PragmaticSegmenter::Segmenter.new(text: "1°C соответствует 33.8°F.", language: 'ru')
1194
+ expect(ps.segment).to eq(["1°C соответствует 33.8°F."])
1195
+ end
1196
+
1197
+ it 'correctly segments text #021' do
1198
+ ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27.10.14", language: 'ru')
1199
+ expect(ps.segment).to eq(["Сегодня 27.10.14"])
1200
+ end
1201
+
1202
+ it 'correctly segments text #022' do
1203
+ ps = PragmaticSegmenter::Segmenter.new(text: "Сегодня 27 октября 2014 года.", language: 'ru')
1204
+ expect(ps.segment).to eq(["Сегодня 27 октября 2014 года."])
1205
+ end
1206
+
1207
+ it 'correctly segments text #023' do
1208
+ ps = PragmaticSegmenter::Segmenter.new(text: "Эта машина стоит 150 000 дол.!", language: 'ru')
1209
+ expect(ps.segment).to eq(["Эта машина стоит 150 000 дол.!"])
1210
+ end
1211
+
1212
+ it 'correctly segments text #024' do
1213
+ ps = PragmaticSegmenter::Segmenter.new(text: "Эта машина стоит $150 000!", language: 'ru')
1214
+ expect(ps.segment).to eq(["Эта машина стоит $150 000!"])
1215
+ end
1216
+
1217
+ it 'correctly segments text #025' do
1218
+ ps = PragmaticSegmenter::Segmenter.new(text: "Вот номер моего телефона: +39045969798. Передавайте привет г-ну Шапочкину. До свидания.", language: 'ru')
1219
+ expect(ps.segment).to eq(["Вот номер моего телефона: +39045969798.", "Передавайте привет г-ну Шапочкину.", "До свидания."])
1220
+ end
1221
+
1222
+ it 'correctly segments text #026' do
1223
+ ps = PragmaticSegmenter::Segmenter.new(text: "Постойте, разве можно указывать цены в у.е.!", language: 'ru')
1224
+ expect(ps.segment).to eq(["Постойте, разве можно указывать цены в у.е.!"])
1225
+ end
1226
+
1227
+ it 'correctly segments text #027' do
1228
+ ps = PragmaticSegmenter::Segmenter.new(text: "Едем на скорости 90 км/ч в сторону пгт. Брагиновка, о котором мы так много слышали по ТВ!", language: 'ru')
1229
+ expect(ps.segment).to eq(["Едем на скорости 90 км/ч в сторону пгт. Брагиновка, о котором мы так много слышали по ТВ!"])
1230
+ end
1231
+
1232
+ it 'correctly segments text #028' do
1233
+ ps = PragmaticSegmenter::Segmenter.new(text: "Д-р ветеринарных наук А. И. Семенов и пр. выступали на этом семинаре.", language: 'ru')
1234
+ expect(ps.segment).to eq(["Д-р ветеринарных наук А. И. Семенов и пр. выступали на этом семинаре."])
1235
+ end
1236
+
1237
+ it 'correctly segments text #029' do
1238
+ ps = PragmaticSegmenter::Segmenter.new(text: "Уважаемый проф. Семенов! Просьба до 20.10 сдать отчет на кафедру.", language: 'ru')
1239
+ expect(ps.segment).to eq(["Уважаемый проф. Семенов!", "Просьба до 20.10 сдать отчет на кафедру."])
1240
+ end
1241
+
1242
+ it 'correctly segments text #030' do
1243
+ ps = PragmaticSegmenter::Segmenter.new(text: "Первоначальная стоимость этого комплекта 30 долл., но сейчас действует скидка. Предъявите дисконтную карту, пожалуйста!", language: 'ru')
1244
+ expect(ps.segment).to eq(["Первоначальная стоимость этого комплекта 30 долл., но сейчас действует скидка.", "Предъявите дисконтную карту, пожалуйста!"])
1245
+ end
1246
+
1247
+ it 'correctly segments text #031' do
1248
+ ps = PragmaticSegmenter::Segmenter.new(text: "Виктор съел пол-лимона и ушел по-английски из дома на ул. 1 Мая.", language: 'ru')
1249
+ expect(ps.segment).to eq(["Виктор съел пол-лимона и ушел по-английски из дома на ул. 1 Мая."])
1250
+ end
1251
+
1252
+ it 'correctly segments text #032' do
1253
+ ps = PragmaticSegmenter::Segmenter.new(text: "Напоминаю Вам, что 25.10 день рождения у Маши К., нужно будет купить ей подарок.", language: 'ru')
1254
+ expect(ps.segment).to eq(["Напоминаю Вам, что 25.10 день рождения у Маши К., нужно будет купить ей подарок."])
1255
+ end
1256
+
1257
+ it 'correctly segments text #033' do
1258
+ ps = PragmaticSegmenter::Segmenter.new(text: "В 2010-2012 гг. Виктор посещал г. Волгоград неоднократно.", language: 'ru')
1259
+ expect(ps.segment).to eq(["В 2010-2012 гг. Виктор посещал г. Волгоград неоднократно."])
1260
+ end
1261
+
1262
+ it 'correctly segments text #034' do
1263
+ ps = PragmaticSegmenter::Segmenter.new(text: "Маленькая девочка бежала и кричала: «Не видали маму?»", language: 'ru')
1264
+ expect(ps.segment).to eq(["Маленькая девочка бежала и кричала: «Не видали маму?»"])
1265
+ end
1266
+
1267
+ it 'correctly segments text #035' do
1268
+ ps = PragmaticSegmenter::Segmenter.new(text: "Кв. 234 находится на 4 этаже.", language: 'ru')
1269
+ expect(ps.segment).to eq(["Кв. 234 находится на 4 этаже."])
1270
+ end
1271
+
1272
+ it 'correctly segments text #036' do
1273
+ ps = PragmaticSegmenter::Segmenter.new(text: "В это время года температура может подниматься до 40°C.", language: 'ru')
1274
+ expect(ps.segment).to eq(["В это время года температура может подниматься до 40°C."])
1275
+ end
1276
+
1277
+ it 'correctly segments text #037' do
1278
+ ps = PragmaticSegmenter::Segmenter.new(text: "Нужно купить 1)рыбу 2)соль.", language: 'ru')
1279
+ expect(ps.segment).to eq(["Нужно купить 1)рыбу 2)соль."])
1280
+ end
1281
+
1282
+ it 'correctly segments text #038' do
1283
+ ps = PragmaticSegmenter::Segmenter.new(text: "Машина едет со скоростью 100 км/ч.", language: 'ru')
1284
+ expect(ps.segment).to eq(["Машина едет со скоростью 100 км/ч."])
1285
+ end
1286
+
1287
+ it 'correctly segments text #039' do
1288
+ ps = PragmaticSegmenter::Segmenter.new(text: "Л.Н. Толстой написал \"Войну и мир\". Кроме Волконских, Л. Н. Толстой состоял в близком родстве с некоторыми другими аристократическими родами. Дом, где родился Л.Н.Толстой, 1898 г. В 1854 году дом продан по распоряжению писателя на вывоз в село Долгое.", language: 'ru')
1289
+ expect(ps.segment).to eq(["Л.Н. Толстой написал \"Войну и мир\".", "Кроме Волконских, Л. Н. Толстой состоял в близком родстве с некоторыми другими аристократическими родами.", "Дом, где родился Л.Н.Толстой, 1898 г. В 1854 году дом продан по распоряжению писателя на вывоз в село Долгое."])
1290
+ end
1291
+ end
1292
+ end
1293
+
1294
+ context 'Language: German (de)' do
1295
+ # Thanks to Silvia Busse for the German test examples.
1296
+ describe '#segment' do
1297
+ it 'correctly segments text #001' do
1298
+ ps = PragmaticSegmenter::Segmenter.new(text: "\n \n\n http:www.babycentre.co.uk/midwives \n\n \n\n \n\n10 steps to a healthy pregnancy (German) \n\n10 Schritte zu einer gesunden Schwangerschaft \n \n• 1. Planen und organisieren Sie die Zeit der Schwangerschaft frühzeitig! \n• 2. Essen Sie gesund! \n• 3. Seien Sie achtsam bei der Auswahl der Nahrungsmittel! \n• 4. Nehmen Sie zusätzlich Folsäurepräparate und essen Sie Fisch! \n• 5. Treiben Sie regelmäßig Sport! \n• 6. Beginnen Sie mit Übungen für die Beckenbodenmuskulatur! \n• 7. Reduzieren Sie Ihren Alkoholgenuss! \n• 8. Reduzieren Sie Ihren Koffeingenuß! \n• 9. Hören Sie mit dem Rauchen auf! \n• 10. Gönnen Sie sich Erholung! \n \n \nZehn einfach zu befolgende Tipps sollen Ihnen helfen, eine möglichst problemlose \nSchwangerschaft zu erleben und ein gesundes Baby auf die Welt zu bringen: \n\n1. Planen und organisieren Sie die Zeit der Schwangerschaft frühzeitig!", language: 'de', doc_type: 'pdf')
1299
+ expect(ps.segment).to eq(["http:www.babycentre.co.uk/midwives", "10 steps to a healthy pregnancy (German)", "10 Schritte zu einer gesunden Schwangerschaft", "• 1. Planen und organisieren Sie die Zeit der Schwangerschaft frühzeitig!", "• 2. Essen Sie gesund!", "• 3. Seien Sie achtsam bei der Auswahl der Nahrungsmittel!", "• 4. Nehmen Sie zusätzlich Folsäurepräparate und essen Sie Fisch!", "• 5. Treiben Sie regelmäßig Sport!", "• 6. Beginnen Sie mit Übungen für die Beckenbodenmuskulatur!", "• 7. Reduzieren Sie Ihren Alkoholgenuss!", "• 8. Reduzieren Sie Ihren Koffeingenuß!", "• 9. Hören Sie mit dem Rauchen auf!", "• 10. Gönnen Sie sich Erholung!", "Zehn einfach zu befolgende Tipps sollen Ihnen helfen, eine möglichst problemlose Schwangerschaft zu erleben und ein gesundes Baby auf die Welt zu bringen:", "1. Planen und organisieren Sie die Zeit der Schwangerschaft frühzeitig!"])
1300
+ end
1301
+
1302
+ it 'correctly segments text #002' do
1303
+ ps = PragmaticSegmenter::Segmenter.new(text: '„Ich habe heute keine Zeit“, sagte die Frau und flüsterte leise: „Und auch keine Lust.“ Wir haben 1.000.000 Euro.', language: 'de')
1304
+ expect(ps.segment).to eq(["„Ich habe heute keine Zeit“, sagte die Frau und flüsterte leise: „Und auch keine Lust.“", "Wir haben 1.000.000 Euro."])
1305
+ end
1306
+
1307
+ it 'correctly segments text #003' do
1308
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Thomas sagte: ,,Wann kommst zu mir?” ,,Das weiß ich noch nicht“, antwortete Susi, ,,wahrscheinlich am Sonntag.“ Wir haben 1.000.000 Euro.', language: 'de')
1309
+ expect(ps.segment).to eq(["Thomas sagte: ,,Wann kommst zu mir?” ,,Das weiß ich noch nicht“, antwortete Susi, ,,wahrscheinlich am Sonntag.“", "Wir haben 1.000.000 Euro."])
1310
+ end
1311
+
1312
+ it 'correctly segments text #004' do
1313
+ ps = PragmaticSegmenter::Segmenter.new(text: '„Lass uns jetzt essen gehen!“, sagte die Mutter zu ihrer Freundin, „am besten zum Italiener.“', language: 'de')
1314
+ expect(ps.segment).to eq(['„Lass uns jetzt essen gehen!“, sagte die Mutter zu ihrer Freundin, „am besten zum Italiener.“'])
1315
+ end
1316
+
1317
+ it 'correctly segments text #005' do
1318
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Wir haben 1.000.000 Euro.', language: 'de')
1319
+ expect(ps.segment).to eq(['Wir haben 1.000.000 Euro.'])
1320
+ end
1321
+
1322
+ it 'correctly segments text #006' do
1323
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Sie bekommen 3,50 Euro zurück.', language: 'de')
1324
+ expect(ps.segment).to eq(['Sie bekommen 3,50 Euro zurück.'])
1325
+ end
1326
+
1327
+ it 'correctly segments text #007' do
1328
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Dafür brauchen wir 5,5 Stunden.', language: 'de')
1329
+ expect(ps.segment).to eq(['Dafür brauchen wir 5,5 Stunden.'])
1330
+ end
1331
+
1332
+ it 'correctly segments text #008' do
1333
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Bitte überweisen Sie 5.300,25 Euro.', language: 'de')
1334
+ expect(ps.segment).to eq(['Bitte überweisen Sie 5.300,25 Euro.'])
1335
+ end
1336
+
1337
+ it 'correctly segments text #009' do
1338
+ ps = PragmaticSegmenter::Segmenter.new(text: '1. Dies ist eine Punkteliste.', language: 'de')
1339
+ expect(ps.segment).to eq(['1. Dies ist eine Punkteliste.'])
1340
+ end
1341
+
1342
+ it 'correctly segments text #010' do
1343
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Wir trafen Dr. med. Meyer in der Stadt.', language: 'de')
1344
+ expect(ps.segment).to eq(['Wir trafen Dr. med. Meyer in der Stadt.'])
1345
+ end
1346
+
1347
+ it 'correctly segments text #011' do
1348
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Wir brauchen Getränke, z. B. Wasser, Saft, Bier usw.', language: 'de')
1349
+ expect(ps.segment).to eq(['Wir brauchen Getränke, z. B. Wasser, Saft, Bier usw.'])
1350
+ end
1351
+
1352
+ it 'correctly segments text #012' do
1353
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Ich kann u.a. Spanisch sprechen.', language: 'de')
1354
+ expect(ps.segment).to eq(['Ich kann u.a. Spanisch sprechen.'])
1355
+ end
1356
+
1357
+ it 'correctly segments text #013' do
1358
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Frau Prof. Schulze ist z. Z. nicht da.', language: 'de')
1359
+ expect(ps.segment).to eq(['Frau Prof. Schulze ist z. Z. nicht da.'])
1360
+ end
1361
+
1362
+ it 'correctly segments text #014' do
1363
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Sie erhalten ein neues Bank-Statement bzw. ein neues Schreiben.', language: 'de')
1364
+ expect(ps.segment).to eq(['Sie erhalten ein neues Bank-Statement bzw. ein neues Schreiben.'])
1365
+ end
1366
+
1367
+ it 'correctly segments text #015' do
1368
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Z. T. ist die Lieferung unvollständig.', language: 'de')
1369
+ expect(ps.segment).to eq(['Z. T. ist die Lieferung unvollständig.'])
1370
+ end
1371
+
1372
+ it 'correctly segments text #016' do
1373
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Das finden Sie auf S. 225.', language: 'de')
1374
+ expect(ps.segment).to eq(['Das finden Sie auf S. 225.'])
1375
+ end
1376
+
1377
+ it 'correctly segments text #017' do
1378
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Sie besucht eine kath. Schule.', language: 'de')
1379
+ expect(ps.segment).to eq(['Sie besucht eine kath. Schule.'])
1380
+ end
1381
+
1382
+ it 'correctly segments text #018' do
1383
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Wir benötigen Zeitungen, Zeitschriften u. Ä. für unser Projekt.', language: 'de')
1384
+ expect(ps.segment).to eq(['Wir benötigen Zeitungen, Zeitschriften u. Ä. für unser Projekt.'])
1385
+ end
1386
+
1387
+ it 'correctly segments text #019' do
1388
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Das steht auf S. 23, s. vorherige Anmerkung.', language: 'de')
1389
+ expect(ps.segment).to eq(['Das steht auf S. 23, s. vorherige Anmerkung.'])
1390
+ end
1391
+
1392
+ it 'correctly segments text #020' do
1393
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Dies ist meine Adresse: Dr. Meier, Berliner Str. 5, 21234 Bremen.', language: 'de')
1394
+ expect(ps.segment).to eq(['Dies ist meine Adresse: Dr. Meier, Berliner Str. 5, 21234 Bremen.'])
1395
+ end
1396
+
1397
+ it 'correctly segments text #021' do
1398
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Er sagte: „Hallo, wie geht´s Ihnen, Frau Prof. Müller?“', language: 'de')
1399
+ expect(ps.segment).to eq(['Er sagte: „Hallo, wie geht´s Ihnen, Frau Prof. Müller?“'])
1400
+ end
1401
+
1402
+ it 'correctly segments text #022' do
1403
+ ps = PragmaticSegmenter::Segmenter.new(text: "Fit in vier Wochen\n\nDeine Anleitung für eine reine Ernährung und ein gesünderes und glücklicheres Leben\n\nRECHTLICHE HINWEISE\n\nOhne die ausdrückliche schriftliche Genehmigung der Eigentümerin von instafemmefitness, Anna Anderson, darf dieses E-Book weder teilweise noch in vollem Umfang reproduziert, gespeichert, kopiert oder auf irgendeine Weise übertragen werden. Wenn Du das E-Book auf einem öffentlich zugänglichen Computer ausdruckst, musst Du es nach dem Ausdrucken von dem Computer löschen. Jedes E-Book wird mit einem Benutzernamen und Transaktionsinformationen versehen.\n\nVerstöße gegen dieses Urheberrecht werden im vollen gesetzlichen Umfang geltend gemacht. Obgleich die Autorin und Herausgeberin alle Anstrengungen unternommen hat, sicherzustellen, dass die Informationen in diesem Buch zum Zeitpunkt der Drucklegung korrekt sind, übernimmt die Autorin und Herausgeberin keine Haftung für etwaige Verluste, Schäden oder Störungen, die durch Fehler oder Auslassungen in Folge von Fahrlässigkeit, zufälligen Umständen oder sonstigen Ursachen entstehen, und lehnt hiermit jedwede solche Haftung ab.\n\nDieses Buch ist kein Ersatz für die medizinische Beratung durch Ärzte. Der Leser/die Leserin sollte regelmäßig einen Arzt/eine Ärztin hinsichtlich Fragen zu seiner/ihrer Gesundheit und vor allem in Bezug auf Symptome, die eventuell einer ärztlichen Diagnose oder Behandlung bedürfen, konsultieren.\n\nDie Informationen in diesem Buch sind dazu gedacht, ein ordnungsgemäßes Training zu ergänzen, nicht aber zu ersetzen. Wie jeder andere Sport, der Geschwindigkeit, Ausrüstung, Gleichgewicht und Umweltfaktoren einbezieht, stellt dieser Sport ein gewisses Risiko dar. Die Autorin und Herausgeberin rät den Lesern dazu, die volle Verantwortung für die eigene Sicherheit zu übernehmen und die eigenen Grenzen zu beachten. Vor dem Ausüben der in diesem Buch beschriebenen Übungen solltest Du sicherstellen, dass Deine Ausrüstung in gutem Zustand ist, und Du solltest keine Risiken außerhalb Deines Erfahrungs- oder Trainingsniveaus, Deiner Fähigkeiten oder Deines Komfortbereichs eingehen.\nHintergrundillustrationen Urheberrecht © 2013 bei Shuttershock, Buchgestaltung und -produktion durch Anna Anderson Verfasst von Anna Anderson\nUrheberrecht © 2014 Instafemmefitness. Alle Rechte vorbehalten\n\nÜber mich", language: 'de')
1404
+ expect(ps.segment).to eq(["Fit in vier Wochen", "Deine Anleitung für eine reine Ernährung und ein gesünderes und glücklicheres Leben", "RECHTLICHE HINWEISE", "Ohne die ausdrückliche schriftliche Genehmigung der Eigentümerin von instafemmefitness, Anna Anderson, darf dieses E-Book weder teilweise noch in vollem Umfang reproduziert, gespeichert, kopiert oder auf irgendeine Weise übertragen werden.", "Wenn Du das E-Book auf einem öffentlich zugänglichen Computer ausdruckst, musst Du es nach dem Ausdrucken von dem Computer löschen.", "Jedes E-Book wird mit einem Benutzernamen und Transaktionsinformationen versehen.", "Verstöße gegen dieses Urheberrecht werden im vollen gesetzlichen Umfang geltend gemacht.", "Obgleich die Autorin und Herausgeberin alle Anstrengungen unternommen hat, sicherzustellen, dass die Informationen in diesem Buch zum Zeitpunkt der Drucklegung korrekt sind, übernimmt die Autorin und Herausgeberin keine Haftung für etwaige Verluste, Schäden oder Störungen, die durch Fehler oder Auslassungen in Folge von Fahrlässigkeit, zufälligen Umständen oder sonstigen Ursachen entstehen, und lehnt hiermit jedwede solche Haftung ab.", "Dieses Buch ist kein Ersatz für die medizinische Beratung durch Ärzte.", "Der Leser/die Leserin sollte regelmäßig einen Arzt/eine Ärztin hinsichtlich Fragen zu seiner/ihrer Gesundheit und vor allem in Bezug auf Symptome, die eventuell einer ärztlichen Diagnose oder Behandlung bedürfen, konsultieren.", "Die Informationen in diesem Buch sind dazu gedacht, ein ordnungsgemäßes Training zu ergänzen, nicht aber zu ersetzen.", "Wie jeder andere Sport, der Geschwindigkeit, Ausrüstung, Gleichgewicht und Umweltfaktoren einbezieht, stellt dieser Sport ein gewisses Risiko dar.", "Die Autorin und Herausgeberin rät den Lesern dazu, die volle Verantwortung für die eigene Sicherheit zu übernehmen und die eigenen Grenzen zu beachten.", "Vor dem Ausüben der in diesem Buch beschriebenen Übungen solltest Du sicherstellen, dass Deine Ausrüstung in gutem Zustand ist, und Du solltest keine Risiken außerhalb Deines Erfahrungs- oder Trainingsniveaus, Deiner Fähigkeiten oder Deines Komfortbereichs eingehen.", "Hintergrundillustrationen Urheberrecht © 2013 bei Shuttershock, Buchgestaltung und -produktion durch Anna Anderson Verfasst von Anna Anderson", "Urheberrecht © 2014 Instafemmefitness.", "Alle Rechte vorbehalten", "Über mich"])
1405
+ end
1406
+
1407
+ it 'correctly segments text #023' do
1408
+ ps = PragmaticSegmenter::Segmenter.new(text: "Es gibt jedoch einige Vorsichtsmaßnahmen, die Du ergreifen kannst, z. B. ist es sehr empfehlenswert, dass Du Dein Zuhause von allem Junkfood befreist. Ich persönlich kaufe kein Junkfood oder etwas, das nicht rein ist (ich traue mir da selbst nicht!). Ich finde jeden Vorwand, um das Junkfood zu essen, vor allem die Vorstellung, dass ich nicht mehr in Versuchung kommen werde, wenn ich es jetzt aufesse und es weg ist. Es ist schon komisch, was unser Verstand mitunter anstellt!", language: 'de')
1409
+ expect(ps.segment).to eq(["Es gibt jedoch einige Vorsichtsmaßnahmen, die Du ergreifen kannst, z. B. ist es sehr empfehlenswert, dass Du Dein Zuhause von allem Junkfood befreist.", "Ich persönlich kaufe kein Junkfood oder etwas, das nicht rein ist (ich traue mir da selbst nicht!).", "Ich finde jeden Vorwand, um das Junkfood zu essen, vor allem die Vorstellung, dass ich nicht mehr in Versuchung kommen werde, wenn ich es jetzt aufesse und es weg ist.", "Es ist schon komisch, was unser Verstand mitunter anstellt!"])
1410
+ end
1411
+
1412
+ it 'correctly segments text #024' do
1413
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ob Sie in Hannover nur auf der Durchreise, für einen längeren Aufenthalt oder zum Besuch einer der zahlreichen Messen sind: Die Hauptstadt des Landes Niedersachsens hat viele Sehenswürdigkeiten und ist zu jeder Jahreszeit eine Reise Wert. \nHannovers Ursprünge können bis zur römischen Kaiserzeit zurückverfolgt werden, und zwar durch Ausgrabungen von Tongefäßen aus dem 1. -3. Jahrhundert nach Christus, die an mehreren Stellen im Untergrund des Stadtzentrums durchgeführt wurden.", language: 'de')
1414
+ expect(ps.segment).to eq(["Ob Sie in Hannover nur auf der Durchreise, für einen längeren Aufenthalt oder zum Besuch einer der zahlreichen Messen sind: Die Hauptstadt des Landes Niedersachsens hat viele Sehenswürdigkeiten und ist zu jeder Jahreszeit eine Reise Wert.", "Hannovers Ursprünge können bis zur römischen Kaiserzeit zurückverfolgt werden, und zwar durch Ausgrabungen von Tongefäßen aus dem 1. -3. Jahrhundert nach Christus, die an mehreren Stellen im Untergrund des Stadtzentrums durchgeführt wurden."])
1415
+ end
1416
+
1417
+ it 'correctly segments text #025' do
1418
+ ps = PragmaticSegmenter::Segmenter.new(text: "• 3. Seien Sie achtsam bei der Auswahl der Nahrungsmittel! \n• 4. Nehmen Sie zusätzlich Folsäurepräparate und essen Sie Fisch! \n• 5. Treiben Sie regelmäßig Sport! \n• 6. Beginnen Sie mit Übungen für die Beckenbodenmuskulatur! \n• 7. Reduzieren Sie Ihren Alkoholgenuss! \n", language: 'de')
1419
+ expect(ps.segment).to eq(["• 3. Seien Sie achtsam bei der Auswahl der Nahrungsmittel!", "• 4. Nehmen Sie zusätzlich Folsäurepräparate und essen Sie Fisch!", "• 5. Treiben Sie regelmäßig Sport!", "• 6. Beginnen Sie mit Übungen für die Beckenbodenmuskulatur!", "• 7. Reduzieren Sie Ihren Alkoholgenuss!"])
1420
+ end
1421
+
1422
+ it 'correctly segments text #026' do
1423
+ ps = PragmaticSegmenter::Segmenter.new(text: "Schwangere Frauen sollten während der \nersten drei Monate eine tägliche Dosis von 400 Mikrogramm Folsäure zusätzlich nehmen. \nFolsäure befindet sich auch in einigen Gemüse- und Müslisorten.", language: 'de', doc_type: 'pdf')
1424
+ expect(ps.segment).to eq(["Schwangere Frauen sollten während der ersten drei Monate eine tägliche Dosis von 400 Mikrogramm Folsäure zusätzlich nehmen.", "Folsäure befindet sich auch in einigen Gemüse- und Müslisorten."])
1425
+ end
1426
+
1427
+ it 'correctly segments text #027' do
1428
+ ps = PragmaticSegmenter::Segmenter.new(text: "Andere \nFischsorten (z.B. Hai, Thunfisch, Aal und Seeteufel) weisen einen erhöhten Quecksilbergehalt \nauf und sollten deshalb in der Schwangerschaft nur selten verzehrt werden.", language: 'de', doc_type: 'pdf')
1429
+ expect(ps.segment).to eq(["Andere Fischsorten (z.B. Hai, Thunfisch, Aal und Seeteufel) weisen einen erhöhten Quecksilbergehalt auf und sollten deshalb in der Schwangerschaft nur selten verzehrt werden."])
1430
+ end
1431
+
1432
+ it 'correctly segments text #028' do
1433
+ ps = PragmaticSegmenter::Segmenter.new(text: "Übung Präsens\n1. Ich ___ gern Tennis.\nspielen\nspielt\nspiele\n2. Karl __ mir den Ball.\ngebt\ngibt\ngeben\n3. Ihr ___ fast jeden Tag.\narbeitet\narbeite\narbeiten\n4. ___ Susi Deutsch?\nSprichst\nSprecht\nSpricht\n5. Wann ___ Karl und Julia? Heute?\nkommen\nkommt\nkomme\n\n\n\n\n", language: 'de', doc_type: 'docx')
1434
+ expect(ps.segment).to eq(["Übung Präsens", "1. Ich ___ gern Tennis.", "spielen", "spielt", "spiele", "2. Karl __ mir den Ball.", "gebt", "gibt", "geben", "3. Ihr ___ fast jeden Tag.", "arbeitet", "arbeite", "arbeiten", "4. ___ Susi Deutsch?", "Sprichst", "Sprecht", "Spricht", "5. Wann ___ Karl und Julia?", "Heute?", "kommen", "kommt", "komme"])
1435
+ end
1436
+
1437
+ it 'correctly segments text #029' do
1438
+ ps = PragmaticSegmenter::Segmenter.new(text: "\n• einige Sorten Weichkäse \n• rohes oder nicht ganz durchgebratenes Fleisch \n• ungeputztes Gemüse und ungewaschener Salat \n• nicht ganz durchgebratenes Hühnerfleisch, rohe oder nur weich gekochte Eier", language: 'de', doc_type: 'pdf')
1439
+ expect(ps.segment).to eq(["• einige Sorten Weichkäse", "• rohes oder nicht ganz durchgebratenes Fleisch", "• ungeputztes Gemüse und ungewaschener Salat", "• nicht ganz durchgebratenes Hühnerfleisch, rohe oder nur weich gekochte Eier"])
1440
+ end
1441
+
1442
+ it 'correctly segments text #030' do
1443
+ ps = PragmaticSegmenter::Segmenter.new(text: "Was sind die Konsequenzen der Abstimmung vom 12. Juni?", language: 'de')
1444
+ expect(ps.segment).to eq(["Was sind die Konsequenzen der Abstimmung vom 12. Juni?"])
1445
+ end
1446
+
1447
+ it 'correctly segments text #031' do
1448
+ ps = PragmaticSegmenter::Segmenter.new(text: "Was pro Jahr10. Zudem pro Jahr um 0.3 %11. Der gängigen Theorie nach erfolgt der Anstieg.", language: 'de')
1449
+ expect(ps.segment).to eq(["Was pro Jahr10.", "Zudem pro Jahr um 0.3 %11.", "Der gängigen Theorie nach erfolgt der Anstieg."])
1450
+ end
1451
+
1452
+ it 'correctly segments text #032' do
1453
+ ps = PragmaticSegmenter::Segmenter.new(text: "s. vorherige Anmerkung.", language: 'de')
1454
+ expect(ps.segment).to eq(["s. vorherige Anmerkung."])
1455
+ end
1456
+ end
1457
+ end
1458
+
1459
+ context 'Language: Spanish (es)' do
1460
+ # Thanks to Alejandro Naser Pastoriza for the Spanish test examples.
1461
+ describe '#segment' do
1462
+ it 'correctly segments text #001' do
1463
+ ps = PragmaticSegmenter::Segmenter.new(text: '«Ninguna mente extraordinaria está exenta de un toque de demencia», dijo Aristóteles. Pablo, ¿adónde vas? ¡¿Qué viste?!', language: 'es')
1464
+ expect(ps.segment).to eq(['«Ninguna mente extraordinaria está exenta de un toque de demencia», dijo Aristóteles.', 'Pablo, ¿adónde vas?', '¡¿Qué viste?!'])
1465
+ end
1466
+
1467
+ it 'correctly segments text #002' do
1468
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Admón. es administración o me equivoco.', language: 'es')
1469
+ expect(ps.segment).to eq(['Admón. es administración o me equivoco.'])
1470
+ end
1471
+
1472
+ it 'correctly segments text #003' do
1473
+ ps = PragmaticSegmenter::Segmenter.new(text: "• 1. Busca atención prenatal desde el principio \n• 2. Aliméntate bien \n• 3. Presta mucha atención a la higiene de los alimentos \n• 4. Toma suplementos de ácido fólico y come pescado \n• 5. Haz ejercicio regularmente \n• 6. Comienza a hacer ejercicios de Kegel \n• 7. Restringe el consumo de alcohol \n• 8. Disminuye el consumo de cafeína \n• 9. Deja de fumar \n• 10. Descansa", language: 'es')
1474
+ expect(ps.segment).to eq(["• 1. Busca atención prenatal desde el principio", "• 2. Aliméntate bien", "• 3. Presta mucha atención a la higiene de los alimentos", "• 4. Toma suplementos de ácido fólico y come pescado", "• 5. Haz ejercicio regularmente", "• 6. Comienza a hacer ejercicios de Kegel", "• 7. Restringe el consumo de alcohol", "• 8. Disminuye el consumo de cafeína", "• 9. Deja de fumar", "• 10. Descansa"])
1475
+ end
1476
+
1477
+ it 'correctly segments text #004' do
1478
+ ps = PragmaticSegmenter::Segmenter.new(text: "• 1. Busca atención prenatal desde el principio \n• 2. Aliméntate bien \n• 3. Presta mucha atención a la higiene de los alimentos \n• 4. Toma suplementos de ácido fólico y come pescado \n• 5. Haz ejercicio regularmente \n• 6. Comienza a hacer ejercicios de Kegel \n• 7. Restringe el consumo de alcohol \n• 8. Disminuye el consumo de cafeína \n• 9. Deja de fumar \n• 10. Descansa \n• 11. Hola", language: 'es')
1479
+ expect(ps.segment).to eq(["• 1. Busca atención prenatal desde el principio", "• 2. Aliméntate bien", "• 3. Presta mucha atención a la higiene de los alimentos", "• 4. Toma suplementos de ácido fólico y come pescado", "• 5. Haz ejercicio regularmente", "• 6. Comienza a hacer ejercicios de Kegel", "• 7. Restringe el consumo de alcohol", "• 8. Disminuye el consumo de cafeína", "• 9. Deja de fumar", "• 10. Descansa", "• 11. Hola"])
1480
+ end
1481
+
1482
+ it 'correctly segments text #005' do
1483
+ ps = PragmaticSegmenter::Segmenter.new(text: "¡Hola Srta. Ledesma! ¿Cómo está hoy? Espero que muy bien.", language: 'es')
1484
+ expect(ps.segment).to eq(["¡Hola Srta. Ledesma!", "¿Cómo está hoy?", "Espero que muy bien."])
1485
+ end
1486
+
1487
+ it 'correctly segments text #006' do
1488
+ ps = PragmaticSegmenter::Segmenter.new(text: "Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser.", language: 'es')
1489
+ expect(ps.segment).to eq(["Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre, el Dr. Naser."])
1490
+ end
1491
+
1492
+ it 'correctly segments text #007' do
1493
+ ps = PragmaticSegmenter::Segmenter.new(text: "He apuntado una cita para la siguiente fecha: Mar. 23 de Nov. de 2014. Gracias.", language: 'es')
1494
+ expect(ps.segment).to eq(["He apuntado una cita para la siguiente fecha: Mar. 23 de Nov. de 2014.", "Gracias."])
1495
+ end
1496
+
1497
+ it 'correctly segments text #008' do
1498
+ ps = PragmaticSegmenter::Segmenter.new(text: "Núm. de tel: 351.123.465.4. Envíe mis saludos a la Sra. Rescia.", language: 'es')
1499
+ expect(ps.segment).to eq(["Núm. de tel: 351.123.465.4.", "Envíe mis saludos a la Sra. Rescia."])
1500
+ end
1501
+
1502
+ it 'correctly segments text #009' do
1503
+ ps = PragmaticSegmenter::Segmenter.new(text: "Cero en la escala Celsius o de grados centígrados (0 °C) se define como el equivalente a 273.15 K, con una diferencia de temperatura de 1 °C equivalente a una diferencia de 1 Kelvin. Esto significa que 100 °C, definido como el punto de ebullición del agua, se define como el equivalente a 373.15 K.", language: 'es')
1504
+ expect(ps.segment).to eq(["Cero en la escala Celsius o de grados centígrados (0 °C) se define como el equivalente a 273.15 K, con una diferencia de temperatura de 1 °C equivalente a una diferencia de 1 Kelvin.", "Esto significa que 100 °C, definido como el punto de ebullición del agua, se define como el equivalente a 373.15 K."])
1505
+ end
1506
+
1507
+ it 'correctly segments text #010' do
1508
+ ps = PragmaticSegmenter::Segmenter.new(text: "Durante la primera misión del Discovery (30 Ago. 1984 15:08.10) tuvo lugar el lanzamiento de dos satélites de comunicación, el nombre de esta misión fue STS-41-D.", language: 'es')
1509
+ expect(ps.segment).to eq(["Durante la primera misión del Discovery (30 Ago. 1984 15:08.10) tuvo lugar el lanzamiento de dos satélites de comunicación, el nombre de esta misión fue STS-41-D."])
1510
+ end
1511
+
1512
+ it 'correctly segments text #011' do
1513
+ ps = PragmaticSegmenter::Segmenter.new(text: "Frase del gran José Hernández: \"Aquí me pongo a cantar / al compás de la vigüela, / que el hombre que lo desvela / una pena estrordinaria, / como la ave solitaria / con el cantar se consuela. / [...] \".", language: 'es')
1514
+ expect(ps.segment).to eq(["Frase del gran José Hernández: \"Aquí me pongo a cantar / al compás de la vigüela, / que el hombre que lo desvela / una pena estrordinaria, / como la ave solitaria / con el cantar se consuela. / [...] \"."])
1515
+ end
1516
+
1517
+ it 'correctly segments text #012' do
1518
+ ps = PragmaticSegmenter::Segmenter.new(text: "Citando a Criss Jami «Prefiero ser un artista a ser un líder, irónicamente, un líder tiene que seguir las reglas.», lo cual parece muy acertado.", language: 'es')
1519
+ expect(ps.segment).to eq(["Citando a Criss Jami «Prefiero ser un artista a ser un líder, irónicamente, un líder tiene que seguir las reglas.», lo cual parece muy acertado."])
1520
+ end
1521
+
1522
+ it 'correctly segments text #013' do
1523
+ ps = PragmaticSegmenter::Segmenter.new(text: "Cuando llegué, le estaba dando ejercicios a los niños, uno de los cuales era \"3 + (14/7).x = 5\". ¿Qué te parece?", language: 'es')
1524
+ expect(ps.segment).to eq(["Cuando llegué, le estaba dando ejercicios a los niños, uno de los cuales era \"3 + (14/7).x = 5\".", "¿Qué te parece?"])
1525
+ end
1526
+
1527
+ it 'correctly segments text #014' do
1528
+ ps = PragmaticSegmenter::Segmenter.new(text: "Se le pidió a los niños que leyeran los párrf. 5 y 6 del art. 4 de la constitución de los EE. UU..", language: 'es')
1529
+ expect(ps.segment).to eq(["Se le pidió a los niños que leyeran los párrf. 5 y 6 del art. 4 de la constitución de los EE. UU.."])
1530
+ end
1531
+
1532
+ it 'correctly segments text #015' do
1533
+ ps = PragmaticSegmenter::Segmenter.new(text: "Una de las preguntas realizadas en la evaluación del día Lun. 15 de Mar. fue la siguiente: \"Alumnos, ¿cuál es el resultado de la operación 1.1 + 4/5?\". Disponían de 1 min. para responder esa pregunta.", language: 'es')
1534
+ expect(ps.segment).to eq(["Una de las preguntas realizadas en la evaluación del día Lun. 15 de Mar. fue la siguiente: \"Alumnos, ¿cuál es el resultado de la operación 1.1 + 4/5?\".", "Disponían de 1 min. para responder esa pregunta."])
1535
+ end
1536
+
1537
+ it 'correctly segments text #016' do
1538
+ ps = PragmaticSegmenter::Segmenter.new(text: "La temperatura del motor alcanzó los 120.5°C. Afortunadamente, pudo llegar al final de carrera.", language: 'es')
1539
+ expect(ps.segment).to eq(["La temperatura del motor alcanzó los 120.5°C.", "Afortunadamente, pudo llegar al final de carrera."])
1540
+ end
1541
+
1542
+ it 'correctly segments text #017' do
1543
+ ps = PragmaticSegmenter::Segmenter.new(text: "El volumen del cuerpo es 3m³. ¿Cuál es la superficie de cada cara del prisma?", language: 'es')
1544
+ expect(ps.segment).to eq(["El volumen del cuerpo es 3m³.", "¿Cuál es la superficie de cada cara del prisma?"])
1545
+ end
1546
+
1547
+ it 'correctly segments text #018' do
1548
+ ps = PragmaticSegmenter::Segmenter.new(text: "La habitación tiene 20.55m². El living tiene 50.0m².", language: 'es')
1549
+ expect(ps.segment).to eq(["La habitación tiene 20.55m².", "El living tiene 50.0m²."])
1550
+ end
1551
+
1552
+ it 'correctly segments text #019' do
1553
+ ps = PragmaticSegmenter::Segmenter.new(text: "1°C corresponde a 33.8°F. ¿A cuánto corresponde 35°C?", language: 'es')
1554
+ expect(ps.segment).to eq(["1°C corresponde a 33.8°F.", "¿A cuánto corresponde 35°C?"])
1555
+ end
1556
+
1557
+ it 'correctly segments text #020' do
1558
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hamilton ganó el último gran premio de Fórmula 1, luego de 1:39:02.619 Hs. de carrera, segundo resultó Massa, a una diferencia de 2.5 segundos. De esta manera se consagró ¡Campeón mundial!", language: 'es')
1559
+ expect(ps.segment).to eq(["Hamilton ganó el último gran premio de Fórmula 1, luego de 1:39:02.619 Hs. de carrera, segundo resultó Massa, a una diferencia de 2.5 segundos.", "De esta manera se consagró ¡Campeón mundial!"])
1560
+ end
1561
+
1562
+ it 'correctly segments text #021' do
1563
+ ps = PragmaticSegmenter::Segmenter.new(text: "¡La casa cuesta $170.500.000,00! ¡Muy costosa! Se prevé una disminución del 12.5% para el próximo año.", language: 'es')
1564
+ expect(ps.segment).to eq(["¡La casa cuesta $170.500.000,00!", "¡Muy costosa!", "Se prevé una disminución del 12.5% para el próximo año."])
1565
+ end
1566
+
1567
+ it 'correctly segments text #022' do
1568
+ ps = PragmaticSegmenter::Segmenter.new(text: "El corredor No. 103 arrivó 4°.", language: 'es')
1569
+ expect(ps.segment).to eq(["El corredor No. 103 arrivó 4°."])
1570
+ end
1571
+
1572
+ it 'correctly segments text #023' do
1573
+ ps = PragmaticSegmenter::Segmenter.new(text: "Hoy es 27/04/2014, y es mi cumpleaños. ¿Cuándo es el tuyo?", language: 'es')
1574
+ expect(ps.segment).to eq(["Hoy es 27/04/2014, y es mi cumpleaños.", "¿Cuándo es el tuyo?"])
1575
+ end
1576
+
1577
+ it 'correctly segments text #024' do
1578
+ ps = PragmaticSegmenter::Segmenter.new(text: "Aquí está la lista de compras para el almuerzo: 1.Helado, 2.Carne, 3.Arroz. ¿Cuánto costará? Quizás $12.5.", language: 'es')
1579
+ expect(ps.segment).to eq(["Aquí está la lista de compras para el almuerzo: 1.Helado, 2.Carne, 3.Arroz.", "¿Cuánto costará?", "Quizás $12.5."])
1580
+ end
1581
+
1582
+ it 'correctly segments text #025' do
1583
+ ps = PragmaticSegmenter::Segmenter.new(text: "1 + 1 es 2. 2 + 2 es 4. El auto es de color rojo.", language: 'es')
1584
+ expect(ps.segment).to eq(["1 + 1 es 2.", "2 + 2 es 4.", "El auto es de color rojo."])
1585
+ end
1586
+
1587
+ it 'correctly segments text #026' do
1588
+ ps = PragmaticSegmenter::Segmenter.new(text: "La máquina viajaba a 100 km/h. ¿En cuánto tiempo recorrió los 153 Km.?", language: 'es')
1589
+ expect(ps.segment).to eq(["La máquina viajaba a 100 km/h.", "¿En cuánto tiempo recorrió los 153 Km.?"])
1590
+ end
1591
+
1592
+ it 'correctly segments text #027' do
1593
+ ps = PragmaticSegmenter::Segmenter.new(text: "\n \nCentro de Relaciones Interinstitucionales -CERI \n\nCra. 7 No. 40-53 Piso 10 Tel. (57-1) 3239300 Ext. 1010 Fax: (57-1) 3402973 Bogotá, D.C. - Colombia \n\nhttp://www.udistrital.edu.co - http://ceri.udistrital.edu.co - relinter@udistrital.edu.co \n\n \n\nCERI 0908 \n \nBogotá, D.C. 6 de noviembre de 2014. \n \nSeñores: \nEMBAJADA DE UNITED KINGDOM \n \n", language: 'es')
1594
+ expect(ps.segment).to eq(["Centro de Relaciones Interinstitucionales -CERI", "Cra. 7 No. 40-53 Piso 10 Tel. (57-1) 3239300 Ext. 1010 Fax: (57-1) 3402973 Bogotá, D.C. - Colombia", "http://www.udistrital.edu.co - http://ceri.udistrital.edu.co - relinter@udistrital.edu.co", "CERI 0908", "Bogotá, D.C. 6 de noviembre de 2014.", "Señores:", "EMBAJADA DE UNITED KINGDOM"])
1595
+ end
1596
+
1597
+ it 'correctly segments text #028' do
1598
+ ps = PragmaticSegmenter::Segmenter.new(text: "N°. 1026.253.553", language: 'es')
1599
+ expect(ps.segment).to eq(["N°. 1026.253.553"])
1600
+ end
1601
+
1602
+ it 'correctly segments text #029' do
1603
+ ps = PragmaticSegmenter::Segmenter.new(text: "\nA continuación me permito presentar a la Ingeniera LAURA MILENA LEÓN \nSANDOVAL, identificada con el documento N°. 1026.253.553 de Bogotá, \negresada del Programa Ingeniería Industrial en el año 2012, quien se desatacó por \nsu excelencia académica, actualmente cursa el programa de Maestría en \nIngeniería Industrial y se encuentra en un intercambio cultural en Bangalore – \nIndia.", language: 'es', doc_type: 'pdf')
1604
+ expect(ps.segment).to eq(["A continuación me permito presentar a la Ingeniera LAURA MILENA LEÓN SANDOVAL, identificada con el documento N°. 1026.253.553 de Bogotá, egresada del Programa Ingeniería Industrial en el año 2012, quien se desatacó por su excelencia académica, actualmente cursa el programa de Maestría en Ingeniería Industrial y se encuentra en un intercambio cultural en Bangalore – India."])
1605
+ end
1606
+
1607
+ it 'correctly segments text #030' do
1608
+ ps = PragmaticSegmenter::Segmenter.new(text: "\n__________________________________________________________\nEl Board para Servicios Educativos de Putnam/Northern Westchester según el título IX, Sección 504 del “Rehabilitation Act” del 1973, del Título VII y del Acta “American with Disabilities” no discrimina para la admisión a programas educativos por sexo, creencia, nacionalidad, origen, edad o discapacidad.", language: 'es')
1609
+ expect(ps.segment).to eq(["El Board para Servicios Educativos de Putnam/Northern Westchester según el título IX, Sección 504 del “Rehabilitation Act” del 1973, del Título VII y del Acta “American with Disabilities” no discrimina para la admisión a programas educativos por sexo, creencia, nacionalidad, origen, edad o discapacidad."])
1610
+ end
1611
+
1612
+ it 'correctly segments text #031' do
1613
+ ps = PragmaticSegmenter::Segmenter.new(text: "Explora oportunidades de carrera en el área de Salud en el Hospital de Northern en Mt. Kisco.", language: 'es')
1614
+ expect(ps.segment).to eq(["Explora oportunidades de carrera en el área de Salud en el Hospital de Northern en Mt. Kisco."])
1615
+ end
1616
+ end
1617
+ end
1618
+
1619
+ context 'Language: Hindi (hi)' do
1620
+ describe '#segment' do
1621
+ it 'correctly segments text #001' do
1622
+ ps = PragmaticSegmenter::Segmenter.new(text: "सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।", language: 'hi')
1623
+ expect(ps.segment).to eq(["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"])
1624
+ end
1625
+ end
1626
+ end
1627
+
1628
+ context 'Language: Greek (el)' do
1629
+ describe '#segment' do
1630
+ it 'correctly segments text #001' do
1631
+ ps = PragmaticSegmenter::Segmenter.new(text: "Με συγχωρείτε· πού είναι οι τουαλέτες; Τις Κυριακές δε δούλευε κανένας. το κόστος του σπιτιού ήταν £260.950,00.", language: 'el')
1632
+ expect(ps.segment).to eq(["Με συγχωρείτε· πού είναι οι τουαλέτες;", "Τις Κυριακές δε δούλευε κανένας.", "το κόστος του σπιτιού ήταν £260.950,00."])
1633
+ end
1634
+ end
1635
+ end
1636
+
1637
+ context 'Language: French (fr)' do
1638
+ describe '#segment' do
1639
+ it 'correctly segments text #001' do
1640
+ ps = PragmaticSegmenter::Segmenter.new(text: "Après avoir été l'un des acteurs du projet génome humain, le Genoscope met aujourd'hui le cap vers la génomique environnementale. L'exploitation des données de séquences, prolongée par l'identification expérimentale des fonctions biologiques, notamment dans le domaine de la biocatalyse, ouvrent des perspectives de développements en biotechnologie industrielle.", language: 'fr')
1641
+ expect(ps.segment).to eq(["Après avoir été l'un des acteurs du projet génome humain, le Genoscope met aujourd'hui le cap vers la génomique environnementale.", "L'exploitation des données de séquences, prolongée par l'identification expérimentale des fonctions biologiques, notamment dans le domaine de la biocatalyse, ouvrent des perspectives de développements en biotechnologie industrielle."])
1642
+ end
1643
+
1644
+ it 'correctly segments text #002' do
1645
+ ps = PragmaticSegmenter::Segmenter.new(text: "\"Airbus livrera comme prévu 30 appareils 380 cette année avec en ligne de mire l'objectif d'équilibre financier du programme en 2015\", a-t-il ajouté.", language: 'fr')
1646
+ expect(ps.segment).to eq(["\"Airbus livrera comme prévu 30 appareils 380 cette année avec en ligne de mire l'objectif d'équilibre financier du programme en 2015\", a-t-il ajouté."])
1647
+ end
1648
+
1649
+ it 'correctly segments text #003' do
1650
+ ps = PragmaticSegmenter::Segmenter.new(text: "À 11 heures ce matin, la direction ne décomptait que douze grévistes en tout sur la France : ce sont ceux du site de Saran (Loiret), dont l’effectif est de 809 salariés, dont la moitié d’intérimaires. Elle assure que ce mouvement « n’aura aucun impact sur les livraisons ».", language: 'fr')
1651
+ expect(ps.segment).to eq(["À 11 heures ce matin, la direction ne décomptait que douze grévistes en tout sur la France : ce sont ceux du site de Saran (Loiret), dont l’effectif est de 809 salariés, dont la moitié d’intérimaires.", "Elle assure que ce mouvement « n’aura aucun impact sur les livraisons »."])
1652
+ end
1653
+
1654
+ it 'correctly segments text #004' do
1655
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ce modèle permet d’afficher le texte « LL.AA.II.RR. » pour l’abréviation de « Leurs Altesses impériales et royales » avec son infobulle.", language: 'fr')
1656
+ expect(ps.segment).to eq(["Ce modèle permet d’afficher le texte « LL.AA.II.RR. » pour l’abréviation de « Leurs Altesses impériales et royales » avec son infobulle."])
1657
+ end
1658
+
1659
+ it 'correctly segments text #005' do
1660
+ ps = PragmaticSegmenter::Segmenter.new(text: "Les derniers ouvrages de Intercept Ltd. sont ici.", language: 'fr')
1661
+ expect(ps.segment).to eq(["Les derniers ouvrages de Intercept Ltd. sont ici."])
1662
+ end
1663
+ end
1664
+ end
1665
+
1666
+ context 'Language: Armenian (hy)' do
1667
+ describe '#segment' do
1668
+ # Thanks to Armine Abelyan for the Armenian test examples.
1669
+
1670
+ it 'correctly segments text #001' do
1671
+ ps = PragmaticSegmenter::Segmenter.new(text: "Սա այն փուլն է, երբ տեղի է ունենում Համակարգի մշակումը: Համաձայն Փուլ 2-ի, Մատակարարը մշակում և/կամ հարմարեցնում է համապատասխան ծրագիրը, տեղադրում ծրագրի բաղկացուցիչները, կատարում առանձին բլոկի և համակարգի թեստավորում և ներառում տարբեր մոդուլներ եզակի աշխատանքային համակարգում, որը կազմում է այս Փուլի արդյունքը:", language: 'hy')
1672
+ expect(ps.segment).to eq(["Սա այն փուլն է, երբ տեղի է ունենում Համակարգի մշակումը:", "Համաձայն Փուլ 2-ի, Մատակարարը մշակում և/կամ հարմարեցնում է համապատասխան ծրագիրը, տեղադրում ծրագրի բաղկացուցիչները, կատարում առանձին բլոկի և համակարգի թեստավորում և ներառում տարբեր մոդուլներ եզակի աշխատանքային համակարգում, որը կազմում է այս Փուլի արդյունքը:"])
1673
+ end
1674
+
1675
+ it 'correctly segments text #002' do
1676
+ ps = PragmaticSegmenter::Segmenter.new(text: "Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար: 2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ: Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի: Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից: Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում: Մատակարարը պետք է տրամադրի հետևյալը`", language: 'hy')
1677
+ expect(ps.segment).to eq(["Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար:", "2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ:", "Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի:", "Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից:", "Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում:", "Մատակարարը պետք է տրամադրի հետևյալը`"])
1678
+ end
1679
+
1680
+ it 'correctly segments text #003' do
1681
+ ps = PragmaticSegmenter::Segmenter.new(text: "Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար: 2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ: Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի: Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից: Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում: Մատակարարը պետք է տրամադրի հետևյալը`", language: 'hy')
1682
+ expect(ps.segment).to eq(["Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար:", "2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ:", "Այս թեստերի թիրախը հանդիսանում է Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի:", "Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է Կառավարության կողմից:", "Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում:", "Մատակարարը պետք է տրամադրի հետևյալը`"])
1683
+ end
1684
+
1685
+ it 'correctly segments text #004' do
1686
+ # "Hello world. My name is Armine." ==> ["Hello world.", "My name is Armine."]
1687
+ ps = PragmaticSegmenter::Segmenter.new(text: "Բարև Ձեզ: Իմ անունն էԱրմինե:", language: 'hy')
1688
+ expect(ps.segment).to eq(["Բարև Ձեզ:", "Իմ անունն էԱրմինե:"])
1689
+ end
1690
+
1691
+ it 'correctly segments text #005' do
1692
+ # "Today is Monday. I am going to work." ==> ["Today is Monday.", "I am going to work."]
1693
+ ps = PragmaticSegmenter::Segmenter.new(text: "Այսօր երկուշաբթի է: Ես գնում եմ աշխատանքի:", language: 'hy')
1694
+ expect(ps.segment).to eq(["Այսօր երկուշաբթի է:", "Ես գնում եմ աշխատանքի:"])
1695
+ end
1696
+
1697
+ it 'correctly segments text #006' do
1698
+ # "Tomorrow is September 1st. We are going to school." ==> ["Tomorrow is September 1st.", "We are going to school."]
1699
+ ps = PragmaticSegmenter::Segmenter.new(text: "Վաղը սեպտեմբերի 1-ն է: Մենք գնում ենք դպրոց:", language: 'hy')
1700
+ expect(ps.segment).to eq(["Վաղը սեպտեմբերի 1-ն է:", "Մենք գնում ենք դպրոց:"])
1701
+ end
1702
+
1703
+ it 'correctly segments text #007' do
1704
+ # "Yes, I understood. I really love you." ==> ["Yes, I understood.", "I really love you."]
1705
+ ps = PragmaticSegmenter::Segmenter.new(text: "Այո, ես հասկացա: Ես իսկապես քեզ սիրում եմ:", language: 'hy')
1706
+ expect(ps.segment).to eq(["Այո, ես հասկացա:", "Ես իսկապես քեզ սիրում եմ:"])
1707
+ end
1708
+
1709
+ it 'correctly segments text #008' do
1710
+ # "Close the windows. It is raining in the evening." ==> ["Close the windows.", "It is raining in the evening."]
1711
+ ps = PragmaticSegmenter::Segmenter.new(text: "Փակիր պատուհանները: Երեկոյան անձրևում է:", language: 'hy')
1712
+ expect(ps.segment).to eq(["Փակիր պատուհանները:", "Երեկոյան անձրևում է:"])
1713
+ end
1714
+
1715
+ it 'correctly segments text #009' do
1716
+ # "It is dark. I should go home." ==> ["It is dark.", "I should go home."]
1717
+ ps = PragmaticSegmenter::Segmenter.new(text: "Մութ է: Ես պետք է տուն վերադառնամ:", language: 'hy')
1718
+ expect(ps.segment).to eq(["Մութ է:", "Ես պետք է տուն վերադառնամ:"])
1719
+ end
1720
+
1721
+ it 'correctly segments text #010' do
1722
+ # "You know, I am starting to believe. Everything is changing." ==> ["You know, I am starting to believe.", "Everything is changing."]
1723
+ ps = PragmaticSegmenter::Segmenter.new(text: "Գիտես, սկսել եմ հավատալ: Ամեն ինչ փոխվում է:", language: 'hy')
1724
+ expect(ps.segment).to eq(["Գիտես, սկսել եմ հավատալ:", "Ամեն ինչ փոխվում է:"])
1725
+ end
1726
+
1727
+ it 'correctly segments text #011' do
1728
+ # "It is a new Christmas tree. We should decorate it." ==> ["It is a new Christmas tree.", "We should decorate it."]
1729
+ ps = PragmaticSegmenter::Segmenter.new(text: "Տոնածառը նոր է: Պետք է այն զարդարել:", language: 'hy')
1730
+ expect(ps.segment).to eq(["Տոնածառը նոր է:", "Պետք է այն զարդարել:"])
1731
+ end
1732
+
1733
+ it 'correctly segments text #012' do
1734
+ # "I am in hurry. I could not wait you." ==> ["I am in hurry.", "I could not wait you."]
1735
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ես շտապում եմ: Ես քեզ չեմ կարող սպասել:", language: 'hy')
1736
+ expect(ps.segment).to eq(["Ես շտապում եմ:", "Ես քեզ չեմ կարող սպասել:"])
1737
+ end
1738
+
1739
+ it 'correctly segments text #013' do
1740
+ # "Wait, we love each other. I want us to live together." ==> ["Wait, we love each other.", "I want us to live together."]
1741
+ ps = PragmaticSegmenter::Segmenter.new(text: "Սպասիր, մենք իրար սիրում ենք: Ցանկանում եմ միասին ապրենք:", language: 'hy')
1742
+ expect(ps.segment).to eq(["Սպասիր, մենք իրար սիրում ենք:", "Ցանկանում եմ միասին ապրենք:"])
1743
+ end
1744
+
1745
+ it 'correctly segments text #014' do
1746
+ # "No, I do not think so. It is not true." ==> ["No, I do not think so.", "It is not true."]
1747
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ոչ, այդպես չեմ կարծում: Դա ճիշտ չէ:", language: 'hy')
1748
+ expect(ps.segment).to eq(["Ոչ, այդպես չեմ կարծում:", "Դա ճիշտ չէ:"])
1749
+ end
1750
+
1751
+ it 'correctly segments text #015' do
1752
+ # "April 24 it has started to rain... I was thinking about." ==> ["April 24 it has started to rain... I was thinking about."]
1753
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:", language: 'hy')
1754
+ expect(ps.segment).to eq(["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"])
1755
+ end
1756
+
1757
+ it 'correctly segments text #016' do
1758
+ # "It was 1960...it was winter...it was night. It was cold...emptiness." ==> ["It was 1960...it was winter...it was night.", "It was cold...emptiness."]
1759
+ ps = PragmaticSegmenter::Segmenter.new(text: "1960 թվական…ձմեռ…գիշեր: Սառն էր…դատարկություն:", language: 'hy')
1760
+ expect(ps.segment).to eq(["1960 թվական…ձմեռ…գիշեր:", "Սառն էր…դատարկություն:"])
1761
+ end
1762
+
1763
+ it 'correctly segments text #017' do
1764
+ # "Why a computer could not do what a man could do? Simply it doesn't have a human brain." ==> ["Why a computer could not do what a man could do?", "Simply it doesn't have a human brain."]
1765
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը: Պարզապես չունի մարդկային ուղեղ:", language: 'hy')
1766
+ expect(ps.segment).to eq(["Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը:", "Պարզապես չունի մարդկային ուղեղ:"])
1767
+ end
1768
+
1769
+ it 'correctly segments text #018' do
1770
+ # "Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity." ==> ["Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity."]
1771
+ ps = PragmaticSegmenter::Segmenter.new(text: "Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:", language: 'hy')
1772
+ expect(ps.segment).to eq(["Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:"])
1773
+ end
1774
+
1775
+ it 'correctly segments text #019' do
1776
+ # "So, we are coming to the end. The logic is...simplicity and work" ==> ["So, we are coming to the end.", "Simplicity and work."]
1777
+ ps = PragmaticSegmenter::Segmenter.new(text: "Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:", language: 'hy')
1778
+ expect(ps.segment).to eq(["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"])
1779
+ end
1780
+
1781
+ it 'correctly segments text #020' do
1782
+ # "What are you thinking? Nothing!" ==> ["What are you thinking?", "Nothing!"]
1783
+ ps = PragmaticSegmenter::Segmenter.new(text: "Ի՞նչ ես մտածում: Ոչինչ:", language: 'hy')
1784
+ expect(ps.segment).to eq(["Ի՞նչ ես մտածում:", "Ոչինչ:"])
1785
+ end
1786
+
1787
+ it 'correctly segments text #021' do
1788
+ # "Can we work together ?. May be what you are thinking, is possible." ==> ["Can we work together?.", "May be what you are thinking is possible."]
1789
+ ps = PragmaticSegmenter::Segmenter.new(text: "Կարող ե՞նք միասին աշխատել: Գուցե այն ինչ մտածում ես, իրականանալի է:", language: 'hy')
1790
+ expect(ps.segment).to eq(["Կարող ե՞նք միասին աշխատել:", "Գուցե այն ինչ մտածում ես, իրականանալի է:"])
1791
+ end
1792
+
1793
+ it 'correctly segments text #022' do
1794
+ # "Now what we have started, comes to the end. However the questions are numerous... ." ==> ["Now what we have started, comes to the end.", "However the questions are numerous... ."]
1795
+ ps = PragmaticSegmenter::Segmenter.new(text: "Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում: Հարցերը սակայն շատ են...:", language: 'hy')
1796
+ expect(ps.segment).to eq(["Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում:", "Հարցերը սակայն շատ են...:"])
1797
+ end
1798
+
1799
+ it 'correctly segments text #023' do
1800
+ # "Honey... I am waiting. Shall I go... or?" ==> ["Honey... I am waiting.", "Shall I go... or?"]
1801
+ ps = PragmaticSegmenter::Segmenter.new(text: "Սիրելիս...սպասում եմ: Գնամ թ՟ե …:", language: 'hy')
1802
+ expect(ps.segment).to eq(["Սիրելիս...սպասում եմ:", "Գնամ թ՟ե …:"])
1803
+ end
1804
+ end
1805
+ end
1806
+
1807
+ context 'Language: Burmese (my)' do
1808
+ describe '#segment' do
1809
+ it 'correctly segments text #001' do
1810
+ ps = PragmaticSegmenter::Segmenter.new(text: "ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။", language: 'my')
1811
+ expect(ps.segment).to eq(["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"])
1812
+ end
1813
+ end
1814
+ end
1815
+
1816
+ context 'Language: Amharic (am)' do
1817
+ describe '#segment' do
1818
+ it 'correctly segments text #001' do
1819
+ ps = PragmaticSegmenter::Segmenter.new(text: "እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።", language: 'am')
1820
+ expect(ps.segment).to eq(["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"])
1821
+ end
1822
+ end
1823
+ end
1824
+
1825
+ context 'Language: Persian (fa)' do
1826
+ describe '#segment' do
1827
+ it 'correctly segments text #001' do
1828
+ ps = PragmaticSegmenter::Segmenter.new(text: "خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.", language: 'fa')
1829
+ expect(ps.segment).to eq(["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."])
1830
+ end
1831
+ end
1832
+ end
1833
+
1834
+ context 'Language: Urdu (ur)' do
1835
+ describe '#segment' do
1836
+ it 'correctly segments text #001' do
1837
+ ps = PragmaticSegmenter::Segmenter.new(text: "کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟", language: 'ur')
1838
+ expect(ps.segment).to eq(["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"])
1839
+ end
1840
+ end
1841
+ end
1842
+
1843
+ context 'Language: Chinese (zh)' do
1844
+ describe '#segment' do
1845
+ it 'correctly segments text #001' do
1846
+ ps = PragmaticSegmenter::Segmenter.new(text: "安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。", language: 'zh')
1847
+ expect(ps.segment).to eq(["安永已聯繫周怡安親屬,協助辦理簽證相關事宜,周怡安家屬1月1日晚間搭乘東方航空班機抵達上海,他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中,去年剛從元智大學畢業,同年9月加入安永。"])
1848
+ end
1849
+ end
1850
+ end
1851
+
1852
+ context 'miscellaneous tests' do
1853
+ describe '#segment' do
1854
+ it 'handles nil' do
1855
+ ps = PragmaticSegmenter::Segmenter.new(text: nil)
1856
+ expect(ps.segment).to eq([])
1857
+ end
1858
+
1859
+ it 'handles no language' do
1860
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Hello world. Hello.')
1861
+ expect(ps.segment).to eq(["Hello world.", "Hello."])
1862
+ end
1863
+
1864
+ it 'handles empty strings' do
1865
+ ps = PragmaticSegmenter::Segmenter.new(text: "\n")
1866
+ expect(ps.segment).to eq([])
1867
+ end
1868
+
1869
+ it 'handles empty strings' do
1870
+ ps = PragmaticSegmenter::Segmenter.new(text: "<b></b>")
1871
+ expect(ps.segment).to eq([])
1872
+ end
1873
+
1874
+ it 'handles empty strings' do
1875
+ ps = PragmaticSegmenter::Segmenter.new(text: '')
1876
+ expect(ps.segment).to eq([])
1877
+ end
1878
+
1879
+ it 'has an option to not use the cleaner' do
1880
+ ps = PragmaticSegmenter::Segmenter.new(text: "It was a cold \nnight in the city.", language: "en", clean: false)
1881
+ expect(ps.segment).to eq(["It was a cold", "night in the city."])
1882
+ end
1883
+
1884
+ it 'does not mutate the input string' do
1885
+ text = "It was a cold \nnight in the city."
1886
+ PragmaticSegmenter::Segmenter.new(text: text, language: "en").segment
1887
+ expect(text).to eq("It was a cold \nnight in the city.")
1888
+ end
1889
+
1890
+
1891
+ end
1892
+
1893
+ describe '#clean' do
1894
+ it 'cleans the text' do
1895
+ ps = PragmaticSegmenter::Cleaner.new(text: "It was a cold \nnight in the city.", language: "en")
1896
+ expect(ps.clean).to eq("It was a cold night in the city.")
1897
+ end
1898
+
1899
+ it 'does not mutate the input string (cleaner)' do
1900
+ text = "It was a cold \nnight in the city."
1901
+ PragmaticSegmenter::Cleaner.new(text: text, language: "en").clean
1902
+ expect(text).to eq("It was a cold \nnight in the city.")
1903
+ end
1904
+ end
1905
+ end
1906
+ end