pragmatic_segmenter 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +1 -1
- data/lib/pragmatic_segmenter/cleaner.rb +44 -1
- data/lib/pragmatic_segmenter/languages/deutsch.rb +6 -1
- data/lib/pragmatic_segmenter/languages/english.rb +5 -1
- data/lib/pragmatic_segmenter/languages/french.rb +5 -0
- data/lib/pragmatic_segmenter/languages/italian.rb +5 -0
- data/lib/pragmatic_segmenter/languages/russian.rb +5 -0
- data/lib/pragmatic_segmenter/languages/spanish.rb +5 -0
- data/lib/pragmatic_segmenter/process.rb +3 -2
- data/lib/pragmatic_segmenter/rules.rb +1 -1
- data/lib/pragmatic_segmenter/segmenter.rb +0 -2
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +10 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae3798fa47a86a8928835153af20c91d181ab2d5
|
4
|
+
data.tar.gz: 265670562de5e8b25aa90454919f044c910353f8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 33eea4d021662c497763950fb5815e29b59214d2c4d7056f77f081ea3edc77a0fc9c54a9467d89dc5d278174e316ced772a6f826bb477c711239eb2b0d0b1722
|
7
|
+
data.tar.gz: 73fb101b5a2c6a3d2f57bdede1d37a70286519dd54ca93aaa2cfc467dd5202961bd3f27b3f25c906ea67e3038542823303bbb411bb4ffd6fafd9f7145f3aa116
|
data/README.md
CHANGED
@@ -407,6 +407,12 @@ One further habit which was somewhat weakened . . . was that of combining words
|
|
407
407
|
=> ["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."]
|
408
408
|
```
|
409
409
|
|
410
|
+
52.) **No whitespace in between sentences** *Credit: Don_Patrick*
|
411
|
+
```
|
412
|
+
Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.
|
413
|
+
=> ["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."]
|
414
|
+
```
|
415
|
+
|
410
416
|
####Golden Rules (German)
|
411
417
|
|
412
418
|
1.) **Quotation at end of sentence**
|
@@ -17,6 +17,14 @@ module PragmaticSegmenter
|
|
17
17
|
# xhtml, inline formatting, etc.
|
18
18
|
class Cleaner
|
19
19
|
include Rules
|
20
|
+
URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
|
21
|
+
|
22
|
+
# Rubular: http://rubular.com/r/6dt98uI76u
|
23
|
+
NO_SPACE_BETWEEN_SENTENCES_REGEX = /(?<=[a-z])\.(?=[A-Z])/
|
24
|
+
|
25
|
+
# Rubular: http://rubular.com/r/l6KN6rH5XE
|
26
|
+
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = /(?<=\d)\.(?=[A-Z])/
|
27
|
+
|
20
28
|
# Rubular: http://rubular.com/r/V57WnM9Zut
|
21
29
|
NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
|
22
30
|
|
@@ -53,9 +61,17 @@ module PragmaticSegmenter
|
|
53
61
|
# Rubular: http://rubular.com/r/IQ4TPfsbd8
|
54
62
|
ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
|
55
63
|
|
64
|
+
# Rubular: http://rubular.com/r/6dt98uI76u
|
65
|
+
NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
|
66
|
+
|
67
|
+
# Rubular: http://rubular.com/r/l6KN6rH5XE
|
68
|
+
NoSpaceBetweenSentencesDigitRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
|
69
|
+
|
56
70
|
EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
|
71
|
+
TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r")
|
57
72
|
|
58
73
|
EscapedNewLineRule = Rule.new(/\\n/, "\n")
|
74
|
+
TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n")
|
59
75
|
|
60
76
|
ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
|
61
77
|
|
@@ -92,11 +108,36 @@ module PragmaticSegmenter
|
|
92
108
|
@clean_text.apply(InlineFormattingRule)
|
93
109
|
clean_quotations(@clean_text)
|
94
110
|
clean_table_of_contents(@clean_text)
|
111
|
+
check_for_no_space_in_between_sentences(@clean_text)
|
95
112
|
clean_consecutive_characters(@clean_text)
|
96
113
|
end
|
97
114
|
|
98
115
|
private
|
99
116
|
|
117
|
+
def check_for_no_space_in_between_sentences(txt)
|
118
|
+
words = txt.split(' ')
|
119
|
+
words.each do |word|
|
120
|
+
search_for_connected_sentences(word, txt, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule)
|
121
|
+
search_for_connected_sentences(word, txt, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule)
|
122
|
+
end
|
123
|
+
txt
|
124
|
+
end
|
125
|
+
|
126
|
+
def search_for_connected_sentences(word, txt, regex, rule)
|
127
|
+
if word =~ regex
|
128
|
+
unless URL_EMAIL_KEYWORDS.any? { |web| word =~ /#{web}/ }
|
129
|
+
unless abbreviations.any? { |abbr| word =~ /#{abbr}/i }
|
130
|
+
new_word = word.dup.apply(rule)
|
131
|
+
txt.gsub!(/#{Regexp.escape(word)}/, new_word)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
def abbreviations
|
138
|
+
@abbr ||= PragmaticSegmenter::Abbreviation.new.all
|
139
|
+
end
|
140
|
+
|
100
141
|
def remove_all_newlines(txt)
|
101
142
|
clean_text = remove_newline_in_middle_of_sentence(txt)
|
102
143
|
remove_newline_in_middle_of_word(clean_text)
|
@@ -118,7 +159,9 @@ module PragmaticSegmenter
|
|
118
159
|
|
119
160
|
def replace_escaped_newlines(txt)
|
120
161
|
txt.apply(EscapedNewLineRule).
|
121
|
-
apply(EscapedCarriageReturnRule)
|
162
|
+
apply(EscapedCarriageReturnRule).
|
163
|
+
apply(TypoEscapedNewLineRule).
|
164
|
+
apply(TypoEscapedCarriageReturnRule)
|
122
165
|
end
|
123
166
|
|
124
167
|
def replace_double_newlines(txt)
|
@@ -18,6 +18,11 @@ module PragmaticSegmenter
|
|
18
18
|
end
|
19
19
|
|
20
20
|
class Cleaner < PragmaticSegmenter::Cleaner
|
21
|
+
private
|
22
|
+
|
23
|
+
def abbreviations
|
24
|
+
PragmaticSegmenter::Languages::Deutsch::Abbreviation.new.all
|
25
|
+
end
|
21
26
|
end
|
22
27
|
|
23
28
|
class Number < PragmaticSegmenter::Number
|
@@ -58,7 +63,7 @@ module PragmaticSegmenter
|
|
58
63
|
end
|
59
64
|
|
60
65
|
class Abbreviation < PragmaticSegmenter::Abbreviation
|
61
|
-
ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b
|
66
|
+
ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt']
|
62
67
|
NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
|
63
68
|
|
64
69
|
def all
|
@@ -15,13 +15,17 @@ module PragmaticSegmenter
|
|
15
15
|
def clean_quotations(txt)
|
16
16
|
txt.gsub(/`/, "'")
|
17
17
|
end
|
18
|
+
|
19
|
+
def abbreviations
|
20
|
+
[]
|
21
|
+
end
|
18
22
|
end
|
19
23
|
|
20
24
|
class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
|
21
25
|
private
|
22
26
|
|
23
27
|
def abbreviations
|
24
|
-
PragmaticSegmenter::
|
28
|
+
PragmaticSegmenter::Abbreviation.new
|
25
29
|
end
|
26
30
|
end
|
27
31
|
end
|
@@ -10,6 +10,11 @@ module PragmaticSegmenter
|
|
10
10
|
end
|
11
11
|
|
12
12
|
class Cleaner < PragmaticSegmenter::Cleaner
|
13
|
+
private
|
14
|
+
|
15
|
+
def abbreviations
|
16
|
+
PragmaticSegmenter::Languages::Italian::Abbreviation.new.all
|
17
|
+
end
|
13
18
|
end
|
14
19
|
|
15
20
|
class Abbreviation < PragmaticSegmenter::Abbreviation
|
@@ -10,6 +10,11 @@ module PragmaticSegmenter
|
|
10
10
|
end
|
11
11
|
|
12
12
|
class Cleaner < PragmaticSegmenter::Cleaner
|
13
|
+
private
|
14
|
+
|
15
|
+
def abbreviations
|
16
|
+
PragmaticSegmenter::Languages::Russian::Abbreviation.new.all
|
17
|
+
end
|
13
18
|
end
|
14
19
|
|
15
20
|
class Abbreviation < PragmaticSegmenter::Abbreviation
|
@@ -10,6 +10,11 @@ module PragmaticSegmenter
|
|
10
10
|
end
|
11
11
|
|
12
12
|
class Cleaner < PragmaticSegmenter::Cleaner
|
13
|
+
private
|
14
|
+
|
15
|
+
def abbreviations
|
16
|
+
PragmaticSegmenter::Languages::Spanish::Abbreviation.new.all
|
17
|
+
end
|
13
18
|
end
|
14
19
|
|
15
20
|
class Abbreviation < PragmaticSegmenter::Abbreviation
|
@@ -29,7 +29,8 @@ module PragmaticSegmenter
|
|
29
29
|
reformatted_text = PragmaticSegmenter::List.new(text: text).add_line_break
|
30
30
|
reformatted_text = replace_abbreviations(reformatted_text)
|
31
31
|
reformatted_text = replace_numbers(reformatted_text)
|
32
|
-
reformatted_text
|
32
|
+
reformatted_text.apply(AbbreviationsWithMultiplePeriodsAndEmailRule)
|
33
|
+
reformatted_text.apply(GeoLocationRule)
|
33
34
|
split_into_segments(reformatted_text)
|
34
35
|
end
|
35
36
|
|
@@ -37,7 +38,7 @@ module PragmaticSegmenter
|
|
37
38
|
|
38
39
|
def split_into_segments(txt)
|
39
40
|
txt.split("\r")
|
40
|
-
.map! { |segment| segment.apply(SingleNewLineRule, EllipsisRules::All
|
41
|
+
.map! { |segment| segment.apply(SingleNewLineRule, EllipsisRules::All) }
|
41
42
|
.map { |segment| check_for_punctuation(segment) }.flatten
|
42
43
|
.map! { |segment| segment.apply(SubSymbolsRules::All) }
|
43
44
|
.map { |segment| post_process_segments(segment) }
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Rules
|
3
3
|
# Rubular: http://rubular.com/r/EUbZCNfgei
|
4
|
-
|
4
|
+
AbbreviationsWithMultiplePeriodsAndEmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
|
5
5
|
|
6
6
|
# Rubular: http://rubular.com/r/G2opjedIm9
|
7
7
|
GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
|
@@ -32,7 +32,6 @@ module PragmaticSegmenter
|
|
32
32
|
@language = args[:language] || 'en'
|
33
33
|
@doc_type = args[:doc_type]
|
34
34
|
@text = text.dup
|
35
|
-
|
36
35
|
unless args[:clean].eql?(false)
|
37
36
|
@text = cleaner_class.new(text: @text, doc_type: args[:doc_type]).clean
|
38
37
|
end
|
@@ -40,7 +39,6 @@ module PragmaticSegmenter
|
|
40
39
|
|
41
40
|
def segment
|
42
41
|
return [] unless text
|
43
|
-
|
44
42
|
process_class.new(text: text, doc_type: doc_type).process
|
45
43
|
end
|
46
44
|
end
|
@@ -258,6 +258,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
258
258
|
ps = PragmaticSegmenter::Segmenter.new(text: "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", language: "en")
|
259
259
|
expect(ps.segment).to eq(["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."])
|
260
260
|
end
|
261
|
+
|
262
|
+
it "No whitespace in between sentences #052" do
|
263
|
+
ps = PragmaticSegmenter::Segmenter.new(text: "Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.", language: "en")
|
264
|
+
expect(ps.segment).to eq(["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."])
|
265
|
+
end
|
261
266
|
end
|
262
267
|
|
263
268
|
context "Golden Rules (languages other than English)" do
|
@@ -873,6 +878,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
|
|
873
878
|
ps = PragmaticSegmenter::Segmenter.new(text: 'Hello World. \r\n Hello.', language: 'en')
|
874
879
|
expect(ps.segment).to eq(["Hello World.", "Hello."])
|
875
880
|
end
|
881
|
+
|
882
|
+
it 'correctly segments text #082' do
|
883
|
+
ps = PragmaticSegmenter::Segmenter.new(text: 'Hello World. \ r \ nHello.', language: 'en')
|
884
|
+
expect(ps.segment).to eq(["Hello World.", "Hello."])
|
885
|
+
end
|
876
886
|
end
|
877
887
|
end
|
878
888
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-01-
|
11
|
+
date: 2015-01-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|