pragmatic_segmenter 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +730 -0
  7. data/Rakefile +4 -0
  8. data/lib/pragmatic_segmenter.rb +2 -0
  9. data/lib/pragmatic_segmenter/abbreviation.rb +22 -0
  10. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +149 -0
  11. data/lib/pragmatic_segmenter/between_punctuation.rb +78 -0
  12. data/lib/pragmatic_segmenter/cleaner.rb +141 -0
  13. data/lib/pragmatic_segmenter/ellipsis.rb +36 -0
  14. data/lib/pragmatic_segmenter/exclamation_words.rb +19 -0
  15. data/lib/pragmatic_segmenter/languages/amharic.rb +33 -0
  16. data/lib/pragmatic_segmenter/languages/arabic.rb +83 -0
  17. data/lib/pragmatic_segmenter/languages/armenian.rb +33 -0
  18. data/lib/pragmatic_segmenter/languages/burmese.rb +33 -0
  19. data/lib/pragmatic_segmenter/languages/deutsch.rb +132 -0
  20. data/lib/pragmatic_segmenter/languages/english.rb +44 -0
  21. data/lib/pragmatic_segmenter/languages/french.rb +29 -0
  22. data/lib/pragmatic_segmenter/languages/greek.rb +29 -0
  23. data/lib/pragmatic_segmenter/languages/hindi.rb +33 -0
  24. data/lib/pragmatic_segmenter/languages/italian.rb +39 -0
  25. data/lib/pragmatic_segmenter/languages/japanese.rb +58 -0
  26. data/lib/pragmatic_segmenter/languages/persian.rb +56 -0
  27. data/lib/pragmatic_segmenter/languages/russian.rb +60 -0
  28. data/lib/pragmatic_segmenter/languages/spanish.rb +39 -0
  29. data/lib/pragmatic_segmenter/languages/urdu.rb +33 -0
  30. data/lib/pragmatic_segmenter/list.rb +169 -0
  31. data/lib/pragmatic_segmenter/number.rb +35 -0
  32. data/lib/pragmatic_segmenter/process.rb +126 -0
  33. data/lib/pragmatic_segmenter/punctuation.rb +12 -0
  34. data/lib/pragmatic_segmenter/punctuation_replacer.rb +62 -0
  35. data/lib/pragmatic_segmenter/rules.rb +38 -0
  36. data/lib/pragmatic_segmenter/segmenter.rb +81 -0
  37. data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +17 -0
  38. data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +37 -0
  39. data/lib/pragmatic_segmenter/types.rb +12 -0
  40. data/lib/pragmatic_segmenter/version.rb +3 -0
  41. data/pragmatic_segmenter.gemspec +25 -0
  42. data/spec/performance_spec.rb +24 -0
  43. data/spec/pragmatic_segmenter_spec.rb +1906 -0
  44. data/spec/spec_helper.rb +1 -0
  45. metadata +150 -0
@@ -0,0 +1,33 @@
1
+ module PragmaticSegmenter
2
+ module Languages
3
+ class Amharic
4
+ class Process < PragmaticSegmenter::Process
5
+ private
6
+
7
+ def sentence_boundary_punctuation(txt)
8
+ PragmaticSegmenter::Languages::Amharic::SentenceBoundaryPunctuation.new(text: txt).split
9
+ end
10
+
11
+ def punctuation_array
12
+ PragmaticSegmenter::Languages::Amharic::Punctuation.new.punct
13
+ end
14
+ end
15
+
16
+ class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
17
+ SENTENCE_BOUNDARY = /.*?[፧።!\?]|.*?$/
18
+
19
+ def split
20
+ text.scan(SENTENCE_BOUNDARY)
21
+ end
22
+ end
23
+
24
+ class Punctuation < PragmaticSegmenter::Punctuation
25
+ PUNCT = ['።', '፧', '?', '!']
26
+
27
+ def punct
28
+ PUNCT
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,83 @@
1
+ module PragmaticSegmenter
2
+ module Languages
3
+ class Arabic
4
+ class Process < PragmaticSegmenter::Process
5
+ private
6
+
7
+ def sentence_boundary_punctuation(txt)
8
+ PragmaticSegmenter::Languages::Arabic::SentenceBoundaryPunctuation.new(text: txt).split
9
+ end
10
+
11
+ def replace_abbreviations(txt)
12
+ PragmaticSegmenter::Languages::Arabic::AbbreviationReplacer.new(text: txt).replace
13
+ end
14
+
15
+ def punctuation_array
16
+ PragmaticSegmenter::Languages::Arabic::Punctuation.new.punct
17
+ end
18
+ end
19
+
20
+ class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
21
+ SENTENCE_BOUNDARY = /.*?[:\.!\?؟،]|.*?\z|.*?$/
22
+
23
+ # Rubular: http://rubular.com/r/RX5HpdDIyv
24
+ ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
25
+
26
+ # Rubular: http://rubular.com/r/kPRgApNHUg
27
+ ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
28
+
29
+ def split
30
+ txt = replace_non_sentence_boundary_punctuation(text)
31
+ txt.scan(SENTENCE_BOUNDARY)
32
+ end
33
+
34
+ private
35
+
36
+ def replace_non_sentence_boundary_punctuation(txt)
37
+ txt.apply(ReplaceColonBetweenNumbersRule).
38
+ apply(ReplaceNonSentenceBoundaryCommaRule)
39
+ end
40
+ end
41
+
42
+ class Abbreviation < PragmaticSegmenter::Abbreviation
43
+ ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د‪']
44
+
45
+ def all
46
+ ABBREVIATIONS
47
+ end
48
+
49
+ def prepositive
50
+ []
51
+ end
52
+
53
+ def number
54
+ []
55
+ end
56
+ end
57
+
58
+ class Punctuation < PragmaticSegmenter::Punctuation
59
+ PUNCT = ['?', '!', ':', '.', '؟', '،']
60
+
61
+ def punct
62
+ PUNCT
63
+ end
64
+ end
65
+
66
+ class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
67
+ private
68
+
69
+ def scan_for_replacements(txt, am, index, character_array, abbr)
70
+ replace_abbr(txt, am)
71
+ end
72
+
73
+ def replace_abbr(txt, abbr)
74
+ txt.gsub(/(?<=#{abbr})\./, '∯')
75
+ end
76
+
77
+ def abbreviations
78
+ PragmaticSegmenter::Languages::Arabic::Abbreviation.new
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,33 @@
1
+ module PragmaticSegmenter
2
+ module Languages
3
+ class Armenian
4
+ class Process < PragmaticSegmenter::Process
5
+ private
6
+
7
+ def sentence_boundary_punctuation(txt)
8
+ PragmaticSegmenter::Languages::Armenian::SentenceBoundaryPunctuation.new(text: txt).split
9
+ end
10
+
11
+ def punctuation_array
12
+ PragmaticSegmenter::Languages::Armenian::Punctuation.new.punct
13
+ end
14
+ end
15
+
16
+ class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
17
+ SENTENCE_BOUNDARY = /.*?[։՜:]|.*?$/
18
+
19
+ def split
20
+ text.scan(SENTENCE_BOUNDARY)
21
+ end
22
+ end
23
+
24
+ class Punctuation < PragmaticSegmenter::Punctuation
25
+ PUNCT = ['։', '՜', ':']
26
+
27
+ def punct
28
+ PUNCT
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,33 @@
1
+ module PragmaticSegmenter
2
+ module Languages
3
+ class Burmese
4
+ class Process < PragmaticSegmenter::Process
5
+ private
6
+
7
+ def sentence_boundary_punctuation(txt)
8
+ PragmaticSegmenter::Languages::Burmese::SentenceBoundaryPunctuation.new(text: txt).split
9
+ end
10
+
11
+ def punctuation_array
12
+ PragmaticSegmenter::Languages::Burmese::Punctuation.new.punct
13
+ end
14
+ end
15
+
16
+ class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
17
+ SENTENCE_BOUNDARY = /.*?[။၏!\?]|.*?$/
18
+
19
+ def split
20
+ text.scan(SENTENCE_BOUNDARY)
21
+ end
22
+ end
23
+
24
+ class Punctuation < PragmaticSegmenter::Punctuation
25
+ PUNCT = ['။', '၏', '?', '!']
26
+
27
+ def punct
28
+ PUNCT
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,132 @@
1
+ module PragmaticSegmenter
2
+ module Languages
3
+ class Deutsch
4
+ class Process < PragmaticSegmenter::Process
5
+ private
6
+
7
+ def between_punctutation(txt)
8
+ PragmaticSegmenter::Languages::Deutsch::BetweenPunctuation.new(text: txt).replace
9
+ end
10
+
11
+ def replace_numbers(txt)
12
+ PragmaticSegmenter::Languages::Deutsch::Number.new(text: txt).replace
13
+ end
14
+
15
+ def replace_abbreviations(txt)
16
+ PragmaticSegmenter::Languages::Deutsch::AbbreviationReplacer.new(text: txt).replace
17
+ end
18
+ end
19
+
20
+ class Number < PragmaticSegmenter::Number
21
+ # Rubular: http://rubular.com/r/hZxoyQwKT1
22
+ NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
23
+
24
+ # Rubular: http://rubular.com/r/ityNMwdghj
25
+ NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
26
+
27
+ def replace
28
+ super
29
+ @formatted_text.apply(NumberPeriodSpaceRule).apply(NegativeNumberPeriodSpaceRule)
30
+ end
31
+ end
32
+
33
+ class SingleLetterAbbreviation < PragmaticSegmenter::SingleLetterAbbreviation
34
+ # Rubular: http://rubular.com/r/B4X33QKIL8
35
+ SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
36
+
37
+ # Rubular: http://rubular.com/r/iUNSkCuso0
38
+ SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
39
+
40
+ def replace
41
+ super
42
+ @formatted_text = replace_single_lowercase_letter(@formatted_text)
43
+ replace_single_lowercase_letter_sol(@formatted_text)
44
+ end
45
+
46
+ private
47
+
48
+ def replace_single_lowercase_letter_sol(txt)
49
+ txt.apply(SingleLowerCaseLetterRule)
50
+ end
51
+
52
+ def replace_single_lowercase_letter(txt)
53
+ txt.apply(SingleLowerCaseLetterAtStartOfLineRule)
54
+ end
55
+ end
56
+
57
+ class Abbreviation < PragmaticSegmenter::Abbreviation
58
+ ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b ', 'z.t ', 'z.z', 'z.zt', 'zt', 'zzt']
59
+ NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
60
+
61
+ def all
62
+ ABBREVIATIONS
63
+ end
64
+
65
+ def prepositive
66
+ []
67
+ end
68
+
69
+ def number
70
+ NUMBER_ABBREVIATIONS
71
+ end
72
+ end
73
+
74
+ class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
75
+ def replace
76
+ @reformatted_text = text.apply(PossessiveAbbreviationRule)
77
+ @reformatted_text = PragmaticSegmenter::Languages::Deutsch::SingleLetterAbbreviation.new(text: @reformatted_text).replace
78
+ @reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
79
+ @reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
80
+ @reformatted_text = @reformatted_text.apply(AmPmRules::All)
81
+ replace_abbreviation_as_sentence_boundary(@reformatted_text)
82
+ end
83
+
84
+ private
85
+
86
+ def scan_for_replacements(txt, am, index, character_array, abbr)
87
+ replace_abbr(txt, am)
88
+ end
89
+
90
+ def replace_abbr(txt, abbr)
91
+ txt.gsub(/(?<=#{abbr})\.(?=\s)/, '∯')
92
+ end
93
+
94
+ def abbreviations
95
+ PragmaticSegmenter::Languages::Deutsch::Abbreviation.new
96
+ end
97
+ end
98
+
99
+ class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
100
+ # Rubular: http://rubular.com/r/OdcXBsub0w
101
+ BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
102
+
103
+ # Rubular: http://rubular.com/r/2UskIupGgP
104
+ SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
105
+
106
+ # Rubular: http://rubular.com/r/TkZomF9tTM
107
+ BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
108
+ private
109
+
110
+ def sub_punctuation_between_double_quotes(txt)
111
+ btwn_dbl_quote = sub_punctuation_between_double_quotes_de(txt)
112
+ PragmaticSegmenter::PunctuationReplacer.new(
113
+ matches_array: btwn_dbl_quote,
114
+ text: txt
115
+ ).replace
116
+ end
117
+
118
+ def sub_punctuation_between_double_quotes_de(txt)
119
+ if txt.include?('„')
120
+ btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_DE_REGEX)
121
+ txt.scan(SPLIT_DOUBLE_QUOTES_DE_REGEX).each do |q|
122
+ btwn_dbl_quote << q
123
+ end
124
+ elsif txt.include?(',,')
125
+ btwn_dbl_quote = txt.scan(BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX)
126
+ end
127
+ btwn_dbl_quote
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,44 @@
1
+ module PragmaticSegmenter
2
+ module Languages
3
+ class English
4
+ class Abbreviation < PragmaticSegmenter::Abbreviation
5
+ ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
6
+ PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
7
+ NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
8
+
9
+ def all
10
+ ABBREVIATIONS
11
+ end
12
+
13
+ def prepositive
14
+ PREPOSITIVE_ABBREVIATIONS
15
+ end
16
+
17
+ def number
18
+ NUMBER_ABBREVIATIONS
19
+ end
20
+ end
21
+
22
+ class Cleaner < PragmaticSegmenter::Cleaner
23
+ def clean
24
+ super
25
+ clean_quotations(@clean_text)
26
+ end
27
+
28
+ private
29
+
30
+ def clean_quotations(txt)
31
+ txt.gsub(/`/, "'")
32
+ end
33
+ end
34
+
35
+ class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
36
+ private
37
+
38
+ def abbreviations
39
+ PragmaticSegmenter::Languages::English::Abbreviation.new
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,29 @@
1
+ module PragmaticSegmenter
2
+ module Languages
3
+ class French
4
+ class Abbreviation < PragmaticSegmenter::Abbreviation
5
+ ABBREVIATIONS = ['a.c.n', 'a.m', 'al', 'ann', 'apr', 'art', 'auj', 'av', 'b.p', 'boul', 'c.-à-d', 'c.n', 'c.n.s', 'c.p.i', 'c.q.f.d', 'c.s', 'ca', 'cf', 'ch.-l', 'chap', 'co', 'co', 'contr', 'dir', 'e.g', 'e.v', 'env', 'etc', 'ex', 'fasc', 'fig', 'fr', 'fém', 'hab', 'i.e', 'ibid', 'id', 'inf', 'l.d', 'lib', 'll.aa', 'll.aa.ii', 'll.aa.rr', 'll.aa.ss', 'll.ee', 'll.mm', 'll.mm.ii.rr', 'loc.cit', 'ltd', 'ltd', 'masc', 'mm', 'ms', 'n.b', 'n.d', 'n.d.a', 'n.d.l.r', 'n.d.t', 'n.p.a.i', 'n.s', 'n/réf', 'nn.ss', 'p.c.c', 'p.ex', 'p.j', 'p.s', 'pl', 'pp', 'r.-v', 'r.a.s', 'r.i.p', 'r.p', 's.a', 's.a.i', 's.a.r', 's.a.s', 's.e', 's.m', 's.m.i.r', 's.s', 'sec', 'sect', 'sing', 'sq', 'sqq', 'ss', 'suiv', 'sup', 'suppl', 't.s.v.p', 'tél', 'vb', 'vol', 'vs', 'x.o', 'z.i', 'éd']
6
+
7
+ def all
8
+ ABBREVIATIONS
9
+ end
10
+
11
+ def prepositive
12
+ []
13
+ end
14
+
15
+ def number
16
+ []
17
+ end
18
+ end
19
+
20
+ class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
21
+ private
22
+
23
+ def abbreviations
24
+ PragmaticSegmenter::Languages::French::Abbreviation.new
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ module PragmaticSegmenter
2
+ module Languages
3
+ class Greek
4
+ class Process < PragmaticSegmenter::Process
5
+ private
6
+
7
+ def sentence_boundary_punctuation(txt)
8
+ PragmaticSegmenter::Languages::Greek::SentenceBoundaryPunctuation.new(text: txt).split
9
+ end
10
+ end
11
+
12
+ class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
13
+ SENTENCE_BOUNDARY = /.*?[\.;!\?]|.*?$/
14
+
15
+ def split
16
+ text.scan(SENTENCE_BOUNDARY)
17
+ end
18
+ end
19
+
20
+ class Punctuation < PragmaticSegmenter::Punctuation
21
+ PUNCT = ['.', '!', ';', '?']
22
+
23
+ def punct
24
+ PUNCT
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,33 @@
1
+ module PragmaticSegmenter
2
+ module Languages
3
+ class Hindi
4
+ class Process < PragmaticSegmenter::Process
5
+ private
6
+
7
+ def sentence_boundary_punctuation(txt)
8
+ PragmaticSegmenter::Languages::Hindi::SentenceBoundaryPunctuation.new(text: txt).split
9
+ end
10
+
11
+ def punctuation_array
12
+ PragmaticSegmenter::Languages::Hindi::Punctuation.new.punct
13
+ end
14
+ end
15
+
16
+ class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
17
+ SENTENCE_BOUNDARY = /.*?[।\|!\?]|.*?$/
18
+
19
+ def split
20
+ text.scan(SENTENCE_BOUNDARY)
21
+ end
22
+ end
23
+
24
+ class Punctuation < PragmaticSegmenter::Punctuation
25
+ PUNCT = ['।', '|', '.', '!', '?']
26
+
27
+ def punct
28
+ PUNCT
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end