pragmatic_segmenter 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +730 -0
- data/Rakefile +4 -0
- data/lib/pragmatic_segmenter.rb +2 -0
- data/lib/pragmatic_segmenter/abbreviation.rb +22 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +149 -0
- data/lib/pragmatic_segmenter/between_punctuation.rb +78 -0
- data/lib/pragmatic_segmenter/cleaner.rb +141 -0
- data/lib/pragmatic_segmenter/ellipsis.rb +36 -0
- data/lib/pragmatic_segmenter/exclamation_words.rb +19 -0
- data/lib/pragmatic_segmenter/languages/amharic.rb +33 -0
- data/lib/pragmatic_segmenter/languages/arabic.rb +83 -0
- data/lib/pragmatic_segmenter/languages/armenian.rb +33 -0
- data/lib/pragmatic_segmenter/languages/burmese.rb +33 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +132 -0
- data/lib/pragmatic_segmenter/languages/english.rb +44 -0
- data/lib/pragmatic_segmenter/languages/french.rb +29 -0
- data/lib/pragmatic_segmenter/languages/greek.rb +29 -0
- data/lib/pragmatic_segmenter/languages/hindi.rb +33 -0
- data/lib/pragmatic_segmenter/languages/italian.rb +39 -0
- data/lib/pragmatic_segmenter/languages/japanese.rb +58 -0
- data/lib/pragmatic_segmenter/languages/persian.rb +56 -0
- data/lib/pragmatic_segmenter/languages/russian.rb +60 -0
- data/lib/pragmatic_segmenter/languages/spanish.rb +39 -0
- data/lib/pragmatic_segmenter/languages/urdu.rb +33 -0
- data/lib/pragmatic_segmenter/list.rb +169 -0
- data/lib/pragmatic_segmenter/number.rb +35 -0
- data/lib/pragmatic_segmenter/process.rb +126 -0
- data/lib/pragmatic_segmenter/punctuation.rb +12 -0
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +62 -0
- data/lib/pragmatic_segmenter/rules.rb +38 -0
- data/lib/pragmatic_segmenter/segmenter.rb +81 -0
- data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +17 -0
- data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +37 -0
- data/lib/pragmatic_segmenter/types.rb +12 -0
- data/lib/pragmatic_segmenter/version.rb +3 -0
- data/pragmatic_segmenter.gemspec +25 -0
- data/spec/performance_spec.rb +24 -0
- data/spec/pragmatic_segmenter_spec.rb +1906 -0
- data/spec/spec_helper.rb +1 -0
- metadata +150 -0
@@ -0,0 +1,33 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
module Languages
|
3
|
+
class Amharic
|
4
|
+
class Process < PragmaticSegmenter::Process
|
5
|
+
private
|
6
|
+
|
7
|
+
def sentence_boundary_punctuation(txt)
|
8
|
+
PragmaticSegmenter::Languages::Amharic::SentenceBoundaryPunctuation.new(text: txt).split
|
9
|
+
end
|
10
|
+
|
11
|
+
def punctuation_array
|
12
|
+
PragmaticSegmenter::Languages::Amharic::Punctuation.new.punct
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
17
|
+
SENTENCE_BOUNDARY = /.*?[፧።!\?]|.*?$/
|
18
|
+
|
19
|
+
def split
|
20
|
+
text.scan(SENTENCE_BOUNDARY)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Punctuation < PragmaticSegmenter::Punctuation
|
25
|
+
PUNCT = ['።', '፧', '?', '!']
|
26
|
+
|
27
|
+
def punct
|
28
|
+
PUNCT
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
module Languages
|
3
|
+
class Arabic
|
4
|
+
class Process < PragmaticSegmenter::Process
|
5
|
+
private
|
6
|
+
|
7
|
+
def sentence_boundary_punctuation(txt)
|
8
|
+
PragmaticSegmenter::Languages::Arabic::SentenceBoundaryPunctuation.new(text: txt).split
|
9
|
+
end
|
10
|
+
|
11
|
+
def replace_abbreviations(txt)
|
12
|
+
PragmaticSegmenter::Languages::Arabic::AbbreviationReplacer.new(text: txt).replace
|
13
|
+
end
|
14
|
+
|
15
|
+
def punctuation_array
|
16
|
+
PragmaticSegmenter::Languages::Arabic::Punctuation.new.punct
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
21
|
+
SENTENCE_BOUNDARY = /.*?[:\.!\?؟،]|.*?\z|.*?$/
|
22
|
+
|
23
|
+
# Rubular: http://rubular.com/r/RX5HpdDIyv
|
24
|
+
ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
|
25
|
+
|
26
|
+
# Rubular: http://rubular.com/r/kPRgApNHUg
|
27
|
+
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
28
|
+
|
29
|
+
def split
|
30
|
+
txt = replace_non_sentence_boundary_punctuation(text)
|
31
|
+
txt.scan(SENTENCE_BOUNDARY)
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def replace_non_sentence_boundary_punctuation(txt)
|
37
|
+
txt.apply(ReplaceColonBetweenNumbersRule).
|
38
|
+
apply(ReplaceNonSentenceBoundaryCommaRule)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class Abbreviation < PragmaticSegmenter::Abbreviation
|
43
|
+
ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د']
|
44
|
+
|
45
|
+
def all
|
46
|
+
ABBREVIATIONS
|
47
|
+
end
|
48
|
+
|
49
|
+
def prepositive
|
50
|
+
[]
|
51
|
+
end
|
52
|
+
|
53
|
+
def number
|
54
|
+
[]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
class Punctuation < PragmaticSegmenter::Punctuation
|
59
|
+
PUNCT = ['?', '!', ':', '.', '؟', '،']
|
60
|
+
|
61
|
+
def punct
|
62
|
+
PUNCT
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
|
67
|
+
private
|
68
|
+
|
69
|
+
def scan_for_replacements(txt, am, index, character_array, abbr)
|
70
|
+
replace_abbr(txt, am)
|
71
|
+
end
|
72
|
+
|
73
|
+
def replace_abbr(txt, abbr)
|
74
|
+
txt.gsub(/(?<=#{abbr})\./, '∯')
|
75
|
+
end
|
76
|
+
|
77
|
+
def abbreviations
|
78
|
+
PragmaticSegmenter::Languages::Arabic::Abbreviation.new
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
module Languages
|
3
|
+
class Armenian
|
4
|
+
class Process < PragmaticSegmenter::Process
|
5
|
+
private
|
6
|
+
|
7
|
+
def sentence_boundary_punctuation(txt)
|
8
|
+
PragmaticSegmenter::Languages::Armenian::SentenceBoundaryPunctuation.new(text: txt).split
|
9
|
+
end
|
10
|
+
|
11
|
+
def punctuation_array
|
12
|
+
PragmaticSegmenter::Languages::Armenian::Punctuation.new.punct
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
17
|
+
SENTENCE_BOUNDARY = /.*?[։՜:]|.*?$/
|
18
|
+
|
19
|
+
def split
|
20
|
+
text.scan(SENTENCE_BOUNDARY)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Punctuation < PragmaticSegmenter::Punctuation
|
25
|
+
PUNCT = ['։', '՜', ':']
|
26
|
+
|
27
|
+
def punct
|
28
|
+
PUNCT
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
module Languages
|
3
|
+
class Burmese
|
4
|
+
class Process < PragmaticSegmenter::Process
|
5
|
+
private
|
6
|
+
|
7
|
+
def sentence_boundary_punctuation(txt)
|
8
|
+
PragmaticSegmenter::Languages::Burmese::SentenceBoundaryPunctuation.new(text: txt).split
|
9
|
+
end
|
10
|
+
|
11
|
+
def punctuation_array
|
12
|
+
PragmaticSegmenter::Languages::Burmese::Punctuation.new.punct
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
17
|
+
SENTENCE_BOUNDARY = /.*?[။၏!\?]|.*?$/
|
18
|
+
|
19
|
+
def split
|
20
|
+
text.scan(SENTENCE_BOUNDARY)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Punctuation < PragmaticSegmenter::Punctuation
|
25
|
+
PUNCT = ['။', '၏', '?', '!']
|
26
|
+
|
27
|
+
def punct
|
28
|
+
PUNCT
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
module Languages
|
3
|
+
class Deutsch
|
4
|
+
class Process < PragmaticSegmenter::Process
|
5
|
+
private
|
6
|
+
|
7
|
+
def between_punctutation(txt)
|
8
|
+
PragmaticSegmenter::Languages::Deutsch::BetweenPunctuation.new(text: txt).replace
|
9
|
+
end
|
10
|
+
|
11
|
+
def replace_numbers(txt)
|
12
|
+
PragmaticSegmenter::Languages::Deutsch::Number.new(text: txt).replace
|
13
|
+
end
|
14
|
+
|
15
|
+
def replace_abbreviations(txt)
|
16
|
+
PragmaticSegmenter::Languages::Deutsch::AbbreviationReplacer.new(text: txt).replace
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class Number < PragmaticSegmenter::Number
|
21
|
+
# Rubular: http://rubular.com/r/hZxoyQwKT1
|
22
|
+
NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
|
23
|
+
|
24
|
+
# Rubular: http://rubular.com/r/ityNMwdghj
|
25
|
+
NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
|
26
|
+
|
27
|
+
def replace
|
28
|
+
super
|
29
|
+
@formatted_text.apply(NumberPeriodSpaceRule).apply(NegativeNumberPeriodSpaceRule)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class SingleLetterAbbreviation < PragmaticSegmenter::SingleLetterAbbreviation
|
34
|
+
# Rubular: http://rubular.com/r/B4X33QKIL8
|
35
|
+
SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
|
36
|
+
|
37
|
+
# Rubular: http://rubular.com/r/iUNSkCuso0
|
38
|
+
SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
|
39
|
+
|
40
|
+
def replace
|
41
|
+
super
|
42
|
+
@formatted_text = replace_single_lowercase_letter(@formatted_text)
|
43
|
+
replace_single_lowercase_letter_sol(@formatted_text)
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def replace_single_lowercase_letter_sol(txt)
|
49
|
+
txt.apply(SingleLowerCaseLetterRule)
|
50
|
+
end
|
51
|
+
|
52
|
+
def replace_single_lowercase_letter(txt)
|
53
|
+
txt.apply(SingleLowerCaseLetterAtStartOfLineRule)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
class Abbreviation < PragmaticSegmenter::Abbreviation
|
58
|
+
ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b ', 'z.t ', 'z.z', 'z.zt', 'zt', 'zzt']
|
59
|
+
NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
|
60
|
+
|
61
|
+
def all
|
62
|
+
ABBREVIATIONS
|
63
|
+
end
|
64
|
+
|
65
|
+
def prepositive
|
66
|
+
[]
|
67
|
+
end
|
68
|
+
|
69
|
+
def number
|
70
|
+
NUMBER_ABBREVIATIONS
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
|
75
|
+
def replace
|
76
|
+
@reformatted_text = text.apply(PossessiveAbbreviationRule)
|
77
|
+
@reformatted_text = PragmaticSegmenter::Languages::Deutsch::SingleLetterAbbreviation.new(text: @reformatted_text).replace
|
78
|
+
@reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
|
79
|
+
@reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
|
80
|
+
@reformatted_text = @reformatted_text.apply(AmPmRules::All)
|
81
|
+
replace_abbreviation_as_sentence_boundary(@reformatted_text)
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def scan_for_replacements(txt, am, index, character_array, abbr)
|
87
|
+
replace_abbr(txt, am)
|
88
|
+
end
|
89
|
+
|
90
|
+
def replace_abbr(txt, abbr)
|
91
|
+
txt.gsub(/(?<=#{abbr})\.(?=\s)/, '∯')
|
92
|
+
end
|
93
|
+
|
94
|
+
def abbreviations
|
95
|
+
PragmaticSegmenter::Languages::Deutsch::Abbreviation.new
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
|
100
|
+
# Rubular: http://rubular.com/r/OdcXBsub0w
|
101
|
+
BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
|
102
|
+
|
103
|
+
# Rubular: http://rubular.com/r/2UskIupGgP
|
104
|
+
SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
|
105
|
+
|
106
|
+
# Rubular: http://rubular.com/r/TkZomF9tTM
|
107
|
+
BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
|
108
|
+
private
|
109
|
+
|
110
|
+
def sub_punctuation_between_double_quotes(txt)
|
111
|
+
btwn_dbl_quote = sub_punctuation_between_double_quotes_de(txt)
|
112
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
113
|
+
matches_array: btwn_dbl_quote,
|
114
|
+
text: txt
|
115
|
+
).replace
|
116
|
+
end
|
117
|
+
|
118
|
+
def sub_punctuation_between_double_quotes_de(txt)
|
119
|
+
if txt.include?('„')
|
120
|
+
btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_DE_REGEX)
|
121
|
+
txt.scan(SPLIT_DOUBLE_QUOTES_DE_REGEX).each do |q|
|
122
|
+
btwn_dbl_quote << q
|
123
|
+
end
|
124
|
+
elsif txt.include?(',,')
|
125
|
+
btwn_dbl_quote = txt.scan(BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX)
|
126
|
+
end
|
127
|
+
btwn_dbl_quote
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
module Languages
|
3
|
+
class English
|
4
|
+
class Abbreviation < PragmaticSegmenter::Abbreviation
|
5
|
+
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
|
6
|
+
PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
|
7
|
+
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
8
|
+
|
9
|
+
def all
|
10
|
+
ABBREVIATIONS
|
11
|
+
end
|
12
|
+
|
13
|
+
def prepositive
|
14
|
+
PREPOSITIVE_ABBREVIATIONS
|
15
|
+
end
|
16
|
+
|
17
|
+
def number
|
18
|
+
NUMBER_ABBREVIATIONS
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class Cleaner < PragmaticSegmenter::Cleaner
|
23
|
+
def clean
|
24
|
+
super
|
25
|
+
clean_quotations(@clean_text)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def clean_quotations(txt)
|
31
|
+
txt.gsub(/`/, "'")
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
|
36
|
+
private
|
37
|
+
|
38
|
+
def abbreviations
|
39
|
+
PragmaticSegmenter::Languages::English::Abbreviation.new
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
module Languages
|
3
|
+
class French
|
4
|
+
class Abbreviation < PragmaticSegmenter::Abbreviation
|
5
|
+
ABBREVIATIONS = ['a.c.n', 'a.m', 'al', 'ann', 'apr', 'art', 'auj', 'av', 'b.p', 'boul', 'c.-à-d', 'c.n', 'c.n.s', 'c.p.i', 'c.q.f.d', 'c.s', 'ca', 'cf', 'ch.-l', 'chap', 'co', 'co', 'contr', 'dir', 'e.g', 'e.v', 'env', 'etc', 'ex', 'fasc', 'fig', 'fr', 'fém', 'hab', 'i.e', 'ibid', 'id', 'inf', 'l.d', 'lib', 'll.aa', 'll.aa.ii', 'll.aa.rr', 'll.aa.ss', 'll.ee', 'll.mm', 'll.mm.ii.rr', 'loc.cit', 'ltd', 'ltd', 'masc', 'mm', 'ms', 'n.b', 'n.d', 'n.d.a', 'n.d.l.r', 'n.d.t', 'n.p.a.i', 'n.s', 'n/réf', 'nn.ss', 'p.c.c', 'p.ex', 'p.j', 'p.s', 'pl', 'pp', 'r.-v', 'r.a.s', 'r.i.p', 'r.p', 's.a', 's.a.i', 's.a.r', 's.a.s', 's.e', 's.m', 's.m.i.r', 's.s', 'sec', 'sect', 'sing', 'sq', 'sqq', 'ss', 'suiv', 'sup', 'suppl', 't.s.v.p', 'tél', 'vb', 'vol', 'vs', 'x.o', 'z.i', 'éd']
|
6
|
+
|
7
|
+
def all
|
8
|
+
ABBREVIATIONS
|
9
|
+
end
|
10
|
+
|
11
|
+
def prepositive
|
12
|
+
[]
|
13
|
+
end
|
14
|
+
|
15
|
+
def number
|
16
|
+
[]
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
|
21
|
+
private
|
22
|
+
|
23
|
+
def abbreviations
|
24
|
+
PragmaticSegmenter::Languages::French::Abbreviation.new
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
module Languages
|
3
|
+
class Greek
|
4
|
+
class Process < PragmaticSegmenter::Process
|
5
|
+
private
|
6
|
+
|
7
|
+
def sentence_boundary_punctuation(txt)
|
8
|
+
PragmaticSegmenter::Languages::Greek::SentenceBoundaryPunctuation.new(text: txt).split
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
13
|
+
SENTENCE_BOUNDARY = /.*?[\.;!\?]|.*?$/
|
14
|
+
|
15
|
+
def split
|
16
|
+
text.scan(SENTENCE_BOUNDARY)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class Punctuation < PragmaticSegmenter::Punctuation
|
21
|
+
PUNCT = ['.', '!', ';', '?']
|
22
|
+
|
23
|
+
def punct
|
24
|
+
PUNCT
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
module Languages
|
3
|
+
class Hindi
|
4
|
+
class Process < PragmaticSegmenter::Process
|
5
|
+
private
|
6
|
+
|
7
|
+
def sentence_boundary_punctuation(txt)
|
8
|
+
PragmaticSegmenter::Languages::Hindi::SentenceBoundaryPunctuation.new(text: txt).split
|
9
|
+
end
|
10
|
+
|
11
|
+
def punctuation_array
|
12
|
+
PragmaticSegmenter::Languages::Hindi::Punctuation.new.punct
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
17
|
+
SENTENCE_BOUNDARY = /.*?[।\|!\?]|.*?$/
|
18
|
+
|
19
|
+
def split
|
20
|
+
text.scan(SENTENCE_BOUNDARY)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Punctuation < PragmaticSegmenter::Punctuation
|
25
|
+
PUNCT = ['।', '|', '.', '!', '?']
|
26
|
+
|
27
|
+
def punct
|
28
|
+
PUNCT
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|