pragmatic_segmenter 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -2
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +16 -51
- data/lib/pragmatic_segmenter/cleaner.rb +18 -99
- data/lib/pragmatic_segmenter/languages.rb +62 -0
- data/lib/pragmatic_segmenter/languages/amharic.rb +4 -30
- data/lib/pragmatic_segmenter/languages/arabic.rb +21 -64
- data/lib/pragmatic_segmenter/languages/armenian.rb +4 -30
- data/lib/pragmatic_segmenter/languages/burmese.rb +4 -30
- data/lib/pragmatic_segmenter/languages/chinese.rb +8 -0
- data/lib/pragmatic_segmenter/languages/common.rb +70 -1
- data/lib/pragmatic_segmenter/languages/deutsch.rb +49 -78
- data/lib/pragmatic_segmenter/languages/dutch.rb +5 -36
- data/lib/pragmatic_segmenter/languages/english.rb +3 -12
- data/lib/pragmatic_segmenter/languages/french.rb +5 -32
- data/lib/pragmatic_segmenter/languages/greek.rb +4 -26
- data/lib/pragmatic_segmenter/languages/hindi.rb +4 -30
- data/lib/pragmatic_segmenter/languages/italian.rb +3 -37
- data/lib/pragmatic_segmenter/languages/japanese.rb +6 -4
- data/lib/pragmatic_segmenter/languages/persian.rb +16 -40
- data/lib/pragmatic_segmenter/languages/polish.rb +6 -38
- data/lib/pragmatic_segmenter/languages/russian.rb +13 -33
- data/lib/pragmatic_segmenter/languages/spanish.rb +6 -31
- data/lib/pragmatic_segmenter/languages/urdu.rb +4 -30
- data/lib/pragmatic_segmenter/number.rb +5 -5
- data/lib/pragmatic_segmenter/process.rb +28 -49
- data/lib/pragmatic_segmenter/rules.rb +65 -1
- data/lib/pragmatic_segmenter/{ellipsis.rb → rules/ellipsis.rb} +0 -0
- data/lib/pragmatic_segmenter/rules/html.rb +13 -0
- data/lib/pragmatic_segmenter/segmenter.rb +12 -32
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +6 -7
- metadata +6 -8
- data/lib/pragmatic_segmenter/abbreviation.rb +0 -22
- data/lib/pragmatic_segmenter/language_support.rb +0 -31
- data/lib/pragmatic_segmenter/punctuation.rb +0 -12
- data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +0 -17
- data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +0 -37
@@ -1,84 +1,41 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
4
|
-
|
5
|
-
private
|
6
|
-
|
7
|
-
def sentence_boundary_punctuation(txt)
|
8
|
-
PragmaticSegmenter::Languages::Arabic::SentenceBoundaryPunctuation.new(text: txt).split
|
9
|
-
end
|
3
|
+
module Arabic
|
4
|
+
include Languages::Common
|
10
5
|
|
11
|
-
|
12
|
-
|
13
|
-
end
|
6
|
+
Punctuations = ['?', '!', ':', '.', '؟', '،']
|
7
|
+
SENTENCE_BOUNDARY_REGEX = /.*?[:\.!\?؟،]|.*?\z|.*?$/
|
14
8
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
9
|
+
module Abbreviation
|
10
|
+
ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د']
|
11
|
+
PREPOSITIVE_ABBREVIATIONS = []
|
12
|
+
NUMBER_ABBREVIATIONS = []
|
21
13
|
end
|
22
14
|
|
23
|
-
|
24
|
-
|
15
|
+
# Rubular: http://rubular.com/r/RX5HpdDIyv
|
16
|
+
ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
|
25
17
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
# Rubular: http://rubular.com/r/kPRgApNHUg
|
30
|
-
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
31
|
-
|
32
|
-
def split
|
33
|
-
txt = replace_non_sentence_boundary_punctuation(text)
|
34
|
-
txt.scan(SENTENCE_BOUNDARY)
|
35
|
-
end
|
18
|
+
# Rubular: http://rubular.com/r/kPRgApNHUg
|
19
|
+
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
36
20
|
|
21
|
+
class Process < Process
|
37
22
|
private
|
38
23
|
|
39
|
-
def
|
40
|
-
txt.apply(ReplaceColonBetweenNumbersRule)
|
41
|
-
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
class Abbreviation < PragmaticSegmenter::Abbreviation
|
46
|
-
ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د']
|
47
|
-
|
48
|
-
def all
|
49
|
-
ABBREVIATIONS
|
50
|
-
end
|
51
|
-
|
52
|
-
def prepositive
|
53
|
-
[]
|
24
|
+
def sentence_boundary_punctuation(txt)
|
25
|
+
txt = txt.apply(ReplaceColonBetweenNumbersRule, ReplaceNonSentenceBoundaryCommaRule)
|
26
|
+
txt.scan(SENTENCE_BOUNDARY_REGEX)
|
54
27
|
end
|
55
28
|
|
56
|
-
def
|
57
|
-
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
class Punctuation < PragmaticSegmenter::Punctuation
|
62
|
-
PUNCT = ['?', '!', ':', '.', '؟', '،']
|
63
|
-
|
64
|
-
def punct
|
65
|
-
PUNCT
|
29
|
+
def replace_abbreviations(txt)
|
30
|
+
AbbreviationReplacer.new(text: txt, language: Arabic).replace
|
66
31
|
end
|
67
32
|
end
|
68
33
|
|
69
|
-
class AbbreviationReplacer <
|
34
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
70
35
|
private
|
71
36
|
|
72
|
-
def scan_for_replacements(txt, am, index, character_array
|
73
|
-
|
74
|
-
end
|
75
|
-
|
76
|
-
def replace_abbr(txt, abbr)
|
77
|
-
txt.gsub(/(?<=#{abbr})\./, '∯')
|
78
|
-
end
|
79
|
-
|
80
|
-
def abbreviations
|
81
|
-
PragmaticSegmenter::Languages::Arabic::Abbreviation.new
|
37
|
+
def scan_for_replacements(txt, am, index, character_array)
|
38
|
+
txt.gsub(/(?<=#{am})\./, '∯')
|
82
39
|
end
|
83
40
|
end
|
84
41
|
end
|
@@ -1,36 +1,10 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
4
|
-
|
5
|
-
private
|
3
|
+
module Armenian
|
4
|
+
include Languages::Common
|
6
5
|
|
7
|
-
|
8
|
-
|
9
|
-
end
|
10
|
-
|
11
|
-
def punctuation_array
|
12
|
-
PragmaticSegmenter::Languages::Armenian::Punctuation.new.punct
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
17
|
-
end
|
18
|
-
|
19
|
-
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
20
|
-
SENTENCE_BOUNDARY = /.*?[։՜:]|.*?$/
|
21
|
-
|
22
|
-
def split
|
23
|
-
text.scan(SENTENCE_BOUNDARY)
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
class Punctuation < PragmaticSegmenter::Punctuation
|
28
|
-
PUNCT = ['։', '՜', ':']
|
29
|
-
|
30
|
-
def punct
|
31
|
-
PUNCT
|
32
|
-
end
|
33
|
-
end
|
6
|
+
SENTENCE_BOUNDARY_REGEX = /.*?[։՜:]|.*?$/
|
7
|
+
Punctuations = ['։', '՜', ':']
|
34
8
|
end
|
35
9
|
end
|
36
10
|
end
|
@@ -1,36 +1,10 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
4
|
-
|
5
|
-
private
|
3
|
+
module Burmese
|
4
|
+
include Languages::Common
|
6
5
|
|
7
|
-
|
8
|
-
|
9
|
-
end
|
10
|
-
|
11
|
-
def punctuation_array
|
12
|
-
PragmaticSegmenter::Languages::Burmese::Punctuation.new.punct
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
17
|
-
end
|
18
|
-
|
19
|
-
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
20
|
-
SENTENCE_BOUNDARY = /.*?[။၏!\?]|.*?$/
|
21
|
-
|
22
|
-
def split
|
23
|
-
text.scan(SENTENCE_BOUNDARY)
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
class Punctuation < PragmaticSegmenter::Punctuation
|
28
|
-
PUNCT = ['။', '၏', '?', '!']
|
29
|
-
|
30
|
-
def punct
|
31
|
-
PUNCT
|
32
|
-
end
|
33
|
-
end
|
6
|
+
SENTENCE_BOUNDARY_REGEX = /.*?[။၏!\?]|.*?$/
|
7
|
+
Punctuations = ['။', '၏', '?', '!']
|
34
8
|
end
|
35
9
|
end
|
36
10
|
end
|
@@ -1,6 +1,75 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
3
|
+
module Common
|
4
|
+
# This class holds the punctuation marks.
|
5
|
+
Punctuations = ['。', '.', '.', '!', '!', '?', '?']
|
6
|
+
|
7
|
+
# Defines the abbreviations for each language (if available)
|
8
|
+
module Abbreviation
|
9
|
+
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
|
10
|
+
PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
|
11
|
+
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
12
|
+
end
|
13
|
+
|
14
|
+
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
|
15
|
+
|
16
|
+
include Rules
|
17
|
+
# Rubular: http://rubular.com/r/NqCqv372Ix
|
18
|
+
QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
|
19
|
+
|
20
|
+
# Rubular: http://rubular.com/r/6flGnUMEVl
|
21
|
+
PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
|
22
|
+
|
23
|
+
# Rubular: http://rubular.com/r/TYzr4qOW1Q
|
24
|
+
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
|
25
|
+
|
26
|
+
# Rubular: http://rubular.com/r/JMjlZHAT4g
|
27
|
+
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
|
28
|
+
|
29
|
+
# Rubular: http://rubular.com/r/mQ8Es9bxtk
|
30
|
+
CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
|
31
|
+
|
32
|
+
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
33
|
+
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
34
|
+
|
35
|
+
# Rubular: http://rubular.com/r/NEv265G2X2
|
36
|
+
KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
|
37
|
+
|
38
|
+
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
39
|
+
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
40
|
+
|
41
|
+
module AmPmRules
|
42
|
+
# Rubular: http://rubular.com/r/Vnx3m4Spc8
|
43
|
+
UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
|
44
|
+
|
45
|
+
# Rubular: http://rubular.com/r/AJMCotJVbW
|
46
|
+
UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
|
47
|
+
|
48
|
+
# Rubular: http://rubular.com/r/13q7SnOhgA
|
49
|
+
LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
|
50
|
+
|
51
|
+
# Rubular: http://rubular.com/r/DgUDq4mLz5
|
52
|
+
LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
|
53
|
+
|
54
|
+
All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
|
55
|
+
end
|
56
|
+
|
57
|
+
# This class searches for periods within an abbreviation and
|
58
|
+
# replaces the periods.
|
59
|
+
module SingleLetterAbbreviationRules
|
60
|
+
# Rubular: http://rubular.com/r/e3H6kwnr6H
|
61
|
+
SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
|
62
|
+
|
63
|
+
# Rubular: http://rubular.com/r/gitvf0YWH4
|
64
|
+
SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
|
65
|
+
|
66
|
+
All = [
|
67
|
+
SingleUpperCaseLetterAtStartOfLineRule,
|
68
|
+
SingleUpperCaseLetterRule
|
69
|
+
]
|
70
|
+
end
|
71
|
+
|
72
|
+
|
4
73
|
class Process < PragmaticSegmenter::Process
|
5
74
|
end
|
6
75
|
class Cleaner < PragmaticSegmenter::Cleaner
|
@@ -1,19 +1,51 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
3
|
+
module Deutsch
|
4
|
+
include Languages::Common
|
5
|
+
|
6
|
+
module Abbreviation
|
7
|
+
ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt']
|
8
|
+
NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
|
9
|
+
PREPOSITIVE_ABBREVIATIONS = []
|
10
|
+
end
|
11
|
+
|
12
|
+
# Rubular: http://rubular.com/r/OdcXBsub0w
|
13
|
+
BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
|
14
|
+
|
15
|
+
# Rubular: http://rubular.com/r/2UskIupGgP
|
16
|
+
SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
|
17
|
+
|
18
|
+
# Rubular: http://rubular.com/r/TkZomF9tTM
|
19
|
+
BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
|
20
|
+
|
21
|
+
# Rubular: http://rubular.com/r/hZxoyQwKT1
|
22
|
+
NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
|
23
|
+
|
24
|
+
# Rubular: http://rubular.com/r/ityNMwdghj
|
25
|
+
NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
|
26
|
+
|
27
|
+
MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
|
28
|
+
|
29
|
+
# Rubular: http://rubular.com/r/B4X33QKIL8
|
30
|
+
SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
|
31
|
+
|
32
|
+
# Rubular: http://rubular.com/r/iUNSkCuso0
|
33
|
+
SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
|
34
|
+
|
35
|
+
|
4
36
|
class Process < PragmaticSegmenter::Process
|
5
37
|
private
|
6
38
|
|
7
39
|
def between_punctuation(txt)
|
8
|
-
|
40
|
+
BetweenPunctuation.new(text: txt).replace
|
9
41
|
end
|
10
42
|
|
11
43
|
def replace_numbers(txt)
|
12
|
-
|
44
|
+
Number.new(text: txt).replace
|
13
45
|
end
|
14
46
|
|
15
47
|
def replace_abbreviations(txt)
|
16
|
-
|
48
|
+
AbbreviationReplacer.new(text: txt, language: Deutsch).replace
|
17
49
|
end
|
18
50
|
end
|
19
51
|
|
@@ -21,27 +53,19 @@ module PragmaticSegmenter
|
|
21
53
|
private
|
22
54
|
|
23
55
|
def abbreviations
|
24
|
-
|
56
|
+
Abbreviation::ABBREVIATIONS
|
25
57
|
end
|
26
58
|
end
|
27
59
|
|
28
60
|
class Number < PragmaticSegmenter::Number
|
29
|
-
# Rubular: http://rubular.com/r/hZxoyQwKT1
|
30
|
-
NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
|
31
|
-
|
32
|
-
# Rubular: http://rubular.com/r/ityNMwdghj
|
33
|
-
NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
|
34
|
-
|
35
|
-
DE_MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
|
36
|
-
|
37
61
|
def replace
|
38
62
|
super
|
39
|
-
@text.apply(NumberPeriodSpaceRule
|
63
|
+
@text.apply(NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule)
|
40
64
|
replace_period_in_deutsch_dates(@text)
|
41
65
|
end
|
42
66
|
|
43
67
|
def replace_period_in_deutsch_dates(txt)
|
44
|
-
|
68
|
+
MONTHS.each do |month|
|
45
69
|
# Rubular: http://rubular.com/r/zlqgj7G5dA
|
46
70
|
txt.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
|
47
71
|
end
|
@@ -49,81 +73,28 @@ module PragmaticSegmenter
|
|
49
73
|
end
|
50
74
|
end
|
51
75
|
|
52
|
-
class
|
53
|
-
# Rubular: http://rubular.com/r/B4X33QKIL8
|
54
|
-
SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
|
55
|
-
|
56
|
-
# Rubular: http://rubular.com/r/iUNSkCuso0
|
57
|
-
SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
|
58
|
-
|
76
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
59
77
|
def replace
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
private
|
78
|
+
@reformatted_text = text.apply(
|
79
|
+
@language::PossessiveAbbreviationRule,
|
80
|
+
@language::SingleLetterAbbreviationRules::All,
|
81
|
+
SingleLowerCaseLetterRule,
|
82
|
+
SingleLowerCaseLetterAtStartOfLineRule)
|
66
83
|
|
67
|
-
|
68
|
-
txt.apply(SingleLowerCaseLetterRule)
|
69
|
-
end
|
70
|
-
|
71
|
-
def replace_single_lowercase_letter(txt)
|
72
|
-
txt.apply(SingleLowerCaseLetterAtStartOfLineRule)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
class Abbreviation < PragmaticSegmenter::Abbreviation
|
77
|
-
ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt']
|
78
|
-
NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
|
79
|
-
|
80
|
-
def all
|
81
|
-
ABBREVIATIONS
|
82
|
-
end
|
83
|
-
|
84
|
-
def prepositive
|
85
|
-
[]
|
86
|
-
end
|
87
|
-
|
88
|
-
def number
|
89
|
-
NUMBER_ABBREVIATIONS
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
|
94
|
-
def replace
|
95
|
-
@reformatted_text = text.apply(PossessiveAbbreviationRule)
|
96
|
-
@reformatted_text = PragmaticSegmenter::Languages::Deutsch::SingleLetterAbbreviation.new(text: @reformatted_text).replace
|
97
|
-
@reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
|
84
|
+
@reformatted_text = search_for_abbreviations_in_string(@reformatted_text)
|
98
85
|
@reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
|
99
|
-
@reformatted_text = @reformatted_text.apply(AmPmRules::All)
|
86
|
+
@reformatted_text = @reformatted_text.apply(Languages::Common::AmPmRules::All)
|
100
87
|
replace_abbreviation_as_sentence_boundary(@reformatted_text)
|
101
88
|
end
|
102
89
|
|
103
90
|
private
|
104
91
|
|
105
|
-
def scan_for_replacements(txt, am, index, character_array
|
106
|
-
|
107
|
-
end
|
108
|
-
|
109
|
-
def replace_abbr(txt, abbr)
|
110
|
-
txt.gsub(/(?<=#{abbr})\.(?=\s)/, '∯')
|
111
|
-
end
|
112
|
-
|
113
|
-
def abbreviations
|
114
|
-
PragmaticSegmenter::Languages::Deutsch::Abbreviation.new
|
92
|
+
def scan_for_replacements(txt, am, index, character_array)
|
93
|
+
txt.gsub(/(?<=#{am})\.(?=\s)/, '∯')
|
115
94
|
end
|
116
95
|
end
|
117
96
|
|
118
97
|
class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
|
119
|
-
# Rubular: http://rubular.com/r/OdcXBsub0w
|
120
|
-
BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
|
121
|
-
|
122
|
-
# Rubular: http://rubular.com/r/2UskIupGgP
|
123
|
-
SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
|
124
|
-
|
125
|
-
# Rubular: http://rubular.com/r/TkZomF9tTM
|
126
|
-
BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
|
127
98
|
private
|
128
99
|
|
129
100
|
def sub_punctuation_between_double_quotes(txt)
|