pragmatic_segmenter 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -2
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +16 -51
- data/lib/pragmatic_segmenter/cleaner.rb +18 -99
- data/lib/pragmatic_segmenter/languages.rb +62 -0
- data/lib/pragmatic_segmenter/languages/amharic.rb +4 -30
- data/lib/pragmatic_segmenter/languages/arabic.rb +21 -64
- data/lib/pragmatic_segmenter/languages/armenian.rb +4 -30
- data/lib/pragmatic_segmenter/languages/burmese.rb +4 -30
- data/lib/pragmatic_segmenter/languages/chinese.rb +8 -0
- data/lib/pragmatic_segmenter/languages/common.rb +70 -1
- data/lib/pragmatic_segmenter/languages/deutsch.rb +49 -78
- data/lib/pragmatic_segmenter/languages/dutch.rb +5 -36
- data/lib/pragmatic_segmenter/languages/english.rb +3 -12
- data/lib/pragmatic_segmenter/languages/french.rb +5 -32
- data/lib/pragmatic_segmenter/languages/greek.rb +4 -26
- data/lib/pragmatic_segmenter/languages/hindi.rb +4 -30
- data/lib/pragmatic_segmenter/languages/italian.rb +3 -37
- data/lib/pragmatic_segmenter/languages/japanese.rb +6 -4
- data/lib/pragmatic_segmenter/languages/persian.rb +16 -40
- data/lib/pragmatic_segmenter/languages/polish.rb +6 -38
- data/lib/pragmatic_segmenter/languages/russian.rb +13 -33
- data/lib/pragmatic_segmenter/languages/spanish.rb +6 -31
- data/lib/pragmatic_segmenter/languages/urdu.rb +4 -30
- data/lib/pragmatic_segmenter/number.rb +5 -5
- data/lib/pragmatic_segmenter/process.rb +28 -49
- data/lib/pragmatic_segmenter/rules.rb +65 -1
- data/lib/pragmatic_segmenter/{ellipsis.rb → rules/ellipsis.rb} +0 -0
- data/lib/pragmatic_segmenter/rules/html.rb +13 -0
- data/lib/pragmatic_segmenter/segmenter.rb +12 -32
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +6 -7
- metadata +6 -8
- data/lib/pragmatic_segmenter/abbreviation.rb +0 -22
- data/lib/pragmatic_segmenter/language_support.rb +0 -31
- data/lib/pragmatic_segmenter/punctuation.rb +0 -12
- data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +0 -17
- data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +0 -37
@@ -1,84 +1,41 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
4
|
-
|
5
|
-
private
|
6
|
-
|
7
|
-
def sentence_boundary_punctuation(txt)
|
8
|
-
PragmaticSegmenter::Languages::Arabic::SentenceBoundaryPunctuation.new(text: txt).split
|
9
|
-
end
|
3
|
+
module Arabic
|
4
|
+
include Languages::Common
|
10
5
|
|
11
|
-
|
12
|
-
|
13
|
-
end
|
6
|
+
Punctuations = ['?', '!', ':', '.', '؟', '،']
|
7
|
+
SENTENCE_BOUNDARY_REGEX = /.*?[:\.!\?؟،]|.*?\z|.*?$/
|
14
8
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
9
|
+
module Abbreviation
|
10
|
+
ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د']
|
11
|
+
PREPOSITIVE_ABBREVIATIONS = []
|
12
|
+
NUMBER_ABBREVIATIONS = []
|
21
13
|
end
|
22
14
|
|
23
|
-
|
24
|
-
|
15
|
+
# Rubular: http://rubular.com/r/RX5HpdDIyv
|
16
|
+
ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
|
25
17
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
# Rubular: http://rubular.com/r/kPRgApNHUg
|
30
|
-
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
31
|
-
|
32
|
-
def split
|
33
|
-
txt = replace_non_sentence_boundary_punctuation(text)
|
34
|
-
txt.scan(SENTENCE_BOUNDARY)
|
35
|
-
end
|
18
|
+
# Rubular: http://rubular.com/r/kPRgApNHUg
|
19
|
+
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
36
20
|
|
21
|
+
class Process < Process
|
37
22
|
private
|
38
23
|
|
39
|
-
def
|
40
|
-
txt.apply(ReplaceColonBetweenNumbersRule)
|
41
|
-
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
class Abbreviation < PragmaticSegmenter::Abbreviation
|
46
|
-
ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د']
|
47
|
-
|
48
|
-
def all
|
49
|
-
ABBREVIATIONS
|
50
|
-
end
|
51
|
-
|
52
|
-
def prepositive
|
53
|
-
[]
|
24
|
+
def sentence_boundary_punctuation(txt)
|
25
|
+
txt = txt.apply(ReplaceColonBetweenNumbersRule, ReplaceNonSentenceBoundaryCommaRule)
|
26
|
+
txt.scan(SENTENCE_BOUNDARY_REGEX)
|
54
27
|
end
|
55
28
|
|
56
|
-
def
|
57
|
-
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
class Punctuation < PragmaticSegmenter::Punctuation
|
62
|
-
PUNCT = ['?', '!', ':', '.', '؟', '،']
|
63
|
-
|
64
|
-
def punct
|
65
|
-
PUNCT
|
29
|
+
def replace_abbreviations(txt)
|
30
|
+
AbbreviationReplacer.new(text: txt, language: Arabic).replace
|
66
31
|
end
|
67
32
|
end
|
68
33
|
|
69
|
-
class AbbreviationReplacer <
|
34
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
70
35
|
private
|
71
36
|
|
72
|
-
def scan_for_replacements(txt, am, index, character_array
|
73
|
-
|
74
|
-
end
|
75
|
-
|
76
|
-
def replace_abbr(txt, abbr)
|
77
|
-
txt.gsub(/(?<=#{abbr})\./, '∯')
|
78
|
-
end
|
79
|
-
|
80
|
-
def abbreviations
|
81
|
-
PragmaticSegmenter::Languages::Arabic::Abbreviation.new
|
37
|
+
def scan_for_replacements(txt, am, index, character_array)
|
38
|
+
txt.gsub(/(?<=#{am})\./, '∯')
|
82
39
|
end
|
83
40
|
end
|
84
41
|
end
|
@@ -1,36 +1,10 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
4
|
-
|
5
|
-
private
|
3
|
+
module Armenian
|
4
|
+
include Languages::Common
|
6
5
|
|
7
|
-
|
8
|
-
|
9
|
-
end
|
10
|
-
|
11
|
-
def punctuation_array
|
12
|
-
PragmaticSegmenter::Languages::Armenian::Punctuation.new.punct
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
17
|
-
end
|
18
|
-
|
19
|
-
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
20
|
-
SENTENCE_BOUNDARY = /.*?[։՜:]|.*?$/
|
21
|
-
|
22
|
-
def split
|
23
|
-
text.scan(SENTENCE_BOUNDARY)
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
class Punctuation < PragmaticSegmenter::Punctuation
|
28
|
-
PUNCT = ['։', '՜', ':']
|
29
|
-
|
30
|
-
def punct
|
31
|
-
PUNCT
|
32
|
-
end
|
33
|
-
end
|
6
|
+
SENTENCE_BOUNDARY_REGEX = /.*?[։՜:]|.*?$/
|
7
|
+
Punctuations = ['։', '՜', ':']
|
34
8
|
end
|
35
9
|
end
|
36
10
|
end
|
@@ -1,36 +1,10 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
4
|
-
|
5
|
-
private
|
3
|
+
module Burmese
|
4
|
+
include Languages::Common
|
6
5
|
|
7
|
-
|
8
|
-
|
9
|
-
end
|
10
|
-
|
11
|
-
def punctuation_array
|
12
|
-
PragmaticSegmenter::Languages::Burmese::Punctuation.new.punct
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
17
|
-
end
|
18
|
-
|
19
|
-
class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
|
20
|
-
SENTENCE_BOUNDARY = /.*?[။၏!\?]|.*?$/
|
21
|
-
|
22
|
-
def split
|
23
|
-
text.scan(SENTENCE_BOUNDARY)
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
class Punctuation < PragmaticSegmenter::Punctuation
|
28
|
-
PUNCT = ['။', '၏', '?', '!']
|
29
|
-
|
30
|
-
def punct
|
31
|
-
PUNCT
|
32
|
-
end
|
33
|
-
end
|
6
|
+
SENTENCE_BOUNDARY_REGEX = /.*?[။၏!\?]|.*?$/
|
7
|
+
Punctuations = ['။', '၏', '?', '!']
|
34
8
|
end
|
35
9
|
end
|
36
10
|
end
|
@@ -1,6 +1,75 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
3
|
+
module Common
|
4
|
+
# This class holds the punctuation marks.
|
5
|
+
Punctuations = ['。', '.', '.', '!', '!', '?', '?']
|
6
|
+
|
7
|
+
# Defines the abbreviations for each language (if available)
|
8
|
+
module Abbreviation
|
9
|
+
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
|
10
|
+
PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
|
11
|
+
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
12
|
+
end
|
13
|
+
|
14
|
+
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
|
15
|
+
|
16
|
+
include Rules
|
17
|
+
# Rubular: http://rubular.com/r/NqCqv372Ix
|
18
|
+
QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
|
19
|
+
|
20
|
+
# Rubular: http://rubular.com/r/6flGnUMEVl
|
21
|
+
PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
|
22
|
+
|
23
|
+
# Rubular: http://rubular.com/r/TYzr4qOW1Q
|
24
|
+
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
|
25
|
+
|
26
|
+
# Rubular: http://rubular.com/r/JMjlZHAT4g
|
27
|
+
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
|
28
|
+
|
29
|
+
# Rubular: http://rubular.com/r/mQ8Es9bxtk
|
30
|
+
CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
|
31
|
+
|
32
|
+
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
33
|
+
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
34
|
+
|
35
|
+
# Rubular: http://rubular.com/r/NEv265G2X2
|
36
|
+
KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
|
37
|
+
|
38
|
+
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
39
|
+
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
40
|
+
|
41
|
+
module AmPmRules
|
42
|
+
# Rubular: http://rubular.com/r/Vnx3m4Spc8
|
43
|
+
UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
|
44
|
+
|
45
|
+
# Rubular: http://rubular.com/r/AJMCotJVbW
|
46
|
+
UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
|
47
|
+
|
48
|
+
# Rubular: http://rubular.com/r/13q7SnOhgA
|
49
|
+
LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
|
50
|
+
|
51
|
+
# Rubular: http://rubular.com/r/DgUDq4mLz5
|
52
|
+
LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
|
53
|
+
|
54
|
+
All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
|
55
|
+
end
|
56
|
+
|
57
|
+
# This class searches for periods within an abbreviation and
|
58
|
+
# replaces the periods.
|
59
|
+
module SingleLetterAbbreviationRules
|
60
|
+
# Rubular: http://rubular.com/r/e3H6kwnr6H
|
61
|
+
SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
|
62
|
+
|
63
|
+
# Rubular: http://rubular.com/r/gitvf0YWH4
|
64
|
+
SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
|
65
|
+
|
66
|
+
All = [
|
67
|
+
SingleUpperCaseLetterAtStartOfLineRule,
|
68
|
+
SingleUpperCaseLetterRule
|
69
|
+
]
|
70
|
+
end
|
71
|
+
|
72
|
+
|
4
73
|
class Process < PragmaticSegmenter::Process
|
5
74
|
end
|
6
75
|
class Cleaner < PragmaticSegmenter::Cleaner
|
@@ -1,19 +1,51 @@
|
|
1
1
|
module PragmaticSegmenter
|
2
2
|
module Languages
|
3
|
-
|
3
|
+
module Deutsch
|
4
|
+
include Languages::Common
|
5
|
+
|
6
|
+
module Abbreviation
|
7
|
+
ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt']
|
8
|
+
NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
|
9
|
+
PREPOSITIVE_ABBREVIATIONS = []
|
10
|
+
end
|
11
|
+
|
12
|
+
# Rubular: http://rubular.com/r/OdcXBsub0w
|
13
|
+
BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
|
14
|
+
|
15
|
+
# Rubular: http://rubular.com/r/2UskIupGgP
|
16
|
+
SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
|
17
|
+
|
18
|
+
# Rubular: http://rubular.com/r/TkZomF9tTM
|
19
|
+
BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
|
20
|
+
|
21
|
+
# Rubular: http://rubular.com/r/hZxoyQwKT1
|
22
|
+
NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
|
23
|
+
|
24
|
+
# Rubular: http://rubular.com/r/ityNMwdghj
|
25
|
+
NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
|
26
|
+
|
27
|
+
MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
|
28
|
+
|
29
|
+
# Rubular: http://rubular.com/r/B4X33QKIL8
|
30
|
+
SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
|
31
|
+
|
32
|
+
# Rubular: http://rubular.com/r/iUNSkCuso0
|
33
|
+
SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
|
34
|
+
|
35
|
+
|
4
36
|
class Process < PragmaticSegmenter::Process
|
5
37
|
private
|
6
38
|
|
7
39
|
def between_punctuation(txt)
|
8
|
-
|
40
|
+
BetweenPunctuation.new(text: txt).replace
|
9
41
|
end
|
10
42
|
|
11
43
|
def replace_numbers(txt)
|
12
|
-
|
44
|
+
Number.new(text: txt).replace
|
13
45
|
end
|
14
46
|
|
15
47
|
def replace_abbreviations(txt)
|
16
|
-
|
48
|
+
AbbreviationReplacer.new(text: txt, language: Deutsch).replace
|
17
49
|
end
|
18
50
|
end
|
19
51
|
|
@@ -21,27 +53,19 @@ module PragmaticSegmenter
|
|
21
53
|
private
|
22
54
|
|
23
55
|
def abbreviations
|
24
|
-
|
56
|
+
Abbreviation::ABBREVIATIONS
|
25
57
|
end
|
26
58
|
end
|
27
59
|
|
28
60
|
class Number < PragmaticSegmenter::Number
|
29
|
-
# Rubular: http://rubular.com/r/hZxoyQwKT1
|
30
|
-
NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
|
31
|
-
|
32
|
-
# Rubular: http://rubular.com/r/ityNMwdghj
|
33
|
-
NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
|
34
|
-
|
35
|
-
DE_MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
|
36
|
-
|
37
61
|
def replace
|
38
62
|
super
|
39
|
-
@text.apply(NumberPeriodSpaceRule
|
63
|
+
@text.apply(NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule)
|
40
64
|
replace_period_in_deutsch_dates(@text)
|
41
65
|
end
|
42
66
|
|
43
67
|
def replace_period_in_deutsch_dates(txt)
|
44
|
-
|
68
|
+
MONTHS.each do |month|
|
45
69
|
# Rubular: http://rubular.com/r/zlqgj7G5dA
|
46
70
|
txt.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
|
47
71
|
end
|
@@ -49,81 +73,28 @@ module PragmaticSegmenter
|
|
49
73
|
end
|
50
74
|
end
|
51
75
|
|
52
|
-
class
|
53
|
-
# Rubular: http://rubular.com/r/B4X33QKIL8
|
54
|
-
SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
|
55
|
-
|
56
|
-
# Rubular: http://rubular.com/r/iUNSkCuso0
|
57
|
-
SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
|
58
|
-
|
76
|
+
class AbbreviationReplacer < AbbreviationReplacer
|
59
77
|
def replace
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
private
|
78
|
+
@reformatted_text = text.apply(
|
79
|
+
@language::PossessiveAbbreviationRule,
|
80
|
+
@language::SingleLetterAbbreviationRules::All,
|
81
|
+
SingleLowerCaseLetterRule,
|
82
|
+
SingleLowerCaseLetterAtStartOfLineRule)
|
66
83
|
|
67
|
-
|
68
|
-
txt.apply(SingleLowerCaseLetterRule)
|
69
|
-
end
|
70
|
-
|
71
|
-
def replace_single_lowercase_letter(txt)
|
72
|
-
txt.apply(SingleLowerCaseLetterAtStartOfLineRule)
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
class Abbreviation < PragmaticSegmenter::Abbreviation
|
77
|
-
ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt']
|
78
|
-
NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
|
79
|
-
|
80
|
-
def all
|
81
|
-
ABBREVIATIONS
|
82
|
-
end
|
83
|
-
|
84
|
-
def prepositive
|
85
|
-
[]
|
86
|
-
end
|
87
|
-
|
88
|
-
def number
|
89
|
-
NUMBER_ABBREVIATIONS
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
|
94
|
-
def replace
|
95
|
-
@reformatted_text = text.apply(PossessiveAbbreviationRule)
|
96
|
-
@reformatted_text = PragmaticSegmenter::Languages::Deutsch::SingleLetterAbbreviation.new(text: @reformatted_text).replace
|
97
|
-
@reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
|
84
|
+
@reformatted_text = search_for_abbreviations_in_string(@reformatted_text)
|
98
85
|
@reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
|
99
|
-
@reformatted_text = @reformatted_text.apply(AmPmRules::All)
|
86
|
+
@reformatted_text = @reformatted_text.apply(Languages::Common::AmPmRules::All)
|
100
87
|
replace_abbreviation_as_sentence_boundary(@reformatted_text)
|
101
88
|
end
|
102
89
|
|
103
90
|
private
|
104
91
|
|
105
|
-
def scan_for_replacements(txt, am, index, character_array
|
106
|
-
|
107
|
-
end
|
108
|
-
|
109
|
-
def replace_abbr(txt, abbr)
|
110
|
-
txt.gsub(/(?<=#{abbr})\.(?=\s)/, '∯')
|
111
|
-
end
|
112
|
-
|
113
|
-
def abbreviations
|
114
|
-
PragmaticSegmenter::Languages::Deutsch::Abbreviation.new
|
92
|
+
def scan_for_replacements(txt, am, index, character_array)
|
93
|
+
txt.gsub(/(?<=#{am})\.(?=\s)/, '∯')
|
115
94
|
end
|
116
95
|
end
|
117
96
|
|
118
97
|
class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
|
119
|
-
# Rubular: http://rubular.com/r/OdcXBsub0w
|
120
|
-
BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
|
121
|
-
|
122
|
-
# Rubular: http://rubular.com/r/2UskIupGgP
|
123
|
-
SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
|
124
|
-
|
125
|
-
# Rubular: http://rubular.com/r/TkZomF9tTM
|
126
|
-
BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
|
127
98
|
private
|
128
99
|
|
129
100
|
def sub_punctuation_between_double_quotes(txt)
|