pragmatic_segmenter 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +8 -2
  3. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +16 -51
  4. data/lib/pragmatic_segmenter/cleaner.rb +18 -99
  5. data/lib/pragmatic_segmenter/languages.rb +62 -0
  6. data/lib/pragmatic_segmenter/languages/amharic.rb +4 -30
  7. data/lib/pragmatic_segmenter/languages/arabic.rb +21 -64
  8. data/lib/pragmatic_segmenter/languages/armenian.rb +4 -30
  9. data/lib/pragmatic_segmenter/languages/burmese.rb +4 -30
  10. data/lib/pragmatic_segmenter/languages/chinese.rb +8 -0
  11. data/lib/pragmatic_segmenter/languages/common.rb +70 -1
  12. data/lib/pragmatic_segmenter/languages/deutsch.rb +49 -78
  13. data/lib/pragmatic_segmenter/languages/dutch.rb +5 -36
  14. data/lib/pragmatic_segmenter/languages/english.rb +3 -12
  15. data/lib/pragmatic_segmenter/languages/french.rb +5 -32
  16. data/lib/pragmatic_segmenter/languages/greek.rb +4 -26
  17. data/lib/pragmatic_segmenter/languages/hindi.rb +4 -30
  18. data/lib/pragmatic_segmenter/languages/italian.rb +3 -37
  19. data/lib/pragmatic_segmenter/languages/japanese.rb +6 -4
  20. data/lib/pragmatic_segmenter/languages/persian.rb +16 -40
  21. data/lib/pragmatic_segmenter/languages/polish.rb +6 -38
  22. data/lib/pragmatic_segmenter/languages/russian.rb +13 -33
  23. data/lib/pragmatic_segmenter/languages/spanish.rb +6 -31
  24. data/lib/pragmatic_segmenter/languages/urdu.rb +4 -30
  25. data/lib/pragmatic_segmenter/number.rb +5 -5
  26. data/lib/pragmatic_segmenter/process.rb +28 -49
  27. data/lib/pragmatic_segmenter/rules.rb +65 -1
  28. data/lib/pragmatic_segmenter/{ellipsis.rb → rules/ellipsis.rb} +0 -0
  29. data/lib/pragmatic_segmenter/rules/html.rb +13 -0
  30. data/lib/pragmatic_segmenter/segmenter.rb +12 -32
  31. data/lib/pragmatic_segmenter/version.rb +1 -1
  32. data/spec/pragmatic_segmenter_spec.rb +6 -7
  33. metadata +6 -8
  34. data/lib/pragmatic_segmenter/abbreviation.rb +0 -22
  35. data/lib/pragmatic_segmenter/language_support.rb +0 -31
  36. data/lib/pragmatic_segmenter/punctuation.rb +0 -12
  37. data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +0 -17
  38. data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +0 -37
@@ -1,84 +1,41 @@
1
1
  module PragmaticSegmenter
2
2
  module Languages
3
- class Arabic
4
- class Process < PragmaticSegmenter::Process
5
- private
6
-
7
- def sentence_boundary_punctuation(txt)
8
- PragmaticSegmenter::Languages::Arabic::SentenceBoundaryPunctuation.new(text: txt).split
9
- end
3
+ module Arabic
4
+ include Languages::Common
10
5
 
11
- def replace_abbreviations(txt)
12
- PragmaticSegmenter::Languages::Arabic::AbbreviationReplacer.new(text: txt).replace
13
- end
6
+ Punctuations = ['?', '!', ':', '.', '؟', '،']
7
+ SENTENCE_BOUNDARY_REGEX = /.*?[:\.!\?؟،]|.*?\z|.*?$/
14
8
 
15
- def punctuation_array
16
- PragmaticSegmenter::Languages::Arabic::Punctuation.new.punct
17
- end
18
- end
19
-
20
- class Cleaner < PragmaticSegmenter::Cleaner
9
+ module Abbreviation
10
+ ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د‪']
11
+ PREPOSITIVE_ABBREVIATIONS = []
12
+ NUMBER_ABBREVIATIONS = []
21
13
  end
22
14
 
23
- class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
24
- SENTENCE_BOUNDARY = /.*?[:\.!\?؟،]|.*?\z|.*?$/
15
+ # Rubular: http://rubular.com/r/RX5HpdDIyv
16
+ ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
25
17
 
26
- # Rubular: http://rubular.com/r/RX5HpdDIyv
27
- ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '')
28
-
29
- # Rubular: http://rubular.com/r/kPRgApNHUg
30
- ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
31
-
32
- def split
33
- txt = replace_non_sentence_boundary_punctuation(text)
34
- txt.scan(SENTENCE_BOUNDARY)
35
- end
18
+ # Rubular: http://rubular.com/r/kPRgApNHUg
19
+ ReplaceNonSentenceBoundaryCommaRule = Rule.new((?=\s\S+،)/, '')
36
20
 
21
+ class Process < Process
37
22
  private
38
23
 
39
- def replace_non_sentence_boundary_punctuation(txt)
40
- txt.apply(ReplaceColonBetweenNumbersRule).
41
- apply(ReplaceNonSentenceBoundaryCommaRule)
42
- end
43
- end
44
-
45
- class Abbreviation < PragmaticSegmenter::Abbreviation
46
- ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د‪']
47
-
48
- def all
49
- ABBREVIATIONS
50
- end
51
-
52
- def prepositive
53
- []
24
+ def sentence_boundary_punctuation(txt)
25
+ txt = txt.apply(ReplaceColonBetweenNumbersRule, ReplaceNonSentenceBoundaryCommaRule)
26
+ txt.scan(SENTENCE_BOUNDARY_REGEX)
54
27
  end
55
28
 
56
- def number
57
- []
58
- end
59
- end
60
-
61
- class Punctuation < PragmaticSegmenter::Punctuation
62
- PUNCT = ['?', '!', ':', '.', '؟', '،']
63
-
64
- def punct
65
- PUNCT
29
+ def replace_abbreviations(txt)
30
+ AbbreviationReplacer.new(text: txt, language: Arabic).replace
66
31
  end
67
32
  end
68
33
 
69
- class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
34
+ class AbbreviationReplacer < AbbreviationReplacer
70
35
  private
71
36
 
72
- def scan_for_replacements(txt, am, index, character_array, abbr)
73
- replace_abbr(txt, am)
74
- end
75
-
76
- def replace_abbr(txt, abbr)
77
- txt.gsub(/(?<=#{abbr})\./, '∯')
78
- end
79
-
80
- def abbreviations
81
- PragmaticSegmenter::Languages::Arabic::Abbreviation.new
37
+ def scan_for_replacements(txt, am, index, character_array)
38
+ txt.gsub(/(?<=#{am})\./, '∯')
82
39
  end
83
40
  end
84
41
  end
@@ -1,36 +1,10 @@
1
1
  module PragmaticSegmenter
2
2
  module Languages
3
- class Armenian
4
- class Process < PragmaticSegmenter::Process
5
- private
3
+ module Armenian
4
+ include Languages::Common
6
5
 
7
- def sentence_boundary_punctuation(txt)
8
- PragmaticSegmenter::Languages::Armenian::SentenceBoundaryPunctuation.new(text: txt).split
9
- end
10
-
11
- def punctuation_array
12
- PragmaticSegmenter::Languages::Armenian::Punctuation.new.punct
13
- end
14
- end
15
-
16
- class Cleaner < PragmaticSegmenter::Cleaner
17
- end
18
-
19
- class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
20
- SENTENCE_BOUNDARY = /.*?[։՜:]|.*?$/
21
-
22
- def split
23
- text.scan(SENTENCE_BOUNDARY)
24
- end
25
- end
26
-
27
- class Punctuation < PragmaticSegmenter::Punctuation
28
- PUNCT = ['։', '՜', ':']
29
-
30
- def punct
31
- PUNCT
32
- end
33
- end
6
+ SENTENCE_BOUNDARY_REGEX = /.*?[։՜:]|.*?$/
7
+ Punctuations = ['։', '՜', ':']
34
8
  end
35
9
  end
36
10
  end
@@ -1,36 +1,10 @@
1
1
  module PragmaticSegmenter
2
2
  module Languages
3
- class Burmese
4
- class Process < PragmaticSegmenter::Process
5
- private
3
+ module Burmese
4
+ include Languages::Common
6
5
 
7
- def sentence_boundary_punctuation(txt)
8
- PragmaticSegmenter::Languages::Burmese::SentenceBoundaryPunctuation.new(text: txt).split
9
- end
10
-
11
- def punctuation_array
12
- PragmaticSegmenter::Languages::Burmese::Punctuation.new.punct
13
- end
14
- end
15
-
16
- class Cleaner < PragmaticSegmenter::Cleaner
17
- end
18
-
19
- class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
20
- SENTENCE_BOUNDARY = /.*?[။၏!\?]|.*?$/
21
-
22
- def split
23
- text.scan(SENTENCE_BOUNDARY)
24
- end
25
- end
26
-
27
- class Punctuation < PragmaticSegmenter::Punctuation
28
- PUNCT = ['။', '၏', '?', '!']
29
-
30
- def punct
31
- PUNCT
32
- end
33
- end
6
+ SENTENCE_BOUNDARY_REGEX = /.*?[။၏!\?]|.*?$/
7
+ Punctuations = ['။', '၏', '?', '!']
34
8
  end
35
9
  end
36
10
  end
@@ -0,0 +1,8 @@
1
+ module PragmaticSegmenter
2
+ module Languages
3
+ module Chinese
4
+ include Languages::Common
5
+
6
+ end
7
+ end
8
+ end
@@ -1,6 +1,75 @@
1
1
  module PragmaticSegmenter
2
2
  module Languages
3
- class Common
3
+ module Common
4
+ # This class holds the punctuation marks.
5
+ Punctuations = ['。', '.', '.', '!', '!', '?', '?']
6
+
7
+ # Defines the abbreviations for each language (if available)
8
+ module Abbreviation
9
+ ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
10
+ PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
11
+ NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
12
+ end
13
+
14
+ SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
15
+
16
+ include Rules
17
+ # Rubular: http://rubular.com/r/NqCqv372Ix
18
+ QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
19
+
20
+ # Rubular: http://rubular.com/r/6flGnUMEVl
21
+ PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
22
+
23
+ # Rubular: http://rubular.com/r/TYzr4qOW1Q
24
+ BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
25
+
26
+ # Rubular: http://rubular.com/r/JMjlZHAT4g
27
+ SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
28
+
29
+ # Rubular: http://rubular.com/r/mQ8Es9bxtk
30
+ CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
31
+
32
+ # Rubular: http://rubular.com/r/yqa4Rit8EY
33
+ PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
34
+
35
+ # Rubular: http://rubular.com/r/NEv265G2X2
36
+ KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
37
+
38
+ # Rubular: http://rubular.com/r/xDkpFZ0EgH
39
+ MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
40
+
41
+ module AmPmRules
42
+ # Rubular: http://rubular.com/r/Vnx3m4Spc8
43
+ UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
44
+
45
+ # Rubular: http://rubular.com/r/AJMCotJVbW
46
+ UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
47
+
48
+ # Rubular: http://rubular.com/r/13q7SnOhgA
49
+ LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
50
+
51
+ # Rubular: http://rubular.com/r/DgUDq4mLz5
52
+ LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
53
+
54
+ All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
55
+ end
56
+
57
+ # This class searches for periods within an abbreviation and
58
+ # replaces the periods.
59
+ module SingleLetterAbbreviationRules
60
+ # Rubular: http://rubular.com/r/e3H6kwnr6H
61
+ SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
62
+
63
+ # Rubular: http://rubular.com/r/gitvf0YWH4
64
+ SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
65
+
66
+ All = [
67
+ SingleUpperCaseLetterAtStartOfLineRule,
68
+ SingleUpperCaseLetterRule
69
+ ]
70
+ end
71
+
72
+
4
73
  class Process < PragmaticSegmenter::Process
5
74
  end
6
75
  class Cleaner < PragmaticSegmenter::Cleaner
@@ -1,19 +1,51 @@
1
1
  module PragmaticSegmenter
2
2
  module Languages
3
- class Deutsch
3
+ module Deutsch
4
+ include Languages::Common
5
+
6
+ module Abbreviation
7
+ ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt']
8
+ NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
9
+ PREPOSITIVE_ABBREVIATIONS = []
10
+ end
11
+
12
+ # Rubular: http://rubular.com/r/OdcXBsub0w
13
+ BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
14
+
15
+ # Rubular: http://rubular.com/r/2UskIupGgP
16
+ SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
17
+
18
+ # Rubular: http://rubular.com/r/TkZomF9tTM
19
+ BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
20
+
21
+ # Rubular: http://rubular.com/r/hZxoyQwKT1
22
+ NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
23
+
24
+ # Rubular: http://rubular.com/r/ityNMwdghj
25
+ NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
26
+
27
+ MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
28
+
29
+ # Rubular: http://rubular.com/r/B4X33QKIL8
30
+ SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
31
+
32
+ # Rubular: http://rubular.com/r/iUNSkCuso0
33
+ SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
34
+
35
+
4
36
  class Process < PragmaticSegmenter::Process
5
37
  private
6
38
 
7
39
  def between_punctuation(txt)
8
- PragmaticSegmenter::Languages::Deutsch::BetweenPunctuation.new(text: txt).replace
40
+ BetweenPunctuation.new(text: txt).replace
9
41
  end
10
42
 
11
43
  def replace_numbers(txt)
12
- PragmaticSegmenter::Languages::Deutsch::Number.new(text: txt).replace
44
+ Number.new(text: txt).replace
13
45
  end
14
46
 
15
47
  def replace_abbreviations(txt)
16
- PragmaticSegmenter::Languages::Deutsch::AbbreviationReplacer.new(text: txt).replace
48
+ AbbreviationReplacer.new(text: txt, language: Deutsch).replace
17
49
  end
18
50
  end
19
51
 
@@ -21,27 +53,19 @@ module PragmaticSegmenter
21
53
  private
22
54
 
23
55
  def abbreviations
24
- PragmaticSegmenter::Languages::Deutsch::Abbreviation.new.all
56
+ Abbreviation::ABBREVIATIONS
25
57
  end
26
58
  end
27
59
 
28
60
  class Number < PragmaticSegmenter::Number
29
- # Rubular: http://rubular.com/r/hZxoyQwKT1
30
- NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
31
-
32
- # Rubular: http://rubular.com/r/ityNMwdghj
33
- NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
34
-
35
- DE_MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
36
-
37
61
  def replace
38
62
  super
39
- @text.apply(NumberPeriodSpaceRule).apply(NegativeNumberPeriodSpaceRule)
63
+ @text.apply(NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule)
40
64
  replace_period_in_deutsch_dates(@text)
41
65
  end
42
66
 
43
67
  def replace_period_in_deutsch_dates(txt)
44
- DE_MONTHS.each do |month|
68
+ MONTHS.each do |month|
45
69
  # Rubular: http://rubular.com/r/zlqgj7G5dA
46
70
  txt.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
47
71
  end
@@ -49,81 +73,28 @@ module PragmaticSegmenter
49
73
  end
50
74
  end
51
75
 
52
- class SingleLetterAbbreviation < PragmaticSegmenter::SingleLetterAbbreviation
53
- # Rubular: http://rubular.com/r/B4X33QKIL8
54
- SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
55
-
56
- # Rubular: http://rubular.com/r/iUNSkCuso0
57
- SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
58
-
76
+ class AbbreviationReplacer < AbbreviationReplacer
59
77
  def replace
60
- super
61
- @formatted_text = replace_single_lowercase_letter(@formatted_text)
62
- replace_single_lowercase_letter_sol(@formatted_text)
63
- end
64
-
65
- private
78
+ @reformatted_text = text.apply(
79
+ @language::PossessiveAbbreviationRule,
80
+ @language::SingleLetterAbbreviationRules::All,
81
+ SingleLowerCaseLetterRule,
82
+ SingleLowerCaseLetterAtStartOfLineRule)
66
83
 
67
- def replace_single_lowercase_letter_sol(txt)
68
- txt.apply(SingleLowerCaseLetterRule)
69
- end
70
-
71
- def replace_single_lowercase_letter(txt)
72
- txt.apply(SingleLowerCaseLetterAtStartOfLineRule)
73
- end
74
- end
75
-
76
- class Abbreviation < PragmaticSegmenter::Abbreviation
77
- ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str ', 'supt', 'surg', 'u.a ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt']
78
- NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
79
-
80
- def all
81
- ABBREVIATIONS
82
- end
83
-
84
- def prepositive
85
- []
86
- end
87
-
88
- def number
89
- NUMBER_ABBREVIATIONS
90
- end
91
- end
92
-
93
- class AbbreviationReplacer < PragmaticSegmenter::AbbreviationReplacer
94
- def replace
95
- @reformatted_text = text.apply(PossessiveAbbreviationRule)
96
- @reformatted_text = PragmaticSegmenter::Languages::Deutsch::SingleLetterAbbreviation.new(text: @reformatted_text).replace
97
- @reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
84
+ @reformatted_text = search_for_abbreviations_in_string(@reformatted_text)
98
85
  @reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
99
- @reformatted_text = @reformatted_text.apply(AmPmRules::All)
86
+ @reformatted_text = @reformatted_text.apply(Languages::Common::AmPmRules::All)
100
87
  replace_abbreviation_as_sentence_boundary(@reformatted_text)
101
88
  end
102
89
 
103
90
  private
104
91
 
105
- def scan_for_replacements(txt, am, index, character_array, abbr)
106
- replace_abbr(txt, am)
107
- end
108
-
109
- def replace_abbr(txt, abbr)
110
- txt.gsub(/(?<=#{abbr})\.(?=\s)/, '∯')
111
- end
112
-
113
- def abbreviations
114
- PragmaticSegmenter::Languages::Deutsch::Abbreviation.new
92
+ def scan_for_replacements(txt, am, index, character_array)
93
+ txt.gsub(/(?<=#{am})\.(?=\s)/, '∯')
115
94
  end
116
95
  end
117
96
 
118
97
  class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
119
- # Rubular: http://rubular.com/r/OdcXBsub0w
120
- BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
121
-
122
- # Rubular: http://rubular.com/r/2UskIupGgP
123
- SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
124
-
125
- # Rubular: http://rubular.com/r/TkZomF9tTM
126
- BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
127
98
  private
128
99
 
129
100
  def sub_punctuation_between_double_quotes(txt)