pragmatic_segmenter 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/README.md +8 -2
 - data/lib/pragmatic_segmenter/abbreviation_replacer.rb +16 -51
 - data/lib/pragmatic_segmenter/cleaner.rb +18 -99
 - data/lib/pragmatic_segmenter/languages.rb +62 -0
 - data/lib/pragmatic_segmenter/languages/amharic.rb +4 -30
 - data/lib/pragmatic_segmenter/languages/arabic.rb +21 -64
 - data/lib/pragmatic_segmenter/languages/armenian.rb +4 -30
 - data/lib/pragmatic_segmenter/languages/burmese.rb +4 -30
 - data/lib/pragmatic_segmenter/languages/chinese.rb +8 -0
 - data/lib/pragmatic_segmenter/languages/common.rb +70 -1
 - data/lib/pragmatic_segmenter/languages/deutsch.rb +49 -78
 - data/lib/pragmatic_segmenter/languages/dutch.rb +5 -36
 - data/lib/pragmatic_segmenter/languages/english.rb +3 -12
 - data/lib/pragmatic_segmenter/languages/french.rb +5 -32
 - data/lib/pragmatic_segmenter/languages/greek.rb +4 -26
 - data/lib/pragmatic_segmenter/languages/hindi.rb +4 -30
 - data/lib/pragmatic_segmenter/languages/italian.rb +3 -37
 - data/lib/pragmatic_segmenter/languages/japanese.rb +6 -4
 - data/lib/pragmatic_segmenter/languages/persian.rb +16 -40
 - data/lib/pragmatic_segmenter/languages/polish.rb +6 -38
 - data/lib/pragmatic_segmenter/languages/russian.rb +13 -33
 - data/lib/pragmatic_segmenter/languages/spanish.rb +6 -31
 - data/lib/pragmatic_segmenter/languages/urdu.rb +4 -30
 - data/lib/pragmatic_segmenter/number.rb +5 -5
 - data/lib/pragmatic_segmenter/process.rb +28 -49
 - data/lib/pragmatic_segmenter/rules.rb +65 -1
 - data/lib/pragmatic_segmenter/{ellipsis.rb → rules/ellipsis.rb} +0 -0
 - data/lib/pragmatic_segmenter/rules/html.rb +13 -0
 - data/lib/pragmatic_segmenter/segmenter.rb +12 -32
 - data/lib/pragmatic_segmenter/version.rb +1 -1
 - data/spec/pragmatic_segmenter_spec.rb +6 -7
 - metadata +6 -8
 - data/lib/pragmatic_segmenter/abbreviation.rb +0 -22
 - data/lib/pragmatic_segmenter/language_support.rb +0 -31
 - data/lib/pragmatic_segmenter/punctuation.rb +0 -12
 - data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +0 -17
 - data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +0 -37
 
| 
         @@ -1,84 +1,41 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module PragmaticSegmenter
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Languages
         
     | 
| 
       3 
     | 
    
         
            -
                 
     | 
| 
       4 
     | 
    
         
            -
                   
     | 
| 
       5 
     | 
    
         
            -
                    private
         
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
                    def sentence_boundary_punctuation(txt)
         
     | 
| 
       8 
     | 
    
         
            -
                      PragmaticSegmenter::Languages::Arabic::SentenceBoundaryPunctuation.new(text: txt).split
         
     | 
| 
       9 
     | 
    
         
            -
                    end
         
     | 
| 
      
 3 
     | 
    
         
            +
                module Arabic
         
     | 
| 
      
 4 
     | 
    
         
            +
                  include Languages::Common
         
     | 
| 
       10 
5 
     | 
    
         | 
| 
       11 
     | 
    
         
            -
             
     | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
       13 
     | 
    
         
            -
                    end
         
     | 
| 
      
 6 
     | 
    
         
            +
                  Punctuations = ['?', '!', ':', '.', '؟', '،']
         
     | 
| 
      
 7 
     | 
    
         
            +
                  SENTENCE_BOUNDARY_REGEX = /.*?[:\.!\?؟،]|.*?\z|.*?$/
         
     | 
| 
       14 
8 
     | 
    
         | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
       17 
     | 
    
         
            -
                     
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
             
     | 
| 
       20 
     | 
    
         
            -
                  class Cleaner < PragmaticSegmenter::Cleaner
         
     | 
| 
      
 9 
     | 
    
         
            +
                  module Abbreviation
         
     | 
| 
      
 10 
     | 
    
         
            +
                    ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د']
         
     | 
| 
      
 11 
     | 
    
         
            +
                    PREPOSITIVE_ABBREVIATIONS = []
         
     | 
| 
      
 12 
     | 
    
         
            +
                    NUMBER_ABBREVIATIONS = []
         
     | 
| 
       21 
13 
     | 
    
         
             
                  end
         
     | 
| 
       22 
14 
     | 
    
         | 
| 
       23 
     | 
    
         
            -
                   
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
      
 15 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/RX5HpdDIyv
         
     | 
| 
      
 16 
     | 
    
         
            +
                  ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
         
     | 
| 
       25 
17 
     | 
    
         | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
                    # Rubular: http://rubular.com/r/kPRgApNHUg
         
     | 
| 
       30 
     | 
    
         
            -
                    ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
         
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
       32 
     | 
    
         
            -
                    def split
         
     | 
| 
       33 
     | 
    
         
            -
                      txt = replace_non_sentence_boundary_punctuation(text)
         
     | 
| 
       34 
     | 
    
         
            -
                      txt.scan(SENTENCE_BOUNDARY)
         
     | 
| 
       35 
     | 
    
         
            -
                    end
         
     | 
| 
      
 18 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/kPRgApNHUg
         
     | 
| 
      
 19 
     | 
    
         
            +
                  ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
         
     | 
| 
       36 
20 
     | 
    
         | 
| 
      
 21 
     | 
    
         
            +
                  class Process < Process
         
     | 
| 
       37 
22 
     | 
    
         
             
                    private
         
     | 
| 
       38 
23 
     | 
    
         | 
| 
       39 
     | 
    
         
            -
                    def  
     | 
| 
       40 
     | 
    
         
            -
                      txt.apply(ReplaceColonBetweenNumbersRule) 
     | 
| 
       41 
     | 
    
         
            -
             
     | 
| 
       42 
     | 
    
         
            -
                    end
         
     | 
| 
       43 
     | 
    
         
            -
                  end
         
     | 
| 
       44 
     | 
    
         
            -
             
     | 
| 
       45 
     | 
    
         
            -
                  class Abbreviation < PragmaticSegmenter::Abbreviation
         
     | 
| 
       46 
     | 
    
         
            -
                    ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه', 'د']
         
     | 
| 
       47 
     | 
    
         
            -
             
     | 
| 
       48 
     | 
    
         
            -
                    def all
         
     | 
| 
       49 
     | 
    
         
            -
                      ABBREVIATIONS
         
     | 
| 
       50 
     | 
    
         
            -
                    end
         
     | 
| 
       51 
     | 
    
         
            -
             
     | 
| 
       52 
     | 
    
         
            -
                    def prepositive
         
     | 
| 
       53 
     | 
    
         
            -
                      []
         
     | 
| 
      
 24 
     | 
    
         
            +
                    def sentence_boundary_punctuation(txt)
         
     | 
| 
      
 25 
     | 
    
         
            +
                      txt = txt.apply(ReplaceColonBetweenNumbersRule, ReplaceNonSentenceBoundaryCommaRule)
         
     | 
| 
      
 26 
     | 
    
         
            +
                      txt.scan(SENTENCE_BOUNDARY_REGEX)
         
     | 
| 
       54 
27 
     | 
    
         
             
                    end
         
     | 
| 
       55 
28 
     | 
    
         | 
| 
       56 
     | 
    
         
            -
                    def  
     | 
| 
       57 
     | 
    
         
            -
                       
     | 
| 
       58 
     | 
    
         
            -
                    end
         
     | 
| 
       59 
     | 
    
         
            -
                  end
         
     | 
| 
       60 
     | 
    
         
            -
             
     | 
| 
       61 
     | 
    
         
            -
                  class Punctuation < PragmaticSegmenter::Punctuation
         
     | 
| 
       62 
     | 
    
         
            -
                    PUNCT = ['?', '!', ':', '.', '؟', '،']
         
     | 
| 
       63 
     | 
    
         
            -
             
     | 
| 
       64 
     | 
    
         
            -
                    def punct
         
     | 
| 
       65 
     | 
    
         
            -
                      PUNCT
         
     | 
| 
      
 29 
     | 
    
         
            +
                    def replace_abbreviations(txt)
         
     | 
| 
      
 30 
     | 
    
         
            +
                      AbbreviationReplacer.new(text: txt, language: Arabic).replace
         
     | 
| 
       66 
31 
     | 
    
         
             
                    end
         
     | 
| 
       67 
32 
     | 
    
         
             
                  end
         
     | 
| 
       68 
33 
     | 
    
         | 
| 
       69 
     | 
    
         
            -
                  class AbbreviationReplacer  <  
     | 
| 
      
 34 
     | 
    
         
            +
                  class AbbreviationReplacer  < AbbreviationReplacer
         
     | 
| 
       70 
35 
     | 
    
         
             
                    private
         
     | 
| 
       71 
36 
     | 
    
         | 
| 
       72 
     | 
    
         
            -
                    def scan_for_replacements(txt, am, index, character_array 
     | 
| 
       73 
     | 
    
         
            -
                       
     | 
| 
       74 
     | 
    
         
            -
                    end
         
     | 
| 
       75 
     | 
    
         
            -
             
     | 
| 
       76 
     | 
    
         
            -
                    def replace_abbr(txt, abbr)
         
     | 
| 
       77 
     | 
    
         
            -
                      txt.gsub(/(?<=#{abbr})\./, '∯')
         
     | 
| 
       78 
     | 
    
         
            -
                    end
         
     | 
| 
       79 
     | 
    
         
            -
             
     | 
| 
       80 
     | 
    
         
            -
                    def abbreviations
         
     | 
| 
       81 
     | 
    
         
            -
                      PragmaticSegmenter::Languages::Arabic::Abbreviation.new
         
     | 
| 
      
 37 
     | 
    
         
            +
                    def scan_for_replacements(txt, am, index, character_array)
         
     | 
| 
      
 38 
     | 
    
         
            +
                      txt.gsub(/(?<=#{am})\./, '∯')
         
     | 
| 
       82 
39 
     | 
    
         
             
                    end
         
     | 
| 
       83 
40 
     | 
    
         
             
                  end
         
     | 
| 
       84 
41 
     | 
    
         
             
                end
         
     | 
| 
         @@ -1,36 +1,10 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module PragmaticSegmenter
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Languages
         
     | 
| 
       3 
     | 
    
         
            -
                 
     | 
| 
       4 
     | 
    
         
            -
                   
     | 
| 
       5 
     | 
    
         
            -
                    private
         
     | 
| 
      
 3 
     | 
    
         
            +
                module Armenian
         
     | 
| 
      
 4 
     | 
    
         
            +
                  include Languages::Common
         
     | 
| 
       6 
5 
     | 
    
         | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
       9 
     | 
    
         
            -
                    end
         
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
                    def punctuation_array
         
     | 
| 
       12 
     | 
    
         
            -
                      PragmaticSegmenter::Languages::Armenian::Punctuation.new.punct
         
     | 
| 
       13 
     | 
    
         
            -
                    end
         
     | 
| 
       14 
     | 
    
         
            -
                  end
         
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
                  class Cleaner < PragmaticSegmenter::Cleaner
         
     | 
| 
       17 
     | 
    
         
            -
                  end
         
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
                  class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
         
     | 
| 
       20 
     | 
    
         
            -
                    SENTENCE_BOUNDARY = /.*?[։՜:]|.*?$/
         
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
                    def split
         
     | 
| 
       23 
     | 
    
         
            -
                      text.scan(SENTENCE_BOUNDARY)
         
     | 
| 
       24 
     | 
    
         
            -
                    end
         
     | 
| 
       25 
     | 
    
         
            -
                  end
         
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
                  class Punctuation < PragmaticSegmenter::Punctuation
         
     | 
| 
       28 
     | 
    
         
            -
                    PUNCT = ['։', '՜', ':']
         
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
                    def punct
         
     | 
| 
       31 
     | 
    
         
            -
                      PUNCT
         
     | 
| 
       32 
     | 
    
         
            -
                    end
         
     | 
| 
       33 
     | 
    
         
            -
                  end
         
     | 
| 
      
 6 
     | 
    
         
            +
                  SENTENCE_BOUNDARY_REGEX = /.*?[։՜:]|.*?$/
         
     | 
| 
      
 7 
     | 
    
         
            +
                  Punctuations = ['։', '՜', ':']
         
     | 
| 
       34 
8 
     | 
    
         
             
                end
         
     | 
| 
       35 
9 
     | 
    
         
             
              end
         
     | 
| 
       36 
10 
     | 
    
         
             
            end
         
     | 
| 
         @@ -1,36 +1,10 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module PragmaticSegmenter
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Languages
         
     | 
| 
       3 
     | 
    
         
            -
                 
     | 
| 
       4 
     | 
    
         
            -
                   
     | 
| 
       5 
     | 
    
         
            -
                    private
         
     | 
| 
      
 3 
     | 
    
         
            +
                module Burmese
         
     | 
| 
      
 4 
     | 
    
         
            +
                  include Languages::Common
         
     | 
| 
       6 
5 
     | 
    
         | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
       9 
     | 
    
         
            -
                    end
         
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
                    def punctuation_array
         
     | 
| 
       12 
     | 
    
         
            -
                      PragmaticSegmenter::Languages::Burmese::Punctuation.new.punct
         
     | 
| 
       13 
     | 
    
         
            -
                    end
         
     | 
| 
       14 
     | 
    
         
            -
                  end
         
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
                  class Cleaner < PragmaticSegmenter::Cleaner
         
     | 
| 
       17 
     | 
    
         
            -
                  end
         
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
                  class SentenceBoundaryPunctuation < PragmaticSegmenter::SentenceBoundaryPunctuation
         
     | 
| 
       20 
     | 
    
         
            -
                    SENTENCE_BOUNDARY = /.*?[။၏!\?]|.*?$/
         
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
                    def split
         
     | 
| 
       23 
     | 
    
         
            -
                      text.scan(SENTENCE_BOUNDARY)
         
     | 
| 
       24 
     | 
    
         
            -
                    end
         
     | 
| 
       25 
     | 
    
         
            -
                  end
         
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
                  class Punctuation < PragmaticSegmenter::Punctuation
         
     | 
| 
       28 
     | 
    
         
            -
                    PUNCT = ['။', '၏', '?', '!']
         
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
                    def punct
         
     | 
| 
       31 
     | 
    
         
            -
                      PUNCT
         
     | 
| 
       32 
     | 
    
         
            -
                    end
         
     | 
| 
       33 
     | 
    
         
            -
                  end
         
     | 
| 
      
 6 
     | 
    
         
            +
                  SENTENCE_BOUNDARY_REGEX = /.*?[။၏!\?]|.*?$/
         
     | 
| 
      
 7 
     | 
    
         
            +
                  Punctuations = ['။', '၏', '?', '!']
         
     | 
| 
       34 
8 
     | 
    
         
             
                end
         
     | 
| 
       35 
9 
     | 
    
         
             
              end
         
     | 
| 
       36 
10 
     | 
    
         
             
            end
         
     | 
| 
         @@ -1,6 +1,75 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module PragmaticSegmenter
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Languages
         
     | 
| 
       3 
     | 
    
         
            -
                 
     | 
| 
      
 3 
     | 
    
         
            +
                module Common
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # This class holds the punctuation marks.
         
     | 
| 
      
 5 
     | 
    
         
            +
                  Punctuations = ['。', '.', '.', '!', '!', '?', '?']
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
                  # Defines the abbreviations for each language (if available)
         
     | 
| 
      
 8 
     | 
    
         
            +
                  module Abbreviation
         
     | 
| 
      
 9 
     | 
    
         
            +
                    ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
         
     | 
| 
      
 10 
     | 
    
         
            +
                    PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
         
     | 
| 
      
 11 
     | 
    
         
            +
                    NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
         
     | 
| 
      
 12 
     | 
    
         
            +
                  end
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
                  SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                  include Rules
         
     | 
| 
      
 17 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/NqCqv372Ix
         
     | 
| 
      
 18 
     | 
    
         
            +
                  QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/6flGnUMEVl
         
     | 
| 
      
 21 
     | 
    
         
            +
                  PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/TYzr4qOW1Q
         
     | 
| 
      
 24 
     | 
    
         
            +
                  BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/JMjlZHAT4g
         
     | 
| 
      
 27 
     | 
    
         
            +
                  SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/mQ8Es9bxtk
         
     | 
| 
      
 30 
     | 
    
         
            +
                  CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/yqa4Rit8EY
         
     | 
| 
      
 33 
     | 
    
         
            +
                  PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/NEv265G2X2
         
     | 
| 
      
 36 
     | 
    
         
            +
                  KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/xDkpFZ0EgH
         
     | 
| 
      
 39 
     | 
    
         
            +
                  MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
                  module AmPmRules
         
     | 
| 
      
 42 
     | 
    
         
            +
                    # Rubular: http://rubular.com/r/Vnx3m4Spc8
         
     | 
| 
      
 43 
     | 
    
         
            +
                    UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
                    # Rubular: http://rubular.com/r/AJMCotJVbW
         
     | 
| 
      
 46 
     | 
    
         
            +
                    UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
      
 48 
     | 
    
         
            +
                    # Rubular: http://rubular.com/r/13q7SnOhgA
         
     | 
| 
      
 49 
     | 
    
         
            +
                    LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                    # Rubular: http://rubular.com/r/DgUDq4mLz5
         
     | 
| 
      
 52 
     | 
    
         
            +
                    LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
                    All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
         
     | 
| 
      
 55 
     | 
    
         
            +
                  end
         
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
                  # This class searches for periods within an abbreviation and
         
     | 
| 
      
 58 
     | 
    
         
            +
                  # replaces the periods.
         
     | 
| 
      
 59 
     | 
    
         
            +
                  module SingleLetterAbbreviationRules
         
     | 
| 
      
 60 
     | 
    
         
            +
                    # Rubular: http://rubular.com/r/e3H6kwnr6H
         
     | 
| 
      
 61 
     | 
    
         
            +
                    SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                    # Rubular: http://rubular.com/r/gitvf0YWH4
         
     | 
| 
      
 64 
     | 
    
         
            +
                    SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
         
     | 
| 
      
 65 
     | 
    
         
            +
             
     | 
| 
      
 66 
     | 
    
         
            +
                    All = [
         
     | 
| 
      
 67 
     | 
    
         
            +
                      SingleUpperCaseLetterAtStartOfLineRule,
         
     | 
| 
      
 68 
     | 
    
         
            +
                      SingleUpperCaseLetterRule
         
     | 
| 
      
 69 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 70 
     | 
    
         
            +
                  end
         
     | 
| 
      
 71 
     | 
    
         
            +
             
     | 
| 
      
 72 
     | 
    
         
            +
             
     | 
| 
       4 
73 
     | 
    
         
             
                  class Process < PragmaticSegmenter::Process
         
     | 
| 
       5 
74 
     | 
    
         
             
                  end
         
     | 
| 
       6 
75 
     | 
    
         
             
                  class Cleaner < PragmaticSegmenter::Cleaner
         
     | 
| 
         @@ -1,19 +1,51 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module PragmaticSegmenter
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Languages
         
     | 
| 
       3 
     | 
    
         
            -
                 
     | 
| 
      
 3 
     | 
    
         
            +
                module Deutsch
         
     | 
| 
      
 4 
     | 
    
         
            +
                  include Languages::Common
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
                  module Abbreviation
         
     | 
| 
      
 7 
     | 
    
         
            +
                    ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str  ', 'supt', 'surg', 'u.a  ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt']
         
     | 
| 
      
 8 
     | 
    
         
            +
                    NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
         
     | 
| 
      
 9 
     | 
    
         
            +
                    PREPOSITIVE_ABBREVIATIONS = []
         
     | 
| 
      
 10 
     | 
    
         
            +
                  end
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/OdcXBsub0w
         
     | 
| 
      
 13 
     | 
    
         
            +
                  BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/2UskIupGgP
         
     | 
| 
      
 16 
     | 
    
         
            +
                  SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/TkZomF9tTM
         
     | 
| 
      
 19 
     | 
    
         
            +
                  BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/hZxoyQwKT1
         
     | 
| 
      
 22 
     | 
    
         
            +
                  NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/ityNMwdghj
         
     | 
| 
      
 25 
     | 
    
         
            +
                  NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
                  MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/B4X33QKIL8
         
     | 
| 
      
 30 
     | 
    
         
            +
                  SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                  # Rubular: http://rubular.com/r/iUNSkCuso0
         
     | 
| 
      
 33 
     | 
    
         
            +
                  SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
         
     | 
| 
      
 34 
     | 
    
         
            +
             
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
       4 
36 
     | 
    
         
             
                  class Process < PragmaticSegmenter::Process
         
     | 
| 
       5 
37 
     | 
    
         
             
                    private
         
     | 
| 
       6 
38 
     | 
    
         | 
| 
       7 
39 
     | 
    
         
             
                    def between_punctuation(txt)
         
     | 
| 
       8 
     | 
    
         
            -
                       
     | 
| 
      
 40 
     | 
    
         
            +
                      BetweenPunctuation.new(text: txt).replace
         
     | 
| 
       9 
41 
     | 
    
         
             
                    end
         
     | 
| 
       10 
42 
     | 
    
         | 
| 
       11 
43 
     | 
    
         
             
                    def replace_numbers(txt)
         
     | 
| 
       12 
     | 
    
         
            -
                       
     | 
| 
      
 44 
     | 
    
         
            +
                      Number.new(text: txt).replace
         
     | 
| 
       13 
45 
     | 
    
         
             
                    end
         
     | 
| 
       14 
46 
     | 
    
         | 
| 
       15 
47 
     | 
    
         
             
                    def replace_abbreviations(txt)
         
     | 
| 
       16 
     | 
    
         
            -
                       
     | 
| 
      
 48 
     | 
    
         
            +
                      AbbreviationReplacer.new(text: txt, language: Deutsch).replace
         
     | 
| 
       17 
49 
     | 
    
         
             
                    end
         
     | 
| 
       18 
50 
     | 
    
         
             
                  end
         
     | 
| 
       19 
51 
     | 
    
         | 
| 
         @@ -21,27 +53,19 @@ module PragmaticSegmenter 
     | 
|
| 
       21 
53 
     | 
    
         
             
                    private
         
     | 
| 
       22 
54 
     | 
    
         | 
| 
       23 
55 
     | 
    
         
             
                    def abbreviations
         
     | 
| 
       24 
     | 
    
         
            -
                       
     | 
| 
      
 56 
     | 
    
         
            +
                      Abbreviation::ABBREVIATIONS
         
     | 
| 
       25 
57 
     | 
    
         
             
                    end
         
     | 
| 
       26 
58 
     | 
    
         
             
                  end
         
     | 
| 
       27 
59 
     | 
    
         | 
| 
       28 
60 
     | 
    
         
             
                  class Number < PragmaticSegmenter::Number
         
     | 
| 
       29 
     | 
    
         
            -
                    # Rubular: http://rubular.com/r/hZxoyQwKT1
         
     | 
| 
       30 
     | 
    
         
            -
                    NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
         
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
       32 
     | 
    
         
            -
                    # Rubular: http://rubular.com/r/ityNMwdghj
         
     | 
| 
       33 
     | 
    
         
            -
                    NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
         
     | 
| 
       34 
     | 
    
         
            -
             
     | 
| 
       35 
     | 
    
         
            -
                    DE_MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
         
     | 
| 
       36 
     | 
    
         
            -
             
     | 
| 
       37 
61 
     | 
    
         
             
                    def replace
         
     | 
| 
       38 
62 
     | 
    
         
             
                      super
         
     | 
| 
       39 
     | 
    
         
            -
                      @text.apply(NumberPeriodSpaceRule 
     | 
| 
      
 63 
     | 
    
         
            +
                      @text.apply(NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule)
         
     | 
| 
       40 
64 
     | 
    
         
             
                      replace_period_in_deutsch_dates(@text)
         
     | 
| 
       41 
65 
     | 
    
         
             
                    end
         
     | 
| 
       42 
66 
     | 
    
         | 
| 
       43 
67 
     | 
    
         
             
                    def replace_period_in_deutsch_dates(txt)
         
     | 
| 
       44 
     | 
    
         
            -
                       
     | 
| 
      
 68 
     | 
    
         
            +
                      MONTHS.each do |month|
         
     | 
| 
       45 
69 
     | 
    
         
             
                        # Rubular: http://rubular.com/r/zlqgj7G5dA
         
     | 
| 
       46 
70 
     | 
    
         
             
                        txt.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
         
     | 
| 
       47 
71 
     | 
    
         
             
                      end
         
     | 
| 
         @@ -49,81 +73,28 @@ module PragmaticSegmenter 
     | 
|
| 
       49 
73 
     | 
    
         
             
                    end
         
     | 
| 
       50 
74 
     | 
    
         
             
                  end
         
     | 
| 
       51 
75 
     | 
    
         | 
| 
       52 
     | 
    
         
            -
                  class  
     | 
| 
       53 
     | 
    
         
            -
                    # Rubular: http://rubular.com/r/B4X33QKIL8
         
     | 
| 
       54 
     | 
    
         
            -
                    SingleLowerCaseLetterRule = Rule.new(/(?<=\s[a-z])\.(?=\s)/, '∯')
         
     | 
| 
       55 
     | 
    
         
            -
             
     | 
| 
       56 
     | 
    
         
            -
                    # Rubular: http://rubular.com/r/iUNSkCuso0
         
     | 
| 
       57 
     | 
    
         
            -
                    SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
         
     | 
| 
       58 
     | 
    
         
            -
             
     | 
| 
      
 76 
     | 
    
         
            +
                  class AbbreviationReplacer  < AbbreviationReplacer
         
     | 
| 
       59 
77 
     | 
    
         
             
                    def replace
         
     | 
| 
       60 
     | 
    
         
            -
                       
     | 
| 
       61 
     | 
    
         
            -
             
     | 
| 
       62 
     | 
    
         
            -
             
     | 
| 
       63 
     | 
    
         
            -
             
     | 
| 
       64 
     | 
    
         
            -
             
     | 
| 
       65 
     | 
    
         
            -
                    private
         
     | 
| 
      
 78 
     | 
    
         
            +
                      @reformatted_text = text.apply(
         
     | 
| 
      
 79 
     | 
    
         
            +
                        @language::PossessiveAbbreviationRule,
         
     | 
| 
      
 80 
     | 
    
         
            +
                        @language::SingleLetterAbbreviationRules::All,
         
     | 
| 
      
 81 
     | 
    
         
            +
                        SingleLowerCaseLetterRule,
         
     | 
| 
      
 82 
     | 
    
         
            +
                        SingleLowerCaseLetterAtStartOfLineRule)
         
     | 
| 
       66 
83 
     | 
    
         | 
| 
       67 
     | 
    
         
            -
             
     | 
| 
       68 
     | 
    
         
            -
                      txt.apply(SingleLowerCaseLetterRule)
         
     | 
| 
       69 
     | 
    
         
            -
                    end
         
     | 
| 
       70 
     | 
    
         
            -
             
     | 
| 
       71 
     | 
    
         
            -
                    def replace_single_lowercase_letter(txt)
         
     | 
| 
       72 
     | 
    
         
            -
                      txt.apply(SingleLowerCaseLetterAtStartOfLineRule)
         
     | 
| 
       73 
     | 
    
         
            -
                    end
         
     | 
| 
       74 
     | 
    
         
            -
                  end
         
     | 
| 
       75 
     | 
    
         
            -
             
     | 
| 
       76 
     | 
    
         
            -
                  class Abbreviation < PragmaticSegmenter::Abbreviation
         
     | 
| 
       77 
     | 
    
         
            -
                    ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str  ', 'supt', 'surg', 'u.a  ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt']
         
     | 
| 
       78 
     | 
    
         
            -
                    NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
         
     | 
| 
       79 
     | 
    
         
            -
             
     | 
| 
       80 
     | 
    
         
            -
                    def all
         
     | 
| 
       81 
     | 
    
         
            -
                      ABBREVIATIONS
         
     | 
| 
       82 
     | 
    
         
            -
                    end
         
     | 
| 
       83 
     | 
    
         
            -
             
     | 
| 
       84 
     | 
    
         
            -
                    def prepositive
         
     | 
| 
       85 
     | 
    
         
            -
                      []
         
     | 
| 
       86 
     | 
    
         
            -
                    end
         
     | 
| 
       87 
     | 
    
         
            -
             
     | 
| 
       88 
     | 
    
         
            -
                    def number
         
     | 
| 
       89 
     | 
    
         
            -
                      NUMBER_ABBREVIATIONS
         
     | 
| 
       90 
     | 
    
         
            -
                    end
         
     | 
| 
       91 
     | 
    
         
            -
                  end
         
     | 
| 
       92 
     | 
    
         
            -
             
     | 
| 
       93 
     | 
    
         
            -
                  class AbbreviationReplacer  < PragmaticSegmenter::AbbreviationReplacer
         
     | 
| 
       94 
     | 
    
         
            -
                    def replace
         
     | 
| 
       95 
     | 
    
         
            -
                      @reformatted_text = text.apply(PossessiveAbbreviationRule)
         
     | 
| 
       96 
     | 
    
         
            -
                      @reformatted_text = PragmaticSegmenter::Languages::Deutsch::SingleLetterAbbreviation.new(text: @reformatted_text).replace
         
     | 
| 
       97 
     | 
    
         
            -
                      @reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
         
     | 
| 
      
 84 
     | 
    
         
            +
                      @reformatted_text = search_for_abbreviations_in_string(@reformatted_text)
         
     | 
| 
       98 
85 
     | 
    
         
             
                      @reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
         
     | 
| 
       99 
     | 
    
         
            -
                      @reformatted_text = @reformatted_text.apply(AmPmRules::All)
         
     | 
| 
      
 86 
     | 
    
         
            +
                      @reformatted_text = @reformatted_text.apply(Languages::Common::AmPmRules::All)
         
     | 
| 
       100 
87 
     | 
    
         
             
                      replace_abbreviation_as_sentence_boundary(@reformatted_text)
         
     | 
| 
       101 
88 
     | 
    
         
             
                    end
         
     | 
| 
       102 
89 
     | 
    
         | 
| 
       103 
90 
     | 
    
         
             
                    private
         
     | 
| 
       104 
91 
     | 
    
         | 
| 
       105 
     | 
    
         
            -
                    def scan_for_replacements(txt, am, index, character_array 
     | 
| 
       106 
     | 
    
         
            -
                       
     | 
| 
       107 
     | 
    
         
            -
                    end
         
     | 
| 
       108 
     | 
    
         
            -
             
     | 
| 
       109 
     | 
    
         
            -
                    def replace_abbr(txt, abbr)
         
     | 
| 
       110 
     | 
    
         
            -
                      txt.gsub(/(?<=#{abbr})\.(?=\s)/, '∯')
         
     | 
| 
       111 
     | 
    
         
            -
                    end
         
     | 
| 
       112 
     | 
    
         
            -
             
     | 
| 
       113 
     | 
    
         
            -
                    def abbreviations
         
     | 
| 
       114 
     | 
    
         
            -
                      PragmaticSegmenter::Languages::Deutsch::Abbreviation.new
         
     | 
| 
      
 92 
     | 
    
         
            +
                    def scan_for_replacements(txt, am, index, character_array)
         
     | 
| 
      
 93 
     | 
    
         
            +
                      txt.gsub(/(?<=#{am})\.(?=\s)/, '∯')
         
     | 
| 
       115 
94 
     | 
    
         
             
                    end
         
     | 
| 
       116 
95 
     | 
    
         
             
                  end
         
     | 
| 
       117 
96 
     | 
    
         | 
| 
       118 
97 
     | 
    
         
             
                  class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
         
     | 
| 
       119 
     | 
    
         
            -
                    # Rubular: http://rubular.com/r/OdcXBsub0w
         
     | 
| 
       120 
     | 
    
         
            -
                    BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = /,,(?>[^“\\]+|\\{2}|\\.)*“/
         
     | 
| 
       121 
     | 
    
         
            -
             
     | 
| 
       122 
     | 
    
         
            -
                    # Rubular: http://rubular.com/r/2UskIupGgP
         
     | 
| 
       123 
     | 
    
         
            -
                    SPLIT_DOUBLE_QUOTES_DE_REGEX = /\A„(?>[^“\\]+|\\{2}|\\.)*“/
         
     | 
| 
       124 
     | 
    
         
            -
             
     | 
| 
       125 
     | 
    
         
            -
                    # Rubular: http://rubular.com/r/TkZomF9tTM
         
     | 
| 
       126 
     | 
    
         
            -
                    BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
         
     | 
| 
       127 
98 
     | 
    
         
             
                    private
         
     | 
| 
       128 
99 
     | 
    
         | 
| 
       129 
100 
     | 
    
         
             
                    def sub_punctuation_between_double_quotes(txt)
         
     |