pragmatic_segmenter 0.0.6 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +39 -8
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +3 -1
- data/lib/pragmatic_segmenter/list.rb +34 -14
- data/lib/pragmatic_segmenter/rules.rb +17 -17
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/spec/pragmatic_segmenter_spec.rb +30 -0
- metadata +2 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 0f7d9ee20797db1385cc6b19dd6a9029f51355bc
         | 
| 4 | 
            +
              data.tar.gz: 44a2066e7e3cc3a08e7a53a0a31b35d49687471a
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 6bc171f4cda7cddce161dc2ce1f7acddf0c90a9602316530a7378d48ec26fc41e335c34bd560ecc726c1f6cd16b363d37e153feac0ce11c04b01b67f89983522
         | 
| 7 | 
            +
              data.tar.gz: 498ec2b1e6b8ef8b7f6f07f482e6805ab98cfc5c06d5a53cf074929b6928461876f46457f093e572342f2e716dcdc2914ce040562a20a945b46c48ca0c4af3ef
         | 
    
        data/README.md
    CHANGED
    
    | @@ -641,14 +641,14 @@ Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre | |
| 641 641 |  | 
| 642 642 | 
             
            Name                                                                 | Programming Language | License                                             | GRS (English) | GRS (Other Languages)† | Speed‡
         | 
| 643 643 | 
             
            ---------------------------------------------------------------------| -------------------- | --------------------------------------------------- | ------------- | ---------------------- | -------
         | 
| 644 | 
            -
            Pragmatic Segmenter                                                  | Ruby                 | [MIT](http://opensource.org/licenses/MIT)           | 98. | 
| 645 | 
            -
            [TactfulTokenizer](https://github.com/zencephalon/Tactful_Tokenizer) | Ruby                 | [GNU GPLv3](http://www.gnu.org/copyleft/gpl.html)   |  | 
| 646 | 
            -
            [OpenNLP](https://opennlp.apache.org/)                               | Java                 | [APLv2](http://www.apache.org/licenses/LICENSE-2.0) |  | 
| 647 | 
            -
            [Standford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml)  | Java                 | [GNU GPLv3](http://www.gnu.org/copyleft/gpl.html)   |  | 
| 648 | 
            -
            [Splitta](http://www.nltk.org/_modules/nltk/tokenize/punkt.html)     | Python               | [APLv2](http://www.apache.org/licenses/LICENSE-2.0) |  | 
| 649 | 
            -
            [Punkt](http://www.nltk.org/_modules/nltk/tokenize/punkt.html)       | Python               | [APLv2](http://www.apache.org/licenses/LICENSE-2.0) |  | 
| 650 | 
            -
            [SRX English](https://github.com/apohllo/srx-english)                | Ruby                 | [GNU GPLv3](http://www.gnu.org/copyleft/gpl.html)   |  | 
| 651 | 
            -
            [Scapel](https://github.com/louismullie/scalpel)                     | Ruby                 | [GNU GPLv3](http://www.gnu.org/copyleft/gpl.html)   |  | 
| 644 | 
            +
            Pragmatic Segmenter                                                  | Ruby                 | [MIT](http://opensource.org/licenses/MIT)           | 98.08%        | 100.00%                | 3.84 s
         | 
| 645 | 
            +
            [TactfulTokenizer](https://github.com/zencephalon/Tactful_Tokenizer) | Ruby                 | [GNU GPLv3](http://www.gnu.org/copyleft/gpl.html)   | 65.38%        | 45.45%                 | 46.32 s
         | 
| 646 | 
            +
            [OpenNLP](https://opennlp.apache.org/)                               | Java                 | [APLv2](http://www.apache.org/licenses/LICENSE-2.0) | 59.62%        | 42.42%                 | 1.27 s
         | 
| 647 | 
            +
            [Standford CoreNLP](http://nlp.stanford.edu/software/corenlp.shtml)  | Java                 | [GNU GPLv3](http://www.gnu.org/copyleft/gpl.html)   | 59.62%        | 27.27%                 | 0.92 s
         | 
| 648 | 
            +
            [Splitta](http://www.nltk.org/_modules/nltk/tokenize/punkt.html)     | Python               | [APLv2](http://www.apache.org/licenses/LICENSE-2.0) | 55.77%        | 33.33%                 | N/A
         | 
| 649 | 
            +
            [Punkt](http://www.nltk.org/_modules/nltk/tokenize/punkt.html)       | Python               | [APLv2](http://www.apache.org/licenses/LICENSE-2.0) | 46.15%        | 45.45%                 | 1.79 s
         | 
| 650 | 
            +
            [SRX English](https://github.com/apohllo/srx-english)                | Ruby                 | [GNU GPLv3](http://www.gnu.org/copyleft/gpl.html)   | 30.77%        | 24.24%                 | 6.19 s
         | 
| 651 | 
            +
            [Scapel](https://github.com/louismullie/scalpel)                     | Ruby                 | [GNU GPLv3](http://www.gnu.org/copyleft/gpl.html)   | 28.85%        | 15.15%                 | 0.13 s
         | 
| 652 652 |  | 
| 653 653 | 
             
            †GRS (Other Languages) is the total of the Golden Rules listed above for all languages other than English. This metric by no means includes all languages, only the ones that have Golden Rules listed above.  
         | 
| 654 654 | 
             
            ‡ Speed is based on the performance benchmark results detailed in the section "Speed Performance Benchmarks" below. The number is an average of 10 runs.
         | 
| @@ -707,6 +707,37 @@ To test the relative performance of different segmentation tools and libraries I | |
| 707 707 | 
             
            * Add abbreviation lists for any languages that do not currently have one (only relevant for languages that have the concept of abbreviations with periods)
         | 
| 708 708 | 
             
            * Get Golden Rule #18 passing - Handling of a.m. or p.m. followed by a capitalized non sentence starter (ex. "At 5 p.m. Mr. Smith went to the bank. He left the bank at 6 p.m. Next he went to the store." --> ["At 5 p.m. Mr. Smith went to the bank.", "He left the bank at 6 p.m.", "Next he went to the store."])
         | 
| 709 709 |  | 
| 710 | 
            +
            ## Change Log
         | 
| 711 | 
            +
             | 
| 712 | 
            +
            **Version 0.0.1**  
         | 
| 713 | 
            +
            * Initial Release  
         | 
| 714 | 
            +
             | 
| 715 | 
            +
            **Version 0.0.2**  
         | 
| 716 | 
            +
            * Major design refactor  
         | 
| 717 | 
            +
             | 
| 718 | 
            +
            **Version 0.0.3**
         | 
| 719 | 
            +
            * Add travis.yml  
         | 
| 720 | 
            +
            * Add Code Climate  
         | 
| 721 | 
            +
            * Update README  
         | 
| 722 | 
            +
             | 
| 723 | 
            +
            **Version 0.0.4**  
         | 
| 724 | 
            +
            * Add `ConsecutiveForwardSlashRule` to cleaner  
         | 
| 725 | 
            +
            * Refactor `segmenter.rb` and `process.rb`  
         | 
| 726 | 
            +
             | 
| 727 | 
            +
            **Version 0.0.5**  
         | 
| 728 | 
            +
            * Make symbol substitution safer  
         | 
| 729 | 
            +
            * Refactor `process.rb`  
         | 
| 730 | 
            +
            * Update cleaner with escaped newline rules  
         | 
| 731 | 
            +
             | 
| 732 | 
            +
            **Version 0.0.6**  
         | 
| 733 | 
            +
            * Add rule for escaped newlines that include a space between the slash and character   
         | 
| 734 | 
            +
            * Add Golden Rule #52 and code to make it pass  
         | 
| 735 | 
            +
             | 
| 736 | 
            +
            **Version 0.0.7**  
         | 
| 737 | 
            +
            * Add change log to README  
         | 
| 738 | 
            +
            * Add passing spec for new end of sentence abbreviation (EN)  
         | 
| 739 | 
            +
            * Add roman numeral list support  
         | 
| 740 | 
            +
             | 
| 710 741 | 
             
            ## Contributing
         | 
| 711 742 |  | 
| 712 743 | 
             
            If you find a text that is incorrectly segmented using this gem, please submit an issue.
         | 
| @@ -28,7 +28,7 @@ module PragmaticSegmenter | |
| 28 28 | 
             
                  All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
         | 
| 29 29 | 
             
                end
         | 
| 30 30 |  | 
| 31 | 
            -
                SENTENCE_STARTERS = %w(A Being Did For He How However I In Millions More She That The There They We What When Where Who Why)
         | 
| 31 | 
            +
                SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)
         | 
| 32 32 |  | 
| 33 33 | 
             
                attr_reader :text
         | 
| 34 34 | 
             
                def initialize(text:)
         | 
| @@ -109,6 +109,8 @@ module PragmaticSegmenter | |
| 109 109 | 
             
                          .gsub(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
         | 
| 110 110 | 
             
                          .gsub(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
         | 
| 111 111 | 
             
                          .gsub(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
         | 
| 112 | 
            +
                          .gsub(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
         | 
| 113 | 
            +
                          .gsub(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
         | 
| 112 114 | 
             
                  end
         | 
| 113 115 | 
             
                  txt
         | 
| 114 116 | 
             
                end
         | 
| @@ -6,11 +6,11 @@ module PragmaticSegmenter | |
| 6 6 | 
             
              class List
         | 
| 7 7 | 
             
                # Rubular: http://rubular.com/r/XcpaJKH0sz
         | 
| 8 8 | 
             
                ALPHABETICAL_LIST_WITH_PERIODS =
         | 
| 9 | 
            -
                  /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/ | 
| 9 | 
            +
                  /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
         | 
| 10 10 |  | 
| 11 | 
            -
                # Rubular: http://rubular.com/r/ | 
| 11 | 
            +
                # Rubular: http://rubular.com/r/Gu5rQapywf
         | 
| 12 12 | 
             
                ALPHABETICAL_LIST_WITH_PARENS =
         | 
| 13 | 
            -
                  /(?<=^)[a-z](?=\))|(?<=\A)[a-z](?=\))|(?<=\s)[a-z](?=\))/i
         | 
| 13 | 
            +
                  /(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
         | 
| 14 14 |  | 
| 15 15 | 
             
                SubstituteListPeriodRule = Rule.new(/♨/, '∯')
         | 
| 16 16 | 
             
                ListMarkerRule = Rule.new(/☝/, '')
         | 
| @@ -30,9 +30,9 @@ module PragmaticSegmenter | |
| 30 30 | 
             
                  /(?<=\s)\d+\.(?=\s)|^\d+\.(?=\s)|(?<=\s)\d+\.(?=\))|^\d+\.(?=\))|(?<=\s\-)\d+\.(?=\s)|(?<=^\-)\d+\.(?=\s)|(?<=\s\⁃)\d+\.(?=\s)|(?<=^\⁃)\d+\.(?=\s)|(?<=\s\-)\d+\.(?=\))|(?<=^\-)\d+\.(?=\))|(?<=\s\⁃)\d+\.(?=\))|(?<=^\⁃)\d+\.(?=\))/
         | 
| 31 31 | 
             
                NUMBERED_LIST_PARENS_REGEX = /\d+(?=\)\s)/
         | 
| 32 32 |  | 
| 33 | 
            -
                # Rubular: http://rubular.com/r/ | 
| 33 | 
            +
                # Rubular: http://rubular.com/r/NsNFSqrNvJ
         | 
| 34 34 | 
             
                EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX =
         | 
| 35 | 
            -
                  /(?<=^)[a-z](?=\))|(?<=\A)[a-z](?=\))|(?<=\s)[a-z](?=\))/i
         | 
| 35 | 
            +
                  /(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))/i
         | 
| 36 36 |  | 
| 37 37 | 
             
                # Rubular: http://rubular.com/r/wMpnVedEIb
         | 
| 38 38 | 
             
                ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX =
         | 
| @@ -45,6 +45,7 @@ module PragmaticSegmenter | |
| 45 45 |  | 
| 46 46 | 
             
                def add_line_break
         | 
| 47 47 | 
             
                  formatted_text = format_alphabetical_lists(text)
         | 
| 48 | 
            +
                  formatted_text = format_roman_numeral_lists(formatted_text)
         | 
| 48 49 | 
             
                  formatted_text = format_numbered_list_with_periods(formatted_text)
         | 
| 49 50 | 
             
                  format_numbered_list_with_parens(formatted_text)
         | 
| 50 51 | 
             
                end
         | 
| @@ -64,8 +65,13 @@ module PragmaticSegmenter | |
| 64 65 | 
             
                end
         | 
| 65 66 |  | 
| 66 67 | 
             
                def format_alphabetical_lists(txt)
         | 
| 67 | 
            -
                  new_txt = add_line_breaks_for_alphabetical_list_with_periods(txt)
         | 
| 68 | 
            -
                  add_line_breaks_for_alphabetical_list_with_parens(new_txt)
         | 
| 68 | 
            +
                  new_txt = add_line_breaks_for_alphabetical_list_with_periods(txt, false)
         | 
| 69 | 
            +
                  add_line_breaks_for_alphabetical_list_with_parens(new_txt, false)
         | 
| 70 | 
            +
                end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                def format_roman_numeral_lists(txt)
         | 
| 73 | 
            +
                  new_txt = add_line_breaks_for_alphabetical_list_with_periods(txt, true)
         | 
| 74 | 
            +
                  add_line_breaks_for_alphabetical_list_with_parens(new_txt, true)
         | 
| 69 75 | 
             
                end
         | 
| 70 76 |  | 
| 71 77 | 
             
                def replace_periods_in_numbered_list(txt)
         | 
| @@ -112,12 +118,12 @@ module PragmaticSegmenter | |
| 112 118 | 
             
                  end
         | 
| 113 119 | 
             
                end
         | 
| 114 120 |  | 
| 115 | 
            -
                def add_line_breaks_for_alphabetical_list_with_periods(txt)
         | 
| 116 | 
            -
                  iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, false, txt)
         | 
| 121 | 
            +
                def add_line_breaks_for_alphabetical_list_with_periods(txt, roman_numeral)
         | 
| 122 | 
            +
                  iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, false, txt, roman_numeral)
         | 
| 117 123 | 
             
                end
         | 
| 118 124 |  | 
| 119 | 
            -
                def add_line_breaks_for_alphabetical_list_with_parens(txt)
         | 
| 120 | 
            -
                  iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS, true, txt)
         | 
| 125 | 
            +
                def add_line_breaks_for_alphabetical_list_with_parens(txt, roman_numeral)
         | 
| 126 | 
            +
                  iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS, true, txt, roman_numeral)
         | 
| 121 127 | 
             
                end
         | 
| 122 128 |  | 
| 123 129 | 
             
                def replace_alphabet_list(a, txt)
         | 
| @@ -128,7 +134,11 @@ module PragmaticSegmenter | |
| 128 134 |  | 
| 129 135 | 
             
                def replace_alphabet_list_parens(a, txt)
         | 
| 130 136 | 
             
                  txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
         | 
| 131 | 
            -
                     | 
| 137 | 
            +
                    if txt =~ /\(#{Regexp.escape(m.to_s)}\)/i
         | 
| 138 | 
            +
                      a.eql?(m.dup.downcase) ? "\rȸ(#{Regexp.escape(m.to_s)}" : "#{m}"
         | 
| 139 | 
            +
                    else
         | 
| 140 | 
            +
                      a.eql?(m.dup.downcase) ? "\r#{Regexp.escape(m.to_s)}" : "#{m}"
         | 
| 141 | 
            +
                    end
         | 
| 132 142 | 
             
                  end
         | 
| 133 143 | 
             
                end
         | 
| 134 144 |  | 
| @@ -141,19 +151,29 @@ module PragmaticSegmenter | |
| 141 151 | 
             
                end
         | 
| 142 152 |  | 
| 143 153 | 
             
                def last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
         | 
| 154 | 
            +
                  return if alphabet & list_array == [] ||
         | 
| 155 | 
            +
                    !alphabet.include?(list_array[i - 1]) ||
         | 
| 156 | 
            +
                    !alphabet.include?(a)
         | 
| 144 157 | 
             
                  return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
         | 
| 145 158 | 
             
                  replace_correct_alphabet_list(a, txt, parens)
         | 
| 146 159 | 
             
                end
         | 
| 147 160 |  | 
| 148 161 | 
             
                def other_items_replacement(a, i, alphabet, list_array, txt, parens)
         | 
| 162 | 
            +
                  return if alphabet & list_array == [] ||
         | 
| 163 | 
            +
                    !alphabet.include?(list_array[i - 1]) ||
         | 
| 164 | 
            +
                    !alphabet.include?(a)
         | 
| 149 165 | 
             
                  return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 &&
         | 
| 150 166 | 
             
                            (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
         | 
| 151 167 | 
             
                  replace_correct_alphabet_list(a, txt, parens)
         | 
| 152 168 | 
             
                end
         | 
| 153 169 |  | 
| 154 | 
            -
                def iterate_alphabet_array(regex, parens, txt)
         | 
| 170 | 
            +
                def iterate_alphabet_array(regex, parens, txt, roman_numeral)
         | 
| 155 171 | 
             
                  list_array = txt.scan(regex).map(&:downcase)
         | 
| 156 | 
            -
                   | 
| 172 | 
            +
                  if roman_numeral
         | 
| 173 | 
            +
                    alphabet = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
         | 
| 174 | 
            +
                  else
         | 
| 175 | 
            +
                    alphabet = ('a'..'z').to_a
         | 
| 176 | 
            +
                  end
         | 
| 157 177 | 
             
                  list_array.each_with_index do |a, i|
         | 
| 158 178 | 
             
                    if i.eql?(list_array.length - 1)
         | 
| 159 179 | 
             
                      last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
         | 
| @@ -36,15 +36,15 @@ module PragmaticSegmenter | |
| 36 36 | 
             
                end
         | 
| 37 37 |  | 
| 38 38 | 
             
                module ReinsertEllipsisRules
         | 
| 39 | 
            -
                   | 
| 40 | 
            -
                   | 
| 41 | 
            -
                   | 
| 42 | 
            -
                   | 
| 43 | 
            -
                   | 
| 44 | 
            -
             | 
| 45 | 
            -
                  All = [  | 
| 46 | 
            -
                           | 
| 47 | 
            -
                           | 
| 39 | 
            +
                  SubThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
         | 
| 40 | 
            +
                  SubThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
         | 
| 41 | 
            +
                  SubFourSpacePeriod = Rule.new(/♝/, '. . . .')
         | 
| 42 | 
            +
                  SubTwoConsecutivePeriod = Rule.new(/☏/, '..')
         | 
| 43 | 
            +
                  SubOnePeriod = Rule.new(/∮/, '.')
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                  All = [ SubThreeConsecutivePeriod, SubThreeSpacePeriod,
         | 
| 46 | 
            +
                          SubFourSpacePeriod, SubTwoConsecutivePeriod,
         | 
| 47 | 
            +
                          SubOnePeriod ]
         | 
| 48 48 | 
             
                end
         | 
| 49 49 |  | 
| 50 50 | 
             
                module SubSymbolsRules
         | 
| @@ -86,14 +86,14 @@ module PragmaticSegmenter | |
| 86 86 | 
             
                end
         | 
| 87 87 |  | 
| 88 88 | 
             
                module SubEscapedRegexReservedCharacters
         | 
| 89 | 
            -
                   | 
| 90 | 
            -
                   | 
| 91 | 
            -
                   | 
| 92 | 
            -
                   | 
| 93 | 
            -
                   | 
| 94 | 
            -
             | 
| 95 | 
            -
                  All = [  | 
| 96 | 
            -
                           | 
| 89 | 
            +
                  SubLeftParen = Rule.new('\\(', '(')
         | 
| 90 | 
            +
                  SubRightParen = Rule.new('\\)', ')')
         | 
| 91 | 
            +
                  SubLeftBracket = Rule.new('\\[', '[')
         | 
| 92 | 
            +
                  SubRightBracket = Rule.new('\\]', ']')
         | 
| 93 | 
            +
                  SubDash = Rule.new('\\-', '-')
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                  All = [ SubLeftParen, SubRightParen,
         | 
| 96 | 
            +
                          SubLeftBracket, SubRightBracket, SubDash ]
         | 
| 97 97 | 
             
                end
         | 
| 98 98 | 
             
              end
         | 
| 99 99 | 
             
            end
         | 
| @@ -883,6 +883,36 @@ RSpec.describe PragmaticSegmenter::Segmenter do | |
| 883 883 | 
             
                    ps = PragmaticSegmenter::Segmenter.new(text: 'Hello World. \ r \ nHello.', language: 'en')
         | 
| 884 884 | 
             
                    expect(ps.segment).to eq(["Hello World.", "Hello."])
         | 
| 885 885 | 
             
                  end
         | 
| 886 | 
            +
             | 
| 887 | 
            +
                  it "correctly segments text #083" do
         | 
| 888 | 
            +
                    ps = PragmaticSegmenter::Segmenter.new(text: "The nurse gave him the i.v. in his vein. She gave him the i.v. It was a great I.V. that she gave him. She gave him the I.V. It was night.", language: "en")
         | 
| 889 | 
            +
                    expect(ps.segment).to eq(["The nurse gave him the i.v. in his vein.", "She gave him the i.v.", "It was a great I.V. that she gave him.", "She gave him the I.V.", "It was night."])
         | 
| 890 | 
            +
                  end
         | 
| 891 | 
            +
             | 
| 892 | 
            +
                  it "correctly segments text #084" do
         | 
| 893 | 
            +
                    ps = PragmaticSegmenter::Segmenter.new(text: "(i) Hello world. \n(ii) Hello world.\n(iii) Hello world.\n(iv) Hello world.\n(v) Hello world.\n(vi) Hello world.", language: "en")
         | 
| 894 | 
            +
                    expect(ps.segment).to eq(["(i) Hello world.", "(ii) Hello world.", "(iii) Hello world.", "(iv) Hello world.", "(v) Hello world.", "(vi) Hello world."])
         | 
| 895 | 
            +
                  end
         | 
| 896 | 
            +
             | 
| 897 | 
            +
                  it "correctly segments text #085" do
         | 
| 898 | 
            +
                    ps = PragmaticSegmenter::Segmenter.new(text: "i) Hello world. \nii) Hello world.\niii) Hello world.\niv) Hello world.\nv) Hello world.\nvi) Hello world.", language: "en")
         | 
| 899 | 
            +
                    expect(ps.segment).to eq(["i) Hello world.", "ii) Hello world.", "iii) Hello world.", "iv) Hello world.", "v) Hello world.", "vi) Hello world."])
         | 
| 900 | 
            +
                  end
         | 
| 901 | 
            +
             | 
| 902 | 
            +
                  it "correctly segments text #086" do
         | 
| 903 | 
            +
                    ps = PragmaticSegmenter::Segmenter.new(text: "(a) Hello world. \n(b) Hello world.\n(c) Hello world.\n(d) Hello world.\n(e) Hello world.\n(f) Hello world.", language: "en")
         | 
| 904 | 
            +
                    expect(ps.segment).to eq(["(a) Hello world.", "(b) Hello world.", "(c) Hello world.", "(d) Hello world.", "(e) Hello world.", "(f) Hello world."])
         | 
| 905 | 
            +
                  end
         | 
| 906 | 
            +
             | 
| 907 | 
            +
                  it "correctly segments text #087" do
         | 
| 908 | 
            +
                    ps = PragmaticSegmenter::Segmenter.new(text: "(A) Hello world. \n(B) Hello world.\n(C) Hello world.\n(D) Hello world.\n(E) Hello world.\n(F) Hello world.", language: "en")
         | 
| 909 | 
            +
                    expect(ps.segment).to eq(["(A) Hello world.", "(B) Hello world.", "(C) Hello world.", "(D) Hello world.", "(E) Hello world.", "(F) Hello world."])
         | 
| 910 | 
            +
                  end
         | 
| 911 | 
            +
             | 
| 912 | 
            +
                  it "correctly segments text #088" do
         | 
| 913 | 
            +
                    ps = PragmaticSegmenter::Segmenter.new(text: "A) Hello world. \nB) Hello world.\nC) Hello world.\nD) Hello world.\nE) Hello world.\nF) Hello world.", language: "en")
         | 
| 914 | 
            +
                    expect(ps.segment).to eq(["A) Hello world.", "B) Hello world.", "C) Hello world.", "D) Hello world.", "E) Hello world.", "F) Hello world."])
         | 
| 915 | 
            +
                  end
         | 
| 886 916 | 
             
                end
         | 
| 887 917 | 
             
              end
         | 
| 888 918 |  | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: pragmatic_segmenter
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.0. | 
| 4 | 
            +
              version: 0.0.7
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Kevin S. Dias
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2015-01- | 
| 11 | 
            +
            date: 2015-01-12 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: bundler
         |