pragmatic_segmenter 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +1 -0
  3. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +6 -6
  4. data/lib/pragmatic_segmenter/between_punctuation.rb +6 -4
  5. data/lib/pragmatic_segmenter/cleaner.rb +51 -47
  6. data/lib/pragmatic_segmenter/cleaner/rules.rb +86 -0
  7. data/lib/pragmatic_segmenter/languages.rb +21 -30
  8. data/lib/pragmatic_segmenter/languages/arabic.rb +0 -13
  9. data/lib/pragmatic_segmenter/languages/common.rb +67 -44
  10. data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +37 -0
  11. data/lib/pragmatic_segmenter/languages/common/numbers.rb +90 -0
  12. data/lib/pragmatic_segmenter/languages/deutsch.rb +25 -48
  13. data/lib/pragmatic_segmenter/languages/english.rb +3 -3
  14. data/lib/pragmatic_segmenter/languages/japanese.rb +5 -13
  15. data/lib/pragmatic_segmenter/languages/persian.rb +0 -14
  16. data/lib/pragmatic_segmenter/languages/russian.rb +0 -25
  17. data/lib/pragmatic_segmenter/languages/spanish.rb +0 -9
  18. data/lib/pragmatic_segmenter/list.rb +60 -58
  19. data/lib/pragmatic_segmenter/{process.rb → processor.rb} +47 -26
  20. data/lib/pragmatic_segmenter/punctuation_replacer.rb +41 -20
  21. data/lib/pragmatic_segmenter/segmenter.rb +19 -5
  22. data/lib/pragmatic_segmenter/version.rb +1 -1
  23. data/pragmatic_segmenter.gemspec +1 -0
  24. data/spec/pragmatic_segmenter/languages/amharic_spec.rb +18 -0
  25. data/spec/pragmatic_segmenter/languages/arabic_spec.rb +59 -0
  26. data/spec/pragmatic_segmenter/languages/armenian_spec.rb +160 -0
  27. data/spec/pragmatic_segmenter/languages/burmese_spec.rb +18 -0
  28. data/spec/pragmatic_segmenter/languages/chinese_spec.rb +11 -0
  29. data/spec/pragmatic_segmenter/languages/deutsch_spec.rb +189 -0
  30. data/spec/pragmatic_segmenter/languages/dutch_spec.rb +23 -0
  31. data/spec/pragmatic_segmenter/languages/english_spec.rb +1348 -0
  32. data/spec/pragmatic_segmenter/languages/french_spec.rb +31 -0
  33. data/spec/pragmatic_segmenter/languages/greek_spec.rb +18 -0
  34. data/spec/pragmatic_segmenter/languages/hindi_spec.rb +18 -0
  35. data/spec/pragmatic_segmenter/languages/italian_spec.rb +190 -0
  36. data/spec/pragmatic_segmenter/languages/japanese_spec.rb +53 -0
  37. data/spec/pragmatic_segmenter/languages/persian_spec.rb +18 -0
  38. data/spec/pragmatic_segmenter/languages/polish_spec.rb +11 -0
  39. data/spec/pragmatic_segmenter/languages/russian_spec.rb +219 -0
  40. data/spec/pragmatic_segmenter/languages/spanish_spec.rb +189 -0
  41. data/spec/pragmatic_segmenter/languages/urdu_spec.rb +18 -0
  42. data/spec/pragmatic_segmenter/languages_spec.rb +31 -0
  43. data/spec/pragmatic_segmenter_spec.rb +24 -2583
  44. metadata +59 -8
  45. data/lib/pragmatic_segmenter/number.rb +0 -35
  46. data/lib/pragmatic_segmenter/rules.rb +0 -168
  47. data/lib/pragmatic_segmenter/rules/ellipsis.rb +0 -35
  48. data/lib/pragmatic_segmenter/rules/html.rb +0 -13
@@ -0,0 +1,37 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module PragmaticSegmenter
4
+ module Languages
5
+ module Common
6
+ # This class searches for ellipses within a string and
7
+ # replaces the periods.
8
+
9
+ # http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
10
+ # http://www.thepunctuationguide.com/ellipses.html
11
+
12
+ module EllipsisRules
13
+ # Rubular: http://rubular.com/r/i60hCK81fz
14
+ ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
15
+
16
+ # Rubular: http://rubular.com/r/Hdqpd90owl
17
+ FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
18
+
19
+ # Rubular: http://rubular.com/r/YBG1dIHTRu
20
+ ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
21
+
22
+ # Rubular: http://rubular.com/r/2VvZ8wRbd8
23
+ FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
24
+
25
+ OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
26
+
27
+ All = [
28
+ ThreeSpaceRule,
29
+ FourSpaceRule,
30
+ FourConsecutiveRule,
31
+ ThreeConsecutiveRule,
32
+ OtherThreePeriodRule
33
+ ]
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,90 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module PragmaticSegmenter
4
+ module Languages
5
+ module Common
6
+ module Numbers
7
+ # Rubular: http://rubular.com/r/oNyxBOqbyy
8
+ PeriodBeforeNumberRule = Rule.new(/\.(?=\d)/, '∯')
9
+
10
+ # Rubular: http://rubular.com/r/EMk5MpiUzt
11
+ NumberAfterPeriodBeforeLetterRule = Rule.new(/(?<=\d)\.(?=\S)/, '∯')
12
+
13
+ # Rubular: http://rubular.com/r/rf4l1HjtjG
14
+ NewLineNumberPeriodSpaceLetterRule = Rule.new(/(?<=\r\d)\.(?=(\s\S)|\))/, '∯')
15
+
16
+ # Rubular: http://rubular.com/r/HPa4sdc6b9
17
+ StartLineNumberPeriodRule = Rule.new(/(?<=^\d)\.(?=(\s\S)|\))/, '∯')
18
+
19
+ # Rubular: http://rubular.com/r/NuvWnKleFl
20
+ StartLineTwoDigitNumberPeriodRule = Rule.new(/(?<=^\d\d)\.(?=(\s\S)|\))/, '∯')
21
+
22
+ All = [
23
+ PeriodBeforeNumberRule,
24
+ NumberAfterPeriodBeforeLetterRule,
25
+ NewLineNumberPeriodSpaceLetterRule,
26
+ StartLineNumberPeriodRule,
27
+ StartLineTwoDigitNumberPeriodRule
28
+ ]
29
+ end
30
+
31
+
32
+ SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
33
+
34
+ # Rubular: http://rubular.com/r/NqCqv372Ix
35
+ QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
36
+
37
+ # Rubular: http://rubular.com/r/6flGnUMEVl
38
+ PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
39
+
40
+ # Rubular: http://rubular.com/r/TYzr4qOW1Q
41
+ BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
42
+
43
+ # Rubular: http://rubular.com/r/JMjlZHAT4g
44
+ SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
45
+
46
+ # Rubular: http://rubular.com/r/mQ8Es9bxtk
47
+ CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
48
+
49
+ # Rubular: http://rubular.com/r/yqa4Rit8EY
50
+ PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
51
+
52
+ # Rubular: http://rubular.com/r/NEv265G2X2
53
+ KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
54
+
55
+ # Rubular: http://rubular.com/r/xDkpFZ0EgH
56
+ MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
57
+
58
+ module AmPmRules
59
+ # Rubular: http://rubular.com/r/Vnx3m4Spc8
60
+ UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
61
+
62
+ # Rubular: http://rubular.com/r/AJMCotJVbW
63
+ UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
64
+
65
+ # Rubular: http://rubular.com/r/13q7SnOhgA
66
+ LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
67
+
68
+ # Rubular: http://rubular.com/r/DgUDq4mLz5
69
+ LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
70
+
71
+ All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
72
+ end
73
+
74
+ # This class searches for periods within an abbreviation and
75
+ # replaces the periods.
76
+ module SingleLetterAbbreviationRules
77
+ # Rubular: http://rubular.com/r/e3H6kwnr6H
78
+ SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
79
+
80
+ # Rubular: http://rubular.com/r/gitvf0YWH4
81
+ SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
82
+
83
+ All = [
84
+ SingleUpperCaseLetterAtStartOfLineRule,
85
+ SingleUpperCaseLetterRule
86
+ ]
87
+ end
88
+ end
89
+ end
90
+ end
@@ -18,11 +18,20 @@ module PragmaticSegmenter
18
18
  # Rubular: http://rubular.com/r/TkZomF9tTM
19
19
  BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
20
20
 
21
- # Rubular: http://rubular.com/r/hZxoyQwKT1
22
- NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
23
21
 
24
- # Rubular: http://rubular.com/r/ityNMwdghj
25
- NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
22
+ module Numbers
23
+ # Rubular: http://rubular.com/r/hZxoyQwKT1
24
+ NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
25
+
26
+ # Rubular: http://rubular.com/r/ityNMwdghj
27
+ NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
28
+
29
+ All = [
30
+ Common::Numbers::All,
31
+ NumberPeriodSpaceRule,
32
+ NegativeNumberPeriodSpaceRule
33
+ ]
34
+ end
26
35
 
27
36
  MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
28
37
 
@@ -32,59 +41,35 @@ module PragmaticSegmenter
32
41
  # Rubular: http://rubular.com/r/iUNSkCuso0
33
42
  SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
34
43
 
35
-
36
- class Process < PragmaticSegmenter::Process
37
- private
38
-
39
- def between_punctuation(txt)
40
- BetweenPunctuation.new(text: txt).replace
41
- end
42
-
43
- def replace_numbers(txt)
44
- Number.new(text: txt).replace
45
- end
46
-
47
- def replace_abbreviations(txt)
48
- AbbreviationReplacer.new(text: txt, language: Deutsch).replace
49
- end
50
- end
51
-
52
- class Cleaner < PragmaticSegmenter::Cleaner
44
+ class Processor < PragmaticSegmenter::Processor
53
45
  private
54
46
 
55
- def abbreviations
56
- Abbreviation::ABBREVIATIONS
57
- end
58
- end
47
+ def replace_numbers
48
+ @text.apply Numbers::All
59
49
 
60
- class Number < PragmaticSegmenter::Number
61
- def replace
62
- super
63
- @text.apply(NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule)
64
- replace_period_in_deutsch_dates(@text)
50
+ replace_period_in_deutsch_dates
65
51
  end
66
52
 
67
- def replace_period_in_deutsch_dates(txt)
53
+ def replace_period_in_deutsch_dates
68
54
  MONTHS.each do |month|
69
55
  # Rubular: http://rubular.com/r/zlqgj7G5dA
70
- txt.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
56
+ @text.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
71
57
  end
72
- txt
73
58
  end
74
59
  end
75
60
 
76
61
  class AbbreviationReplacer < AbbreviationReplacer
77
62
  def replace
78
- @reformatted_text = text.apply(
63
+ @text = text.apply(
79
64
  @language::PossessiveAbbreviationRule,
80
65
  @language::SingleLetterAbbreviationRules::All,
81
66
  SingleLowerCaseLetterRule,
82
67
  SingleLowerCaseLetterAtStartOfLineRule)
83
68
 
84
- @reformatted_text = search_for_abbreviations_in_string(@reformatted_text)
85
- @reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
86
- @reformatted_text = @reformatted_text.apply(Languages::Common::AmPmRules::All)
87
- replace_abbreviation_as_sentence_boundary(@reformatted_text)
69
+ @text = search_for_abbreviations_in_string(@text)
70
+ @text = replace_multi_period_abbreviations(@text)
71
+ @text.apply(Languages::Common::AmPmRules::All)
72
+ replace_abbreviation_as_sentence_boundary(@text)
88
73
  end
89
74
 
90
75
  private
@@ -97,15 +82,7 @@ module PragmaticSegmenter
97
82
  class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
98
83
  private
99
84
 
100
- def sub_punctuation_between_double_quotes(txt)
101
- btwn_dbl_quote = sub_punctuation_between_double_quotes_de(txt)
102
- PragmaticSegmenter::PunctuationReplacer.new(
103
- matches_array: btwn_dbl_quote,
104
- text: txt
105
- ).replace
106
- end
107
-
108
- def sub_punctuation_between_double_quotes_de(txt)
85
+ def btwn_dbl_quote(txt)
109
86
  if txt.include?('„')
110
87
  btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_DE_REGEX)
111
88
  txt.scan(SPLIT_DOUBLE_QUOTES_DE_REGEX).each do |q|
@@ -6,13 +6,13 @@ module PragmaticSegmenter
6
6
  class Cleaner < Cleaner
7
7
  def clean
8
8
  super
9
- clean_quotations(@clean_text)
9
+ clean_quotations
10
10
  end
11
11
 
12
12
  private
13
13
 
14
- def clean_quotations(txt)
15
- txt.gsub(/`/, "'")
14
+ def clean_quotations
15
+ @text.gsub(/`/, "'")
16
16
  end
17
17
 
18
18
  def abbreviations
@@ -3,27 +3,19 @@ module PragmaticSegmenter
3
3
  module Japanese
4
4
  include Languages::Common
5
5
 
6
- class Process < Process
7
- private
8
-
9
- def between_punctuation(txt)
10
- BetweenPunctuation.new(text: txt).replace
11
- end
12
- end
13
-
14
6
  class Cleaner < PragmaticSegmenter::Cleaner
15
7
  # Rubular: http://rubular.com/r/N4kPuJgle7
16
8
  NewLineInMiddleOfWordRule = Rule.new(/(?<=の)\n(?=\S)/, '')
17
9
 
18
10
  def clean
19
11
  super
20
- @clean_text = remove_newline_in_middle_of_word(@clean_text)
12
+ remove_newline_in_middle_of_word
21
13
  end
22
14
 
23
15
  private
24
16
 
25
- def remove_newline_in_middle_of_word(txt)
26
- txt.apply(NewLineInMiddleOfWordRule)
17
+ def remove_newline_in_middle_of_word
18
+ @text.apply NewLineInMiddleOfWordRule
27
19
  end
28
20
  end
29
21
 
@@ -42,14 +34,14 @@ module PragmaticSegmenter
42
34
  end
43
35
 
44
36
  def sub_punctuation_between_quotes_ja(txt)
45
- PragmaticSegmenter::PunctuationReplacer.new(
37
+ PunctuationReplacer.new(
46
38
  matches_array: txt.scan(BETWEEN_QUOTE_JA_REGEX),
47
39
  text: txt
48
40
  ).replace
49
41
  end
50
42
 
51
43
  def sub_punctuation_between_parens_ja(txt)
52
- PragmaticSegmenter::PunctuationReplacer.new(
44
+ PunctuationReplacer.new(
53
45
  matches_array: txt.scan(BETWEEN_PARENS_JA_REGEX),
54
46
  text: txt
55
47
  ).replace
@@ -9,20 +9,6 @@ module PragmaticSegmenter
9
9
  ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
10
10
  ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
11
11
 
12
- class Process < Process
13
- private
14
-
15
- def sentence_boundary_punctuation(txt)
16
- txt = txt.apply ReplaceColonBetweenNumbersRule,
17
- ReplaceNonSentenceBoundaryCommaRule
18
- txt.scan(SENTENCE_BOUNDARY_REGEX)
19
- end
20
-
21
- def replace_abbreviations(txt)
22
- AbbreviationReplacer.new(text: txt).replace
23
- end
24
- end
25
-
26
12
  class AbbreviationReplacer < AbbreviationReplacer
27
13
  private
28
14
 
@@ -9,34 +9,9 @@ module PragmaticSegmenter
9
9
  NUMBER_ABBREVIATIONS = []
10
10
  end
11
11
 
12
- class Process < Process
13
- private
14
-
15
- def replace_abbreviations(txt)
16
- AbbreviationReplacer.new(text: txt, language: Russian).replace
17
- end
18
- end
19
-
20
12
  class AbbreviationReplacer < AbbreviationReplacer
21
13
  private
22
14
 
23
- def scan_for_replacements(txt, am, index, character_array)
24
- character = character_array[index]
25
- prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS
26
- number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS
27
- upper = /[[:upper:]]/.match(character.to_s)
28
- if upper.nil? || prepositive.include?(am.downcase.strip)
29
- if prepositive.include?(am.downcase.strip)
30
- txt = replace_prepositive_abbr(txt, am)
31
- elsif number_abbr.include?(am.downcase.strip)
32
- txt = replace_pre_number_abbr(txt, am)
33
- else
34
- txt = replace_period_of_abbr(txt, am)
35
- end
36
- end
37
- txt
38
- end
39
-
40
15
  def replace_period_of_abbr(txt, abbr)
41
16
  txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
42
17
  .gsub(/(?<=\A#{abbr.strip})\./, '∯')
@@ -8,15 +8,6 @@ module PragmaticSegmenter
8
8
  PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
9
9
  NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
10
10
  end
11
-
12
- class Cleaner < Cleaner
13
- private
14
-
15
- def abbreviations
16
- Abbreviation::ABBREVIATIONS
17
- end
18
- end
19
-
20
11
  end
21
12
  end
22
13
  end
@@ -5,6 +5,8 @@ module PragmaticSegmenter
5
5
  # newlines before each list item.
6
6
  class List
7
7
  ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
8
+ LATIN_NUMERALS = ('a'..'z').to_a
9
+
8
10
  # Rubular: http://rubular.com/r/XcpaJKH0sz
9
11
  ALPHABETICAL_LIST_WITH_PERIODS =
10
12
  /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
@@ -45,10 +47,10 @@ module PragmaticSegmenter
45
47
  end
46
48
 
47
49
  def add_line_break
48
- formatted_text = format_alphabetical_lists(text)
49
- formatted_text = format_roman_numeral_lists(formatted_text)
50
- formatted_text = format_numbered_list_with_periods(formatted_text)
51
- format_numbered_list_with_parens(formatted_text)
50
+ format_alphabetical_lists
51
+ format_roman_numeral_lists
52
+ format_numbered_list_with_periods
53
+ format_numbered_list_with_parens
52
54
  end
53
55
 
54
56
  def replace_parens
@@ -63,64 +65,63 @@ module PragmaticSegmenter
63
65
 
64
66
  private
65
67
 
66
- def format_numbered_list_with_parens(txt)
67
- new_txt = replace_parens_in_numbered_list(txt)
68
- new_txt = add_line_breaks_for_numbered_list_with_parens(new_txt)
69
- new_txt.apply(ListMarkerRule)
68
+ def format_numbered_list_with_parens
69
+ replace_parens_in_numbered_list
70
+ add_line_breaks_for_numbered_list_with_parens
71
+ @text.apply(ListMarkerRule)
70
72
  end
71
73
 
72
- def format_numbered_list_with_periods(txt)
73
- new_txt = replace_periods_in_numbered_list(txt)
74
- new_txt = add_line_breaks_for_numbered_list_with_periods(new_txt)
75
- new_txt.apply(SubstituteListPeriodRule)
74
+ def format_numbered_list_with_periods
75
+ replace_periods_in_numbered_list
76
+ add_line_breaks_for_numbered_list_with_periods
77
+ @text.apply(SubstituteListPeriodRule)
76
78
  end
77
79
 
78
- def format_alphabetical_lists(txt)
79
- new_txt = add_line_breaks_for_alphabetical_list_with_periods(txt, false)
80
- add_line_breaks_for_alphabetical_list_with_parens(new_txt, false)
80
+ def format_alphabetical_lists
81
+ add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
82
+ add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
81
83
  end
82
84
 
83
- def format_roman_numeral_lists(txt)
84
- new_txt = add_line_breaks_for_alphabetical_list_with_periods(txt, true)
85
- add_line_breaks_for_alphabetical_list_with_parens(new_txt, true)
85
+ def format_roman_numeral_lists
86
+ add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: true)
87
+ add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: true)
86
88
  end
87
89
 
88
- def replace_periods_in_numbered_list(txt)
89
- scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', true, txt)
90
+ def replace_periods_in_numbered_list
91
+ scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', strip: true)
90
92
  end
91
93
 
92
- def add_line_breaks_for_numbered_list_with_periods(txt)
93
- return txt unless txt.include?('♨') &&
94
- txt !~ /♨.+\n.+♨|♨.+\r.+♨/ &&
95
- txt !~ /for\s\d{1,2}♨\s[a-z]/
96
- txt.apply(SpaceBetweenListItemsFirstRule).
97
- apply(SpaceBetweenListItemsSecondRule)
94
+ def add_line_breaks_for_numbered_list_with_periods
95
+ if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/
96
+ @text.apply(SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule)
97
+ end
98
98
  end
99
99
 
100
- def replace_parens_in_numbered_list(txt)
100
+ def replace_parens_in_numbered_list
101
101
  scan_lists(
102
- NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝', false, txt)
102
+ NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
103
+ scan_lists(NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
103
104
  end
104
105
 
105
- def add_line_breaks_for_numbered_list_with_parens(txt)
106
- return txt unless txt.include?('☝') && txt !~ /☝.+\n.+☝|☝.+\r.+☝/
107
- txt.apply(SpaceBetweenListItemsThirdRule)
106
+ def add_line_breaks_for_numbered_list_with_parens
107
+ if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/
108
+ @text.apply(SpaceBetweenListItemsThirdRule)
109
+ end
108
110
  end
109
111
 
110
- def scan_lists(regex1, regex2, replacement, strip, txt)
111
- list_array = txt.scan(regex1).map(&:to_i)
112
+ def scan_lists(regex1, regex2, replacement, strip: false)
113
+ list_array = @text.scan(regex1).map(&:to_i)
112
114
  list_array.each_with_index do |a, i|
113
115
  next unless (a + 1).eql?(list_array[i + 1]) ||
114
116
  (a - 1).eql?(list_array[i - 1]) ||
115
117
  (a.eql?(0) && list_array[i - 1].eql?(9)) ||
116
118
  (a.eql?(9) && list_array[i + 1].eql?(0))
117
- substitute_found_list_items(txt, regex2, a, strip, replacement)
119
+ substitute_found_list_items(regex2, a, strip, replacement)
118
120
  end
119
- txt
120
121
  end
121
122
 
122
- def substitute_found_list_items(txt, regex, a, strip, replacement)
123
- txt.gsub!(regex).with_index do |m|
123
+ def substitute_found_list_items(regex, a, strip, replacement)
124
+ @text.gsub!(regex).with_index do |m|
124
125
  if a.to_s.eql?(strip ? m.strip.chop : m)
125
126
  "#{Regexp.escape(a.to_s)}" + replacement
126
127
  else
@@ -129,22 +130,24 @@ module PragmaticSegmenter
129
130
  end
130
131
  end
131
132
 
132
- def add_line_breaks_for_alphabetical_list_with_periods(txt, roman_numeral)
133
- iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, false, txt, roman_numeral)
133
+ def add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
134
+ iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral: roman_numeral)
134
135
  end
135
136
 
136
- def add_line_breaks_for_alphabetical_list_with_parens(txt, roman_numeral)
137
- iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS, true, txt, roman_numeral)
137
+ def add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
138
+ iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS,
139
+ parens: true,
140
+ roman_numeral: roman_numeral)
138
141
  end
139
142
 
140
- def replace_alphabet_list(a, txt)
141
- txt.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
143
+ def replace_alphabet_list(a)
144
+ @text.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
142
145
  a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
143
146
  end
144
147
  end
145
148
 
146
- def replace_alphabet_list_parens(a, txt)
147
- txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
149
+ def replace_alphabet_list_parens(a)
150
+ @text.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
148
151
  if m.include?('(')
149
152
  a.eql?(m.dup.downcase.gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
150
153
  else
@@ -153,48 +156,47 @@ module PragmaticSegmenter
153
156
  end
154
157
  end
155
158
 
156
- def replace_correct_alphabet_list(a, txt, parens)
159
+ def replace_correct_alphabet_list(a, parens)
157
160
  if parens
158
- replace_alphabet_list_parens(a, txt)
161
+ replace_alphabet_list_parens(a)
159
162
  else
160
- replace_alphabet_list(a, txt)
163
+ replace_alphabet_list(a)
161
164
  end
162
165
  end
163
166
 
164
- def last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
167
+ def last_array_item_replacement(a, i, alphabet, list_array, parens)
165
168
  return if alphabet & list_array == [] ||
166
169
  !alphabet.include?(list_array[i - 1]) ||
167
170
  !alphabet.include?(a)
168
171
  return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
169
- replace_correct_alphabet_list(a, txt, parens)
172
+ replace_correct_alphabet_list(a, parens)
170
173
  end
171
174
 
172
- def other_items_replacement(a, i, alphabet, list_array, txt, parens)
175
+ def other_items_replacement(a, i, alphabet, list_array, parens)
173
176
  return if alphabet & list_array == [] ||
174
177
  !alphabet.include?(list_array[i - 1]) ||
175
178
  !alphabet.include?(a) ||
176
179
  !alphabet.include?(list_array[i + 1])
177
180
  return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 &&
178
181
  (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
179
- replace_correct_alphabet_list(a, txt, parens)
182
+ replace_correct_alphabet_list(a, parens)
180
183
  end
181
184
 
182
- def iterate_alphabet_array(regex, parens, txt, roman_numeral)
183
- list_array = txt.scan(regex).map(&:downcase)
185
+ def iterate_alphabet_array(regex, parens: false, roman_numeral: false)
186
+ list_array = @text.scan(regex).map(&:downcase)
184
187
  if roman_numeral
185
188
  alphabet = ROMAN_NUMERALS
186
189
  else
187
- alphabet = ('a'..'z').to_a
190
+ alphabet = LATIN_NUMERALS
188
191
  end
189
192
  list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
190
193
  list_array.each_with_index do |a, i|
191
194
  if i.eql?(list_array.length - 1)
192
- last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
195
+ last_array_item_replacement(a, i, alphabet, list_array, parens)
193
196
  else
194
- other_items_replacement(a, i, alphabet, list_array, txt, parens)
197
+ other_items_replacement(a, i, alphabet, list_array, parens)
195
198
  end
196
199
  end
197
- txt
198
200
  end
199
201
  end
200
202
  end