pragmatic_segmenter 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +1 -0
  3. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +6 -6
  4. data/lib/pragmatic_segmenter/between_punctuation.rb +6 -4
  5. data/lib/pragmatic_segmenter/cleaner.rb +51 -47
  6. data/lib/pragmatic_segmenter/cleaner/rules.rb +86 -0
  7. data/lib/pragmatic_segmenter/languages.rb +21 -30
  8. data/lib/pragmatic_segmenter/languages/arabic.rb +0 -13
  9. data/lib/pragmatic_segmenter/languages/common.rb +67 -44
  10. data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +37 -0
  11. data/lib/pragmatic_segmenter/languages/common/numbers.rb +90 -0
  12. data/lib/pragmatic_segmenter/languages/deutsch.rb +25 -48
  13. data/lib/pragmatic_segmenter/languages/english.rb +3 -3
  14. data/lib/pragmatic_segmenter/languages/japanese.rb +5 -13
  15. data/lib/pragmatic_segmenter/languages/persian.rb +0 -14
  16. data/lib/pragmatic_segmenter/languages/russian.rb +0 -25
  17. data/lib/pragmatic_segmenter/languages/spanish.rb +0 -9
  18. data/lib/pragmatic_segmenter/list.rb +60 -58
  19. data/lib/pragmatic_segmenter/{process.rb → processor.rb} +47 -26
  20. data/lib/pragmatic_segmenter/punctuation_replacer.rb +41 -20
  21. data/lib/pragmatic_segmenter/segmenter.rb +19 -5
  22. data/lib/pragmatic_segmenter/version.rb +1 -1
  23. data/pragmatic_segmenter.gemspec +1 -0
  24. data/spec/pragmatic_segmenter/languages/amharic_spec.rb +18 -0
  25. data/spec/pragmatic_segmenter/languages/arabic_spec.rb +59 -0
  26. data/spec/pragmatic_segmenter/languages/armenian_spec.rb +160 -0
  27. data/spec/pragmatic_segmenter/languages/burmese_spec.rb +18 -0
  28. data/spec/pragmatic_segmenter/languages/chinese_spec.rb +11 -0
  29. data/spec/pragmatic_segmenter/languages/deutsch_spec.rb +189 -0
  30. data/spec/pragmatic_segmenter/languages/dutch_spec.rb +23 -0
  31. data/spec/pragmatic_segmenter/languages/english_spec.rb +1348 -0
  32. data/spec/pragmatic_segmenter/languages/french_spec.rb +31 -0
  33. data/spec/pragmatic_segmenter/languages/greek_spec.rb +18 -0
  34. data/spec/pragmatic_segmenter/languages/hindi_spec.rb +18 -0
  35. data/spec/pragmatic_segmenter/languages/italian_spec.rb +190 -0
  36. data/spec/pragmatic_segmenter/languages/japanese_spec.rb +53 -0
  37. data/spec/pragmatic_segmenter/languages/persian_spec.rb +18 -0
  38. data/spec/pragmatic_segmenter/languages/polish_spec.rb +11 -0
  39. data/spec/pragmatic_segmenter/languages/russian_spec.rb +219 -0
  40. data/spec/pragmatic_segmenter/languages/spanish_spec.rb +189 -0
  41. data/spec/pragmatic_segmenter/languages/urdu_spec.rb +18 -0
  42. data/spec/pragmatic_segmenter/languages_spec.rb +31 -0
  43. data/spec/pragmatic_segmenter_spec.rb +24 -2583
  44. metadata +59 -8
  45. data/lib/pragmatic_segmenter/number.rb +0 -35
  46. data/lib/pragmatic_segmenter/rules.rb +0 -168
  47. data/lib/pragmatic_segmenter/rules/ellipsis.rb +0 -35
  48. data/lib/pragmatic_segmenter/rules/html.rb +0 -13
@@ -0,0 +1,37 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module PragmaticSegmenter
4
+ module Languages
5
+ module Common
6
+ # This class searches for ellipses within a string and
7
+ # replaces the periods.
8
+
9
+ # http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
10
+ # http://www.thepunctuationguide.com/ellipses.html
11
+
12
+ module EllipsisRules
13
+ # Rubular: http://rubular.com/r/i60hCK81fz
14
+ ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
15
+
16
+ # Rubular: http://rubular.com/r/Hdqpd90owl
17
+ FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
18
+
19
+ # Rubular: http://rubular.com/r/YBG1dIHTRu
20
+ ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
21
+
22
+ # Rubular: http://rubular.com/r/2VvZ8wRbd8
23
+ FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
24
+
25
+ OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
26
+
27
+ All = [
28
+ ThreeSpaceRule,
29
+ FourSpaceRule,
30
+ FourConsecutiveRule,
31
+ ThreeConsecutiveRule,
32
+ OtherThreePeriodRule
33
+ ]
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,90 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module PragmaticSegmenter
4
+ module Languages
5
+ module Common
6
+ module Numbers
7
+ # Rubular: http://rubular.com/r/oNyxBOqbyy
8
+ PeriodBeforeNumberRule = Rule.new(/\.(?=\d)/, '∯')
9
+
10
+ # Rubular: http://rubular.com/r/EMk5MpiUzt
11
+ NumberAfterPeriodBeforeLetterRule = Rule.new(/(?<=\d)\.(?=\S)/, '∯')
12
+
13
+ # Rubular: http://rubular.com/r/rf4l1HjtjG
14
+ NewLineNumberPeriodSpaceLetterRule = Rule.new(/(?<=\r\d)\.(?=(\s\S)|\))/, '∯')
15
+
16
+ # Rubular: http://rubular.com/r/HPa4sdc6b9
17
+ StartLineNumberPeriodRule = Rule.new(/(?<=^\d)\.(?=(\s\S)|\))/, '∯')
18
+
19
+ # Rubular: http://rubular.com/r/NuvWnKleFl
20
+ StartLineTwoDigitNumberPeriodRule = Rule.new(/(?<=^\d\d)\.(?=(\s\S)|\))/, '∯')
21
+
22
+ All = [
23
+ PeriodBeforeNumberRule,
24
+ NumberAfterPeriodBeforeLetterRule,
25
+ NewLineNumberPeriodSpaceLetterRule,
26
+ StartLineNumberPeriodRule,
27
+ StartLineTwoDigitNumberPeriodRule
28
+ ]
29
+ end
30
+
31
+
32
+ SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
33
+
34
+ # Rubular: http://rubular.com/r/NqCqv372Ix
35
+ QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
36
+
37
+ # Rubular: http://rubular.com/r/6flGnUMEVl
38
+ PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
39
+
40
+ # Rubular: http://rubular.com/r/TYzr4qOW1Q
41
+ BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
42
+
43
+ # Rubular: http://rubular.com/r/JMjlZHAT4g
44
+ SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
45
+
46
+ # Rubular: http://rubular.com/r/mQ8Es9bxtk
47
+ CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
48
+
49
+ # Rubular: http://rubular.com/r/yqa4Rit8EY
50
+ PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
51
+
52
+ # Rubular: http://rubular.com/r/NEv265G2X2
53
+ KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
54
+
55
+ # Rubular: http://rubular.com/r/xDkpFZ0EgH
56
+ MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
57
+
58
+ module AmPmRules
59
+ # Rubular: http://rubular.com/r/Vnx3m4Spc8
60
+ UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
61
+
62
+ # Rubular: http://rubular.com/r/AJMCotJVbW
63
+ UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
64
+
65
+ # Rubular: http://rubular.com/r/13q7SnOhgA
66
+ LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
67
+
68
+ # Rubular: http://rubular.com/r/DgUDq4mLz5
69
+ LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
70
+
71
+ All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
72
+ end
73
+
74
+ # This class searches for periods within an abbreviation and
75
+ # replaces the periods.
76
+ module SingleLetterAbbreviationRules
77
+ # Rubular: http://rubular.com/r/e3H6kwnr6H
78
+ SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
79
+
80
+ # Rubular: http://rubular.com/r/gitvf0YWH4
81
+ SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
82
+
83
+ All = [
84
+ SingleUpperCaseLetterAtStartOfLineRule,
85
+ SingleUpperCaseLetterRule
86
+ ]
87
+ end
88
+ end
89
+ end
90
+ end
@@ -18,11 +18,20 @@ module PragmaticSegmenter
18
18
  # Rubular: http://rubular.com/r/TkZomF9tTM
19
19
  BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
20
20
 
21
- # Rubular: http://rubular.com/r/hZxoyQwKT1
22
- NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
23
21
 
24
- # Rubular: http://rubular.com/r/ityNMwdghj
25
- NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
22
+ module Numbers
23
+ # Rubular: http://rubular.com/r/hZxoyQwKT1
24
+ NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
25
+
26
+ # Rubular: http://rubular.com/r/ityNMwdghj
27
+ NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
28
+
29
+ All = [
30
+ Common::Numbers::All,
31
+ NumberPeriodSpaceRule,
32
+ NegativeNumberPeriodSpaceRule
33
+ ]
34
+ end
26
35
 
27
36
  MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
28
37
 
@@ -32,59 +41,35 @@ module PragmaticSegmenter
32
41
  # Rubular: http://rubular.com/r/iUNSkCuso0
33
42
  SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
34
43
 
35
-
36
- class Process < PragmaticSegmenter::Process
37
- private
38
-
39
- def between_punctuation(txt)
40
- BetweenPunctuation.new(text: txt).replace
41
- end
42
-
43
- def replace_numbers(txt)
44
- Number.new(text: txt).replace
45
- end
46
-
47
- def replace_abbreviations(txt)
48
- AbbreviationReplacer.new(text: txt, language: Deutsch).replace
49
- end
50
- end
51
-
52
- class Cleaner < PragmaticSegmenter::Cleaner
44
+ class Processor < PragmaticSegmenter::Processor
53
45
  private
54
46
 
55
- def abbreviations
56
- Abbreviation::ABBREVIATIONS
57
- end
58
- end
47
+ def replace_numbers
48
+ @text.apply Numbers::All
59
49
 
60
- class Number < PragmaticSegmenter::Number
61
- def replace
62
- super
63
- @text.apply(NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule)
64
- replace_period_in_deutsch_dates(@text)
50
+ replace_period_in_deutsch_dates
65
51
  end
66
52
 
67
- def replace_period_in_deutsch_dates(txt)
53
+ def replace_period_in_deutsch_dates
68
54
  MONTHS.each do |month|
69
55
  # Rubular: http://rubular.com/r/zlqgj7G5dA
70
- txt.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
56
+ @text.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
71
57
  end
72
- txt
73
58
  end
74
59
  end
75
60
 
76
61
  class AbbreviationReplacer < AbbreviationReplacer
77
62
  def replace
78
- @reformatted_text = text.apply(
63
+ @text = text.apply(
79
64
  @language::PossessiveAbbreviationRule,
80
65
  @language::SingleLetterAbbreviationRules::All,
81
66
  SingleLowerCaseLetterRule,
82
67
  SingleLowerCaseLetterAtStartOfLineRule)
83
68
 
84
- @reformatted_text = search_for_abbreviations_in_string(@reformatted_text)
85
- @reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
86
- @reformatted_text = @reformatted_text.apply(Languages::Common::AmPmRules::All)
87
- replace_abbreviation_as_sentence_boundary(@reformatted_text)
69
+ @text = search_for_abbreviations_in_string(@text)
70
+ @text = replace_multi_period_abbreviations(@text)
71
+ @text.apply(Languages::Common::AmPmRules::All)
72
+ replace_abbreviation_as_sentence_boundary(@text)
88
73
  end
89
74
 
90
75
  private
@@ -97,15 +82,7 @@ module PragmaticSegmenter
97
82
  class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
98
83
  private
99
84
 
100
- def sub_punctuation_between_double_quotes(txt)
101
- btwn_dbl_quote = sub_punctuation_between_double_quotes_de(txt)
102
- PragmaticSegmenter::PunctuationReplacer.new(
103
- matches_array: btwn_dbl_quote,
104
- text: txt
105
- ).replace
106
- end
107
-
108
- def sub_punctuation_between_double_quotes_de(txt)
85
+ def btwn_dbl_quote(txt)
109
86
  if txt.include?('„')
110
87
  btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_DE_REGEX)
111
88
  txt.scan(SPLIT_DOUBLE_QUOTES_DE_REGEX).each do |q|
@@ -6,13 +6,13 @@ module PragmaticSegmenter
6
6
  class Cleaner < Cleaner
7
7
  def clean
8
8
  super
9
- clean_quotations(@clean_text)
9
+ clean_quotations
10
10
  end
11
11
 
12
12
  private
13
13
 
14
- def clean_quotations(txt)
15
- txt.gsub(/`/, "'")
14
+ def clean_quotations
15
+ @text.gsub(/`/, "'")
16
16
  end
17
17
 
18
18
  def abbreviations
@@ -3,27 +3,19 @@ module PragmaticSegmenter
3
3
  module Japanese
4
4
  include Languages::Common
5
5
 
6
- class Process < Process
7
- private
8
-
9
- def between_punctuation(txt)
10
- BetweenPunctuation.new(text: txt).replace
11
- end
12
- end
13
-
14
6
  class Cleaner < PragmaticSegmenter::Cleaner
15
7
  # Rubular: http://rubular.com/r/N4kPuJgle7
16
8
  NewLineInMiddleOfWordRule = Rule.new(/(?<=の)\n(?=\S)/, '')
17
9
 
18
10
  def clean
19
11
  super
20
- @clean_text = remove_newline_in_middle_of_word(@clean_text)
12
+ remove_newline_in_middle_of_word
21
13
  end
22
14
 
23
15
  private
24
16
 
25
- def remove_newline_in_middle_of_word(txt)
26
- txt.apply(NewLineInMiddleOfWordRule)
17
+ def remove_newline_in_middle_of_word
18
+ @text.apply NewLineInMiddleOfWordRule
27
19
  end
28
20
  end
29
21
 
@@ -42,14 +34,14 @@ module PragmaticSegmenter
42
34
  end
43
35
 
44
36
  def sub_punctuation_between_quotes_ja(txt)
45
- PragmaticSegmenter::PunctuationReplacer.new(
37
+ PunctuationReplacer.new(
46
38
  matches_array: txt.scan(BETWEEN_QUOTE_JA_REGEX),
47
39
  text: txt
48
40
  ).replace
49
41
  end
50
42
 
51
43
  def sub_punctuation_between_parens_ja(txt)
52
- PragmaticSegmenter::PunctuationReplacer.new(
44
+ PunctuationReplacer.new(
53
45
  matches_array: txt.scan(BETWEEN_PARENS_JA_REGEX),
54
46
  text: txt
55
47
  ).replace
@@ -9,20 +9,6 @@ module PragmaticSegmenter
9
9
  ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
10
10
  ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
11
11
 
12
- class Process < Process
13
- private
14
-
15
- def sentence_boundary_punctuation(txt)
16
- txt = txt.apply ReplaceColonBetweenNumbersRule,
17
- ReplaceNonSentenceBoundaryCommaRule
18
- txt.scan(SENTENCE_BOUNDARY_REGEX)
19
- end
20
-
21
- def replace_abbreviations(txt)
22
- AbbreviationReplacer.new(text: txt).replace
23
- end
24
- end
25
-
26
12
  class AbbreviationReplacer < AbbreviationReplacer
27
13
  private
28
14
 
@@ -9,34 +9,9 @@ module PragmaticSegmenter
9
9
  NUMBER_ABBREVIATIONS = []
10
10
  end
11
11
 
12
- class Process < Process
13
- private
14
-
15
- def replace_abbreviations(txt)
16
- AbbreviationReplacer.new(text: txt, language: Russian).replace
17
- end
18
- end
19
-
20
12
  class AbbreviationReplacer < AbbreviationReplacer
21
13
  private
22
14
 
23
- def scan_for_replacements(txt, am, index, character_array)
24
- character = character_array[index]
25
- prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS
26
- number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS
27
- upper = /[[:upper:]]/.match(character.to_s)
28
- if upper.nil? || prepositive.include?(am.downcase.strip)
29
- if prepositive.include?(am.downcase.strip)
30
- txt = replace_prepositive_abbr(txt, am)
31
- elsif number_abbr.include?(am.downcase.strip)
32
- txt = replace_pre_number_abbr(txt, am)
33
- else
34
- txt = replace_period_of_abbr(txt, am)
35
- end
36
- end
37
- txt
38
- end
39
-
40
15
  def replace_period_of_abbr(txt, abbr)
41
16
  txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
42
17
  .gsub(/(?<=\A#{abbr.strip})\./, '∯')
@@ -8,15 +8,6 @@ module PragmaticSegmenter
8
8
  PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
9
9
  NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
10
10
  end
11
-
12
- class Cleaner < Cleaner
13
- private
14
-
15
- def abbreviations
16
- Abbreviation::ABBREVIATIONS
17
- end
18
- end
19
-
20
11
  end
21
12
  end
22
13
  end
@@ -5,6 +5,8 @@ module PragmaticSegmenter
5
5
  # newlines before each list item.
6
6
  class List
7
7
  ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
8
+ LATIN_NUMERALS = ('a'..'z').to_a
9
+
8
10
  # Rubular: http://rubular.com/r/XcpaJKH0sz
9
11
  ALPHABETICAL_LIST_WITH_PERIODS =
10
12
  /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
@@ -45,10 +47,10 @@ module PragmaticSegmenter
45
47
  end
46
48
 
47
49
  def add_line_break
48
- formatted_text = format_alphabetical_lists(text)
49
- formatted_text = format_roman_numeral_lists(formatted_text)
50
- formatted_text = format_numbered_list_with_periods(formatted_text)
51
- format_numbered_list_with_parens(formatted_text)
50
+ format_alphabetical_lists
51
+ format_roman_numeral_lists
52
+ format_numbered_list_with_periods
53
+ format_numbered_list_with_parens
52
54
  end
53
55
 
54
56
  def replace_parens
@@ -63,64 +65,63 @@ module PragmaticSegmenter
63
65
 
64
66
  private
65
67
 
66
- def format_numbered_list_with_parens(txt)
67
- new_txt = replace_parens_in_numbered_list(txt)
68
- new_txt = add_line_breaks_for_numbered_list_with_parens(new_txt)
69
- new_txt.apply(ListMarkerRule)
68
+ def format_numbered_list_with_parens
69
+ replace_parens_in_numbered_list
70
+ add_line_breaks_for_numbered_list_with_parens
71
+ @text.apply(ListMarkerRule)
70
72
  end
71
73
 
72
- def format_numbered_list_with_periods(txt)
73
- new_txt = replace_periods_in_numbered_list(txt)
74
- new_txt = add_line_breaks_for_numbered_list_with_periods(new_txt)
75
- new_txt.apply(SubstituteListPeriodRule)
74
+ def format_numbered_list_with_periods
75
+ replace_periods_in_numbered_list
76
+ add_line_breaks_for_numbered_list_with_periods
77
+ @text.apply(SubstituteListPeriodRule)
76
78
  end
77
79
 
78
- def format_alphabetical_lists(txt)
79
- new_txt = add_line_breaks_for_alphabetical_list_with_periods(txt, false)
80
- add_line_breaks_for_alphabetical_list_with_parens(new_txt, false)
80
+ def format_alphabetical_lists
81
+ add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
82
+ add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
81
83
  end
82
84
 
83
- def format_roman_numeral_lists(txt)
84
- new_txt = add_line_breaks_for_alphabetical_list_with_periods(txt, true)
85
- add_line_breaks_for_alphabetical_list_with_parens(new_txt, true)
85
+ def format_roman_numeral_lists
86
+ add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: true)
87
+ add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: true)
86
88
  end
87
89
 
88
- def replace_periods_in_numbered_list(txt)
89
- scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', true, txt)
90
+ def replace_periods_in_numbered_list
91
+ scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', strip: true)
90
92
  end
91
93
 
92
- def add_line_breaks_for_numbered_list_with_periods(txt)
93
- return txt unless txt.include?('♨') &&
94
- txt !~ /♨.+\n.+♨|♨.+\r.+♨/ &&
95
- txt !~ /for\s\d{1,2}♨\s[a-z]/
96
- txt.apply(SpaceBetweenListItemsFirstRule).
97
- apply(SpaceBetweenListItemsSecondRule)
94
+ def add_line_breaks_for_numbered_list_with_periods
95
+ if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/
96
+ @text.apply(SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule)
97
+ end
98
98
  end
99
99
 
100
- def replace_parens_in_numbered_list(txt)
100
+ def replace_parens_in_numbered_list
101
101
  scan_lists(
102
- NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝', false, txt)
102
+ NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
103
+ scan_lists(NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
103
104
  end
104
105
 
105
- def add_line_breaks_for_numbered_list_with_parens(txt)
106
- return txt unless txt.include?('☝') && txt !~ /☝.+\n.+☝|☝.+\r.+☝/
107
- txt.apply(SpaceBetweenListItemsThirdRule)
106
+ def add_line_breaks_for_numbered_list_with_parens
107
+ if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/
108
+ @text.apply(SpaceBetweenListItemsThirdRule)
109
+ end
108
110
  end
109
111
 
110
- def scan_lists(regex1, regex2, replacement, strip, txt)
111
- list_array = txt.scan(regex1).map(&:to_i)
112
+ def scan_lists(regex1, regex2, replacement, strip: false)
113
+ list_array = @text.scan(regex1).map(&:to_i)
112
114
  list_array.each_with_index do |a, i|
113
115
  next unless (a + 1).eql?(list_array[i + 1]) ||
114
116
  (a - 1).eql?(list_array[i - 1]) ||
115
117
  (a.eql?(0) && list_array[i - 1].eql?(9)) ||
116
118
  (a.eql?(9) && list_array[i + 1].eql?(0))
117
- substitute_found_list_items(txt, regex2, a, strip, replacement)
119
+ substitute_found_list_items(regex2, a, strip, replacement)
118
120
  end
119
- txt
120
121
  end
121
122
 
122
- def substitute_found_list_items(txt, regex, a, strip, replacement)
123
- txt.gsub!(regex).with_index do |m|
123
+ def substitute_found_list_items(regex, a, strip, replacement)
124
+ @text.gsub!(regex).with_index do |m|
124
125
  if a.to_s.eql?(strip ? m.strip.chop : m)
125
126
  "#{Regexp.escape(a.to_s)}" + replacement
126
127
  else
@@ -129,22 +130,24 @@ module PragmaticSegmenter
129
130
  end
130
131
  end
131
132
 
132
- def add_line_breaks_for_alphabetical_list_with_periods(txt, roman_numeral)
133
- iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, false, txt, roman_numeral)
133
+ def add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
134
+ iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral: roman_numeral)
134
135
  end
135
136
 
136
- def add_line_breaks_for_alphabetical_list_with_parens(txt, roman_numeral)
137
- iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS, true, txt, roman_numeral)
137
+ def add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
138
+ iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS,
139
+ parens: true,
140
+ roman_numeral: roman_numeral)
138
141
  end
139
142
 
140
- def replace_alphabet_list(a, txt)
141
- txt.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
143
+ def replace_alphabet_list(a)
144
+ @text.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
142
145
  a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
143
146
  end
144
147
  end
145
148
 
146
- def replace_alphabet_list_parens(a, txt)
147
- txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
149
+ def replace_alphabet_list_parens(a)
150
+ @text.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
148
151
  if m.include?('(')
149
152
  a.eql?(m.dup.downcase.gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
150
153
  else
@@ -153,48 +156,47 @@ module PragmaticSegmenter
153
156
  end
154
157
  end
155
158
 
156
- def replace_correct_alphabet_list(a, txt, parens)
159
+ def replace_correct_alphabet_list(a, parens)
157
160
  if parens
158
- replace_alphabet_list_parens(a, txt)
161
+ replace_alphabet_list_parens(a)
159
162
  else
160
- replace_alphabet_list(a, txt)
163
+ replace_alphabet_list(a)
161
164
  end
162
165
  end
163
166
 
164
- def last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
167
+ def last_array_item_replacement(a, i, alphabet, list_array, parens)
165
168
  return if alphabet & list_array == [] ||
166
169
  !alphabet.include?(list_array[i - 1]) ||
167
170
  !alphabet.include?(a)
168
171
  return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
169
- replace_correct_alphabet_list(a, txt, parens)
172
+ replace_correct_alphabet_list(a, parens)
170
173
  end
171
174
 
172
- def other_items_replacement(a, i, alphabet, list_array, txt, parens)
175
+ def other_items_replacement(a, i, alphabet, list_array, parens)
173
176
  return if alphabet & list_array == [] ||
174
177
  !alphabet.include?(list_array[i - 1]) ||
175
178
  !alphabet.include?(a) ||
176
179
  !alphabet.include?(list_array[i + 1])
177
180
  return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 &&
178
181
  (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
179
- replace_correct_alphabet_list(a, txt, parens)
182
+ replace_correct_alphabet_list(a, parens)
180
183
  end
181
184
 
182
- def iterate_alphabet_array(regex, parens, txt, roman_numeral)
183
- list_array = txt.scan(regex).map(&:downcase)
185
+ def iterate_alphabet_array(regex, parens: false, roman_numeral: false)
186
+ list_array = @text.scan(regex).map(&:downcase)
184
187
  if roman_numeral
185
188
  alphabet = ROMAN_NUMERALS
186
189
  else
187
- alphabet = ('a'..'z').to_a
190
+ alphabet = LATIN_NUMERALS
188
191
  end
189
192
  list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
190
193
  list_array.each_with_index do |a, i|
191
194
  if i.eql?(list_array.length - 1)
192
- last_array_item_replacement(a, i, alphabet, list_array, txt, parens)
195
+ last_array_item_replacement(a, i, alphabet, list_array, parens)
193
196
  else
194
- other_items_replacement(a, i, alphabet, list_array, txt, parens)
197
+ other_items_replacement(a, i, alphabet, list_array, parens)
195
198
  end
196
199
  end
197
- txt
198
200
  end
199
201
  end
200
202
  end