pragmatic_segmenter 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +1 -0
  3. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +6 -6
  4. data/lib/pragmatic_segmenter/between_punctuation.rb +6 -4
  5. data/lib/pragmatic_segmenter/cleaner.rb +51 -47
  6. data/lib/pragmatic_segmenter/cleaner/rules.rb +86 -0
  7. data/lib/pragmatic_segmenter/languages.rb +21 -30
  8. data/lib/pragmatic_segmenter/languages/arabic.rb +0 -13
  9. data/lib/pragmatic_segmenter/languages/common.rb +67 -44
  10. data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +37 -0
  11. data/lib/pragmatic_segmenter/languages/common/numbers.rb +90 -0
  12. data/lib/pragmatic_segmenter/languages/deutsch.rb +25 -48
  13. data/lib/pragmatic_segmenter/languages/english.rb +3 -3
  14. data/lib/pragmatic_segmenter/languages/japanese.rb +5 -13
  15. data/lib/pragmatic_segmenter/languages/persian.rb +0 -14
  16. data/lib/pragmatic_segmenter/languages/russian.rb +0 -25
  17. data/lib/pragmatic_segmenter/languages/spanish.rb +0 -9
  18. data/lib/pragmatic_segmenter/list.rb +60 -58
  19. data/lib/pragmatic_segmenter/{process.rb → processor.rb} +47 -26
  20. data/lib/pragmatic_segmenter/punctuation_replacer.rb +41 -20
  21. data/lib/pragmatic_segmenter/segmenter.rb +19 -5
  22. data/lib/pragmatic_segmenter/version.rb +1 -1
  23. data/pragmatic_segmenter.gemspec +1 -0
  24. data/spec/pragmatic_segmenter/languages/amharic_spec.rb +18 -0
  25. data/spec/pragmatic_segmenter/languages/arabic_spec.rb +59 -0
  26. data/spec/pragmatic_segmenter/languages/armenian_spec.rb +160 -0
  27. data/spec/pragmatic_segmenter/languages/burmese_spec.rb +18 -0
  28. data/spec/pragmatic_segmenter/languages/chinese_spec.rb +11 -0
  29. data/spec/pragmatic_segmenter/languages/deutsch_spec.rb +189 -0
  30. data/spec/pragmatic_segmenter/languages/dutch_spec.rb +23 -0
  31. data/spec/pragmatic_segmenter/languages/english_spec.rb +1348 -0
  32. data/spec/pragmatic_segmenter/languages/french_spec.rb +31 -0
  33. data/spec/pragmatic_segmenter/languages/greek_spec.rb +18 -0
  34. data/spec/pragmatic_segmenter/languages/hindi_spec.rb +18 -0
  35. data/spec/pragmatic_segmenter/languages/italian_spec.rb +190 -0
  36. data/spec/pragmatic_segmenter/languages/japanese_spec.rb +53 -0
  37. data/spec/pragmatic_segmenter/languages/persian_spec.rb +18 -0
  38. data/spec/pragmatic_segmenter/languages/polish_spec.rb +11 -0
  39. data/spec/pragmatic_segmenter/languages/russian_spec.rb +219 -0
  40. data/spec/pragmatic_segmenter/languages/spanish_spec.rb +189 -0
  41. data/spec/pragmatic_segmenter/languages/urdu_spec.rb +18 -0
  42. data/spec/pragmatic_segmenter/languages_spec.rb +31 -0
  43. data/spec/pragmatic_segmenter_spec.rb +24 -2583
  44. metadata +59 -8
  45. data/lib/pragmatic_segmenter/number.rb +0 -35
  46. data/lib/pragmatic_segmenter/rules.rb +0 -168
  47. data/lib/pragmatic_segmenter/rules/ellipsis.rb +0 -35
  48. data/lib/pragmatic_segmenter/rules/html.rb +0 -13
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9e26400933e02aca93c63db3221fb897f49cdf78
4
- data.tar.gz: a0199ca43d0f8558ba3fe867c6e32c69787210c9
3
+ metadata.gz: a9cb6133aca84f8c6ff233ec6fb34b276cf47964
4
+ data.tar.gz: 00c1f664707e86e5c2ae5740c53acde5c814ece8
5
5
  SHA512:
6
- metadata.gz: 1dca5e20b2b062070b0cd319da6549594add065b670506b7e7dd9cbfe9eb8c83985ef197bf6427ee496d700edab5a526c8165fc1a27567e553f7a5e625edb6fe
7
- data.tar.gz: a53a4059b4ba41e7d40bf2a21ca1981ae5ed134a8017771f8ecd337bdc96762e2e78ec7c53c00c3ff2e3ac6179cf1e29f24fee703bc6b19fbbf31bc7ff572894
6
+ metadata.gz: d5726605fa78ec4067c79ed592a7983f2638b26a81fb88cf23bdffeb26d842c0eaed39a531181ecef6456218208f6d297e4316b36fc7f4a15f4deb2ebb7cb800
7
+ data.tar.gz: a1c99c7f3c73c1624a2b1d4792c8937dd27d3dfd4667dcb05844376d5ecad13ebe065f18fe3f40a61a4aa23baa3e8ab9b4dc3bc547821c6d9fc5700cd5a16f20
@@ -2,6 +2,7 @@ language: ruby
2
2
  rvm:
3
3
  - "2.1.5"
4
4
  - "2.2.0"
5
+ - "2.2.4"
5
6
  # uncomment this line if your project needs to run something other than `rake`:
6
7
  # script: bundle exec rspec spec
7
8
  addons:
@@ -8,20 +8,20 @@ module PragmaticSegmenter
8
8
  SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)
9
9
 
10
10
  attr_reader :text
11
- def initialize(text:, language: Languages::Common)
11
+ def initialize(text:, language: )
12
12
  @text = Text.new(text)
13
13
  @language = language
14
14
  end
15
15
 
16
16
  def replace
17
- @reformatted_text = text.apply(@language::PossessiveAbbreviationRule,
17
+ @text.apply(@language::PossessiveAbbreviationRule,
18
18
  @language::KommanditgesellschaftRule,
19
19
  @language::SingleLetterAbbreviationRules::All)
20
20
 
21
- @reformatted_text = search_for_abbreviations_in_string(@reformatted_text)
22
- @reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
23
- @reformatted_text = @reformatted_text.apply(@language::AmPmRules::All)
24
- replace_abbreviation_as_sentence_boundary(@reformatted_text)
21
+ @text = search_for_abbreviations_in_string(@text)
22
+ @text = replace_multi_period_abbreviations(@text)
23
+ @text.apply(@language::AmPmRules::All)
24
+ replace_abbreviation_as_sentence_boundary(@text)
25
25
  end
26
26
 
27
27
  private
@@ -1,5 +1,4 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require 'pragmatic_segmenter/punctuation_replacer'
3
2
 
4
3
  module PragmaticSegmenter
5
4
  # This class searches for punctuation between quotes or parenthesis
@@ -66,13 +65,16 @@ module PragmaticSegmenter
66
65
  end
67
66
 
68
67
  def sub_punctuation_between_double_quotes(txt)
69
- btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
70
68
  PragmaticSegmenter::PunctuationReplacer.new(
71
- matches_array: btwn_dbl_quote,
69
+ matches_array: btwn_dbl_quote(txt),
72
70
  text: txt
73
71
  ).replace
74
72
  end
75
73
 
74
+ def btwn_dbl_quote(txt)
75
+ txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
76
+ end
77
+
76
78
  def sub_punctuation_between_quotes_arrow(txt)
77
79
  PragmaticSegmenter::PunctuationReplacer.new(
78
80
  matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
@@ -87,4 +89,4 @@ module PragmaticSegmenter
87
89
  ).replace
88
90
  end
89
91
  end
90
- end
92
+ end
@@ -1,4 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ require_relative 'cleaner/rules'
2
3
 
3
4
  module PragmaticSegmenter
4
5
  # This is an opinionated class that removes errant newlines,
@@ -7,8 +8,8 @@ module PragmaticSegmenter
7
8
  include Rules
8
9
 
9
10
  attr_reader :text, :doc_type
10
- def initialize(text:, doc_type: nil, language: Languages::Common, **args)
11
- @text = Text.new(text.dup)
11
+ def initialize(text:, doc_type: nil, language: Languages::Common)
12
+ @text = Text.new(text)
12
13
  @doc_type = doc_type
13
14
  @language = language
14
15
  end
@@ -29,17 +30,19 @@ module PragmaticSegmenter
29
30
 
30
31
  def clean
31
32
  return unless text
32
- @clean_text = remove_all_newlines(text)
33
- replace_double_newlines(@clean_text)
34
- replace_newlines(@clean_text)
35
- replace_escaped_newlines(@clean_text)
36
- @clean_text.apply(HTMLRules::All)
37
- replace_punctuation_in_brackets(@clean_text)
38
- @clean_text.apply(InlineFormattingRule)
39
- clean_quotations(@clean_text)
40
- clean_table_of_contents(@clean_text)
41
- check_for_no_space_in_between_sentences(@clean_text)
42
- clean_consecutive_characters(@clean_text)
33
+ remove_all_newlines
34
+ replace_double_newlines
35
+ replace_newlines
36
+ replace_escaped_newlines
37
+
38
+ @text.apply(HTML::All)
39
+
40
+ replace_punctuation_in_brackets
41
+ @text.apply(InlineFormattingRule)
42
+ clean_quotations
43
+ clean_table_of_contents
44
+ check_for_no_space_in_between_sentences
45
+ clean_consecutive_characters
43
46
  end
44
47
 
45
48
  private
@@ -48,18 +51,18 @@ module PragmaticSegmenter
48
51
  @language::Abbreviation::ABBREVIATIONS
49
52
  end
50
53
 
51
- def check_for_no_space_in_between_sentences(txt)
52
- words = txt.split(' ')
54
+ def check_for_no_space_in_between_sentences
55
+ words = @text.split(' ')
53
56
  words.each do |word|
54
- search_for_connected_sentences(word, txt, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule)
55
- search_for_connected_sentences(word, txt, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule)
57
+ search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule)
58
+ search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule)
56
59
  end
57
- txt
60
+ @text
58
61
  end
59
62
 
60
- def replace_punctuation_in_brackets(txt)
61
- txt.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
62
- txt.gsub!(/#{Regexp.escape(match)}/, "#{match.dup.gsub!(/\?/, '&ᓷ&')}") if match.include?('?')
63
+ def replace_punctuation_in_brackets
64
+ @text.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
65
+ @text.gsub!(/#{Regexp.escape(match)}/, "#{match.dup.gsub!(/\?/, '&ᓷ&')}") if match.include?('?')
63
66
  end
64
67
  end
65
68
 
@@ -74,60 +77,61 @@ module PragmaticSegmenter
74
77
  end
75
78
  end
76
79
 
77
- def remove_all_newlines(txt)
78
- clean_text = remove_newline_in_middle_of_sentence(txt)
79
- remove_newline_in_middle_of_word(clean_text)
80
+ def remove_all_newlines
81
+ remove_newline_in_middle_of_sentence
82
+ remove_newline_in_middle_of_word
80
83
  end
81
84
 
82
- def remove_newline_in_middle_of_sentence(txt)
83
- txt.dup.gsub!(/(?:[^\.])*/) do |match|
85
+ def remove_newline_in_middle_of_sentence
86
+ @text.dup.gsub!(/(?:[^\.])*/) do |match|
84
87
  next unless match.include?("\n")
85
88
  orig = match.dup
86
89
  match.gsub!(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '')
87
- txt.gsub!(/#{Regexp.escape(orig)}/, "#{match}")
90
+ @text.gsub!(/#{Regexp.escape(orig)}/, "#{match}")
88
91
  end
89
- txt
92
+ @text
90
93
  end
91
94
 
92
- def remove_newline_in_middle_of_word(txt)
93
- txt.apply NewLineInMiddleOfWordRule
95
+ def remove_newline_in_middle_of_word
96
+ @text.apply NewLineInMiddleOfWordRule
94
97
  end
95
98
 
96
- def replace_escaped_newlines(txt)
97
- txt.apply EscapedNewLineRule, EscapedCarriageReturnRule,
99
+ def replace_escaped_newlines
100
+ @text.apply EscapedNewLineRule, EscapedCarriageReturnRule,
98
101
  TypoEscapedNewLineRule, TypoEscapedCarriageReturnRule
99
102
  end
100
103
 
101
- def replace_double_newlines(txt)
102
- txt.apply DoubleNewLineWithSpaceRule, DoubleNewLineRule
104
+ def replace_double_newlines
105
+ @text.apply DoubleNewLineWithSpaceRule, DoubleNewLineRule
103
106
  end
104
107
 
105
- def replace_newlines(txt)
108
+ def replace_newlines
106
109
  if doc_type.eql?('pdf')
107
- remove_pdf_line_breaks(txt)
110
+ remove_pdf_line_breaks
108
111
  else
109
- txt.apply NewLineFollowedByPeriodRule,
112
+ @text.apply NewLineFollowedByPeriodRule,
110
113
  ReplaceNewlineWithCarriageReturnRule
111
114
  end
112
115
  end
113
116
 
114
- def remove_pdf_line_breaks(txt)
115
- txt.apply NewLineFollowedByBulletRule,
116
- PDF_NewLineInMiddleOfSentenceRule,
117
- PDF_NewLineInMiddleOfSentenceNoSpacesRule
117
+ def remove_pdf_line_breaks
118
+ @text.apply NewLineFollowedByBulletRule,
119
+
120
+ PDF::NewLineInMiddleOfSentenceRule,
121
+ PDF::NewLineInMiddleOfSentenceNoSpacesRule
118
122
  end
119
123
 
120
- def clean_quotations(txt)
121
- txt.apply QuotationsFirstRule, QuotationsSecondRule
124
+ def clean_quotations
125
+ @text.apply QuotationsFirstRule, QuotationsSecondRule
122
126
  end
123
127
 
124
- def clean_table_of_contents(txt)
125
- txt.apply TableOfContentsRule, ConsecutivePeriodsRule,
128
+ def clean_table_of_contents
129
+ @text.apply TableOfContentsRule, ConsecutivePeriodsRule,
126
130
  ConsecutiveForwardSlashRule
127
131
  end
128
132
 
129
- def clean_consecutive_characters(txt)
130
- txt.apply ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
133
+ def clean_consecutive_characters
134
+ @text.apply ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
131
135
  end
132
136
  end
133
137
  end
@@ -0,0 +1,86 @@
1
+ module PragmaticSegmenter
2
+ # This is an opinionated class that removes errant newlines,
3
+ # xhtml, inline formatting, etc.
4
+ class Cleaner
5
+ module Rules
6
+ # Rubular: http://rubular.com/r/V57WnM9Zut
7
+ NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
8
+
9
+ # Rubular: http://rubular.com/r/dMxp5MixFS
10
+ DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
11
+
12
+ # Rubular: http://rubular.com/r/H6HOJeA8bq
13
+ DoubleNewLineRule = Rule.new(/\n\n/, "\r")
14
+
15
+ # Rubular: http://rubular.com/r/FseyMiiYFT
16
+ NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
17
+
18
+
19
+ ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
20
+
21
+ EscapedNewLineRule = Rule.new(/\\n/, "\n")
22
+ EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
23
+
24
+ TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n")
25
+
26
+ TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r")
27
+
28
+
29
+
30
+
31
+ # Rubular: http://rubular.com/r/bAJrhyLNeZ
32
+ InlineFormattingRule = Rule.new(/\{b\^&gt;\d*&lt;b\^\}|\{b\^>\d*<b\^\}/, '')
33
+
34
+ # Rubular: http://rubular.com/r/8mc1ArOIGy
35
+ TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
36
+
37
+ # Rubular: http://rubular.com/r/DwNSuZrNtk
38
+ ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
39
+
40
+ # Rubular: http://rubular.com/r/IQ4TPfsbd8
41
+ ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
42
+
43
+
44
+ # Rubular: http://rubular.com/r/6dt98uI76u
45
+ NO_SPACE_BETWEEN_SENTENCES_REGEX = /(?<=[a-z])\.(?=[A-Z])/
46
+ NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
47
+
48
+ # Rubular: http://rubular.com/r/l6KN6rH5XE
49
+ NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = /(?<=\d)\.(?=[A-Z])/
50
+ NoSpaceBetweenSentencesDigitRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
51
+
52
+
53
+ URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
54
+
55
+ # Rubular: http://rubular.com/r/3GiRiP2IbD
56
+ NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
57
+
58
+
59
+ # Rubular: http://rubular.com/r/Gn18aAnLdZ
60
+ NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
61
+
62
+ QuotationsFirstRule = Rule.new(/''/, '"')
63
+ QuotationsSecondRule = Rule.new(/``/, '"')
64
+
65
+
66
+ module HTML
67
+ # Rubular: http://rubular.com/r/ENrVFMdJ8v
68
+ HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
69
+
70
+ # Rubular: http://rubular.com/r/XZVqMPJhea
71
+ EscapedHTMLTagRule = Rule.new(/&lt;\/?[^gt;]*gt;/, '')
72
+
73
+ All = [HTMLTagRule, EscapedHTMLTagRule]
74
+ end
75
+
76
+ module PDF
77
+ # Rubular: http://rubular.com/r/UZAVcwqck8
78
+ NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
79
+
80
+ # Rubular: http://rubular.com/r/eaNwGavmdo
81
+ NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
82
+ end
83
+
84
+ end
85
+ end
86
+ end
@@ -1,7 +1,6 @@
1
1
  require 'pragmatic_segmenter/types'
2
- require 'pragmatic_segmenter/process'
2
+ require 'pragmatic_segmenter/processor'
3
3
  require 'pragmatic_segmenter/cleaner'
4
- require 'pragmatic_segmenter/rules'
5
4
 
6
5
  require 'pragmatic_segmenter/languages/common'
7
6
 
@@ -27,36 +26,28 @@ require 'pragmatic_segmenter/languages/chinese'
27
26
  module PragmaticSegmenter
28
27
  module Languages
29
28
  LANGUAGE_CODES = {
30
- 'en' => 'English',
31
- 'de' => 'Deutsch',
32
- 'es' => 'Spanish',
33
- 'fr' => 'French',
34
- 'it' => 'Italian',
35
- 'ja' => 'Japanese',
36
- 'el' => 'Greek',
37
- 'ru' => 'Russian',
38
- 'ar' => 'Arabic',
39
- 'am' => 'Amharic',
40
- 'hi' => 'Hindi',
41
- 'hy' => 'Armenian',
42
- 'fa' => 'Persian',
43
- 'my' => 'Burmese',
44
- 'ur' => 'Urdu',
45
- 'nl' => 'Dutch',
46
- 'pl' => 'Polish',
47
- 'zh' => 'Chinese',
29
+ 'en' => English,
30
+ 'de' => Deutsch,
31
+ 'es' => Spanish,
32
+ 'fr' => French,
33
+ 'it' => Italian,
34
+ 'ja' => Japanese,
35
+ 'el' => Greek,
36
+ 'ru' => Russian,
37
+ 'ar' => Arabic,
38
+ 'am' => Amharic,
39
+ 'hi' => Hindi,
40
+ 'hy' => Armenian,
41
+ 'fa' => Persian,
42
+ 'my' => Burmese,
43
+ 'ur' => Urdu,
44
+ 'nl' => Dutch,
45
+ 'pl' => Polish,
46
+ 'zh' => Chinese,
48
47
  }
49
48
 
50
- def process_class
51
- language_module::Process
52
- end
53
-
54
- def cleaner_class
55
- language_module::Cleaner
56
- end
57
-
58
- def language_module
59
- Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}")
49
+ def self.get_language_by_code(code)
50
+ LANGUAGE_CODES[code] || Common
60
51
  end
61
52
  end
62
53
  end
@@ -18,19 +18,6 @@ module PragmaticSegmenter
18
18
  # Rubular: http://rubular.com/r/kPRgApNHUg
19
19
  ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
20
20
 
21
- class Process < Process
22
- private
23
-
24
- def sentence_boundary_punctuation(txt)
25
- txt = txt.apply(ReplaceColonBetweenNumbersRule, ReplaceNonSentenceBoundaryCommaRule)
26
- txt.scan(SENTENCE_BOUNDARY_REGEX)
27
- end
28
-
29
- def replace_abbreviations(txt)
30
- AbbreviationReplacer.new(text: txt, language: Arabic).replace
31
- end
32
- end
33
-
34
21
  class AbbreviationReplacer < AbbreviationReplacer
35
22
  private
36
23
 
@@ -1,3 +1,6 @@
1
+ require_relative 'common/numbers'
2
+ require_relative 'common/ellipsis'
3
+
1
4
  module PragmaticSegmenter
2
5
  module Languages
3
6
  module Common
@@ -11,69 +14,89 @@ module PragmaticSegmenter
11
14
  NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
12
15
  end
13
16
 
14
- SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
15
-
16
- include Rules
17
- # Rubular: http://rubular.com/r/NqCqv372Ix
18
- QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
17
+ module Abbreviations
18
+ # Rubular: http://rubular.com/r/EUbZCNfgei
19
+ WithMultiplePeriodsAndEmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
20
+ end
19
21
 
20
- # Rubular: http://rubular.com/r/6flGnUMEVl
21
- PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
22
+ # Rubular: http://rubular.com/r/G2opjedIm9
23
+ GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
22
24
 
23
- # Rubular: http://rubular.com/r/TYzr4qOW1Q
24
- BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
25
+ SingleNewLineRule = Rule.new(/\n/, 'ȹ')
25
26
 
26
- # Rubular: http://rubular.com/r/JMjlZHAT4g
27
- SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
27
+ module DoublePunctuationRules
28
+ FirstRule = Rule.new(/\?!/, '☉')
29
+ SecondRule = Rule.new(/!\?/, '☈')
30
+ ThirdRule = Rule.new(/\?\?/, '☇')
31
+ ForthRule = Rule.new(/!!/, '☄')
28
32
 
29
- # Rubular: http://rubular.com/r/mQ8Es9bxtk
30
- CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
33
+ All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
34
+ end
31
35
 
32
- # Rubular: http://rubular.com/r/yqa4Rit8EY
33
- PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
34
36
 
35
- # Rubular: http://rubular.com/r/NEv265G2X2
36
- KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '')
37
+ # Rubular: http://rubular.com/r/aXPUGm6fQh
38
+ QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
37
39
 
38
- # Rubular: http://rubular.com/r/xDkpFZ0EgH
39
- MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
40
40
 
41
- module AmPmRules
42
- # Rubular: http://rubular.com/r/Vnx3m4Spc8
43
- UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
41
+ module ExclamationPointRules
42
+ # Rubular: http://rubular.com/r/XS1XXFRfM2
43
+ InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
44
44
 
45
- # Rubular: http://rubular.com/r/AJMCotJVbW
46
- UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
45
+ # Rubular: http://rubular.com/r/sl57YI8LkA
46
+ BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
47
47
 
48
- # Rubular: http://rubular.com/r/13q7SnOhgA
49
- LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
48
+ # Rubular: http://rubular.com/r/f9zTjmkIPb
49
+ MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
50
50
 
51
- # Rubular: http://rubular.com/r/DgUDq4mLz5
52
- LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
51
+ All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
52
+ end
53
53
 
54
- All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
54
+ module SubSymbolsRules
55
+ Period = Rule.new(/∯/, '.')
56
+ ArabicComma = Rule.new(/♬/, '،')
57
+ SemiColon = Rule.new(/♭/, ':')
58
+ FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
59
+ SpecialPeriod = Rule.new(/&ᓱ&/, '.')
60
+ FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
61
+ ExclamationPoint = Rule.new(/&ᓴ&/, '!')
62
+ QuestionMark = Rule.new(/&ᓷ&/, '?')
63
+ FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
64
+ MixedDoubleQE = Rule.new(/☉/, '?!')
65
+ MixedDoubleQQ = Rule.new(/☇/, '??')
66
+ MixedDoubleEQ = Rule.new(/☈/, '!?')
67
+ MixedDoubleEE = Rule.new(/☄/, '!!')
68
+ LeftParens = Rule.new(/&✂&/, '(')
69
+ RightParens = Rule.new(/&⌬&/, ')')
70
+ TemporaryEndingPunctutation = Rule.new('ȸ', '')
71
+ Newline = Rule.new(/ȹ/, "\n")
72
+
73
+ All = [ Period, ArabicComma,
74
+ SemiColon, FullWidthPeriod,
75
+ SpecialPeriod, FullWidthExclamation,
76
+ ExclamationPoint, QuestionMark,
77
+ FullWidthQuestionMark, MixedDoubleQE,
78
+ MixedDoubleQQ, MixedDoubleEQ,
79
+ MixedDoubleEE, LeftParens,
80
+ RightParens, TemporaryEndingPunctutation,
81
+ Newline ]
55
82
  end
56
83
 
57
- # This class searches for periods within an abbreviation and
58
- # replaces the periods.
59
- module SingleLetterAbbreviationRules
60
- # Rubular: http://rubular.com/r/e3H6kwnr6H
61
- SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
62
84
 
63
- # Rubular: http://rubular.com/r/gitvf0YWH4
64
- SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '')
85
+ module ReinsertEllipsisRules
86
+ SubThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
87
+ SubThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
88
+ SubFourSpacePeriod = Rule.new(/♝/, '. . . .')
89
+ SubTwoConsecutivePeriod = Rule.new(/☏/, '..')
90
+ SubOnePeriod = Rule.new(/∮/, '.')
65
91
 
66
- All = [
67
- SingleUpperCaseLetterAtStartOfLineRule,
68
- SingleUpperCaseLetterRule
69
- ]
92
+ All = [ SubThreeConsecutivePeriod, SubThreeSpacePeriod,
93
+ SubFourSpacePeriod, SubTwoConsecutivePeriod,
94
+ SubOnePeriod ]
70
95
  end
71
96
 
97
+ ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
72
98
 
73
- class Process < PragmaticSegmenter::Process
74
- end
75
- class Cleaner < PragmaticSegmenter::Cleaner
76
- end
99
+ SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
77
100
  end
78
101
  end
79
102
  end