pragmatic_segmenter 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.travis.yml +1 -0
  3. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +6 -6
  4. data/lib/pragmatic_segmenter/between_punctuation.rb +6 -4
  5. data/lib/pragmatic_segmenter/cleaner.rb +51 -47
  6. data/lib/pragmatic_segmenter/cleaner/rules.rb +86 -0
  7. data/lib/pragmatic_segmenter/languages.rb +21 -30
  8. data/lib/pragmatic_segmenter/languages/arabic.rb +0 -13
  9. data/lib/pragmatic_segmenter/languages/common.rb +67 -44
  10. data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +37 -0
  11. data/lib/pragmatic_segmenter/languages/common/numbers.rb +90 -0
  12. data/lib/pragmatic_segmenter/languages/deutsch.rb +25 -48
  13. data/lib/pragmatic_segmenter/languages/english.rb +3 -3
  14. data/lib/pragmatic_segmenter/languages/japanese.rb +5 -13
  15. data/lib/pragmatic_segmenter/languages/persian.rb +0 -14
  16. data/lib/pragmatic_segmenter/languages/russian.rb +0 -25
  17. data/lib/pragmatic_segmenter/languages/spanish.rb +0 -9
  18. data/lib/pragmatic_segmenter/list.rb +60 -58
  19. data/lib/pragmatic_segmenter/{process.rb → processor.rb} +47 -26
  20. data/lib/pragmatic_segmenter/punctuation_replacer.rb +41 -20
  21. data/lib/pragmatic_segmenter/segmenter.rb +19 -5
  22. data/lib/pragmatic_segmenter/version.rb +1 -1
  23. data/pragmatic_segmenter.gemspec +1 -0
  24. data/spec/pragmatic_segmenter/languages/amharic_spec.rb +18 -0
  25. data/spec/pragmatic_segmenter/languages/arabic_spec.rb +59 -0
  26. data/spec/pragmatic_segmenter/languages/armenian_spec.rb +160 -0
  27. data/spec/pragmatic_segmenter/languages/burmese_spec.rb +18 -0
  28. data/spec/pragmatic_segmenter/languages/chinese_spec.rb +11 -0
  29. data/spec/pragmatic_segmenter/languages/deutsch_spec.rb +189 -0
  30. data/spec/pragmatic_segmenter/languages/dutch_spec.rb +23 -0
  31. data/spec/pragmatic_segmenter/languages/english_spec.rb +1348 -0
  32. data/spec/pragmatic_segmenter/languages/french_spec.rb +31 -0
  33. data/spec/pragmatic_segmenter/languages/greek_spec.rb +18 -0
  34. data/spec/pragmatic_segmenter/languages/hindi_spec.rb +18 -0
  35. data/spec/pragmatic_segmenter/languages/italian_spec.rb +190 -0
  36. data/spec/pragmatic_segmenter/languages/japanese_spec.rb +53 -0
  37. data/spec/pragmatic_segmenter/languages/persian_spec.rb +18 -0
  38. data/spec/pragmatic_segmenter/languages/polish_spec.rb +11 -0
  39. data/spec/pragmatic_segmenter/languages/russian_spec.rb +219 -0
  40. data/spec/pragmatic_segmenter/languages/spanish_spec.rb +189 -0
  41. data/spec/pragmatic_segmenter/languages/urdu_spec.rb +18 -0
  42. data/spec/pragmatic_segmenter/languages_spec.rb +31 -0
  43. data/spec/pragmatic_segmenter_spec.rb +24 -2583
  44. metadata +59 -8
  45. data/lib/pragmatic_segmenter/number.rb +0 -35
  46. data/lib/pragmatic_segmenter/rules.rb +0 -168
  47. data/lib/pragmatic_segmenter/rules/ellipsis.rb +0 -35
  48. data/lib/pragmatic_segmenter/rules/html.rb +0 -13
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9e26400933e02aca93c63db3221fb897f49cdf78
4
- data.tar.gz: a0199ca43d0f8558ba3fe867c6e32c69787210c9
3
+ metadata.gz: a9cb6133aca84f8c6ff233ec6fb34b276cf47964
4
+ data.tar.gz: 00c1f664707e86e5c2ae5740c53acde5c814ece8
5
5
  SHA512:
6
- metadata.gz: 1dca5e20b2b062070b0cd319da6549594add065b670506b7e7dd9cbfe9eb8c83985ef197bf6427ee496d700edab5a526c8165fc1a27567e553f7a5e625edb6fe
7
- data.tar.gz: a53a4059b4ba41e7d40bf2a21ca1981ae5ed134a8017771f8ecd337bdc96762e2e78ec7c53c00c3ff2e3ac6179cf1e29f24fee703bc6b19fbbf31bc7ff572894
6
+ metadata.gz: d5726605fa78ec4067c79ed592a7983f2638b26a81fb88cf23bdffeb26d842c0eaed39a531181ecef6456218208f6d297e4316b36fc7f4a15f4deb2ebb7cb800
7
+ data.tar.gz: a1c99c7f3c73c1624a2b1d4792c8937dd27d3dfd4667dcb05844376d5ecad13ebe065f18fe3f40a61a4aa23baa3e8ab9b4dc3bc547821c6d9fc5700cd5a16f20
@@ -2,6 +2,7 @@ language: ruby
2
2
  rvm:
3
3
  - "2.1.5"
4
4
  - "2.2.0"
5
+ - "2.2.4"
5
6
  # uncomment this line if your project needs to run something other than `rake`:
6
7
  # script: bundle exec rspec spec
7
8
  addons:
@@ -8,20 +8,20 @@ module PragmaticSegmenter
8
8
  SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)
9
9
 
10
10
  attr_reader :text
11
- def initialize(text:, language: Languages::Common)
11
+ def initialize(text:, language: )
12
12
  @text = Text.new(text)
13
13
  @language = language
14
14
  end
15
15
 
16
16
  def replace
17
- @reformatted_text = text.apply(@language::PossessiveAbbreviationRule,
17
+ @text.apply(@language::PossessiveAbbreviationRule,
18
18
  @language::KommanditgesellschaftRule,
19
19
  @language::SingleLetterAbbreviationRules::All)
20
20
 
21
- @reformatted_text = search_for_abbreviations_in_string(@reformatted_text)
22
- @reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
23
- @reformatted_text = @reformatted_text.apply(@language::AmPmRules::All)
24
- replace_abbreviation_as_sentence_boundary(@reformatted_text)
21
+ @text = search_for_abbreviations_in_string(@text)
22
+ @text = replace_multi_period_abbreviations(@text)
23
+ @text.apply(@language::AmPmRules::All)
24
+ replace_abbreviation_as_sentence_boundary(@text)
25
25
  end
26
26
 
27
27
  private
@@ -1,5 +1,4 @@
1
1
  # -*- encoding : utf-8 -*-
2
- require 'pragmatic_segmenter/punctuation_replacer'
3
2
 
4
3
  module PragmaticSegmenter
5
4
  # This class searches for punctuation between quotes or parenthesis
@@ -66,13 +65,16 @@ module PragmaticSegmenter
66
65
  end
67
66
 
68
67
  def sub_punctuation_between_double_quotes(txt)
69
- btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
70
68
  PragmaticSegmenter::PunctuationReplacer.new(
71
- matches_array: btwn_dbl_quote,
69
+ matches_array: btwn_dbl_quote(txt),
72
70
  text: txt
73
71
  ).replace
74
72
  end
75
73
 
74
+ def btwn_dbl_quote(txt)
75
+ txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
76
+ end
77
+
76
78
  def sub_punctuation_between_quotes_arrow(txt)
77
79
  PragmaticSegmenter::PunctuationReplacer.new(
78
80
  matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
@@ -87,4 +89,4 @@ module PragmaticSegmenter
87
89
  ).replace
88
90
  end
89
91
  end
90
- end
92
+ end
@@ -1,4 +1,5 @@
1
1
  # -*- encoding : utf-8 -*-
2
+ require_relative 'cleaner/rules'
2
3
 
3
4
  module PragmaticSegmenter
4
5
  # This is an opinionated class that removes errant newlines,
@@ -7,8 +8,8 @@ module PragmaticSegmenter
7
8
  include Rules
8
9
 
9
10
  attr_reader :text, :doc_type
10
- def initialize(text:, doc_type: nil, language: Languages::Common, **args)
11
- @text = Text.new(text.dup)
11
+ def initialize(text:, doc_type: nil, language: Languages::Common)
12
+ @text = Text.new(text)
12
13
  @doc_type = doc_type
13
14
  @language = language
14
15
  end
@@ -29,17 +30,19 @@ module PragmaticSegmenter
29
30
 
30
31
  def clean
31
32
  return unless text
32
- @clean_text = remove_all_newlines(text)
33
- replace_double_newlines(@clean_text)
34
- replace_newlines(@clean_text)
35
- replace_escaped_newlines(@clean_text)
36
- @clean_text.apply(HTMLRules::All)
37
- replace_punctuation_in_brackets(@clean_text)
38
- @clean_text.apply(InlineFormattingRule)
39
- clean_quotations(@clean_text)
40
- clean_table_of_contents(@clean_text)
41
- check_for_no_space_in_between_sentences(@clean_text)
42
- clean_consecutive_characters(@clean_text)
33
+ remove_all_newlines
34
+ replace_double_newlines
35
+ replace_newlines
36
+ replace_escaped_newlines
37
+
38
+ @text.apply(HTML::All)
39
+
40
+ replace_punctuation_in_brackets
41
+ @text.apply(InlineFormattingRule)
42
+ clean_quotations
43
+ clean_table_of_contents
44
+ check_for_no_space_in_between_sentences
45
+ clean_consecutive_characters
43
46
  end
44
47
 
45
48
  private
@@ -48,18 +51,18 @@ module PragmaticSegmenter
48
51
  @language::Abbreviation::ABBREVIATIONS
49
52
  end
50
53
 
51
- def check_for_no_space_in_between_sentences(txt)
52
- words = txt.split(' ')
54
+ def check_for_no_space_in_between_sentences
55
+ words = @text.split(' ')
53
56
  words.each do |word|
54
- search_for_connected_sentences(word, txt, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule)
55
- search_for_connected_sentences(word, txt, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule)
57
+ search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule)
58
+ search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule)
56
59
  end
57
- txt
60
+ @text
58
61
  end
59
62
 
60
- def replace_punctuation_in_brackets(txt)
61
- txt.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
62
- txt.gsub!(/#{Regexp.escape(match)}/, "#{match.dup.gsub!(/\?/, '&ᓷ&')}") if match.include?('?')
63
+ def replace_punctuation_in_brackets
64
+ @text.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
65
+ @text.gsub!(/#{Regexp.escape(match)}/, "#{match.dup.gsub!(/\?/, '&ᓷ&')}") if match.include?('?')
63
66
  end
64
67
  end
65
68
 
@@ -74,60 +77,61 @@ module PragmaticSegmenter
74
77
  end
75
78
  end
76
79
 
77
- def remove_all_newlines(txt)
78
- clean_text = remove_newline_in_middle_of_sentence(txt)
79
- remove_newline_in_middle_of_word(clean_text)
80
+ def remove_all_newlines
81
+ remove_newline_in_middle_of_sentence
82
+ remove_newline_in_middle_of_word
80
83
  end
81
84
 
82
- def remove_newline_in_middle_of_sentence(txt)
83
- txt.dup.gsub!(/(?:[^\.])*/) do |match|
85
+ def remove_newline_in_middle_of_sentence
86
+ @text.dup.gsub!(/(?:[^\.])*/) do |match|
84
87
  next unless match.include?("\n")
85
88
  orig = match.dup
86
89
  match.gsub!(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '')
87
- txt.gsub!(/#{Regexp.escape(orig)}/, "#{match}")
90
+ @text.gsub!(/#{Regexp.escape(orig)}/, "#{match}")
88
91
  end
89
- txt
92
+ @text
90
93
  end
91
94
 
92
- def remove_newline_in_middle_of_word(txt)
93
- txt.apply NewLineInMiddleOfWordRule
95
+ def remove_newline_in_middle_of_word
96
+ @text.apply NewLineInMiddleOfWordRule
94
97
  end
95
98
 
96
- def replace_escaped_newlines(txt)
97
- txt.apply EscapedNewLineRule, EscapedCarriageReturnRule,
99
+ def replace_escaped_newlines
100
+ @text.apply EscapedNewLineRule, EscapedCarriageReturnRule,
98
101
  TypoEscapedNewLineRule, TypoEscapedCarriageReturnRule
99
102
  end
100
103
 
101
- def replace_double_newlines(txt)
102
- txt.apply DoubleNewLineWithSpaceRule, DoubleNewLineRule
104
+ def replace_double_newlines
105
+ @text.apply DoubleNewLineWithSpaceRule, DoubleNewLineRule
103
106
  end
104
107
 
105
- def replace_newlines(txt)
108
+ def replace_newlines
106
109
  if doc_type.eql?('pdf')
107
- remove_pdf_line_breaks(txt)
110
+ remove_pdf_line_breaks
108
111
  else
109
- txt.apply NewLineFollowedByPeriodRule,
112
+ @text.apply NewLineFollowedByPeriodRule,
110
113
  ReplaceNewlineWithCarriageReturnRule
111
114
  end
112
115
  end
113
116
 
114
- def remove_pdf_line_breaks(txt)
115
- txt.apply NewLineFollowedByBulletRule,
116
- PDF_NewLineInMiddleOfSentenceRule,
117
- PDF_NewLineInMiddleOfSentenceNoSpacesRule
117
+ def remove_pdf_line_breaks
118
+ @text.apply NewLineFollowedByBulletRule,
119
+
120
+ PDF::NewLineInMiddleOfSentenceRule,
121
+ PDF::NewLineInMiddleOfSentenceNoSpacesRule
118
122
  end
119
123
 
120
- def clean_quotations(txt)
121
- txt.apply QuotationsFirstRule, QuotationsSecondRule
124
+ def clean_quotations
125
+ @text.apply QuotationsFirstRule, QuotationsSecondRule
122
126
  end
123
127
 
124
- def clean_table_of_contents(txt)
125
- txt.apply TableOfContentsRule, ConsecutivePeriodsRule,
128
+ def clean_table_of_contents
129
+ @text.apply TableOfContentsRule, ConsecutivePeriodsRule,
126
130
  ConsecutiveForwardSlashRule
127
131
  end
128
132
 
129
- def clean_consecutive_characters(txt)
130
- txt.apply ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
133
+ def clean_consecutive_characters
134
+ @text.apply ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
131
135
  end
132
136
  end
133
137
  end
@@ -0,0 +1,86 @@
1
+ module PragmaticSegmenter
2
+ # This is an opinionated class that removes errant newlines,
3
+ # xhtml, inline formatting, etc.
4
+ class Cleaner
5
+ module Rules
6
+ # Rubular: http://rubular.com/r/V57WnM9Zut
7
+ NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
8
+
9
+ # Rubular: http://rubular.com/r/dMxp5MixFS
10
+ DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
11
+
12
+ # Rubular: http://rubular.com/r/H6HOJeA8bq
13
+ DoubleNewLineRule = Rule.new(/\n\n/, "\r")
14
+
15
+ # Rubular: http://rubular.com/r/FseyMiiYFT
16
+ NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
17
+
18
+
19
+ ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
20
+
21
+ EscapedNewLineRule = Rule.new(/\\n/, "\n")
22
+ EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
23
+
24
+ TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n")
25
+
26
+ TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r")
27
+
28
+
29
+
30
+
31
+ # Rubular: http://rubular.com/r/bAJrhyLNeZ
32
+ InlineFormattingRule = Rule.new(/\{b\^&gt;\d*&lt;b\^\}|\{b\^>\d*<b\^\}/, '')
33
+
34
+ # Rubular: http://rubular.com/r/8mc1ArOIGy
35
+ TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
36
+
37
+ # Rubular: http://rubular.com/r/DwNSuZrNtk
38
+ ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
39
+
40
+ # Rubular: http://rubular.com/r/IQ4TPfsbd8
41
+ ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
42
+
43
+
44
+ # Rubular: http://rubular.com/r/6dt98uI76u
45
+ NO_SPACE_BETWEEN_SENTENCES_REGEX = /(?<=[a-z])\.(?=[A-Z])/
46
+ NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
47
+
48
+ # Rubular: http://rubular.com/r/l6KN6rH5XE
49
+ NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = /(?<=\d)\.(?=[A-Z])/
50
+ NoSpaceBetweenSentencesDigitRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
51
+
52
+
53
+ URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
54
+
55
+ # Rubular: http://rubular.com/r/3GiRiP2IbD
56
+ NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
57
+
58
+
59
+ # Rubular: http://rubular.com/r/Gn18aAnLdZ
60
+ NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
61
+
62
+ QuotationsFirstRule = Rule.new(/''/, '"')
63
+ QuotationsSecondRule = Rule.new(/``/, '"')
64
+
65
+
66
+ module HTML
67
+ # Rubular: http://rubular.com/r/ENrVFMdJ8v
68
+ HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
69
+
70
+ # Rubular: http://rubular.com/r/XZVqMPJhea
71
+ EscapedHTMLTagRule = Rule.new(/&lt;\/?[^gt;]*gt;/, '')
72
+
73
+ All = [HTMLTagRule, EscapedHTMLTagRule]
74
+ end
75
+
76
+ module PDF
77
+ # Rubular: http://rubular.com/r/UZAVcwqck8
78
+ NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
79
+
80
+ # Rubular: http://rubular.com/r/eaNwGavmdo
81
+ NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
82
+ end
83
+
84
+ end
85
+ end
86
+ end
@@ -1,7 +1,6 @@
1
1
  require 'pragmatic_segmenter/types'
2
- require 'pragmatic_segmenter/process'
2
+ require 'pragmatic_segmenter/processor'
3
3
  require 'pragmatic_segmenter/cleaner'
4
- require 'pragmatic_segmenter/rules'
5
4
 
6
5
  require 'pragmatic_segmenter/languages/common'
7
6
 
@@ -27,36 +26,28 @@ require 'pragmatic_segmenter/languages/chinese'
27
26
  module PragmaticSegmenter
28
27
  module Languages
29
28
  LANGUAGE_CODES = {
30
- 'en' => 'English',
31
- 'de' => 'Deutsch',
32
- 'es' => 'Spanish',
33
- 'fr' => 'French',
34
- 'it' => 'Italian',
35
- 'ja' => 'Japanese',
36
- 'el' => 'Greek',
37
- 'ru' => 'Russian',
38
- 'ar' => 'Arabic',
39
- 'am' => 'Amharic',
40
- 'hi' => 'Hindi',
41
- 'hy' => 'Armenian',
42
- 'fa' => 'Persian',
43
- 'my' => 'Burmese',
44
- 'ur' => 'Urdu',
45
- 'nl' => 'Dutch',
46
- 'pl' => 'Polish',
47
- 'zh' => 'Chinese',
29
+ 'en' => English,
30
+ 'de' => Deutsch,
31
+ 'es' => Spanish,
32
+ 'fr' => French,
33
+ 'it' => Italian,
34
+ 'ja' => Japanese,
35
+ 'el' => Greek,
36
+ 'ru' => Russian,
37
+ 'ar' => Arabic,
38
+ 'am' => Amharic,
39
+ 'hi' => Hindi,
40
+ 'hy' => Armenian,
41
+ 'fa' => Persian,
42
+ 'my' => Burmese,
43
+ 'ur' => Urdu,
44
+ 'nl' => Dutch,
45
+ 'pl' => Polish,
46
+ 'zh' => Chinese,
48
47
  }
49
48
 
50
- def process_class
51
- language_module::Process
52
- end
53
-
54
- def cleaner_class
55
- language_module::Cleaner
56
- end
57
-
58
- def language_module
59
- Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}")
49
+ def self.get_language_by_code(code)
50
+ LANGUAGE_CODES[code] || Common
60
51
  end
61
52
  end
62
53
  end
@@ -18,19 +18,6 @@ module PragmaticSegmenter
18
18
  # Rubular: http://rubular.com/r/kPRgApNHUg
19
19
  ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
20
20
 
21
- class Process < Process
22
- private
23
-
24
- def sentence_boundary_punctuation(txt)
25
- txt = txt.apply(ReplaceColonBetweenNumbersRule, ReplaceNonSentenceBoundaryCommaRule)
26
- txt.scan(SENTENCE_BOUNDARY_REGEX)
27
- end
28
-
29
- def replace_abbreviations(txt)
30
- AbbreviationReplacer.new(text: txt, language: Arabic).replace
31
- end
32
- end
33
-
34
21
  class AbbreviationReplacer < AbbreviationReplacer
35
22
  private
36
23
 
@@ -1,3 +1,6 @@
1
+ require_relative 'common/numbers'
2
+ require_relative 'common/ellipsis'
3
+
1
4
  module PragmaticSegmenter
2
5
  module Languages
3
6
  module Common
@@ -11,69 +14,89 @@ module PragmaticSegmenter
11
14
  NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
12
15
  end
13
16
 
14
- SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
15
-
16
- include Rules
17
- # Rubular: http://rubular.com/r/NqCqv372Ix
18
- QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
17
+ module Abbreviations
18
+ # Rubular: http://rubular.com/r/EUbZCNfgei
19
+ WithMultiplePeriodsAndEmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
20
+ end
19
21
 
20
- # Rubular: http://rubular.com/r/6flGnUMEVl
21
- PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
22
+ # Rubular: http://rubular.com/r/G2opjedIm9
23
+ GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
22
24
 
23
- # Rubular: http://rubular.com/r/TYzr4qOW1Q
24
- BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
25
+ SingleNewLineRule = Rule.new(/\n/, 'ȹ')
25
26
 
26
- # Rubular: http://rubular.com/r/JMjlZHAT4g
27
- SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
27
+ module DoublePunctuationRules
28
+ FirstRule = Rule.new(/\?!/, '☉')
29
+ SecondRule = Rule.new(/!\?/, '☈')
30
+ ThirdRule = Rule.new(/\?\?/, '☇')
31
+ ForthRule = Rule.new(/!!/, '☄')
28
32
 
29
- # Rubular: http://rubular.com/r/mQ8Es9bxtk
30
- CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
33
+ All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
34
+ end
31
35
 
32
- # Rubular: http://rubular.com/r/yqa4Rit8EY
33
- PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
34
36
 
35
- # Rubular: http://rubular.com/r/NEv265G2X2
36
- KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '')
37
+ # Rubular: http://rubular.com/r/aXPUGm6fQh
38
+ QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
37
39
 
38
- # Rubular: http://rubular.com/r/xDkpFZ0EgH
39
- MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
40
40
 
41
- module AmPmRules
42
- # Rubular: http://rubular.com/r/Vnx3m4Spc8
43
- UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
41
+ module ExclamationPointRules
42
+ # Rubular: http://rubular.com/r/XS1XXFRfM2
43
+ InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
44
44
 
45
- # Rubular: http://rubular.com/r/AJMCotJVbW
46
- UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
45
+ # Rubular: http://rubular.com/r/sl57YI8LkA
46
+ BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
47
47
 
48
- # Rubular: http://rubular.com/r/13q7SnOhgA
49
- LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
48
+ # Rubular: http://rubular.com/r/f9zTjmkIPb
49
+ MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
50
50
 
51
- # Rubular: http://rubular.com/r/DgUDq4mLz5
52
- LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
51
+ All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
52
+ end
53
53
 
54
- All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
54
+ module SubSymbolsRules
55
+ Period = Rule.new(/∯/, '.')
56
+ ArabicComma = Rule.new(/♬/, '،')
57
+ SemiColon = Rule.new(/♭/, ':')
58
+ FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
59
+ SpecialPeriod = Rule.new(/&ᓱ&/, '.')
60
+ FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
61
+ ExclamationPoint = Rule.new(/&ᓴ&/, '!')
62
+ QuestionMark = Rule.new(/&ᓷ&/, '?')
63
+ FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
64
+ MixedDoubleQE = Rule.new(/☉/, '?!')
65
+ MixedDoubleQQ = Rule.new(/☇/, '??')
66
+ MixedDoubleEQ = Rule.new(/☈/, '!?')
67
+ MixedDoubleEE = Rule.new(/☄/, '!!')
68
+ LeftParens = Rule.new(/&✂&/, '(')
69
+ RightParens = Rule.new(/&⌬&/, ')')
70
+ TemporaryEndingPunctutation = Rule.new('ȸ', '')
71
+ Newline = Rule.new(/ȹ/, "\n")
72
+
73
+ All = [ Period, ArabicComma,
74
+ SemiColon, FullWidthPeriod,
75
+ SpecialPeriod, FullWidthExclamation,
76
+ ExclamationPoint, QuestionMark,
77
+ FullWidthQuestionMark, MixedDoubleQE,
78
+ MixedDoubleQQ, MixedDoubleEQ,
79
+ MixedDoubleEE, LeftParens,
80
+ RightParens, TemporaryEndingPunctutation,
81
+ Newline ]
55
82
  end
56
83
 
57
- # This class searches for periods within an abbreviation and
58
- # replaces the periods.
59
- module SingleLetterAbbreviationRules
60
- # Rubular: http://rubular.com/r/e3H6kwnr6H
61
- SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
62
84
 
63
- # Rubular: http://rubular.com/r/gitvf0YWH4
64
- SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '')
85
+ module ReinsertEllipsisRules
86
+ SubThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
87
+ SubThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
88
+ SubFourSpacePeriod = Rule.new(/♝/, '. . . .')
89
+ SubTwoConsecutivePeriod = Rule.new(/☏/, '..')
90
+ SubOnePeriod = Rule.new(/∮/, '.')
65
91
 
66
- All = [
67
- SingleUpperCaseLetterAtStartOfLineRule,
68
- SingleUpperCaseLetterRule
69
- ]
92
+ All = [ SubThreeConsecutivePeriod, SubThreeSpacePeriod,
93
+ SubFourSpacePeriod, SubTwoConsecutivePeriod,
94
+ SubOnePeriod ]
70
95
  end
71
96
 
97
+ ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
72
98
 
73
- class Process < PragmaticSegmenter::Process
74
- end
75
- class Cleaner < PragmaticSegmenter::Cleaner
76
- end
99
+ SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
77
100
  end
78
101
  end
79
102
  end