pragmatic_segmenter 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +6 -6
- data/lib/pragmatic_segmenter/between_punctuation.rb +6 -4
- data/lib/pragmatic_segmenter/cleaner.rb +51 -47
- data/lib/pragmatic_segmenter/cleaner/rules.rb +86 -0
- data/lib/pragmatic_segmenter/languages.rb +21 -30
- data/lib/pragmatic_segmenter/languages/arabic.rb +0 -13
- data/lib/pragmatic_segmenter/languages/common.rb +67 -44
- data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +37 -0
- data/lib/pragmatic_segmenter/languages/common/numbers.rb +90 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +25 -48
- data/lib/pragmatic_segmenter/languages/english.rb +3 -3
- data/lib/pragmatic_segmenter/languages/japanese.rb +5 -13
- data/lib/pragmatic_segmenter/languages/persian.rb +0 -14
- data/lib/pragmatic_segmenter/languages/russian.rb +0 -25
- data/lib/pragmatic_segmenter/languages/spanish.rb +0 -9
- data/lib/pragmatic_segmenter/list.rb +60 -58
- data/lib/pragmatic_segmenter/{process.rb → processor.rb} +47 -26
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +41 -20
- data/lib/pragmatic_segmenter/segmenter.rb +19 -5
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/pragmatic_segmenter.gemspec +1 -0
- data/spec/pragmatic_segmenter/languages/amharic_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/arabic_spec.rb +59 -0
- data/spec/pragmatic_segmenter/languages/armenian_spec.rb +160 -0
- data/spec/pragmatic_segmenter/languages/burmese_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/chinese_spec.rb +11 -0
- data/spec/pragmatic_segmenter/languages/deutsch_spec.rb +189 -0
- data/spec/pragmatic_segmenter/languages/dutch_spec.rb +23 -0
- data/spec/pragmatic_segmenter/languages/english_spec.rb +1348 -0
- data/spec/pragmatic_segmenter/languages/french_spec.rb +31 -0
- data/spec/pragmatic_segmenter/languages/greek_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/hindi_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/italian_spec.rb +190 -0
- data/spec/pragmatic_segmenter/languages/japanese_spec.rb +53 -0
- data/spec/pragmatic_segmenter/languages/persian_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/polish_spec.rb +11 -0
- data/spec/pragmatic_segmenter/languages/russian_spec.rb +219 -0
- data/spec/pragmatic_segmenter/languages/spanish_spec.rb +189 -0
- data/spec/pragmatic_segmenter/languages/urdu_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages_spec.rb +31 -0
- data/spec/pragmatic_segmenter_spec.rb +24 -2583
- metadata +59 -8
- data/lib/pragmatic_segmenter/number.rb +0 -35
- data/lib/pragmatic_segmenter/rules.rb +0 -168
- data/lib/pragmatic_segmenter/rules/ellipsis.rb +0 -35
- data/lib/pragmatic_segmenter/rules/html.rb +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a9cb6133aca84f8c6ff233ec6fb34b276cf47964
|
4
|
+
data.tar.gz: 00c1f664707e86e5c2ae5740c53acde5c814ece8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d5726605fa78ec4067c79ed592a7983f2638b26a81fb88cf23bdffeb26d842c0eaed39a531181ecef6456218208f6d297e4316b36fc7f4a15f4deb2ebb7cb800
|
7
|
+
data.tar.gz: a1c99c7f3c73c1624a2b1d4792c8937dd27d3dfd4667dcb05844376d5ecad13ebe065f18fe3f40a61a4aa23baa3e8ab9b4dc3bc547821c6d9fc5700cd5a16f20
|
data/.travis.yml
CHANGED
@@ -8,20 +8,20 @@ module PragmaticSegmenter
|
|
8
8
|
SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)
|
9
9
|
|
10
10
|
attr_reader :text
|
11
|
-
def initialize(text:, language:
|
11
|
+
def initialize(text:, language: )
|
12
12
|
@text = Text.new(text)
|
13
13
|
@language = language
|
14
14
|
end
|
15
15
|
|
16
16
|
def replace
|
17
|
-
@
|
17
|
+
@text.apply(@language::PossessiveAbbreviationRule,
|
18
18
|
@language::KommanditgesellschaftRule,
|
19
19
|
@language::SingleLetterAbbreviationRules::All)
|
20
20
|
|
21
|
-
@
|
22
|
-
@
|
23
|
-
@
|
24
|
-
replace_abbreviation_as_sentence_boundary(@
|
21
|
+
@text = search_for_abbreviations_in_string(@text)
|
22
|
+
@text = replace_multi_period_abbreviations(@text)
|
23
|
+
@text.apply(@language::AmPmRules::All)
|
24
|
+
replace_abbreviation_as_sentence_boundary(@text)
|
25
25
|
end
|
26
26
|
|
27
27
|
private
|
@@ -1,5 +1,4 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
require 'pragmatic_segmenter/punctuation_replacer'
|
3
2
|
|
4
3
|
module PragmaticSegmenter
|
5
4
|
# This class searches for punctuation between quotes or parenthesis
|
@@ -66,13 +65,16 @@ module PragmaticSegmenter
|
|
66
65
|
end
|
67
66
|
|
68
67
|
def sub_punctuation_between_double_quotes(txt)
|
69
|
-
btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
|
70
68
|
PragmaticSegmenter::PunctuationReplacer.new(
|
71
|
-
matches_array: btwn_dbl_quote,
|
69
|
+
matches_array: btwn_dbl_quote(txt),
|
72
70
|
text: txt
|
73
71
|
).replace
|
74
72
|
end
|
75
73
|
|
74
|
+
def btwn_dbl_quote(txt)
|
75
|
+
txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
|
76
|
+
end
|
77
|
+
|
76
78
|
def sub_punctuation_between_quotes_arrow(txt)
|
77
79
|
PragmaticSegmenter::PunctuationReplacer.new(
|
78
80
|
matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
|
@@ -87,4 +89,4 @@ module PragmaticSegmenter
|
|
87
89
|
).replace
|
88
90
|
end
|
89
91
|
end
|
90
|
-
end
|
92
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
require_relative 'cleaner/rules'
|
2
3
|
|
3
4
|
module PragmaticSegmenter
|
4
5
|
# This is an opinionated class that removes errant newlines,
|
@@ -7,8 +8,8 @@ module PragmaticSegmenter
|
|
7
8
|
include Rules
|
8
9
|
|
9
10
|
attr_reader :text, :doc_type
|
10
|
-
def initialize(text:, doc_type: nil, language: Languages::Common
|
11
|
-
@text = Text.new(text
|
11
|
+
def initialize(text:, doc_type: nil, language: Languages::Common)
|
12
|
+
@text = Text.new(text)
|
12
13
|
@doc_type = doc_type
|
13
14
|
@language = language
|
14
15
|
end
|
@@ -29,17 +30,19 @@ module PragmaticSegmenter
|
|
29
30
|
|
30
31
|
def clean
|
31
32
|
return unless text
|
32
|
-
|
33
|
-
replace_double_newlines
|
34
|
-
replace_newlines
|
35
|
-
replace_escaped_newlines
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
33
|
+
remove_all_newlines
|
34
|
+
replace_double_newlines
|
35
|
+
replace_newlines
|
36
|
+
replace_escaped_newlines
|
37
|
+
|
38
|
+
@text.apply(HTML::All)
|
39
|
+
|
40
|
+
replace_punctuation_in_brackets
|
41
|
+
@text.apply(InlineFormattingRule)
|
42
|
+
clean_quotations
|
43
|
+
clean_table_of_contents
|
44
|
+
check_for_no_space_in_between_sentences
|
45
|
+
clean_consecutive_characters
|
43
46
|
end
|
44
47
|
|
45
48
|
private
|
@@ -48,18 +51,18 @@ module PragmaticSegmenter
|
|
48
51
|
@language::Abbreviation::ABBREVIATIONS
|
49
52
|
end
|
50
53
|
|
51
|
-
def check_for_no_space_in_between_sentences
|
52
|
-
words =
|
54
|
+
def check_for_no_space_in_between_sentences
|
55
|
+
words = @text.split(' ')
|
53
56
|
words.each do |word|
|
54
|
-
search_for_connected_sentences(word,
|
55
|
-
search_for_connected_sentences(word,
|
57
|
+
search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule)
|
58
|
+
search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule)
|
56
59
|
end
|
57
|
-
|
60
|
+
@text
|
58
61
|
end
|
59
62
|
|
60
|
-
def replace_punctuation_in_brackets
|
61
|
-
|
62
|
-
|
63
|
+
def replace_punctuation_in_brackets
|
64
|
+
@text.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
|
65
|
+
@text.gsub!(/#{Regexp.escape(match)}/, "#{match.dup.gsub!(/\?/, '&ᓷ&')}") if match.include?('?')
|
63
66
|
end
|
64
67
|
end
|
65
68
|
|
@@ -74,60 +77,61 @@ module PragmaticSegmenter
|
|
74
77
|
end
|
75
78
|
end
|
76
79
|
|
77
|
-
def remove_all_newlines
|
78
|
-
|
79
|
-
remove_newline_in_middle_of_word
|
80
|
+
def remove_all_newlines
|
81
|
+
remove_newline_in_middle_of_sentence
|
82
|
+
remove_newline_in_middle_of_word
|
80
83
|
end
|
81
84
|
|
82
|
-
def remove_newline_in_middle_of_sentence
|
83
|
-
|
85
|
+
def remove_newline_in_middle_of_sentence
|
86
|
+
@text.dup.gsub!(/(?:[^\.])*/) do |match|
|
84
87
|
next unless match.include?("\n")
|
85
88
|
orig = match.dup
|
86
89
|
match.gsub!(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '')
|
87
|
-
|
90
|
+
@text.gsub!(/#{Regexp.escape(orig)}/, "#{match}")
|
88
91
|
end
|
89
|
-
|
92
|
+
@text
|
90
93
|
end
|
91
94
|
|
92
|
-
def remove_newline_in_middle_of_word
|
93
|
-
|
95
|
+
def remove_newline_in_middle_of_word
|
96
|
+
@text.apply NewLineInMiddleOfWordRule
|
94
97
|
end
|
95
98
|
|
96
|
-
def replace_escaped_newlines
|
97
|
-
|
99
|
+
def replace_escaped_newlines
|
100
|
+
@text.apply EscapedNewLineRule, EscapedCarriageReturnRule,
|
98
101
|
TypoEscapedNewLineRule, TypoEscapedCarriageReturnRule
|
99
102
|
end
|
100
103
|
|
101
|
-
def replace_double_newlines
|
102
|
-
|
104
|
+
def replace_double_newlines
|
105
|
+
@text.apply DoubleNewLineWithSpaceRule, DoubleNewLineRule
|
103
106
|
end
|
104
107
|
|
105
|
-
def replace_newlines
|
108
|
+
def replace_newlines
|
106
109
|
if doc_type.eql?('pdf')
|
107
|
-
remove_pdf_line_breaks
|
110
|
+
remove_pdf_line_breaks
|
108
111
|
else
|
109
|
-
|
112
|
+
@text.apply NewLineFollowedByPeriodRule,
|
110
113
|
ReplaceNewlineWithCarriageReturnRule
|
111
114
|
end
|
112
115
|
end
|
113
116
|
|
114
|
-
def remove_pdf_line_breaks
|
115
|
-
|
116
|
-
|
117
|
-
|
117
|
+
def remove_pdf_line_breaks
|
118
|
+
@text.apply NewLineFollowedByBulletRule,
|
119
|
+
|
120
|
+
PDF::NewLineInMiddleOfSentenceRule,
|
121
|
+
PDF::NewLineInMiddleOfSentenceNoSpacesRule
|
118
122
|
end
|
119
123
|
|
120
|
-
def clean_quotations
|
121
|
-
|
124
|
+
def clean_quotations
|
125
|
+
@text.apply QuotationsFirstRule, QuotationsSecondRule
|
122
126
|
end
|
123
127
|
|
124
|
-
def clean_table_of_contents
|
125
|
-
|
128
|
+
def clean_table_of_contents
|
129
|
+
@text.apply TableOfContentsRule, ConsecutivePeriodsRule,
|
126
130
|
ConsecutiveForwardSlashRule
|
127
131
|
end
|
128
132
|
|
129
|
-
def clean_consecutive_characters
|
130
|
-
|
133
|
+
def clean_consecutive_characters
|
134
|
+
@text.apply ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
|
131
135
|
end
|
132
136
|
end
|
133
137
|
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
# This is an opinionated class that removes errant newlines,
|
3
|
+
# xhtml, inline formatting, etc.
|
4
|
+
class Cleaner
|
5
|
+
module Rules
|
6
|
+
# Rubular: http://rubular.com/r/V57WnM9Zut
|
7
|
+
NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
|
8
|
+
|
9
|
+
# Rubular: http://rubular.com/r/dMxp5MixFS
|
10
|
+
DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
|
11
|
+
|
12
|
+
# Rubular: http://rubular.com/r/H6HOJeA8bq
|
13
|
+
DoubleNewLineRule = Rule.new(/\n\n/, "\r")
|
14
|
+
|
15
|
+
# Rubular: http://rubular.com/r/FseyMiiYFT
|
16
|
+
NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
|
17
|
+
|
18
|
+
|
19
|
+
ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
|
20
|
+
|
21
|
+
EscapedNewLineRule = Rule.new(/\\n/, "\n")
|
22
|
+
EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
|
23
|
+
|
24
|
+
TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n")
|
25
|
+
|
26
|
+
TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r")
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
# Rubular: http://rubular.com/r/bAJrhyLNeZ
|
32
|
+
InlineFormattingRule = Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*<b\^\}/, '')
|
33
|
+
|
34
|
+
# Rubular: http://rubular.com/r/8mc1ArOIGy
|
35
|
+
TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
|
36
|
+
|
37
|
+
# Rubular: http://rubular.com/r/DwNSuZrNtk
|
38
|
+
ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
|
39
|
+
|
40
|
+
# Rubular: http://rubular.com/r/IQ4TPfsbd8
|
41
|
+
ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
|
42
|
+
|
43
|
+
|
44
|
+
# Rubular: http://rubular.com/r/6dt98uI76u
|
45
|
+
NO_SPACE_BETWEEN_SENTENCES_REGEX = /(?<=[a-z])\.(?=[A-Z])/
|
46
|
+
NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
|
47
|
+
|
48
|
+
# Rubular: http://rubular.com/r/l6KN6rH5XE
|
49
|
+
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = /(?<=\d)\.(?=[A-Z])/
|
50
|
+
NoSpaceBetweenSentencesDigitRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
|
51
|
+
|
52
|
+
|
53
|
+
URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
|
54
|
+
|
55
|
+
# Rubular: http://rubular.com/r/3GiRiP2IbD
|
56
|
+
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
|
57
|
+
|
58
|
+
|
59
|
+
# Rubular: http://rubular.com/r/Gn18aAnLdZ
|
60
|
+
NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
|
61
|
+
|
62
|
+
QuotationsFirstRule = Rule.new(/''/, '"')
|
63
|
+
QuotationsSecondRule = Rule.new(/``/, '"')
|
64
|
+
|
65
|
+
|
66
|
+
module HTML
|
67
|
+
# Rubular: http://rubular.com/r/ENrVFMdJ8v
|
68
|
+
HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
|
69
|
+
|
70
|
+
# Rubular: http://rubular.com/r/XZVqMPJhea
|
71
|
+
EscapedHTMLTagRule = Rule.new(/<\/?[^gt;]*gt;/, '')
|
72
|
+
|
73
|
+
All = [HTMLTagRule, EscapedHTMLTagRule]
|
74
|
+
end
|
75
|
+
|
76
|
+
module PDF
|
77
|
+
# Rubular: http://rubular.com/r/UZAVcwqck8
|
78
|
+
NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
|
79
|
+
|
80
|
+
# Rubular: http://rubular.com/r/eaNwGavmdo
|
81
|
+
NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'pragmatic_segmenter/types'
|
2
|
-
require 'pragmatic_segmenter/
|
2
|
+
require 'pragmatic_segmenter/processor'
|
3
3
|
require 'pragmatic_segmenter/cleaner'
|
4
|
-
require 'pragmatic_segmenter/rules'
|
5
4
|
|
6
5
|
require 'pragmatic_segmenter/languages/common'
|
7
6
|
|
@@ -27,36 +26,28 @@ require 'pragmatic_segmenter/languages/chinese'
|
|
27
26
|
module PragmaticSegmenter
|
28
27
|
module Languages
|
29
28
|
LANGUAGE_CODES = {
|
30
|
-
'en' =>
|
31
|
-
'de' =>
|
32
|
-
'es' =>
|
33
|
-
'fr' =>
|
34
|
-
'it' =>
|
35
|
-
'ja' =>
|
36
|
-
'el' =>
|
37
|
-
'ru' =>
|
38
|
-
'ar' =>
|
39
|
-
'am' =>
|
40
|
-
'hi' =>
|
41
|
-
'hy' =>
|
42
|
-
'fa' =>
|
43
|
-
'my' =>
|
44
|
-
'ur' =>
|
45
|
-
'nl' =>
|
46
|
-
'pl' =>
|
47
|
-
'zh' =>
|
29
|
+
'en' => English,
|
30
|
+
'de' => Deutsch,
|
31
|
+
'es' => Spanish,
|
32
|
+
'fr' => French,
|
33
|
+
'it' => Italian,
|
34
|
+
'ja' => Japanese,
|
35
|
+
'el' => Greek,
|
36
|
+
'ru' => Russian,
|
37
|
+
'ar' => Arabic,
|
38
|
+
'am' => Amharic,
|
39
|
+
'hi' => Hindi,
|
40
|
+
'hy' => Armenian,
|
41
|
+
'fa' => Persian,
|
42
|
+
'my' => Burmese,
|
43
|
+
'ur' => Urdu,
|
44
|
+
'nl' => Dutch,
|
45
|
+
'pl' => Polish,
|
46
|
+
'zh' => Chinese,
|
48
47
|
}
|
49
48
|
|
50
|
-
def
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
|
-
def cleaner_class
|
55
|
-
language_module::Cleaner
|
56
|
-
end
|
57
|
-
|
58
|
-
def language_module
|
59
|
-
Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}")
|
49
|
+
def self.get_language_by_code(code)
|
50
|
+
LANGUAGE_CODES[code] || Common
|
60
51
|
end
|
61
52
|
end
|
62
53
|
end
|
@@ -18,19 +18,6 @@ module PragmaticSegmenter
|
|
18
18
|
# Rubular: http://rubular.com/r/kPRgApNHUg
|
19
19
|
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
20
20
|
|
21
|
-
class Process < Process
|
22
|
-
private
|
23
|
-
|
24
|
-
def sentence_boundary_punctuation(txt)
|
25
|
-
txt = txt.apply(ReplaceColonBetweenNumbersRule, ReplaceNonSentenceBoundaryCommaRule)
|
26
|
-
txt.scan(SENTENCE_BOUNDARY_REGEX)
|
27
|
-
end
|
28
|
-
|
29
|
-
def replace_abbreviations(txt)
|
30
|
-
AbbreviationReplacer.new(text: txt, language: Arabic).replace
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
21
|
class AbbreviationReplacer < AbbreviationReplacer
|
35
22
|
private
|
36
23
|
|
@@ -1,3 +1,6 @@
|
|
1
|
+
require_relative 'common/numbers'
|
2
|
+
require_relative 'common/ellipsis'
|
3
|
+
|
1
4
|
module PragmaticSegmenter
|
2
5
|
module Languages
|
3
6
|
module Common
|
@@ -11,69 +14,89 @@ module PragmaticSegmenter
|
|
11
14
|
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
12
15
|
end
|
13
16
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
|
17
|
+
module Abbreviations
|
18
|
+
# Rubular: http://rubular.com/r/EUbZCNfgei
|
19
|
+
WithMultiplePeriodsAndEmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
|
20
|
+
end
|
19
21
|
|
20
|
-
# Rubular: http://rubular.com/r/
|
21
|
-
|
22
|
+
# Rubular: http://rubular.com/r/G2opjedIm9
|
23
|
+
GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
|
22
24
|
|
23
|
-
|
24
|
-
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
|
25
|
+
SingleNewLineRule = Rule.new(/\n/, 'ȹ')
|
25
26
|
|
26
|
-
|
27
|
-
|
27
|
+
module DoublePunctuationRules
|
28
|
+
FirstRule = Rule.new(/\?!/, '☉')
|
29
|
+
SecondRule = Rule.new(/!\?/, '☈')
|
30
|
+
ThirdRule = Rule.new(/\?\?/, '☇')
|
31
|
+
ForthRule = Rule.new(/!!/, '☄')
|
28
32
|
|
29
|
-
|
30
|
-
|
33
|
+
All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
|
34
|
+
end
|
31
35
|
|
32
|
-
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
33
|
-
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
34
36
|
|
35
|
-
# Rubular: http://rubular.com/r/
|
36
|
-
|
37
|
+
# Rubular: http://rubular.com/r/aXPUGm6fQh
|
38
|
+
QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
|
37
39
|
|
38
|
-
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
39
|
-
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
40
40
|
|
41
|
-
module
|
42
|
-
# Rubular: http://rubular.com/r/
|
43
|
-
|
41
|
+
module ExclamationPointRules
|
42
|
+
# Rubular: http://rubular.com/r/XS1XXFRfM2
|
43
|
+
InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
|
44
44
|
|
45
|
-
# Rubular: http://rubular.com/r/
|
46
|
-
|
45
|
+
# Rubular: http://rubular.com/r/sl57YI8LkA
|
46
|
+
BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
|
47
47
|
|
48
|
-
# Rubular: http://rubular.com/r/
|
49
|
-
|
48
|
+
# Rubular: http://rubular.com/r/f9zTjmkIPb
|
49
|
+
MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
|
50
50
|
|
51
|
-
|
52
|
-
|
51
|
+
All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
|
52
|
+
end
|
53
53
|
|
54
|
-
|
54
|
+
module SubSymbolsRules
|
55
|
+
Period = Rule.new(/∯/, '.')
|
56
|
+
ArabicComma = Rule.new(/♬/, '،')
|
57
|
+
SemiColon = Rule.new(/♭/, ':')
|
58
|
+
FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
|
59
|
+
SpecialPeriod = Rule.new(/&ᓱ&/, '.')
|
60
|
+
FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
|
61
|
+
ExclamationPoint = Rule.new(/&ᓴ&/, '!')
|
62
|
+
QuestionMark = Rule.new(/&ᓷ&/, '?')
|
63
|
+
FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
|
64
|
+
MixedDoubleQE = Rule.new(/☉/, '?!')
|
65
|
+
MixedDoubleQQ = Rule.new(/☇/, '??')
|
66
|
+
MixedDoubleEQ = Rule.new(/☈/, '!?')
|
67
|
+
MixedDoubleEE = Rule.new(/☄/, '!!')
|
68
|
+
LeftParens = Rule.new(/&✂&/, '(')
|
69
|
+
RightParens = Rule.new(/&⌬&/, ')')
|
70
|
+
TemporaryEndingPunctutation = Rule.new('ȸ', '')
|
71
|
+
Newline = Rule.new(/ȹ/, "\n")
|
72
|
+
|
73
|
+
All = [ Period, ArabicComma,
|
74
|
+
SemiColon, FullWidthPeriod,
|
75
|
+
SpecialPeriod, FullWidthExclamation,
|
76
|
+
ExclamationPoint, QuestionMark,
|
77
|
+
FullWidthQuestionMark, MixedDoubleQE,
|
78
|
+
MixedDoubleQQ, MixedDoubleEQ,
|
79
|
+
MixedDoubleEE, LeftParens,
|
80
|
+
RightParens, TemporaryEndingPunctutation,
|
81
|
+
Newline ]
|
55
82
|
end
|
56
83
|
|
57
|
-
# This class searches for periods within an abbreviation and
|
58
|
-
# replaces the periods.
|
59
|
-
module SingleLetterAbbreviationRules
|
60
|
-
# Rubular: http://rubular.com/r/e3H6kwnr6H
|
61
|
-
SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
|
62
84
|
|
63
|
-
|
64
|
-
|
85
|
+
module ReinsertEllipsisRules
|
86
|
+
SubThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
|
87
|
+
SubThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
|
88
|
+
SubFourSpacePeriod = Rule.new(/♝/, '. . . .')
|
89
|
+
SubTwoConsecutivePeriod = Rule.new(/☏/, '..')
|
90
|
+
SubOnePeriod = Rule.new(/∮/, '.')
|
65
91
|
|
66
|
-
All = [
|
67
|
-
|
68
|
-
|
69
|
-
]
|
92
|
+
All = [ SubThreeConsecutivePeriod, SubThreeSpacePeriod,
|
93
|
+
SubFourSpacePeriod, SubTwoConsecutivePeriod,
|
94
|
+
SubOnePeriod ]
|
70
95
|
end
|
71
96
|
|
97
|
+
ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
|
72
98
|
|
73
|
-
|
74
|
-
end
|
75
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
76
|
-
end
|
99
|
+
SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
|
77
100
|
end
|
78
101
|
end
|
79
102
|
end
|