pragmatic_segmenter 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +6 -6
- data/lib/pragmatic_segmenter/between_punctuation.rb +6 -4
- data/lib/pragmatic_segmenter/cleaner.rb +51 -47
- data/lib/pragmatic_segmenter/cleaner/rules.rb +86 -0
- data/lib/pragmatic_segmenter/languages.rb +21 -30
- data/lib/pragmatic_segmenter/languages/arabic.rb +0 -13
- data/lib/pragmatic_segmenter/languages/common.rb +67 -44
- data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +37 -0
- data/lib/pragmatic_segmenter/languages/common/numbers.rb +90 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +25 -48
- data/lib/pragmatic_segmenter/languages/english.rb +3 -3
- data/lib/pragmatic_segmenter/languages/japanese.rb +5 -13
- data/lib/pragmatic_segmenter/languages/persian.rb +0 -14
- data/lib/pragmatic_segmenter/languages/russian.rb +0 -25
- data/lib/pragmatic_segmenter/languages/spanish.rb +0 -9
- data/lib/pragmatic_segmenter/list.rb +60 -58
- data/lib/pragmatic_segmenter/{process.rb → processor.rb} +47 -26
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +41 -20
- data/lib/pragmatic_segmenter/segmenter.rb +19 -5
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/pragmatic_segmenter.gemspec +1 -0
- data/spec/pragmatic_segmenter/languages/amharic_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/arabic_spec.rb +59 -0
- data/spec/pragmatic_segmenter/languages/armenian_spec.rb +160 -0
- data/spec/pragmatic_segmenter/languages/burmese_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/chinese_spec.rb +11 -0
- data/spec/pragmatic_segmenter/languages/deutsch_spec.rb +189 -0
- data/spec/pragmatic_segmenter/languages/dutch_spec.rb +23 -0
- data/spec/pragmatic_segmenter/languages/english_spec.rb +1348 -0
- data/spec/pragmatic_segmenter/languages/french_spec.rb +31 -0
- data/spec/pragmatic_segmenter/languages/greek_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/hindi_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/italian_spec.rb +190 -0
- data/spec/pragmatic_segmenter/languages/japanese_spec.rb +53 -0
- data/spec/pragmatic_segmenter/languages/persian_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/polish_spec.rb +11 -0
- data/spec/pragmatic_segmenter/languages/russian_spec.rb +219 -0
- data/spec/pragmatic_segmenter/languages/spanish_spec.rb +189 -0
- data/spec/pragmatic_segmenter/languages/urdu_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages_spec.rb +31 -0
- data/spec/pragmatic_segmenter_spec.rb +24 -2583
- metadata +59 -8
- data/lib/pragmatic_segmenter/number.rb +0 -35
- data/lib/pragmatic_segmenter/rules.rb +0 -168
- data/lib/pragmatic_segmenter/rules/ellipsis.rb +0 -35
- data/lib/pragmatic_segmenter/rules/html.rb +0 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a9cb6133aca84f8c6ff233ec6fb34b276cf47964
|
4
|
+
data.tar.gz: 00c1f664707e86e5c2ae5740c53acde5c814ece8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d5726605fa78ec4067c79ed592a7983f2638b26a81fb88cf23bdffeb26d842c0eaed39a531181ecef6456218208f6d297e4316b36fc7f4a15f4deb2ebb7cb800
|
7
|
+
data.tar.gz: a1c99c7f3c73c1624a2b1d4792c8937dd27d3dfd4667dcb05844376d5ecad13ebe065f18fe3f40a61a4aa23baa3e8ab9b4dc3bc547821c6d9fc5700cd5a16f20
|
data/.travis.yml
CHANGED
@@ -8,20 +8,20 @@ module PragmaticSegmenter
|
|
8
8
|
SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)
|
9
9
|
|
10
10
|
attr_reader :text
|
11
|
-
def initialize(text:, language:
|
11
|
+
def initialize(text:, language: )
|
12
12
|
@text = Text.new(text)
|
13
13
|
@language = language
|
14
14
|
end
|
15
15
|
|
16
16
|
def replace
|
17
|
-
@
|
17
|
+
@text.apply(@language::PossessiveAbbreviationRule,
|
18
18
|
@language::KommanditgesellschaftRule,
|
19
19
|
@language::SingleLetterAbbreviationRules::All)
|
20
20
|
|
21
|
-
@
|
22
|
-
@
|
23
|
-
@
|
24
|
-
replace_abbreviation_as_sentence_boundary(@
|
21
|
+
@text = search_for_abbreviations_in_string(@text)
|
22
|
+
@text = replace_multi_period_abbreviations(@text)
|
23
|
+
@text.apply(@language::AmPmRules::All)
|
24
|
+
replace_abbreviation_as_sentence_boundary(@text)
|
25
25
|
end
|
26
26
|
|
27
27
|
private
|
@@ -1,5 +1,4 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
-
require 'pragmatic_segmenter/punctuation_replacer'
|
3
2
|
|
4
3
|
module PragmaticSegmenter
|
5
4
|
# This class searches for punctuation between quotes or parenthesis
|
@@ -66,13 +65,16 @@ module PragmaticSegmenter
|
|
66
65
|
end
|
67
66
|
|
68
67
|
def sub_punctuation_between_double_quotes(txt)
|
69
|
-
btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
|
70
68
|
PragmaticSegmenter::PunctuationReplacer.new(
|
71
|
-
matches_array: btwn_dbl_quote,
|
69
|
+
matches_array: btwn_dbl_quote(txt),
|
72
70
|
text: txt
|
73
71
|
).replace
|
74
72
|
end
|
75
73
|
|
74
|
+
def btwn_dbl_quote(txt)
|
75
|
+
txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
|
76
|
+
end
|
77
|
+
|
76
78
|
def sub_punctuation_between_quotes_arrow(txt)
|
77
79
|
PragmaticSegmenter::PunctuationReplacer.new(
|
78
80
|
matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
|
@@ -87,4 +89,4 @@ module PragmaticSegmenter
|
|
87
89
|
).replace
|
88
90
|
end
|
89
91
|
end
|
90
|
-
end
|
92
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
|
+
require_relative 'cleaner/rules'
|
2
3
|
|
3
4
|
module PragmaticSegmenter
|
4
5
|
# This is an opinionated class that removes errant newlines,
|
@@ -7,8 +8,8 @@ module PragmaticSegmenter
|
|
7
8
|
include Rules
|
8
9
|
|
9
10
|
attr_reader :text, :doc_type
|
10
|
-
def initialize(text:, doc_type: nil, language: Languages::Common
|
11
|
-
@text = Text.new(text
|
11
|
+
def initialize(text:, doc_type: nil, language: Languages::Common)
|
12
|
+
@text = Text.new(text)
|
12
13
|
@doc_type = doc_type
|
13
14
|
@language = language
|
14
15
|
end
|
@@ -29,17 +30,19 @@ module PragmaticSegmenter
|
|
29
30
|
|
30
31
|
def clean
|
31
32
|
return unless text
|
32
|
-
|
33
|
-
replace_double_newlines
|
34
|
-
replace_newlines
|
35
|
-
replace_escaped_newlines
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
33
|
+
remove_all_newlines
|
34
|
+
replace_double_newlines
|
35
|
+
replace_newlines
|
36
|
+
replace_escaped_newlines
|
37
|
+
|
38
|
+
@text.apply(HTML::All)
|
39
|
+
|
40
|
+
replace_punctuation_in_brackets
|
41
|
+
@text.apply(InlineFormattingRule)
|
42
|
+
clean_quotations
|
43
|
+
clean_table_of_contents
|
44
|
+
check_for_no_space_in_between_sentences
|
45
|
+
clean_consecutive_characters
|
43
46
|
end
|
44
47
|
|
45
48
|
private
|
@@ -48,18 +51,18 @@ module PragmaticSegmenter
|
|
48
51
|
@language::Abbreviation::ABBREVIATIONS
|
49
52
|
end
|
50
53
|
|
51
|
-
def check_for_no_space_in_between_sentences
|
52
|
-
words =
|
54
|
+
def check_for_no_space_in_between_sentences
|
55
|
+
words = @text.split(' ')
|
53
56
|
words.each do |word|
|
54
|
-
search_for_connected_sentences(word,
|
55
|
-
search_for_connected_sentences(word,
|
57
|
+
search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_REGEX, NoSpaceBetweenSentencesRule)
|
58
|
+
search_for_connected_sentences(word, @text, NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, NoSpaceBetweenSentencesDigitRule)
|
56
59
|
end
|
57
|
-
|
60
|
+
@text
|
58
61
|
end
|
59
62
|
|
60
|
-
def replace_punctuation_in_brackets
|
61
|
-
|
62
|
-
|
63
|
+
def replace_punctuation_in_brackets
|
64
|
+
@text.dup.gsub!(/\[(?:[^\]])*\]/) do |match|
|
65
|
+
@text.gsub!(/#{Regexp.escape(match)}/, "#{match.dup.gsub!(/\?/, '&ᓷ&')}") if match.include?('?')
|
63
66
|
end
|
64
67
|
end
|
65
68
|
|
@@ -74,60 +77,61 @@ module PragmaticSegmenter
|
|
74
77
|
end
|
75
78
|
end
|
76
79
|
|
77
|
-
def remove_all_newlines
|
78
|
-
|
79
|
-
remove_newline_in_middle_of_word
|
80
|
+
def remove_all_newlines
|
81
|
+
remove_newline_in_middle_of_sentence
|
82
|
+
remove_newline_in_middle_of_word
|
80
83
|
end
|
81
84
|
|
82
|
-
def remove_newline_in_middle_of_sentence
|
83
|
-
|
85
|
+
def remove_newline_in_middle_of_sentence
|
86
|
+
@text.dup.gsub!(/(?:[^\.])*/) do |match|
|
84
87
|
next unless match.include?("\n")
|
85
88
|
orig = match.dup
|
86
89
|
match.gsub!(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '')
|
87
|
-
|
90
|
+
@text.gsub!(/#{Regexp.escape(orig)}/, "#{match}")
|
88
91
|
end
|
89
|
-
|
92
|
+
@text
|
90
93
|
end
|
91
94
|
|
92
|
-
def remove_newline_in_middle_of_word
|
93
|
-
|
95
|
+
def remove_newline_in_middle_of_word
|
96
|
+
@text.apply NewLineInMiddleOfWordRule
|
94
97
|
end
|
95
98
|
|
96
|
-
def replace_escaped_newlines
|
97
|
-
|
99
|
+
def replace_escaped_newlines
|
100
|
+
@text.apply EscapedNewLineRule, EscapedCarriageReturnRule,
|
98
101
|
TypoEscapedNewLineRule, TypoEscapedCarriageReturnRule
|
99
102
|
end
|
100
103
|
|
101
|
-
def replace_double_newlines
|
102
|
-
|
104
|
+
def replace_double_newlines
|
105
|
+
@text.apply DoubleNewLineWithSpaceRule, DoubleNewLineRule
|
103
106
|
end
|
104
107
|
|
105
|
-
def replace_newlines
|
108
|
+
def replace_newlines
|
106
109
|
if doc_type.eql?('pdf')
|
107
|
-
remove_pdf_line_breaks
|
110
|
+
remove_pdf_line_breaks
|
108
111
|
else
|
109
|
-
|
112
|
+
@text.apply NewLineFollowedByPeriodRule,
|
110
113
|
ReplaceNewlineWithCarriageReturnRule
|
111
114
|
end
|
112
115
|
end
|
113
116
|
|
114
|
-
def remove_pdf_line_breaks
|
115
|
-
|
116
|
-
|
117
|
-
|
117
|
+
def remove_pdf_line_breaks
|
118
|
+
@text.apply NewLineFollowedByBulletRule,
|
119
|
+
|
120
|
+
PDF::NewLineInMiddleOfSentenceRule,
|
121
|
+
PDF::NewLineInMiddleOfSentenceNoSpacesRule
|
118
122
|
end
|
119
123
|
|
120
|
-
def clean_quotations
|
121
|
-
|
124
|
+
def clean_quotations
|
125
|
+
@text.apply QuotationsFirstRule, QuotationsSecondRule
|
122
126
|
end
|
123
127
|
|
124
|
-
def clean_table_of_contents
|
125
|
-
|
128
|
+
def clean_table_of_contents
|
129
|
+
@text.apply TableOfContentsRule, ConsecutivePeriodsRule,
|
126
130
|
ConsecutiveForwardSlashRule
|
127
131
|
end
|
128
132
|
|
129
|
-
def clean_consecutive_characters
|
130
|
-
|
133
|
+
def clean_consecutive_characters
|
134
|
+
@text.apply ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
|
131
135
|
end
|
132
136
|
end
|
133
137
|
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module PragmaticSegmenter
|
2
|
+
# This is an opinionated class that removes errant newlines,
|
3
|
+
# xhtml, inline formatting, etc.
|
4
|
+
class Cleaner
|
5
|
+
module Rules
|
6
|
+
# Rubular: http://rubular.com/r/V57WnM9Zut
|
7
|
+
NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
|
8
|
+
|
9
|
+
# Rubular: http://rubular.com/r/dMxp5MixFS
|
10
|
+
DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
|
11
|
+
|
12
|
+
# Rubular: http://rubular.com/r/H6HOJeA8bq
|
13
|
+
DoubleNewLineRule = Rule.new(/\n\n/, "\r")
|
14
|
+
|
15
|
+
# Rubular: http://rubular.com/r/FseyMiiYFT
|
16
|
+
NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
|
17
|
+
|
18
|
+
|
19
|
+
ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
|
20
|
+
|
21
|
+
EscapedNewLineRule = Rule.new(/\\n/, "\n")
|
22
|
+
EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
|
23
|
+
|
24
|
+
TypoEscapedNewLineRule = Rule.new(/\\\ n/, "\n")
|
25
|
+
|
26
|
+
TypoEscapedCarriageReturnRule = Rule.new(/\\\ r/, "\r")
|
27
|
+
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
# Rubular: http://rubular.com/r/bAJrhyLNeZ
|
32
|
+
InlineFormattingRule = Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*<b\^\}/, '')
|
33
|
+
|
34
|
+
# Rubular: http://rubular.com/r/8mc1ArOIGy
|
35
|
+
TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
|
36
|
+
|
37
|
+
# Rubular: http://rubular.com/r/DwNSuZrNtk
|
38
|
+
ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
|
39
|
+
|
40
|
+
# Rubular: http://rubular.com/r/IQ4TPfsbd8
|
41
|
+
ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
|
42
|
+
|
43
|
+
|
44
|
+
# Rubular: http://rubular.com/r/6dt98uI76u
|
45
|
+
NO_SPACE_BETWEEN_SENTENCES_REGEX = /(?<=[a-z])\.(?=[A-Z])/
|
46
|
+
NoSpaceBetweenSentencesRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
|
47
|
+
|
48
|
+
# Rubular: http://rubular.com/r/l6KN6rH5XE
|
49
|
+
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = /(?<=\d)\.(?=[A-Z])/
|
50
|
+
NoSpaceBetweenSentencesDigitRule = Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
|
51
|
+
|
52
|
+
|
53
|
+
URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
|
54
|
+
|
55
|
+
# Rubular: http://rubular.com/r/3GiRiP2IbD
|
56
|
+
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
|
57
|
+
|
58
|
+
|
59
|
+
# Rubular: http://rubular.com/r/Gn18aAnLdZ
|
60
|
+
NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
|
61
|
+
|
62
|
+
QuotationsFirstRule = Rule.new(/''/, '"')
|
63
|
+
QuotationsSecondRule = Rule.new(/``/, '"')
|
64
|
+
|
65
|
+
|
66
|
+
module HTML
|
67
|
+
# Rubular: http://rubular.com/r/ENrVFMdJ8v
|
68
|
+
HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
|
69
|
+
|
70
|
+
# Rubular: http://rubular.com/r/XZVqMPJhea
|
71
|
+
EscapedHTMLTagRule = Rule.new(/<\/?[^gt;]*gt;/, '')
|
72
|
+
|
73
|
+
All = [HTMLTagRule, EscapedHTMLTagRule]
|
74
|
+
end
|
75
|
+
|
76
|
+
module PDF
|
77
|
+
# Rubular: http://rubular.com/r/UZAVcwqck8
|
78
|
+
NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
|
79
|
+
|
80
|
+
# Rubular: http://rubular.com/r/eaNwGavmdo
|
81
|
+
NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'pragmatic_segmenter/types'
|
2
|
-
require 'pragmatic_segmenter/
|
2
|
+
require 'pragmatic_segmenter/processor'
|
3
3
|
require 'pragmatic_segmenter/cleaner'
|
4
|
-
require 'pragmatic_segmenter/rules'
|
5
4
|
|
6
5
|
require 'pragmatic_segmenter/languages/common'
|
7
6
|
|
@@ -27,36 +26,28 @@ require 'pragmatic_segmenter/languages/chinese'
|
|
27
26
|
module PragmaticSegmenter
|
28
27
|
module Languages
|
29
28
|
LANGUAGE_CODES = {
|
30
|
-
'en' =>
|
31
|
-
'de' =>
|
32
|
-
'es' =>
|
33
|
-
'fr' =>
|
34
|
-
'it' =>
|
35
|
-
'ja' =>
|
36
|
-
'el' =>
|
37
|
-
'ru' =>
|
38
|
-
'ar' =>
|
39
|
-
'am' =>
|
40
|
-
'hi' =>
|
41
|
-
'hy' =>
|
42
|
-
'fa' =>
|
43
|
-
'my' =>
|
44
|
-
'ur' =>
|
45
|
-
'nl' =>
|
46
|
-
'pl' =>
|
47
|
-
'zh' =>
|
29
|
+
'en' => English,
|
30
|
+
'de' => Deutsch,
|
31
|
+
'es' => Spanish,
|
32
|
+
'fr' => French,
|
33
|
+
'it' => Italian,
|
34
|
+
'ja' => Japanese,
|
35
|
+
'el' => Greek,
|
36
|
+
'ru' => Russian,
|
37
|
+
'ar' => Arabic,
|
38
|
+
'am' => Amharic,
|
39
|
+
'hi' => Hindi,
|
40
|
+
'hy' => Armenian,
|
41
|
+
'fa' => Persian,
|
42
|
+
'my' => Burmese,
|
43
|
+
'ur' => Urdu,
|
44
|
+
'nl' => Dutch,
|
45
|
+
'pl' => Polish,
|
46
|
+
'zh' => Chinese,
|
48
47
|
}
|
49
48
|
|
50
|
-
def
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
|
-
def cleaner_class
|
55
|
-
language_module::Cleaner
|
56
|
-
end
|
57
|
-
|
58
|
-
def language_module
|
59
|
-
Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}")
|
49
|
+
def self.get_language_by_code(code)
|
50
|
+
LANGUAGE_CODES[code] || Common
|
60
51
|
end
|
61
52
|
end
|
62
53
|
end
|
@@ -18,19 +18,6 @@ module PragmaticSegmenter
|
|
18
18
|
# Rubular: http://rubular.com/r/kPRgApNHUg
|
19
19
|
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
20
20
|
|
21
|
-
class Process < Process
|
22
|
-
private
|
23
|
-
|
24
|
-
def sentence_boundary_punctuation(txt)
|
25
|
-
txt = txt.apply(ReplaceColonBetweenNumbersRule, ReplaceNonSentenceBoundaryCommaRule)
|
26
|
-
txt.scan(SENTENCE_BOUNDARY_REGEX)
|
27
|
-
end
|
28
|
-
|
29
|
-
def replace_abbreviations(txt)
|
30
|
-
AbbreviationReplacer.new(text: txt, language: Arabic).replace
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
21
|
class AbbreviationReplacer < AbbreviationReplacer
|
35
22
|
private
|
36
23
|
|
@@ -1,3 +1,6 @@
|
|
1
|
+
require_relative 'common/numbers'
|
2
|
+
require_relative 'common/ellipsis'
|
3
|
+
|
1
4
|
module PragmaticSegmenter
|
2
5
|
module Languages
|
3
6
|
module Common
|
@@ -11,69 +14,89 @@ module PragmaticSegmenter
|
|
11
14
|
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
12
15
|
end
|
13
16
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
|
17
|
+
module Abbreviations
|
18
|
+
# Rubular: http://rubular.com/r/EUbZCNfgei
|
19
|
+
WithMultiplePeriodsAndEmailRule = Rule.new(/(\w)(\.)(\w)/, '\1∮\3')
|
20
|
+
end
|
19
21
|
|
20
|
-
# Rubular: http://rubular.com/r/
|
21
|
-
|
22
|
+
# Rubular: http://rubular.com/r/G2opjedIm9
|
23
|
+
GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
|
22
24
|
|
23
|
-
|
24
|
-
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
|
25
|
+
SingleNewLineRule = Rule.new(/\n/, 'ȹ')
|
25
26
|
|
26
|
-
|
27
|
-
|
27
|
+
module DoublePunctuationRules
|
28
|
+
FirstRule = Rule.new(/\?!/, '☉')
|
29
|
+
SecondRule = Rule.new(/!\?/, '☈')
|
30
|
+
ThirdRule = Rule.new(/\?\?/, '☇')
|
31
|
+
ForthRule = Rule.new(/!!/, '☄')
|
28
32
|
|
29
|
-
|
30
|
-
|
33
|
+
All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
|
34
|
+
end
|
31
35
|
|
32
|
-
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
33
|
-
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
34
36
|
|
35
|
-
# Rubular: http://rubular.com/r/
|
36
|
-
|
37
|
+
# Rubular: http://rubular.com/r/aXPUGm6fQh
|
38
|
+
QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
|
37
39
|
|
38
|
-
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
39
|
-
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
40
40
|
|
41
|
-
module
|
42
|
-
# Rubular: http://rubular.com/r/
|
43
|
-
|
41
|
+
module ExclamationPointRules
|
42
|
+
# Rubular: http://rubular.com/r/XS1XXFRfM2
|
43
|
+
InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
|
44
44
|
|
45
|
-
# Rubular: http://rubular.com/r/
|
46
|
-
|
45
|
+
# Rubular: http://rubular.com/r/sl57YI8LkA
|
46
|
+
BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
|
47
47
|
|
48
|
-
# Rubular: http://rubular.com/r/
|
49
|
-
|
48
|
+
# Rubular: http://rubular.com/r/f9zTjmkIPb
|
49
|
+
MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
|
50
50
|
|
51
|
-
|
52
|
-
|
51
|
+
All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
|
52
|
+
end
|
53
53
|
|
54
|
-
|
54
|
+
module SubSymbolsRules
|
55
|
+
Period = Rule.new(/∯/, '.')
|
56
|
+
ArabicComma = Rule.new(/♬/, '،')
|
57
|
+
SemiColon = Rule.new(/♭/, ':')
|
58
|
+
FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
|
59
|
+
SpecialPeriod = Rule.new(/&ᓱ&/, '.')
|
60
|
+
FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
|
61
|
+
ExclamationPoint = Rule.new(/&ᓴ&/, '!')
|
62
|
+
QuestionMark = Rule.new(/&ᓷ&/, '?')
|
63
|
+
FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
|
64
|
+
MixedDoubleQE = Rule.new(/☉/, '?!')
|
65
|
+
MixedDoubleQQ = Rule.new(/☇/, '??')
|
66
|
+
MixedDoubleEQ = Rule.new(/☈/, '!?')
|
67
|
+
MixedDoubleEE = Rule.new(/☄/, '!!')
|
68
|
+
LeftParens = Rule.new(/&✂&/, '(')
|
69
|
+
RightParens = Rule.new(/&⌬&/, ')')
|
70
|
+
TemporaryEndingPunctutation = Rule.new('ȸ', '')
|
71
|
+
Newline = Rule.new(/ȹ/, "\n")
|
72
|
+
|
73
|
+
All = [ Period, ArabicComma,
|
74
|
+
SemiColon, FullWidthPeriod,
|
75
|
+
SpecialPeriod, FullWidthExclamation,
|
76
|
+
ExclamationPoint, QuestionMark,
|
77
|
+
FullWidthQuestionMark, MixedDoubleQE,
|
78
|
+
MixedDoubleQQ, MixedDoubleEQ,
|
79
|
+
MixedDoubleEE, LeftParens,
|
80
|
+
RightParens, TemporaryEndingPunctutation,
|
81
|
+
Newline ]
|
55
82
|
end
|
56
83
|
|
57
|
-
# This class searches for periods within an abbreviation and
|
58
|
-
# replaces the periods.
|
59
|
-
module SingleLetterAbbreviationRules
|
60
|
-
# Rubular: http://rubular.com/r/e3H6kwnr6H
|
61
|
-
SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
|
62
84
|
|
63
|
-
|
64
|
-
|
85
|
+
module ReinsertEllipsisRules
|
86
|
+
SubThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
|
87
|
+
SubThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
|
88
|
+
SubFourSpacePeriod = Rule.new(/♝/, '. . . .')
|
89
|
+
SubTwoConsecutivePeriod = Rule.new(/☏/, '..')
|
90
|
+
SubOnePeriod = Rule.new(/∮/, '.')
|
65
91
|
|
66
|
-
All = [
|
67
|
-
|
68
|
-
|
69
|
-
]
|
92
|
+
All = [ SubThreeConsecutivePeriod, SubThreeSpacePeriod,
|
93
|
+
SubFourSpacePeriod, SubTwoConsecutivePeriod,
|
94
|
+
SubOnePeriod ]
|
70
95
|
end
|
71
96
|
|
97
|
+
ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
|
72
98
|
|
73
|
-
|
74
|
-
end
|
75
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
76
|
-
end
|
99
|
+
SubSingleQuoteRule = Rule.new(/&⎋&/, "'")
|
77
100
|
end
|
78
101
|
end
|
79
102
|
end
|