pragmatic_segmenter 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +6 -6
- data/lib/pragmatic_segmenter/between_punctuation.rb +6 -4
- data/lib/pragmatic_segmenter/cleaner.rb +51 -47
- data/lib/pragmatic_segmenter/cleaner/rules.rb +86 -0
- data/lib/pragmatic_segmenter/languages.rb +21 -30
- data/lib/pragmatic_segmenter/languages/arabic.rb +0 -13
- data/lib/pragmatic_segmenter/languages/common.rb +67 -44
- data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +37 -0
- data/lib/pragmatic_segmenter/languages/common/numbers.rb +90 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +25 -48
- data/lib/pragmatic_segmenter/languages/english.rb +3 -3
- data/lib/pragmatic_segmenter/languages/japanese.rb +5 -13
- data/lib/pragmatic_segmenter/languages/persian.rb +0 -14
- data/lib/pragmatic_segmenter/languages/russian.rb +0 -25
- data/lib/pragmatic_segmenter/languages/spanish.rb +0 -9
- data/lib/pragmatic_segmenter/list.rb +60 -58
- data/lib/pragmatic_segmenter/{process.rb → processor.rb} +47 -26
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +41 -20
- data/lib/pragmatic_segmenter/segmenter.rb +19 -5
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/pragmatic_segmenter.gemspec +1 -0
- data/spec/pragmatic_segmenter/languages/amharic_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/arabic_spec.rb +59 -0
- data/spec/pragmatic_segmenter/languages/armenian_spec.rb +160 -0
- data/spec/pragmatic_segmenter/languages/burmese_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/chinese_spec.rb +11 -0
- data/spec/pragmatic_segmenter/languages/deutsch_spec.rb +189 -0
- data/spec/pragmatic_segmenter/languages/dutch_spec.rb +23 -0
- data/spec/pragmatic_segmenter/languages/english_spec.rb +1348 -0
- data/spec/pragmatic_segmenter/languages/french_spec.rb +31 -0
- data/spec/pragmatic_segmenter/languages/greek_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/hindi_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/italian_spec.rb +190 -0
- data/spec/pragmatic_segmenter/languages/japanese_spec.rb +53 -0
- data/spec/pragmatic_segmenter/languages/persian_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/polish_spec.rb +11 -0
- data/spec/pragmatic_segmenter/languages/russian_spec.rb +219 -0
- data/spec/pragmatic_segmenter/languages/spanish_spec.rb +189 -0
- data/spec/pragmatic_segmenter/languages/urdu_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages_spec.rb +31 -0
- data/spec/pragmatic_segmenter_spec.rb +24 -2583
- metadata +59 -8
- data/lib/pragmatic_segmenter/number.rb +0 -35
- data/lib/pragmatic_segmenter/rules.rb +0 -168
- data/lib/pragmatic_segmenter/rules/ellipsis.rb +0 -35
- data/lib/pragmatic_segmenter/rules/html.rb +0 -13
@@ -0,0 +1,37 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module PragmaticSegmenter
|
4
|
+
module Languages
|
5
|
+
module Common
|
6
|
+
# This class searches for ellipses within a string and
|
7
|
+
# replaces the periods.
|
8
|
+
|
9
|
+
# http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
|
10
|
+
# http://www.thepunctuationguide.com/ellipses.html
|
11
|
+
|
12
|
+
module EllipsisRules
|
13
|
+
# Rubular: http://rubular.com/r/i60hCK81fz
|
14
|
+
ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
|
15
|
+
|
16
|
+
# Rubular: http://rubular.com/r/Hdqpd90owl
|
17
|
+
FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
|
18
|
+
|
19
|
+
# Rubular: http://rubular.com/r/YBG1dIHTRu
|
20
|
+
ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
|
21
|
+
|
22
|
+
# Rubular: http://rubular.com/r/2VvZ8wRbd8
|
23
|
+
FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
|
24
|
+
|
25
|
+
OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
|
26
|
+
|
27
|
+
All = [
|
28
|
+
ThreeSpaceRule,
|
29
|
+
FourSpaceRule,
|
30
|
+
FourConsecutiveRule,
|
31
|
+
ThreeConsecutiveRule,
|
32
|
+
OtherThreePeriodRule
|
33
|
+
]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module PragmaticSegmenter
|
4
|
+
module Languages
|
5
|
+
module Common
|
6
|
+
module Numbers
|
7
|
+
# Rubular: http://rubular.com/r/oNyxBOqbyy
|
8
|
+
PeriodBeforeNumberRule = Rule.new(/\.(?=\d)/, '∯')
|
9
|
+
|
10
|
+
# Rubular: http://rubular.com/r/EMk5MpiUzt
|
11
|
+
NumberAfterPeriodBeforeLetterRule = Rule.new(/(?<=\d)\.(?=\S)/, '∯')
|
12
|
+
|
13
|
+
# Rubular: http://rubular.com/r/rf4l1HjtjG
|
14
|
+
NewLineNumberPeriodSpaceLetterRule = Rule.new(/(?<=\r\d)\.(?=(\s\S)|\))/, '∯')
|
15
|
+
|
16
|
+
# Rubular: http://rubular.com/r/HPa4sdc6b9
|
17
|
+
StartLineNumberPeriodRule = Rule.new(/(?<=^\d)\.(?=(\s\S)|\))/, '∯')
|
18
|
+
|
19
|
+
# Rubular: http://rubular.com/r/NuvWnKleFl
|
20
|
+
StartLineTwoDigitNumberPeriodRule = Rule.new(/(?<=^\d\d)\.(?=(\s\S)|\))/, '∯')
|
21
|
+
|
22
|
+
All = [
|
23
|
+
PeriodBeforeNumberRule,
|
24
|
+
NumberAfterPeriodBeforeLetterRule,
|
25
|
+
NewLineNumberPeriodSpaceLetterRule,
|
26
|
+
StartLineNumberPeriodRule,
|
27
|
+
StartLineTwoDigitNumberPeriodRule
|
28
|
+
]
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
|
33
|
+
|
34
|
+
# Rubular: http://rubular.com/r/NqCqv372Ix
|
35
|
+
QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
|
36
|
+
|
37
|
+
# Rubular: http://rubular.com/r/6flGnUMEVl
|
38
|
+
PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
|
39
|
+
|
40
|
+
# Rubular: http://rubular.com/r/TYzr4qOW1Q
|
41
|
+
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
|
42
|
+
|
43
|
+
# Rubular: http://rubular.com/r/JMjlZHAT4g
|
44
|
+
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
|
45
|
+
|
46
|
+
# Rubular: http://rubular.com/r/mQ8Es9bxtk
|
47
|
+
CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
|
48
|
+
|
49
|
+
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
50
|
+
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
51
|
+
|
52
|
+
# Rubular: http://rubular.com/r/NEv265G2X2
|
53
|
+
KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
|
54
|
+
|
55
|
+
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
56
|
+
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
57
|
+
|
58
|
+
module AmPmRules
|
59
|
+
# Rubular: http://rubular.com/r/Vnx3m4Spc8
|
60
|
+
UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
|
61
|
+
|
62
|
+
# Rubular: http://rubular.com/r/AJMCotJVbW
|
63
|
+
UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
|
64
|
+
|
65
|
+
# Rubular: http://rubular.com/r/13q7SnOhgA
|
66
|
+
LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
|
67
|
+
|
68
|
+
# Rubular: http://rubular.com/r/DgUDq4mLz5
|
69
|
+
LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
|
70
|
+
|
71
|
+
All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
|
72
|
+
end
|
73
|
+
|
74
|
+
# This class searches for periods within an abbreviation and
|
75
|
+
# replaces the periods.
|
76
|
+
module SingleLetterAbbreviationRules
|
77
|
+
# Rubular: http://rubular.com/r/e3H6kwnr6H
|
78
|
+
SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
|
79
|
+
|
80
|
+
# Rubular: http://rubular.com/r/gitvf0YWH4
|
81
|
+
SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
|
82
|
+
|
83
|
+
All = [
|
84
|
+
SingleUpperCaseLetterAtStartOfLineRule,
|
85
|
+
SingleUpperCaseLetterRule
|
86
|
+
]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -18,11 +18,20 @@ module PragmaticSegmenter
|
|
18
18
|
# Rubular: http://rubular.com/r/TkZomF9tTM
|
19
19
|
BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
|
20
20
|
|
21
|
-
# Rubular: http://rubular.com/r/hZxoyQwKT1
|
22
|
-
NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
|
23
21
|
|
24
|
-
|
25
|
-
|
22
|
+
module Numbers
|
23
|
+
# Rubular: http://rubular.com/r/hZxoyQwKT1
|
24
|
+
NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
|
25
|
+
|
26
|
+
# Rubular: http://rubular.com/r/ityNMwdghj
|
27
|
+
NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
|
28
|
+
|
29
|
+
All = [
|
30
|
+
Common::Numbers::All,
|
31
|
+
NumberPeriodSpaceRule,
|
32
|
+
NegativeNumberPeriodSpaceRule
|
33
|
+
]
|
34
|
+
end
|
26
35
|
|
27
36
|
MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
|
28
37
|
|
@@ -32,59 +41,35 @@ module PragmaticSegmenter
|
|
32
41
|
# Rubular: http://rubular.com/r/iUNSkCuso0
|
33
42
|
SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
|
34
43
|
|
35
|
-
|
36
|
-
class Process < PragmaticSegmenter::Process
|
37
|
-
private
|
38
|
-
|
39
|
-
def between_punctuation(txt)
|
40
|
-
BetweenPunctuation.new(text: txt).replace
|
41
|
-
end
|
42
|
-
|
43
|
-
def replace_numbers(txt)
|
44
|
-
Number.new(text: txt).replace
|
45
|
-
end
|
46
|
-
|
47
|
-
def replace_abbreviations(txt)
|
48
|
-
AbbreviationReplacer.new(text: txt, language: Deutsch).replace
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
44
|
+
class Processor < PragmaticSegmenter::Processor
|
53
45
|
private
|
54
46
|
|
55
|
-
def
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
47
|
+
def replace_numbers
|
48
|
+
@text.apply Numbers::All
|
59
49
|
|
60
|
-
|
61
|
-
def replace
|
62
|
-
super
|
63
|
-
@text.apply(NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule)
|
64
|
-
replace_period_in_deutsch_dates(@text)
|
50
|
+
replace_period_in_deutsch_dates
|
65
51
|
end
|
66
52
|
|
67
|
-
def replace_period_in_deutsch_dates
|
53
|
+
def replace_period_in_deutsch_dates
|
68
54
|
MONTHS.each do |month|
|
69
55
|
# Rubular: http://rubular.com/r/zlqgj7G5dA
|
70
|
-
|
56
|
+
@text.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
|
71
57
|
end
|
72
|
-
txt
|
73
58
|
end
|
74
59
|
end
|
75
60
|
|
76
61
|
class AbbreviationReplacer < AbbreviationReplacer
|
77
62
|
def replace
|
78
|
-
@
|
63
|
+
@text = text.apply(
|
79
64
|
@language::PossessiveAbbreviationRule,
|
80
65
|
@language::SingleLetterAbbreviationRules::All,
|
81
66
|
SingleLowerCaseLetterRule,
|
82
67
|
SingleLowerCaseLetterAtStartOfLineRule)
|
83
68
|
|
84
|
-
@
|
85
|
-
@
|
86
|
-
@
|
87
|
-
replace_abbreviation_as_sentence_boundary(@
|
69
|
+
@text = search_for_abbreviations_in_string(@text)
|
70
|
+
@text = replace_multi_period_abbreviations(@text)
|
71
|
+
@text.apply(Languages::Common::AmPmRules::All)
|
72
|
+
replace_abbreviation_as_sentence_boundary(@text)
|
88
73
|
end
|
89
74
|
|
90
75
|
private
|
@@ -97,15 +82,7 @@ module PragmaticSegmenter
|
|
97
82
|
class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
|
98
83
|
private
|
99
84
|
|
100
|
-
def
|
101
|
-
btwn_dbl_quote = sub_punctuation_between_double_quotes_de(txt)
|
102
|
-
PragmaticSegmenter::PunctuationReplacer.new(
|
103
|
-
matches_array: btwn_dbl_quote,
|
104
|
-
text: txt
|
105
|
-
).replace
|
106
|
-
end
|
107
|
-
|
108
|
-
def sub_punctuation_between_double_quotes_de(txt)
|
85
|
+
def btwn_dbl_quote(txt)
|
109
86
|
if txt.include?('„')
|
110
87
|
btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_DE_REGEX)
|
111
88
|
txt.scan(SPLIT_DOUBLE_QUOTES_DE_REGEX).each do |q|
|
@@ -6,13 +6,13 @@ module PragmaticSegmenter
|
|
6
6
|
class Cleaner < Cleaner
|
7
7
|
def clean
|
8
8
|
super
|
9
|
-
clean_quotations
|
9
|
+
clean_quotations
|
10
10
|
end
|
11
11
|
|
12
12
|
private
|
13
13
|
|
14
|
-
def clean_quotations
|
15
|
-
|
14
|
+
def clean_quotations
|
15
|
+
@text.gsub(/`/, "'")
|
16
16
|
end
|
17
17
|
|
18
18
|
def abbreviations
|
@@ -3,27 +3,19 @@ module PragmaticSegmenter
|
|
3
3
|
module Japanese
|
4
4
|
include Languages::Common
|
5
5
|
|
6
|
-
class Process < Process
|
7
|
-
private
|
8
|
-
|
9
|
-
def between_punctuation(txt)
|
10
|
-
BetweenPunctuation.new(text: txt).replace
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
6
|
class Cleaner < PragmaticSegmenter::Cleaner
|
15
7
|
# Rubular: http://rubular.com/r/N4kPuJgle7
|
16
8
|
NewLineInMiddleOfWordRule = Rule.new(/(?<=の)\n(?=\S)/, '')
|
17
9
|
|
18
10
|
def clean
|
19
11
|
super
|
20
|
-
|
12
|
+
remove_newline_in_middle_of_word
|
21
13
|
end
|
22
14
|
|
23
15
|
private
|
24
16
|
|
25
|
-
def remove_newline_in_middle_of_word
|
26
|
-
|
17
|
+
def remove_newline_in_middle_of_word
|
18
|
+
@text.apply NewLineInMiddleOfWordRule
|
27
19
|
end
|
28
20
|
end
|
29
21
|
|
@@ -42,14 +34,14 @@ module PragmaticSegmenter
|
|
42
34
|
end
|
43
35
|
|
44
36
|
def sub_punctuation_between_quotes_ja(txt)
|
45
|
-
|
37
|
+
PunctuationReplacer.new(
|
46
38
|
matches_array: txt.scan(BETWEEN_QUOTE_JA_REGEX),
|
47
39
|
text: txt
|
48
40
|
).replace
|
49
41
|
end
|
50
42
|
|
51
43
|
def sub_punctuation_between_parens_ja(txt)
|
52
|
-
|
44
|
+
PunctuationReplacer.new(
|
53
45
|
matches_array: txt.scan(BETWEEN_PARENS_JA_REGEX),
|
54
46
|
text: txt
|
55
47
|
).replace
|
@@ -9,20 +9,6 @@ module PragmaticSegmenter
|
|
9
9
|
ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
|
10
10
|
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
11
11
|
|
12
|
-
class Process < Process
|
13
|
-
private
|
14
|
-
|
15
|
-
def sentence_boundary_punctuation(txt)
|
16
|
-
txt = txt.apply ReplaceColonBetweenNumbersRule,
|
17
|
-
ReplaceNonSentenceBoundaryCommaRule
|
18
|
-
txt.scan(SENTENCE_BOUNDARY_REGEX)
|
19
|
-
end
|
20
|
-
|
21
|
-
def replace_abbreviations(txt)
|
22
|
-
AbbreviationReplacer.new(text: txt).replace
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
12
|
class AbbreviationReplacer < AbbreviationReplacer
|
27
13
|
private
|
28
14
|
|
@@ -9,34 +9,9 @@ module PragmaticSegmenter
|
|
9
9
|
NUMBER_ABBREVIATIONS = []
|
10
10
|
end
|
11
11
|
|
12
|
-
class Process < Process
|
13
|
-
private
|
14
|
-
|
15
|
-
def replace_abbreviations(txt)
|
16
|
-
AbbreviationReplacer.new(text: txt, language: Russian).replace
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
12
|
class AbbreviationReplacer < AbbreviationReplacer
|
21
13
|
private
|
22
14
|
|
23
|
-
def scan_for_replacements(txt, am, index, character_array)
|
24
|
-
character = character_array[index]
|
25
|
-
prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS
|
26
|
-
number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS
|
27
|
-
upper = /[[:upper:]]/.match(character.to_s)
|
28
|
-
if upper.nil? || prepositive.include?(am.downcase.strip)
|
29
|
-
if prepositive.include?(am.downcase.strip)
|
30
|
-
txt = replace_prepositive_abbr(txt, am)
|
31
|
-
elsif number_abbr.include?(am.downcase.strip)
|
32
|
-
txt = replace_pre_number_abbr(txt, am)
|
33
|
-
else
|
34
|
-
txt = replace_period_of_abbr(txt, am)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
txt
|
38
|
-
end
|
39
|
-
|
40
15
|
def replace_period_of_abbr(txt, abbr)
|
41
16
|
txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
|
42
17
|
.gsub(/(?<=\A#{abbr.strip})\./, '∯')
|
@@ -8,15 +8,6 @@ module PragmaticSegmenter
|
|
8
8
|
PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
|
9
9
|
NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
|
10
10
|
end
|
11
|
-
|
12
|
-
class Cleaner < Cleaner
|
13
|
-
private
|
14
|
-
|
15
|
-
def abbreviations
|
16
|
-
Abbreviation::ABBREVIATIONS
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
11
|
end
|
21
12
|
end
|
22
13
|
end
|
@@ -5,6 +5,8 @@ module PragmaticSegmenter
|
|
5
5
|
# newlines before each list item.
|
6
6
|
class List
|
7
7
|
ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
|
8
|
+
LATIN_NUMERALS = ('a'..'z').to_a
|
9
|
+
|
8
10
|
# Rubular: http://rubular.com/r/XcpaJKH0sz
|
9
11
|
ALPHABETICAL_LIST_WITH_PERIODS =
|
10
12
|
/(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
|
@@ -45,10 +47,10 @@ module PragmaticSegmenter
|
|
45
47
|
end
|
46
48
|
|
47
49
|
def add_line_break
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
format_numbered_list_with_parens
|
50
|
+
format_alphabetical_lists
|
51
|
+
format_roman_numeral_lists
|
52
|
+
format_numbered_list_with_periods
|
53
|
+
format_numbered_list_with_parens
|
52
54
|
end
|
53
55
|
|
54
56
|
def replace_parens
|
@@ -63,64 +65,63 @@ module PragmaticSegmenter
|
|
63
65
|
|
64
66
|
private
|
65
67
|
|
66
|
-
def format_numbered_list_with_parens
|
67
|
-
|
68
|
-
|
69
|
-
|
68
|
+
def format_numbered_list_with_parens
|
69
|
+
replace_parens_in_numbered_list
|
70
|
+
add_line_breaks_for_numbered_list_with_parens
|
71
|
+
@text.apply(ListMarkerRule)
|
70
72
|
end
|
71
73
|
|
72
|
-
def format_numbered_list_with_periods
|
73
|
-
|
74
|
-
|
75
|
-
|
74
|
+
def format_numbered_list_with_periods
|
75
|
+
replace_periods_in_numbered_list
|
76
|
+
add_line_breaks_for_numbered_list_with_periods
|
77
|
+
@text.apply(SubstituteListPeriodRule)
|
76
78
|
end
|
77
79
|
|
78
|
-
def format_alphabetical_lists
|
79
|
-
|
80
|
-
add_line_breaks_for_alphabetical_list_with_parens(
|
80
|
+
def format_alphabetical_lists
|
81
|
+
add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
|
82
|
+
add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
|
81
83
|
end
|
82
84
|
|
83
|
-
def format_roman_numeral_lists
|
84
|
-
|
85
|
-
add_line_breaks_for_alphabetical_list_with_parens(
|
85
|
+
def format_roman_numeral_lists
|
86
|
+
add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: true)
|
87
|
+
add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: true)
|
86
88
|
end
|
87
89
|
|
88
|
-
def replace_periods_in_numbered_list
|
89
|
-
scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', true
|
90
|
+
def replace_periods_in_numbered_list
|
91
|
+
scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', strip: true)
|
90
92
|
end
|
91
93
|
|
92
|
-
def add_line_breaks_for_numbered_list_with_periods
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
txt.apply(SpaceBetweenListItemsFirstRule).
|
97
|
-
apply(SpaceBetweenListItemsSecondRule)
|
94
|
+
def add_line_breaks_for_numbered_list_with_periods
|
95
|
+
if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/
|
96
|
+
@text.apply(SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule)
|
97
|
+
end
|
98
98
|
end
|
99
99
|
|
100
|
-
def replace_parens_in_numbered_list
|
100
|
+
def replace_parens_in_numbered_list
|
101
101
|
scan_lists(
|
102
|
-
NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝'
|
102
|
+
NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
|
103
|
+
scan_lists(NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
|
103
104
|
end
|
104
105
|
|
105
|
-
def add_line_breaks_for_numbered_list_with_parens
|
106
|
-
|
107
|
-
|
106
|
+
def add_line_breaks_for_numbered_list_with_parens
|
107
|
+
if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/
|
108
|
+
@text.apply(SpaceBetweenListItemsThirdRule)
|
109
|
+
end
|
108
110
|
end
|
109
111
|
|
110
|
-
def scan_lists(regex1, regex2, replacement, strip
|
111
|
-
list_array =
|
112
|
+
def scan_lists(regex1, regex2, replacement, strip: false)
|
113
|
+
list_array = @text.scan(regex1).map(&:to_i)
|
112
114
|
list_array.each_with_index do |a, i|
|
113
115
|
next unless (a + 1).eql?(list_array[i + 1]) ||
|
114
116
|
(a - 1).eql?(list_array[i - 1]) ||
|
115
117
|
(a.eql?(0) && list_array[i - 1].eql?(9)) ||
|
116
118
|
(a.eql?(9) && list_array[i + 1].eql?(0))
|
117
|
-
substitute_found_list_items(
|
119
|
+
substitute_found_list_items(regex2, a, strip, replacement)
|
118
120
|
end
|
119
|
-
txt
|
120
121
|
end
|
121
122
|
|
122
|
-
def substitute_found_list_items(
|
123
|
-
|
123
|
+
def substitute_found_list_items(regex, a, strip, replacement)
|
124
|
+
@text.gsub!(regex).with_index do |m|
|
124
125
|
if a.to_s.eql?(strip ? m.strip.chop : m)
|
125
126
|
"#{Regexp.escape(a.to_s)}" + replacement
|
126
127
|
else
|
@@ -129,22 +130,24 @@ module PragmaticSegmenter
|
|
129
130
|
end
|
130
131
|
end
|
131
132
|
|
132
|
-
def add_line_breaks_for_alphabetical_list_with_periods(
|
133
|
-
iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS,
|
133
|
+
def add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
|
134
|
+
iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral: roman_numeral)
|
134
135
|
end
|
135
136
|
|
136
|
-
def add_line_breaks_for_alphabetical_list_with_parens(
|
137
|
-
iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS,
|
137
|
+
def add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
|
138
|
+
iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS,
|
139
|
+
parens: true,
|
140
|
+
roman_numeral: roman_numeral)
|
138
141
|
end
|
139
142
|
|
140
|
-
def replace_alphabet_list(a
|
141
|
-
|
143
|
+
def replace_alphabet_list(a)
|
144
|
+
@text.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
|
142
145
|
a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
|
143
146
|
end
|
144
147
|
end
|
145
148
|
|
146
|
-
def replace_alphabet_list_parens(a
|
147
|
-
|
149
|
+
def replace_alphabet_list_parens(a)
|
150
|
+
@text.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
|
148
151
|
if m.include?('(')
|
149
152
|
a.eql?(m.dup.downcase.gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
|
150
153
|
else
|
@@ -153,48 +156,47 @@ module PragmaticSegmenter
|
|
153
156
|
end
|
154
157
|
end
|
155
158
|
|
156
|
-
def replace_correct_alphabet_list(a,
|
159
|
+
def replace_correct_alphabet_list(a, parens)
|
157
160
|
if parens
|
158
|
-
replace_alphabet_list_parens(a
|
161
|
+
replace_alphabet_list_parens(a)
|
159
162
|
else
|
160
|
-
replace_alphabet_list(a
|
163
|
+
replace_alphabet_list(a)
|
161
164
|
end
|
162
165
|
end
|
163
166
|
|
164
|
-
def last_array_item_replacement(a, i, alphabet, list_array,
|
167
|
+
def last_array_item_replacement(a, i, alphabet, list_array, parens)
|
165
168
|
return if alphabet & list_array == [] ||
|
166
169
|
!alphabet.include?(list_array[i - 1]) ||
|
167
170
|
!alphabet.include?(a)
|
168
171
|
return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
|
169
|
-
replace_correct_alphabet_list(a,
|
172
|
+
replace_correct_alphabet_list(a, parens)
|
170
173
|
end
|
171
174
|
|
172
|
-
def other_items_replacement(a, i, alphabet, list_array,
|
175
|
+
def other_items_replacement(a, i, alphabet, list_array, parens)
|
173
176
|
return if alphabet & list_array == [] ||
|
174
177
|
!alphabet.include?(list_array[i - 1]) ||
|
175
178
|
!alphabet.include?(a) ||
|
176
179
|
!alphabet.include?(list_array[i + 1])
|
177
180
|
return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 &&
|
178
181
|
(alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
|
179
|
-
replace_correct_alphabet_list(a,
|
182
|
+
replace_correct_alphabet_list(a, parens)
|
180
183
|
end
|
181
184
|
|
182
|
-
def iterate_alphabet_array(regex, parens
|
183
|
-
list_array =
|
185
|
+
def iterate_alphabet_array(regex, parens: false, roman_numeral: false)
|
186
|
+
list_array = @text.scan(regex).map(&:downcase)
|
184
187
|
if roman_numeral
|
185
188
|
alphabet = ROMAN_NUMERALS
|
186
189
|
else
|
187
|
-
alphabet =
|
190
|
+
alphabet = LATIN_NUMERALS
|
188
191
|
end
|
189
192
|
list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
|
190
193
|
list_array.each_with_index do |a, i|
|
191
194
|
if i.eql?(list_array.length - 1)
|
192
|
-
last_array_item_replacement(a, i, alphabet, list_array,
|
195
|
+
last_array_item_replacement(a, i, alphabet, list_array, parens)
|
193
196
|
else
|
194
|
-
other_items_replacement(a, i, alphabet, list_array,
|
197
|
+
other_items_replacement(a, i, alphabet, list_array, parens)
|
195
198
|
end
|
196
199
|
end
|
197
|
-
txt
|
198
200
|
end
|
199
201
|
end
|
200
202
|
end
|