pragmatic_segmenter 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +6 -6
- data/lib/pragmatic_segmenter/between_punctuation.rb +6 -4
- data/lib/pragmatic_segmenter/cleaner.rb +51 -47
- data/lib/pragmatic_segmenter/cleaner/rules.rb +86 -0
- data/lib/pragmatic_segmenter/languages.rb +21 -30
- data/lib/pragmatic_segmenter/languages/arabic.rb +0 -13
- data/lib/pragmatic_segmenter/languages/common.rb +67 -44
- data/lib/pragmatic_segmenter/languages/common/ellipsis.rb +37 -0
- data/lib/pragmatic_segmenter/languages/common/numbers.rb +90 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +25 -48
- data/lib/pragmatic_segmenter/languages/english.rb +3 -3
- data/lib/pragmatic_segmenter/languages/japanese.rb +5 -13
- data/lib/pragmatic_segmenter/languages/persian.rb +0 -14
- data/lib/pragmatic_segmenter/languages/russian.rb +0 -25
- data/lib/pragmatic_segmenter/languages/spanish.rb +0 -9
- data/lib/pragmatic_segmenter/list.rb +60 -58
- data/lib/pragmatic_segmenter/{process.rb → processor.rb} +47 -26
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +41 -20
- data/lib/pragmatic_segmenter/segmenter.rb +19 -5
- data/lib/pragmatic_segmenter/version.rb +1 -1
- data/pragmatic_segmenter.gemspec +1 -0
- data/spec/pragmatic_segmenter/languages/amharic_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/arabic_spec.rb +59 -0
- data/spec/pragmatic_segmenter/languages/armenian_spec.rb +160 -0
- data/spec/pragmatic_segmenter/languages/burmese_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/chinese_spec.rb +11 -0
- data/spec/pragmatic_segmenter/languages/deutsch_spec.rb +189 -0
- data/spec/pragmatic_segmenter/languages/dutch_spec.rb +23 -0
- data/spec/pragmatic_segmenter/languages/english_spec.rb +1348 -0
- data/spec/pragmatic_segmenter/languages/french_spec.rb +31 -0
- data/spec/pragmatic_segmenter/languages/greek_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/hindi_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/italian_spec.rb +190 -0
- data/spec/pragmatic_segmenter/languages/japanese_spec.rb +53 -0
- data/spec/pragmatic_segmenter/languages/persian_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages/polish_spec.rb +11 -0
- data/spec/pragmatic_segmenter/languages/russian_spec.rb +219 -0
- data/spec/pragmatic_segmenter/languages/spanish_spec.rb +189 -0
- data/spec/pragmatic_segmenter/languages/urdu_spec.rb +18 -0
- data/spec/pragmatic_segmenter/languages_spec.rb +31 -0
- data/spec/pragmatic_segmenter_spec.rb +24 -2583
- metadata +59 -8
- data/lib/pragmatic_segmenter/number.rb +0 -35
- data/lib/pragmatic_segmenter/rules.rb +0 -168
- data/lib/pragmatic_segmenter/rules/ellipsis.rb +0 -35
- data/lib/pragmatic_segmenter/rules/html.rb +0 -13
@@ -0,0 +1,37 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module PragmaticSegmenter
|
4
|
+
module Languages
|
5
|
+
module Common
|
6
|
+
# This class searches for ellipses within a string and
|
7
|
+
# replaces the periods.
|
8
|
+
|
9
|
+
# http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
|
10
|
+
# http://www.thepunctuationguide.com/ellipses.html
|
11
|
+
|
12
|
+
module EllipsisRules
|
13
|
+
# Rubular: http://rubular.com/r/i60hCK81fz
|
14
|
+
ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
|
15
|
+
|
16
|
+
# Rubular: http://rubular.com/r/Hdqpd90owl
|
17
|
+
FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
|
18
|
+
|
19
|
+
# Rubular: http://rubular.com/r/YBG1dIHTRu
|
20
|
+
ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
|
21
|
+
|
22
|
+
# Rubular: http://rubular.com/r/2VvZ8wRbd8
|
23
|
+
FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
|
24
|
+
|
25
|
+
OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
|
26
|
+
|
27
|
+
All = [
|
28
|
+
ThreeSpaceRule,
|
29
|
+
FourSpaceRule,
|
30
|
+
FourConsecutiveRule,
|
31
|
+
ThreeConsecutiveRule,
|
32
|
+
OtherThreePeriodRule
|
33
|
+
]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module PragmaticSegmenter
|
4
|
+
module Languages
|
5
|
+
module Common
|
6
|
+
module Numbers
|
7
|
+
# Rubular: http://rubular.com/r/oNyxBOqbyy
|
8
|
+
PeriodBeforeNumberRule = Rule.new(/\.(?=\d)/, '∯')
|
9
|
+
|
10
|
+
# Rubular: http://rubular.com/r/EMk5MpiUzt
|
11
|
+
NumberAfterPeriodBeforeLetterRule = Rule.new(/(?<=\d)\.(?=\S)/, '∯')
|
12
|
+
|
13
|
+
# Rubular: http://rubular.com/r/rf4l1HjtjG
|
14
|
+
NewLineNumberPeriodSpaceLetterRule = Rule.new(/(?<=\r\d)\.(?=(\s\S)|\))/, '∯')
|
15
|
+
|
16
|
+
# Rubular: http://rubular.com/r/HPa4sdc6b9
|
17
|
+
StartLineNumberPeriodRule = Rule.new(/(?<=^\d)\.(?=(\s\S)|\))/, '∯')
|
18
|
+
|
19
|
+
# Rubular: http://rubular.com/r/NuvWnKleFl
|
20
|
+
StartLineTwoDigitNumberPeriodRule = Rule.new(/(?<=^\d\d)\.(?=(\s\S)|\))/, '∯')
|
21
|
+
|
22
|
+
All = [
|
23
|
+
PeriodBeforeNumberRule,
|
24
|
+
NumberAfterPeriodBeforeLetterRule,
|
25
|
+
NewLineNumberPeriodSpaceLetterRule,
|
26
|
+
StartLineNumberPeriodRule,
|
27
|
+
StartLineTwoDigitNumberPeriodRule
|
28
|
+
]
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
SENTENCE_BOUNDARY_REGEX = /\u{ff08}(?:[^\u{ff09}])*\u{ff09}(?=\s?[A-Z])|\u{300c}(?:[^\u{300d}])*\u{300d}(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|'(?:[^'])*[^,]'(?=\s[A-Z])|"(?:[^"])*[^,]"(?=\s[A-Z])|“(?:[^”])*[^,]”(?=\s[A-Z])|\S.*?[。..!!??ȸȹ☉☈☇☄]/
|
33
|
+
|
34
|
+
# Rubular: http://rubular.com/r/NqCqv372Ix
|
35
|
+
QUOTATION_AT_END_OF_SENTENCE_REGEX = /[!?\.-][\"\'\u{201d}\u{201c}]\s{1}[A-Z]/
|
36
|
+
|
37
|
+
# Rubular: http://rubular.com/r/6flGnUMEVl
|
38
|
+
PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = /["”]\s\(.*\)\s["“]/
|
39
|
+
|
40
|
+
# Rubular: http://rubular.com/r/TYzr4qOW1Q
|
41
|
+
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?:[^"])*[^,]"|“(?:[^”])*[^,]”/
|
42
|
+
|
43
|
+
# Rubular: http://rubular.com/r/JMjlZHAT4g
|
44
|
+
SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
|
45
|
+
|
46
|
+
# Rubular: http://rubular.com/r/mQ8Es9bxtk
|
47
|
+
CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
|
48
|
+
|
49
|
+
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
50
|
+
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
51
|
+
|
52
|
+
# Rubular: http://rubular.com/r/NEv265G2X2
|
53
|
+
KommanditgesellschaftRule = Rule.new(/(?<=Co)\.(?=\sKG)/, '∯')
|
54
|
+
|
55
|
+
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
56
|
+
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
57
|
+
|
58
|
+
module AmPmRules
|
59
|
+
# Rubular: http://rubular.com/r/Vnx3m4Spc8
|
60
|
+
UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
|
61
|
+
|
62
|
+
# Rubular: http://rubular.com/r/AJMCotJVbW
|
63
|
+
UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
|
64
|
+
|
65
|
+
# Rubular: http://rubular.com/r/13q7SnOhgA
|
66
|
+
LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
|
67
|
+
|
68
|
+
# Rubular: http://rubular.com/r/DgUDq4mLz5
|
69
|
+
LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
|
70
|
+
|
71
|
+
All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
|
72
|
+
end
|
73
|
+
|
74
|
+
# This class searches for periods within an abbreviation and
|
75
|
+
# replaces the periods.
|
76
|
+
module SingleLetterAbbreviationRules
|
77
|
+
# Rubular: http://rubular.com/r/e3H6kwnr6H
|
78
|
+
SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
|
79
|
+
|
80
|
+
# Rubular: http://rubular.com/r/gitvf0YWH4
|
81
|
+
SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
|
82
|
+
|
83
|
+
All = [
|
84
|
+
SingleUpperCaseLetterAtStartOfLineRule,
|
85
|
+
SingleUpperCaseLetterRule
|
86
|
+
]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -18,11 +18,20 @@ module PragmaticSegmenter
|
|
18
18
|
# Rubular: http://rubular.com/r/TkZomF9tTM
|
19
19
|
BETWEEN_DOUBLE_QUOTES_DE_REGEX = /„(?>[^“\\]+|\\{2}|\\.)*“/
|
20
20
|
|
21
|
-
# Rubular: http://rubular.com/r/hZxoyQwKT1
|
22
|
-
NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
|
23
21
|
|
24
|
-
|
25
|
-
|
22
|
+
module Numbers
|
23
|
+
# Rubular: http://rubular.com/r/hZxoyQwKT1
|
24
|
+
NumberPeriodSpaceRule = Rule.new(/(?<=\s[0-9]|\s([1-9][0-9]))\.(?=\s)/, '∯')
|
25
|
+
|
26
|
+
# Rubular: http://rubular.com/r/ityNMwdghj
|
27
|
+
NegativeNumberPeriodSpaceRule = Rule.new(/(?<=-[0-9]|-([1-9][0-9]))\.(?=\s)/, '∯')
|
28
|
+
|
29
|
+
All = [
|
30
|
+
Common::Numbers::All,
|
31
|
+
NumberPeriodSpaceRule,
|
32
|
+
NegativeNumberPeriodSpaceRule
|
33
|
+
]
|
34
|
+
end
|
26
35
|
|
27
36
|
MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'Dezember']
|
28
37
|
|
@@ -32,59 +41,35 @@ module PragmaticSegmenter
|
|
32
41
|
# Rubular: http://rubular.com/r/iUNSkCuso0
|
33
42
|
SingleLowerCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[a-z])\.(?=\s)/, '∯')
|
34
43
|
|
35
|
-
|
36
|
-
class Process < PragmaticSegmenter::Process
|
37
|
-
private
|
38
|
-
|
39
|
-
def between_punctuation(txt)
|
40
|
-
BetweenPunctuation.new(text: txt).replace
|
41
|
-
end
|
42
|
-
|
43
|
-
def replace_numbers(txt)
|
44
|
-
Number.new(text: txt).replace
|
45
|
-
end
|
46
|
-
|
47
|
-
def replace_abbreviations(txt)
|
48
|
-
AbbreviationReplacer.new(text: txt, language: Deutsch).replace
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
class Cleaner < PragmaticSegmenter::Cleaner
|
44
|
+
class Processor < PragmaticSegmenter::Processor
|
53
45
|
private
|
54
46
|
|
55
|
-
def
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
47
|
+
def replace_numbers
|
48
|
+
@text.apply Numbers::All
|
59
49
|
|
60
|
-
|
61
|
-
def replace
|
62
|
-
super
|
63
|
-
@text.apply(NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule)
|
64
|
-
replace_period_in_deutsch_dates(@text)
|
50
|
+
replace_period_in_deutsch_dates
|
65
51
|
end
|
66
52
|
|
67
|
-
def replace_period_in_deutsch_dates
|
53
|
+
def replace_period_in_deutsch_dates
|
68
54
|
MONTHS.each do |month|
|
69
55
|
# Rubular: http://rubular.com/r/zlqgj7G5dA
|
70
|
-
|
56
|
+
@text.gsub!(/(?<=\d)\.(?=\s*#{Regexp.escape(month)})/, '∯')
|
71
57
|
end
|
72
|
-
txt
|
73
58
|
end
|
74
59
|
end
|
75
60
|
|
76
61
|
class AbbreviationReplacer < AbbreviationReplacer
|
77
62
|
def replace
|
78
|
-
@
|
63
|
+
@text = text.apply(
|
79
64
|
@language::PossessiveAbbreviationRule,
|
80
65
|
@language::SingleLetterAbbreviationRules::All,
|
81
66
|
SingleLowerCaseLetterRule,
|
82
67
|
SingleLowerCaseLetterAtStartOfLineRule)
|
83
68
|
|
84
|
-
@
|
85
|
-
@
|
86
|
-
@
|
87
|
-
replace_abbreviation_as_sentence_boundary(@
|
69
|
+
@text = search_for_abbreviations_in_string(@text)
|
70
|
+
@text = replace_multi_period_abbreviations(@text)
|
71
|
+
@text.apply(Languages::Common::AmPmRules::All)
|
72
|
+
replace_abbreviation_as_sentence_boundary(@text)
|
88
73
|
end
|
89
74
|
|
90
75
|
private
|
@@ -97,15 +82,7 @@ module PragmaticSegmenter
|
|
97
82
|
class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
|
98
83
|
private
|
99
84
|
|
100
|
-
def
|
101
|
-
btwn_dbl_quote = sub_punctuation_between_double_quotes_de(txt)
|
102
|
-
PragmaticSegmenter::PunctuationReplacer.new(
|
103
|
-
matches_array: btwn_dbl_quote,
|
104
|
-
text: txt
|
105
|
-
).replace
|
106
|
-
end
|
107
|
-
|
108
|
-
def sub_punctuation_between_double_quotes_de(txt)
|
85
|
+
def btwn_dbl_quote(txt)
|
109
86
|
if txt.include?('„')
|
110
87
|
btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_DE_REGEX)
|
111
88
|
txt.scan(SPLIT_DOUBLE_QUOTES_DE_REGEX).each do |q|
|
@@ -6,13 +6,13 @@ module PragmaticSegmenter
|
|
6
6
|
class Cleaner < Cleaner
|
7
7
|
def clean
|
8
8
|
super
|
9
|
-
clean_quotations
|
9
|
+
clean_quotations
|
10
10
|
end
|
11
11
|
|
12
12
|
private
|
13
13
|
|
14
|
-
def clean_quotations
|
15
|
-
|
14
|
+
def clean_quotations
|
15
|
+
@text.gsub(/`/, "'")
|
16
16
|
end
|
17
17
|
|
18
18
|
def abbreviations
|
@@ -3,27 +3,19 @@ module PragmaticSegmenter
|
|
3
3
|
module Japanese
|
4
4
|
include Languages::Common
|
5
5
|
|
6
|
-
class Process < Process
|
7
|
-
private
|
8
|
-
|
9
|
-
def between_punctuation(txt)
|
10
|
-
BetweenPunctuation.new(text: txt).replace
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
6
|
class Cleaner < PragmaticSegmenter::Cleaner
|
15
7
|
# Rubular: http://rubular.com/r/N4kPuJgle7
|
16
8
|
NewLineInMiddleOfWordRule = Rule.new(/(?<=の)\n(?=\S)/, '')
|
17
9
|
|
18
10
|
def clean
|
19
11
|
super
|
20
|
-
|
12
|
+
remove_newline_in_middle_of_word
|
21
13
|
end
|
22
14
|
|
23
15
|
private
|
24
16
|
|
25
|
-
def remove_newline_in_middle_of_word
|
26
|
-
|
17
|
+
def remove_newline_in_middle_of_word
|
18
|
+
@text.apply NewLineInMiddleOfWordRule
|
27
19
|
end
|
28
20
|
end
|
29
21
|
|
@@ -42,14 +34,14 @@ module PragmaticSegmenter
|
|
42
34
|
end
|
43
35
|
|
44
36
|
def sub_punctuation_between_quotes_ja(txt)
|
45
|
-
|
37
|
+
PunctuationReplacer.new(
|
46
38
|
matches_array: txt.scan(BETWEEN_QUOTE_JA_REGEX),
|
47
39
|
text: txt
|
48
40
|
).replace
|
49
41
|
end
|
50
42
|
|
51
43
|
def sub_punctuation_between_parens_ja(txt)
|
52
|
-
|
44
|
+
PunctuationReplacer.new(
|
53
45
|
matches_array: txt.scan(BETWEEN_PARENS_JA_REGEX),
|
54
46
|
text: txt
|
55
47
|
).replace
|
@@ -9,20 +9,6 @@ module PragmaticSegmenter
|
|
9
9
|
ReplaceColonBetweenNumbersRule = Rule.new(/(?<=\d):(?=\d)/, '♭')
|
10
10
|
ReplaceNonSentenceBoundaryCommaRule = Rule.new(/،(?=\s\S+،)/, '♬')
|
11
11
|
|
12
|
-
class Process < Process
|
13
|
-
private
|
14
|
-
|
15
|
-
def sentence_boundary_punctuation(txt)
|
16
|
-
txt = txt.apply ReplaceColonBetweenNumbersRule,
|
17
|
-
ReplaceNonSentenceBoundaryCommaRule
|
18
|
-
txt.scan(SENTENCE_BOUNDARY_REGEX)
|
19
|
-
end
|
20
|
-
|
21
|
-
def replace_abbreviations(txt)
|
22
|
-
AbbreviationReplacer.new(text: txt).replace
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
12
|
class AbbreviationReplacer < AbbreviationReplacer
|
27
13
|
private
|
28
14
|
|
@@ -9,34 +9,9 @@ module PragmaticSegmenter
|
|
9
9
|
NUMBER_ABBREVIATIONS = []
|
10
10
|
end
|
11
11
|
|
12
|
-
class Process < Process
|
13
|
-
private
|
14
|
-
|
15
|
-
def replace_abbreviations(txt)
|
16
|
-
AbbreviationReplacer.new(text: txt, language: Russian).replace
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
12
|
class AbbreviationReplacer < AbbreviationReplacer
|
21
13
|
private
|
22
14
|
|
23
|
-
def scan_for_replacements(txt, am, index, character_array)
|
24
|
-
character = character_array[index]
|
25
|
-
prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS
|
26
|
-
number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS
|
27
|
-
upper = /[[:upper:]]/.match(character.to_s)
|
28
|
-
if upper.nil? || prepositive.include?(am.downcase.strip)
|
29
|
-
if prepositive.include?(am.downcase.strip)
|
30
|
-
txt = replace_prepositive_abbr(txt, am)
|
31
|
-
elsif number_abbr.include?(am.downcase.strip)
|
32
|
-
txt = replace_pre_number_abbr(txt, am)
|
33
|
-
else
|
34
|
-
txt = replace_period_of_abbr(txt, am)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
txt
|
38
|
-
end
|
39
|
-
|
40
15
|
def replace_period_of_abbr(txt, abbr)
|
41
16
|
txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
|
42
17
|
.gsub(/(?<=\A#{abbr.strip})\./, '∯')
|
@@ -8,15 +8,6 @@ module PragmaticSegmenter
|
|
8
8
|
PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
|
9
9
|
NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
|
10
10
|
end
|
11
|
-
|
12
|
-
class Cleaner < Cleaner
|
13
|
-
private
|
14
|
-
|
15
|
-
def abbreviations
|
16
|
-
Abbreviation::ABBREVIATIONS
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
11
|
end
|
21
12
|
end
|
22
13
|
end
|
@@ -5,6 +5,8 @@ module PragmaticSegmenter
|
|
5
5
|
# newlines before each list item.
|
6
6
|
class List
|
7
7
|
ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
|
8
|
+
LATIN_NUMERALS = ('a'..'z').to_a
|
9
|
+
|
8
10
|
# Rubular: http://rubular.com/r/XcpaJKH0sz
|
9
11
|
ALPHABETICAL_LIST_WITH_PERIODS =
|
10
12
|
/(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
|
@@ -45,10 +47,10 @@ module PragmaticSegmenter
|
|
45
47
|
end
|
46
48
|
|
47
49
|
def add_line_break
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
format_numbered_list_with_parens
|
50
|
+
format_alphabetical_lists
|
51
|
+
format_roman_numeral_lists
|
52
|
+
format_numbered_list_with_periods
|
53
|
+
format_numbered_list_with_parens
|
52
54
|
end
|
53
55
|
|
54
56
|
def replace_parens
|
@@ -63,64 +65,63 @@ module PragmaticSegmenter
|
|
63
65
|
|
64
66
|
private
|
65
67
|
|
66
|
-
def format_numbered_list_with_parens
|
67
|
-
|
68
|
-
|
69
|
-
|
68
|
+
def format_numbered_list_with_parens
|
69
|
+
replace_parens_in_numbered_list
|
70
|
+
add_line_breaks_for_numbered_list_with_parens
|
71
|
+
@text.apply(ListMarkerRule)
|
70
72
|
end
|
71
73
|
|
72
|
-
def format_numbered_list_with_periods
|
73
|
-
|
74
|
-
|
75
|
-
|
74
|
+
def format_numbered_list_with_periods
|
75
|
+
replace_periods_in_numbered_list
|
76
|
+
add_line_breaks_for_numbered_list_with_periods
|
77
|
+
@text.apply(SubstituteListPeriodRule)
|
76
78
|
end
|
77
79
|
|
78
|
-
def format_alphabetical_lists
|
79
|
-
|
80
|
-
add_line_breaks_for_alphabetical_list_with_parens(
|
80
|
+
def format_alphabetical_lists
|
81
|
+
add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
|
82
|
+
add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
|
81
83
|
end
|
82
84
|
|
83
|
-
def format_roman_numeral_lists
|
84
|
-
|
85
|
-
add_line_breaks_for_alphabetical_list_with_parens(
|
85
|
+
def format_roman_numeral_lists
|
86
|
+
add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: true)
|
87
|
+
add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: true)
|
86
88
|
end
|
87
89
|
|
88
|
-
def replace_periods_in_numbered_list
|
89
|
-
scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', true
|
90
|
+
def replace_periods_in_numbered_list
|
91
|
+
scan_lists(NUMBERED_LIST_REGEX_1, NUMBERED_LIST_REGEX_2, '♨', strip: true)
|
90
92
|
end
|
91
93
|
|
92
|
-
def add_line_breaks_for_numbered_list_with_periods
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
txt.apply(SpaceBetweenListItemsFirstRule).
|
97
|
-
apply(SpaceBetweenListItemsSecondRule)
|
94
|
+
def add_line_breaks_for_numbered_list_with_periods
|
95
|
+
if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/
|
96
|
+
@text.apply(SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule)
|
97
|
+
end
|
98
98
|
end
|
99
99
|
|
100
|
-
def replace_parens_in_numbered_list
|
100
|
+
def replace_parens_in_numbered_list
|
101
101
|
scan_lists(
|
102
|
-
NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝'
|
102
|
+
NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
|
103
|
+
scan_lists(NUMBERED_LIST_PARENS_REGEX, NUMBERED_LIST_PARENS_REGEX, '☝')
|
103
104
|
end
|
104
105
|
|
105
|
-
def add_line_breaks_for_numbered_list_with_parens
|
106
|
-
|
107
|
-
|
106
|
+
def add_line_breaks_for_numbered_list_with_parens
|
107
|
+
if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/
|
108
|
+
@text.apply(SpaceBetweenListItemsThirdRule)
|
109
|
+
end
|
108
110
|
end
|
109
111
|
|
110
|
-
def scan_lists(regex1, regex2, replacement, strip
|
111
|
-
list_array =
|
112
|
+
def scan_lists(regex1, regex2, replacement, strip: false)
|
113
|
+
list_array = @text.scan(regex1).map(&:to_i)
|
112
114
|
list_array.each_with_index do |a, i|
|
113
115
|
next unless (a + 1).eql?(list_array[i + 1]) ||
|
114
116
|
(a - 1).eql?(list_array[i - 1]) ||
|
115
117
|
(a.eql?(0) && list_array[i - 1].eql?(9)) ||
|
116
118
|
(a.eql?(9) && list_array[i + 1].eql?(0))
|
117
|
-
substitute_found_list_items(
|
119
|
+
substitute_found_list_items(regex2, a, strip, replacement)
|
118
120
|
end
|
119
|
-
txt
|
120
121
|
end
|
121
122
|
|
122
|
-
def substitute_found_list_items(
|
123
|
-
|
123
|
+
def substitute_found_list_items(regex, a, strip, replacement)
|
124
|
+
@text.gsub!(regex).with_index do |m|
|
124
125
|
if a.to_s.eql?(strip ? m.strip.chop : m)
|
125
126
|
"#{Regexp.escape(a.to_s)}" + replacement
|
126
127
|
else
|
@@ -129,22 +130,24 @@ module PragmaticSegmenter
|
|
129
130
|
end
|
130
131
|
end
|
131
132
|
|
132
|
-
def add_line_breaks_for_alphabetical_list_with_periods(
|
133
|
-
iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS,
|
133
|
+
def add_line_breaks_for_alphabetical_list_with_periods(roman_numeral: false)
|
134
|
+
iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral: roman_numeral)
|
134
135
|
end
|
135
136
|
|
136
|
-
def add_line_breaks_for_alphabetical_list_with_parens(
|
137
|
-
iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS,
|
137
|
+
def add_line_breaks_for_alphabetical_list_with_parens(roman_numeral: false)
|
138
|
+
iterate_alphabet_array(ALPHABETICAL_LIST_WITH_PARENS,
|
139
|
+
parens: true,
|
140
|
+
roman_numeral: roman_numeral)
|
138
141
|
end
|
139
142
|
|
140
|
-
def replace_alphabet_list(a
|
141
|
-
|
143
|
+
def replace_alphabet_list(a)
|
144
|
+
@text.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
|
142
145
|
a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
|
143
146
|
end
|
144
147
|
end
|
145
148
|
|
146
|
-
def replace_alphabet_list_parens(a
|
147
|
-
|
149
|
+
def replace_alphabet_list_parens(a)
|
150
|
+
@text.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
|
148
151
|
if m.include?('(')
|
149
152
|
a.eql?(m.dup.downcase.gsub!(/\(/, '')) ? "\r&✂&#{Regexp.escape(m.gsub!(/\(/, ''))}" : "#{m}"
|
150
153
|
else
|
@@ -153,48 +156,47 @@ module PragmaticSegmenter
|
|
153
156
|
end
|
154
157
|
end
|
155
158
|
|
156
|
-
def replace_correct_alphabet_list(a,
|
159
|
+
def replace_correct_alphabet_list(a, parens)
|
157
160
|
if parens
|
158
|
-
replace_alphabet_list_parens(a
|
161
|
+
replace_alphabet_list_parens(a)
|
159
162
|
else
|
160
|
-
replace_alphabet_list(a
|
163
|
+
replace_alphabet_list(a)
|
161
164
|
end
|
162
165
|
end
|
163
166
|
|
164
|
-
def last_array_item_replacement(a, i, alphabet, list_array,
|
167
|
+
def last_array_item_replacement(a, i, alphabet, list_array, parens)
|
165
168
|
return if alphabet & list_array == [] ||
|
166
169
|
!alphabet.include?(list_array[i - 1]) ||
|
167
170
|
!alphabet.include?(a)
|
168
171
|
return if (alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
|
169
|
-
replace_correct_alphabet_list(a,
|
172
|
+
replace_correct_alphabet_list(a, parens)
|
170
173
|
end
|
171
174
|
|
172
|
-
def other_items_replacement(a, i, alphabet, list_array,
|
175
|
+
def other_items_replacement(a, i, alphabet, list_array, parens)
|
173
176
|
return if alphabet & list_array == [] ||
|
174
177
|
!alphabet.include?(list_array[i - 1]) ||
|
175
178
|
!alphabet.include?(a) ||
|
176
179
|
!alphabet.include?(list_array[i + 1])
|
177
180
|
return if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 &&
|
178
181
|
(alphabet.index(list_array[i - 1]) - alphabet.index(a)).abs != 1
|
179
|
-
replace_correct_alphabet_list(a,
|
182
|
+
replace_correct_alphabet_list(a, parens)
|
180
183
|
end
|
181
184
|
|
182
|
-
def iterate_alphabet_array(regex, parens
|
183
|
-
list_array =
|
185
|
+
def iterate_alphabet_array(regex, parens: false, roman_numeral: false)
|
186
|
+
list_array = @text.scan(regex).map(&:downcase)
|
184
187
|
if roman_numeral
|
185
188
|
alphabet = ROMAN_NUMERALS
|
186
189
|
else
|
187
|
-
alphabet =
|
190
|
+
alphabet = LATIN_NUMERALS
|
188
191
|
end
|
189
192
|
list_array.delete_if { |item| !alphabet.any? { |a| a.include?(item) } }
|
190
193
|
list_array.each_with_index do |a, i|
|
191
194
|
if i.eql?(list_array.length - 1)
|
192
|
-
last_array_item_replacement(a, i, alphabet, list_array,
|
195
|
+
last_array_item_replacement(a, i, alphabet, list_array, parens)
|
193
196
|
else
|
194
|
-
other_items_replacement(a, i, alphabet, list_array,
|
197
|
+
other_items_replacement(a, i, alphabet, list_array, parens)
|
195
198
|
end
|
196
199
|
end
|
197
|
-
txt
|
198
200
|
end
|
199
201
|
end
|
200
202
|
end
|