pragmatic_segmenter 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +730 -0
- data/Rakefile +4 -0
- data/lib/pragmatic_segmenter.rb +2 -0
- data/lib/pragmatic_segmenter/abbreviation.rb +22 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +149 -0
- data/lib/pragmatic_segmenter/between_punctuation.rb +78 -0
- data/lib/pragmatic_segmenter/cleaner.rb +141 -0
- data/lib/pragmatic_segmenter/ellipsis.rb +36 -0
- data/lib/pragmatic_segmenter/exclamation_words.rb +19 -0
- data/lib/pragmatic_segmenter/languages/amharic.rb +33 -0
- data/lib/pragmatic_segmenter/languages/arabic.rb +83 -0
- data/lib/pragmatic_segmenter/languages/armenian.rb +33 -0
- data/lib/pragmatic_segmenter/languages/burmese.rb +33 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +132 -0
- data/lib/pragmatic_segmenter/languages/english.rb +44 -0
- data/lib/pragmatic_segmenter/languages/french.rb +29 -0
- data/lib/pragmatic_segmenter/languages/greek.rb +29 -0
- data/lib/pragmatic_segmenter/languages/hindi.rb +33 -0
- data/lib/pragmatic_segmenter/languages/italian.rb +39 -0
- data/lib/pragmatic_segmenter/languages/japanese.rb +58 -0
- data/lib/pragmatic_segmenter/languages/persian.rb +56 -0
- data/lib/pragmatic_segmenter/languages/russian.rb +60 -0
- data/lib/pragmatic_segmenter/languages/spanish.rb +39 -0
- data/lib/pragmatic_segmenter/languages/urdu.rb +33 -0
- data/lib/pragmatic_segmenter/list.rb +169 -0
- data/lib/pragmatic_segmenter/number.rb +35 -0
- data/lib/pragmatic_segmenter/process.rb +126 -0
- data/lib/pragmatic_segmenter/punctuation.rb +12 -0
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +62 -0
- data/lib/pragmatic_segmenter/rules.rb +38 -0
- data/lib/pragmatic_segmenter/segmenter.rb +81 -0
- data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +17 -0
- data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +37 -0
- data/lib/pragmatic_segmenter/types.rb +12 -0
- data/lib/pragmatic_segmenter/version.rb +3 -0
- data/pragmatic_segmenter.gemspec +25 -0
- data/spec/performance_spec.rb +24 -0
- data/spec/pragmatic_segmenter_spec.rb +1906 -0
- data/spec/spec_helper.rb +1 -0
- metadata +150 -0
data/Rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module PragmaticSegmenter
|
4
|
+
# Defines the abbreviations for each language (if available)
|
5
|
+
class Abbreviation
|
6
|
+
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
|
7
|
+
PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
|
8
|
+
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
9
|
+
|
10
|
+
def all
|
11
|
+
ABBREVIATIONS
|
12
|
+
end
|
13
|
+
|
14
|
+
def prepositive
|
15
|
+
PREPOSITIVE_ABBREVIATIONS
|
16
|
+
end
|
17
|
+
|
18
|
+
def number
|
19
|
+
NUMBER_ABBREVIATIONS
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'pragmatic_segmenter/abbreviation'
|
3
|
+
require 'pragmatic_segmenter/single_letter_abbreviation'
|
4
|
+
|
5
|
+
module PragmaticSegmenter
|
6
|
+
# This class searches for periods within an abbreviation and
|
7
|
+
# replaces the periods.
|
8
|
+
class AbbreviationReplacer
|
9
|
+
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
10
|
+
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
11
|
+
|
12
|
+
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
13
|
+
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
14
|
+
|
15
|
+
module AmPmRules
|
16
|
+
# Rubular: http://rubular.com/r/Vnx3m4Spc8
|
17
|
+
UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
|
18
|
+
|
19
|
+
# Rubular: http://rubular.com/r/AJMCotJVbW
|
20
|
+
UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
|
21
|
+
|
22
|
+
# Rubular: http://rubular.com/r/13q7SnOhgA
|
23
|
+
LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
|
24
|
+
|
25
|
+
# Rubular: http://rubular.com/r/DgUDq4mLz5
|
26
|
+
LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
|
27
|
+
|
28
|
+
All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
|
29
|
+
end
|
30
|
+
|
31
|
+
SENTENCE_STARTERS = %w(A Being Did For He How However I In Millions More She That The There They We What When Where Who Why)
|
32
|
+
|
33
|
+
attr_reader :text
|
34
|
+
def initialize(text:)
|
35
|
+
@text = Text.new(text)
|
36
|
+
end
|
37
|
+
|
38
|
+
def replace
|
39
|
+
@reformatted_text = text.apply(PossessiveAbbreviationRule)
|
40
|
+
@reformatted_text = PragmaticSegmenter::SingleLetterAbbreviation.new(text: @reformatted_text).replace
|
41
|
+
@reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
|
42
|
+
@reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
|
43
|
+
@reformatted_text = @reformatted_text.apply(AmPmRules::All)
|
44
|
+
replace_abbreviation_as_sentence_boundary(@reformatted_text)
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def search_for_abbreviations_in_string(txt, abbr)
|
50
|
+
original = txt.dup
|
51
|
+
downcased = txt.downcase
|
52
|
+
abbr.all.each do |a|
|
53
|
+
next unless downcased.include?(a.strip)
|
54
|
+
abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(a.strip)}/i)
|
55
|
+
next if abbrev_match.empty?
|
56
|
+
next_word_start = /(?<=#{Regexp.escape(a.strip)} ).{1}/
|
57
|
+
character_array = @text.scan(next_word_start)
|
58
|
+
abbrev_match.each_with_index do |am, index|
|
59
|
+
txt = scan_for_replacements(txt, am, index, character_array, abbr)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
txt
|
63
|
+
end
|
64
|
+
|
65
|
+
def scan_for_replacements(txt, am, index, character_array, abbr)
|
66
|
+
character = character_array[index]
|
67
|
+
prepositive = abbr.prepositive
|
68
|
+
number_abbr = abbr.number
|
69
|
+
upper = /[[:upper:]]/.match(character.to_s)
|
70
|
+
if upper.nil? || prepositive.include?(am.downcase.strip)
|
71
|
+
if prepositive.include?(am.downcase.strip)
|
72
|
+
txt = replace_prepositive_abbr(txt, am)
|
73
|
+
elsif number_abbr.include?(am.downcase.strip)
|
74
|
+
txt = replace_pre_number_abbr(txt, am)
|
75
|
+
else
|
76
|
+
txt = replace_period_of_abbr(txt, am)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
txt
|
80
|
+
end
|
81
|
+
|
82
|
+
def abbreviations
|
83
|
+
PragmaticSegmenter::Abbreviation.new
|
84
|
+
end
|
85
|
+
|
86
|
+
def replace_abbreviation_as_sentence_boundary(txt)
|
87
|
+
# As we are being conservative and keeping ambiguous
|
88
|
+
# sentence boundaries as one sentence instead of
|
89
|
+
# splitting into two, we can split at words that
|
90
|
+
# we know for certain never follow these abbreviations.
|
91
|
+
# Some might say that the set of words that follow an
|
92
|
+
# abbreviation such as U.S. (i.e. U.S. Government) is smaller than
|
93
|
+
# the set of words that could start a sentence and
|
94
|
+
# never follow U.S. However, we are being conservative
|
95
|
+
# and not splitting by default, so we need to look for places
|
96
|
+
# where we definitely can split. Obviously SENTENCE_STARTERS
|
97
|
+
# will never cover all cases, but as the gem is named
|
98
|
+
# 'Pragmatic Segmenter' we need to be pragmatic
|
99
|
+
# and try to cover the words that most often start a
|
100
|
+
# sentence but could never follow one of the abbreviations below.
|
101
|
+
|
102
|
+
SENTENCE_STARTERS.each do |word|
|
103
|
+
txt = txt.gsub(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
|
104
|
+
.gsub(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
|
105
|
+
.gsub(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
|
106
|
+
.gsub(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
|
107
|
+
.gsub(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
|
108
|
+
.gsub(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
|
109
|
+
.gsub(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
|
110
|
+
.gsub(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
|
111
|
+
.gsub(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
|
112
|
+
end
|
113
|
+
txt
|
114
|
+
end
|
115
|
+
|
116
|
+
def replace_multi_period_abbreviations(txt)
|
117
|
+
mpa = txt.scan(MULTI_PERIOD_ABBREVIATION_REGEX)
|
118
|
+
return txt if mpa.empty?
|
119
|
+
mpa.each do |r|
|
120
|
+
txt = txt.gsub(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
|
121
|
+
end
|
122
|
+
txt
|
123
|
+
end
|
124
|
+
|
125
|
+
def replace_period_in_am_pm(txt)
|
126
|
+
txt.gsub(UPPERCASE_PM_REGEX, '.')
|
127
|
+
.gsub(UPPERCASE_AM_REGEX, '.')
|
128
|
+
.gsub(LOWERCASE_PM_REGEX, '.')
|
129
|
+
.gsub(LOWERCASE_AM_REGEX, '.')
|
130
|
+
end
|
131
|
+
|
132
|
+
def replace_pre_number_abbr(txt, abbr)
|
133
|
+
txt.gsub(/(?<=#{abbr.strip})\.(?=\s\d)/, '∯').gsub(/(?<=#{abbr.strip})\.(?=\s+\()/, '∯')
|
134
|
+
end
|
135
|
+
|
136
|
+
def replace_prepositive_abbr(txt, abbr)
|
137
|
+
txt.gsub(/(?<=#{abbr.strip})\.(?=\s)/, '∯')
|
138
|
+
end
|
139
|
+
|
140
|
+
def replace_period_of_abbr(txt, abbr)
|
141
|
+
txt.gsub(/(?<=#{abbr.strip})\.(?=((\.|:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
142
|
+
.gsub(/(?<=#{abbr.strip})\.(?=,)/, '∯')
|
143
|
+
end
|
144
|
+
|
145
|
+
def replace_possessive_abbreviations(txt)
|
146
|
+
txt.gsub(POSSESSIVE_ABBREVIATION_REGEX, '∯')
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'pragmatic_segmenter/punctuation_replacer'
|
3
|
+
|
4
|
+
module PragmaticSegmenter
|
5
|
+
# This class searches for punctuation between quotes or parenthesis
|
6
|
+
# and replaces it
|
7
|
+
class BetweenPunctuation
|
8
|
+
# Rubular: http://rubular.com/r/2YFrKWQUYi
|
9
|
+
BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
|
10
|
+
|
11
|
+
# Rubular: http://rubular.com/r/3Pw1QlXOjd
|
12
|
+
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/
|
13
|
+
|
14
|
+
# Rubular: http://rubular.com/r/x6s4PZK8jc
|
15
|
+
BETWEEN_QUOTE_ARROW_REGEX = /«(?>[^»\\]+|\\{2}|\\.)*»/
|
16
|
+
|
17
|
+
# Rubular: http://rubular.com/r/JbAIpKdlSq
|
18
|
+
BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
|
19
|
+
|
20
|
+
# Rubular: http://rubular.com/r/6tTityPflI
|
21
|
+
BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
|
22
|
+
|
23
|
+
attr_reader :text
|
24
|
+
def initialize(text:)
|
25
|
+
@text = text
|
26
|
+
end
|
27
|
+
|
28
|
+
def replace
|
29
|
+
sub_punctuation_between_quotes_and_parens(text)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def sub_punctuation_between_quotes_and_parens(txt)
|
35
|
+
sub_punctuation_between_single_quotes(txt)
|
36
|
+
sub_punctuation_between_double_quotes(txt)
|
37
|
+
sub_punctuation_between_parens(txt)
|
38
|
+
sub_punctuation_between_quotes_arrow(txt)
|
39
|
+
sub_punctuation_between_quotes_slanted(txt)
|
40
|
+
end
|
41
|
+
|
42
|
+
def sub_punctuation_between_parens(txt)
|
43
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
44
|
+
matches_array: txt.scan(BETWEEN_PARENS_REGEX),
|
45
|
+
text: txt
|
46
|
+
).replace
|
47
|
+
end
|
48
|
+
|
49
|
+
def sub_punctuation_between_single_quotes(txt)
|
50
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
51
|
+
matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
|
52
|
+
text: txt
|
53
|
+
).replace
|
54
|
+
end
|
55
|
+
|
56
|
+
def sub_punctuation_between_double_quotes(txt)
|
57
|
+
btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
|
58
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
59
|
+
matches_array: btwn_dbl_quote,
|
60
|
+
text: txt
|
61
|
+
).replace
|
62
|
+
end
|
63
|
+
|
64
|
+
def sub_punctuation_between_quotes_arrow(txt)
|
65
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
66
|
+
matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
|
67
|
+
text: txt
|
68
|
+
).replace
|
69
|
+
end
|
70
|
+
|
71
|
+
def sub_punctuation_between_quotes_slanted(txt)
|
72
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
73
|
+
matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
|
74
|
+
text: txt
|
75
|
+
).replace
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module PragmaticSegmenter
|
4
|
+
module Rules
|
5
|
+
module HtmlRules
|
6
|
+
# Rubular: http://rubular.com/r/ENrVFMdJ8v
|
7
|
+
HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
|
8
|
+
|
9
|
+
# Rubular: http://rubular.com/r/XZVqMPJhea
|
10
|
+
EscapedHTMLTagRule = Rule.new(/<\/?[^gt;]*gt;/, '')
|
11
|
+
|
12
|
+
All = [HTMLTagRule, EscapedHTMLTagRule]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# This is an opinionated class that removes errant newlines,
|
17
|
+
# xhtml, inline formatting, etc.
|
18
|
+
class Cleaner
|
19
|
+
include Rules
|
20
|
+
# Rubular: http://rubular.com/r/V57WnM9Zut
|
21
|
+
NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
|
22
|
+
|
23
|
+
# Rubular: http://rubular.com/r/3GiRiP2IbD
|
24
|
+
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
|
25
|
+
|
26
|
+
# Rubular: http://rubular.com/r/UZAVcwqck8
|
27
|
+
PDF_NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
|
28
|
+
|
29
|
+
# Rubular: http://rubular.com/r/eaNwGavmdo
|
30
|
+
PDF_NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
|
31
|
+
|
32
|
+
# Rubular: http://rubular.com/r/bAJrhyLNeZ
|
33
|
+
InlineFormattingRule = Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*<b\^\}/, '')
|
34
|
+
|
35
|
+
# Rubular: http://rubular.com/r/dMxp5MixFS
|
36
|
+
DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
|
37
|
+
|
38
|
+
# Rubular: http://rubular.com/r/H6HOJeA8bq
|
39
|
+
DoubleNewLineRule = Rule.new(/\n\n/, "\r")
|
40
|
+
|
41
|
+
# Rubular: http://rubular.com/r/Gn18aAnLdZ
|
42
|
+
NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
|
43
|
+
|
44
|
+
# Rubular: http://rubular.com/r/FseyMiiYFT
|
45
|
+
NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
|
46
|
+
|
47
|
+
# Rubular: http://rubular.com/r/8mc1ArOIGy
|
48
|
+
TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
|
49
|
+
|
50
|
+
# Rubular: http://rubular.com/r/DwNSuZrNtk
|
51
|
+
ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
|
52
|
+
|
53
|
+
ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
|
54
|
+
|
55
|
+
QuotationsFirstRule = Rule.new(/''/, '"')
|
56
|
+
QuotationsSecondRule = Rule.new(/``/, '"')
|
57
|
+
|
58
|
+
attr_reader :text, :doc_type
|
59
|
+
def initialize(text:, **args)
|
60
|
+
@text = Text.new(text.dup)
|
61
|
+
@doc_type = args[:doc_type]
|
62
|
+
end
|
63
|
+
|
64
|
+
# Clean text of unwanted formatting
|
65
|
+
#
|
66
|
+
# Example:
|
67
|
+
# >> text = "This is a sentence\ncut off in the middle because pdf."
|
68
|
+
# >> PragmaticSegmenter::Cleaner(text: text).clean
|
69
|
+
# => "This is a sentence cut off in the middle because pdf."
|
70
|
+
#
|
71
|
+
# Arguments:
|
72
|
+
# text: (String) *required
|
73
|
+
# language: (String) *optional
|
74
|
+
# (two-digit ISO 639-1 code e.g. 'en')
|
75
|
+
# doc_type: (String) *optional
|
76
|
+
# (e.g. 'pdf')
|
77
|
+
|
78
|
+
def clean
|
79
|
+
return unless text
|
80
|
+
@clean_text = remove_all_newlines(text)
|
81
|
+
@clean_text = replace_double_newlines(@clean_text)
|
82
|
+
@clean_text = replace_newlines(@clean_text)
|
83
|
+
@clean_text = @clean_text.apply(HtmlRules::All)
|
84
|
+
@clean_text = @clean_text.apply(InlineFormattingRule)
|
85
|
+
@clean_text = clean_quotations(@clean_text)
|
86
|
+
@clean_text = clean_table_of_contents(@clean_text)
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
def remove_all_newlines(txt)
|
92
|
+
clean_text = remove_newline_in_middle_of_sentence(txt)
|
93
|
+
remove_newline_in_middle_of_word(clean_text)
|
94
|
+
end
|
95
|
+
|
96
|
+
def remove_newline_in_middle_of_sentence(txt)
|
97
|
+
txt.dup.gsub!(/(?:[^\.])*/) do |match|
|
98
|
+
next unless match.include?("\n")
|
99
|
+
orig = match.dup
|
100
|
+
match.gsub!(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '')
|
101
|
+
txt.gsub!(/#{Regexp.escape(orig)}/, "#{match}")
|
102
|
+
end
|
103
|
+
txt
|
104
|
+
end
|
105
|
+
|
106
|
+
def remove_newline_in_middle_of_word(txt)
|
107
|
+
txt.apply(NewLineInMiddleOfWordRule)
|
108
|
+
end
|
109
|
+
|
110
|
+
def replace_double_newlines(txt)
|
111
|
+
txt.apply(DoubleNewLineWithSpaceRule).
|
112
|
+
apply(DoubleNewLineRule)
|
113
|
+
end
|
114
|
+
|
115
|
+
def replace_newlines(txt)
|
116
|
+
if doc_type.eql?('pdf')
|
117
|
+
txt = remove_pdf_line_breaks(txt)
|
118
|
+
else
|
119
|
+
txt =
|
120
|
+
txt.apply(NewLineFollowedByPeriodRule).
|
121
|
+
apply(ReplaceNewlineWithCarriageReturnRule)
|
122
|
+
end
|
123
|
+
txt
|
124
|
+
end
|
125
|
+
|
126
|
+
def remove_pdf_line_breaks(txt)
|
127
|
+
txt.apply(NewLineFollowedByBulletRule).
|
128
|
+
apply(PDF_NewLineInMiddleOfSentenceRule).
|
129
|
+
apply(PDF_NewLineInMiddleOfSentenceNoSpacesRule)
|
130
|
+
end
|
131
|
+
|
132
|
+
def clean_quotations(txt)
|
133
|
+
txt.apply(QuotationsFirstRule).
|
134
|
+
apply(QuotationsSecondRule)
|
135
|
+
end
|
136
|
+
|
137
|
+
def clean_table_of_contents(txt)
|
138
|
+
txt.apply(TableOfContentsRule).apply(ConsecutivePeriodsRule)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module PragmaticSegmenter
|
4
|
+
module Rules
|
5
|
+
# This class searches for ellipses within a string and
|
6
|
+
# replaces the periods.
|
7
|
+
|
8
|
+
# http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
|
9
|
+
# http://www.thepunctuationguide.com/ellipses.html
|
10
|
+
|
11
|
+
module EllipsisRules
|
12
|
+
# Rubular: http://rubular.com/r/i60hCK81fz
|
13
|
+
ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
|
14
|
+
|
15
|
+
# Rubular: http://rubular.com/r/Hdqpd90owl
|
16
|
+
FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
|
17
|
+
|
18
|
+
# Rubular: http://rubular.com/r/YBG1dIHTRu
|
19
|
+
ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
|
20
|
+
|
21
|
+
# Rubular: http://rubular.com/r/2VvZ8wRbd8
|
22
|
+
FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
|
23
|
+
|
24
|
+
OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
|
25
|
+
|
26
|
+
All = [
|
27
|
+
ThreeSpaceRule,
|
28
|
+
FourSpaceRule,
|
29
|
+
FourConsecutiveRule,
|
30
|
+
ThreeConsecutiveRule,
|
31
|
+
OtherThreePeriodRule
|
32
|
+
]
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'pragmatic_segmenter/punctuation_replacer'
|
3
|
+
|
4
|
+
module PragmaticSegmenter
|
5
|
+
# This class searches for exclamation points that
|
6
|
+
# are part of words and not ending punctuation and replaces them.
|
7
|
+
module ExclamationWords
|
8
|
+
WORDS_WITH_EXCLAMATIONS = ['!Xũ', '!Kung', 'ǃʼOǃKung', '!Xuun', '!Kung-Ekoka', 'ǃHu', 'ǃKhung', 'ǃKu', 'ǃung', 'ǃXo', 'ǃXû', 'ǃXung', 'ǃXũ', '!Xun', 'Yahoo!', 'Y!J', 'Yum!']
|
9
|
+
|
10
|
+
def self.apply_rules(text)
|
11
|
+
WORDS_WITH_EXCLAMATIONS.each do |exclamation|
|
12
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
13
|
+
matches_array: text.scan(/#{Regexp.escape(exclamation)}/),
|
14
|
+
text: text
|
15
|
+
).replace
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|