pragmatic_segmenter 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +1 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +730 -0
- data/Rakefile +4 -0
- data/lib/pragmatic_segmenter.rb +2 -0
- data/lib/pragmatic_segmenter/abbreviation.rb +22 -0
- data/lib/pragmatic_segmenter/abbreviation_replacer.rb +149 -0
- data/lib/pragmatic_segmenter/between_punctuation.rb +78 -0
- data/lib/pragmatic_segmenter/cleaner.rb +141 -0
- data/lib/pragmatic_segmenter/ellipsis.rb +36 -0
- data/lib/pragmatic_segmenter/exclamation_words.rb +19 -0
- data/lib/pragmatic_segmenter/languages/amharic.rb +33 -0
- data/lib/pragmatic_segmenter/languages/arabic.rb +83 -0
- data/lib/pragmatic_segmenter/languages/armenian.rb +33 -0
- data/lib/pragmatic_segmenter/languages/burmese.rb +33 -0
- data/lib/pragmatic_segmenter/languages/deutsch.rb +132 -0
- data/lib/pragmatic_segmenter/languages/english.rb +44 -0
- data/lib/pragmatic_segmenter/languages/french.rb +29 -0
- data/lib/pragmatic_segmenter/languages/greek.rb +29 -0
- data/lib/pragmatic_segmenter/languages/hindi.rb +33 -0
- data/lib/pragmatic_segmenter/languages/italian.rb +39 -0
- data/lib/pragmatic_segmenter/languages/japanese.rb +58 -0
- data/lib/pragmatic_segmenter/languages/persian.rb +56 -0
- data/lib/pragmatic_segmenter/languages/russian.rb +60 -0
- data/lib/pragmatic_segmenter/languages/spanish.rb +39 -0
- data/lib/pragmatic_segmenter/languages/urdu.rb +33 -0
- data/lib/pragmatic_segmenter/list.rb +169 -0
- data/lib/pragmatic_segmenter/number.rb +35 -0
- data/lib/pragmatic_segmenter/process.rb +126 -0
- data/lib/pragmatic_segmenter/punctuation.rb +12 -0
- data/lib/pragmatic_segmenter/punctuation_replacer.rb +62 -0
- data/lib/pragmatic_segmenter/rules.rb +38 -0
- data/lib/pragmatic_segmenter/segmenter.rb +81 -0
- data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +17 -0
- data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +37 -0
- data/lib/pragmatic_segmenter/types.rb +12 -0
- data/lib/pragmatic_segmenter/version.rb +3 -0
- data/pragmatic_segmenter.gemspec +25 -0
- data/spec/performance_spec.rb +24 -0
- data/spec/pragmatic_segmenter_spec.rb +1906 -0
- data/spec/spec_helper.rb +1 -0
- metadata +150 -0
data/Rakefile
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module PragmaticSegmenter
|
4
|
+
# Defines the abbreviations for each language (if available)
|
5
|
+
class Abbreviation
|
6
|
+
ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
|
7
|
+
PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
|
8
|
+
NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
|
9
|
+
|
10
|
+
def all
|
11
|
+
ABBREVIATIONS
|
12
|
+
end
|
13
|
+
|
14
|
+
def prepositive
|
15
|
+
PREPOSITIVE_ABBREVIATIONS
|
16
|
+
end
|
17
|
+
|
18
|
+
def number
|
19
|
+
NUMBER_ABBREVIATIONS
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'pragmatic_segmenter/abbreviation'
|
3
|
+
require 'pragmatic_segmenter/single_letter_abbreviation'
|
4
|
+
|
5
|
+
module PragmaticSegmenter
|
6
|
+
# This class searches for periods within an abbreviation and
|
7
|
+
# replaces the periods.
|
8
|
+
class AbbreviationReplacer
|
9
|
+
# Rubular: http://rubular.com/r/yqa4Rit8EY
|
10
|
+
PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
|
11
|
+
|
12
|
+
# Rubular: http://rubular.com/r/xDkpFZ0EgH
|
13
|
+
MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
|
14
|
+
|
15
|
+
module AmPmRules
|
16
|
+
# Rubular: http://rubular.com/r/Vnx3m4Spc8
|
17
|
+
UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
|
18
|
+
|
19
|
+
# Rubular: http://rubular.com/r/AJMCotJVbW
|
20
|
+
UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
|
21
|
+
|
22
|
+
# Rubular: http://rubular.com/r/13q7SnOhgA
|
23
|
+
LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
|
24
|
+
|
25
|
+
# Rubular: http://rubular.com/r/DgUDq4mLz5
|
26
|
+
LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
|
27
|
+
|
28
|
+
All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
|
29
|
+
end
|
30
|
+
|
31
|
+
SENTENCE_STARTERS = %w(A Being Did For He How However I In Millions More She That The There They We What When Where Who Why)
|
32
|
+
|
33
|
+
attr_reader :text
|
34
|
+
def initialize(text:)
|
35
|
+
@text = Text.new(text)
|
36
|
+
end
|
37
|
+
|
38
|
+
def replace
|
39
|
+
@reformatted_text = text.apply(PossessiveAbbreviationRule)
|
40
|
+
@reformatted_text = PragmaticSegmenter::SingleLetterAbbreviation.new(text: @reformatted_text).replace
|
41
|
+
@reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
|
42
|
+
@reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
|
43
|
+
@reformatted_text = @reformatted_text.apply(AmPmRules::All)
|
44
|
+
replace_abbreviation_as_sentence_boundary(@reformatted_text)
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def search_for_abbreviations_in_string(txt, abbr)
|
50
|
+
original = txt.dup
|
51
|
+
downcased = txt.downcase
|
52
|
+
abbr.all.each do |a|
|
53
|
+
next unless downcased.include?(a.strip)
|
54
|
+
abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(a.strip)}/i)
|
55
|
+
next if abbrev_match.empty?
|
56
|
+
next_word_start = /(?<=#{Regexp.escape(a.strip)} ).{1}/
|
57
|
+
character_array = @text.scan(next_word_start)
|
58
|
+
abbrev_match.each_with_index do |am, index|
|
59
|
+
txt = scan_for_replacements(txt, am, index, character_array, abbr)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
txt
|
63
|
+
end
|
64
|
+
|
65
|
+
def scan_for_replacements(txt, am, index, character_array, abbr)
|
66
|
+
character = character_array[index]
|
67
|
+
prepositive = abbr.prepositive
|
68
|
+
number_abbr = abbr.number
|
69
|
+
upper = /[[:upper:]]/.match(character.to_s)
|
70
|
+
if upper.nil? || prepositive.include?(am.downcase.strip)
|
71
|
+
if prepositive.include?(am.downcase.strip)
|
72
|
+
txt = replace_prepositive_abbr(txt, am)
|
73
|
+
elsif number_abbr.include?(am.downcase.strip)
|
74
|
+
txt = replace_pre_number_abbr(txt, am)
|
75
|
+
else
|
76
|
+
txt = replace_period_of_abbr(txt, am)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
txt
|
80
|
+
end
|
81
|
+
|
82
|
+
def abbreviations
|
83
|
+
PragmaticSegmenter::Abbreviation.new
|
84
|
+
end
|
85
|
+
|
86
|
+
def replace_abbreviation_as_sentence_boundary(txt)
|
87
|
+
# As we are being conservative and keeping ambiguous
|
88
|
+
# sentence boundaries as one sentence instead of
|
89
|
+
# splitting into two, we can split at words that
|
90
|
+
# we know for certain never follow these abbreviations.
|
91
|
+
# Some might say that the set of words that follow an
|
92
|
+
# abbreviation such as U.S. (i.e. U.S. Government) is smaller than
|
93
|
+
# the set of words that could start a sentence and
|
94
|
+
# never follow U.S. However, we are being conservative
|
95
|
+
# and not splitting by default, so we need to look for places
|
96
|
+
# where we definitely can split. Obviously SENTENCE_STARTERS
|
97
|
+
# will never cover all cases, but as the gem is named
|
98
|
+
# 'Pragmatic Segmenter' we need to be pragmatic
|
99
|
+
# and try to cover the words that most often start a
|
100
|
+
# sentence but could never follow one of the abbreviations below.
|
101
|
+
|
102
|
+
SENTENCE_STARTERS.each do |word|
|
103
|
+
txt = txt.gsub(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
|
104
|
+
.gsub(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
|
105
|
+
.gsub(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
|
106
|
+
.gsub(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
|
107
|
+
.gsub(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
|
108
|
+
.gsub(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
|
109
|
+
.gsub(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
|
110
|
+
.gsub(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
|
111
|
+
.gsub(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
|
112
|
+
end
|
113
|
+
txt
|
114
|
+
end
|
115
|
+
|
116
|
+
def replace_multi_period_abbreviations(txt)
|
117
|
+
mpa = txt.scan(MULTI_PERIOD_ABBREVIATION_REGEX)
|
118
|
+
return txt if mpa.empty?
|
119
|
+
mpa.each do |r|
|
120
|
+
txt = txt.gsub(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
|
121
|
+
end
|
122
|
+
txt
|
123
|
+
end
|
124
|
+
|
125
|
+
def replace_period_in_am_pm(txt)
|
126
|
+
txt.gsub(UPPERCASE_PM_REGEX, '.')
|
127
|
+
.gsub(UPPERCASE_AM_REGEX, '.')
|
128
|
+
.gsub(LOWERCASE_PM_REGEX, '.')
|
129
|
+
.gsub(LOWERCASE_AM_REGEX, '.')
|
130
|
+
end
|
131
|
+
|
132
|
+
def replace_pre_number_abbr(txt, abbr)
|
133
|
+
txt.gsub(/(?<=#{abbr.strip})\.(?=\s\d)/, '∯').gsub(/(?<=#{abbr.strip})\.(?=\s+\()/, '∯')
|
134
|
+
end
|
135
|
+
|
136
|
+
def replace_prepositive_abbr(txt, abbr)
|
137
|
+
txt.gsub(/(?<=#{abbr.strip})\.(?=\s)/, '∯')
|
138
|
+
end
|
139
|
+
|
140
|
+
def replace_period_of_abbr(txt, abbr)
|
141
|
+
txt.gsub(/(?<=#{abbr.strip})\.(?=((\.|:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
|
142
|
+
.gsub(/(?<=#{abbr.strip})\.(?=,)/, '∯')
|
143
|
+
end
|
144
|
+
|
145
|
+
def replace_possessive_abbreviations(txt)
|
146
|
+
txt.gsub(POSSESSIVE_ABBREVIATION_REGEX, '∯')
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'pragmatic_segmenter/punctuation_replacer'
|
3
|
+
|
4
|
+
module PragmaticSegmenter
|
5
|
+
# This class searches for punctuation between quotes or parenthesis
|
6
|
+
# and replaces it
|
7
|
+
class BetweenPunctuation
|
8
|
+
# Rubular: http://rubular.com/r/2YFrKWQUYi
|
9
|
+
BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
|
10
|
+
|
11
|
+
# Rubular: http://rubular.com/r/3Pw1QlXOjd
|
12
|
+
BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/
|
13
|
+
|
14
|
+
# Rubular: http://rubular.com/r/x6s4PZK8jc
|
15
|
+
BETWEEN_QUOTE_ARROW_REGEX = /«(?>[^»\\]+|\\{2}|\\.)*»/
|
16
|
+
|
17
|
+
# Rubular: http://rubular.com/r/JbAIpKdlSq
|
18
|
+
BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
|
19
|
+
|
20
|
+
# Rubular: http://rubular.com/r/6tTityPflI
|
21
|
+
BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
|
22
|
+
|
23
|
+
attr_reader :text
|
24
|
+
def initialize(text:)
|
25
|
+
@text = text
|
26
|
+
end
|
27
|
+
|
28
|
+
def replace
|
29
|
+
sub_punctuation_between_quotes_and_parens(text)
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def sub_punctuation_between_quotes_and_parens(txt)
|
35
|
+
sub_punctuation_between_single_quotes(txt)
|
36
|
+
sub_punctuation_between_double_quotes(txt)
|
37
|
+
sub_punctuation_between_parens(txt)
|
38
|
+
sub_punctuation_between_quotes_arrow(txt)
|
39
|
+
sub_punctuation_between_quotes_slanted(txt)
|
40
|
+
end
|
41
|
+
|
42
|
+
def sub_punctuation_between_parens(txt)
|
43
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
44
|
+
matches_array: txt.scan(BETWEEN_PARENS_REGEX),
|
45
|
+
text: txt
|
46
|
+
).replace
|
47
|
+
end
|
48
|
+
|
49
|
+
def sub_punctuation_between_single_quotes(txt)
|
50
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
51
|
+
matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
|
52
|
+
text: txt
|
53
|
+
).replace
|
54
|
+
end
|
55
|
+
|
56
|
+
def sub_punctuation_between_double_quotes(txt)
|
57
|
+
btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
|
58
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
59
|
+
matches_array: btwn_dbl_quote,
|
60
|
+
text: txt
|
61
|
+
).replace
|
62
|
+
end
|
63
|
+
|
64
|
+
def sub_punctuation_between_quotes_arrow(txt)
|
65
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
66
|
+
matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
|
67
|
+
text: txt
|
68
|
+
).replace
|
69
|
+
end
|
70
|
+
|
71
|
+
def sub_punctuation_between_quotes_slanted(txt)
|
72
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
73
|
+
matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
|
74
|
+
text: txt
|
75
|
+
).replace
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module PragmaticSegmenter
|
4
|
+
module Rules
|
5
|
+
module HtmlRules
|
6
|
+
# Rubular: http://rubular.com/r/ENrVFMdJ8v
|
7
|
+
HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
|
8
|
+
|
9
|
+
# Rubular: http://rubular.com/r/XZVqMPJhea
|
10
|
+
EscapedHTMLTagRule = Rule.new(/<\/?[^gt;]*gt;/, '')
|
11
|
+
|
12
|
+
All = [HTMLTagRule, EscapedHTMLTagRule]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# This is an opinionated class that removes errant newlines,
|
17
|
+
# xhtml, inline formatting, etc.
|
18
|
+
class Cleaner
|
19
|
+
include Rules
|
20
|
+
# Rubular: http://rubular.com/r/V57WnM9Zut
|
21
|
+
NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
|
22
|
+
|
23
|
+
# Rubular: http://rubular.com/r/3GiRiP2IbD
|
24
|
+
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
|
25
|
+
|
26
|
+
# Rubular: http://rubular.com/r/UZAVcwqck8
|
27
|
+
PDF_NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
|
28
|
+
|
29
|
+
# Rubular: http://rubular.com/r/eaNwGavmdo
|
30
|
+
PDF_NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
|
31
|
+
|
32
|
+
# Rubular: http://rubular.com/r/bAJrhyLNeZ
|
33
|
+
InlineFormattingRule = Rule.new(/\{b\^>\d*<b\^\}|\{b\^>\d*<b\^\}/, '')
|
34
|
+
|
35
|
+
# Rubular: http://rubular.com/r/dMxp5MixFS
|
36
|
+
DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
|
37
|
+
|
38
|
+
# Rubular: http://rubular.com/r/H6HOJeA8bq
|
39
|
+
DoubleNewLineRule = Rule.new(/\n\n/, "\r")
|
40
|
+
|
41
|
+
# Rubular: http://rubular.com/r/Gn18aAnLdZ
|
42
|
+
NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
|
43
|
+
|
44
|
+
# Rubular: http://rubular.com/r/FseyMiiYFT
|
45
|
+
NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
|
46
|
+
|
47
|
+
# Rubular: http://rubular.com/r/8mc1ArOIGy
|
48
|
+
TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
|
49
|
+
|
50
|
+
# Rubular: http://rubular.com/r/DwNSuZrNtk
|
51
|
+
ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
|
52
|
+
|
53
|
+
ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
|
54
|
+
|
55
|
+
QuotationsFirstRule = Rule.new(/''/, '"')
|
56
|
+
QuotationsSecondRule = Rule.new(/``/, '"')
|
57
|
+
|
58
|
+
attr_reader :text, :doc_type
|
59
|
+
def initialize(text:, **args)
|
60
|
+
@text = Text.new(text.dup)
|
61
|
+
@doc_type = args[:doc_type]
|
62
|
+
end
|
63
|
+
|
64
|
+
# Clean text of unwanted formatting
|
65
|
+
#
|
66
|
+
# Example:
|
67
|
+
# >> text = "This is a sentence\ncut off in the middle because pdf."
|
68
|
+
# >> PragmaticSegmenter::Cleaner(text: text).clean
|
69
|
+
# => "This is a sentence cut off in the middle because pdf."
|
70
|
+
#
|
71
|
+
# Arguments:
|
72
|
+
# text: (String) *required
|
73
|
+
# language: (String) *optional
|
74
|
+
# (two-digit ISO 639-1 code e.g. 'en')
|
75
|
+
# doc_type: (String) *optional
|
76
|
+
# (e.g. 'pdf')
|
77
|
+
|
78
|
+
def clean
|
79
|
+
return unless text
|
80
|
+
@clean_text = remove_all_newlines(text)
|
81
|
+
@clean_text = replace_double_newlines(@clean_text)
|
82
|
+
@clean_text = replace_newlines(@clean_text)
|
83
|
+
@clean_text = @clean_text.apply(HtmlRules::All)
|
84
|
+
@clean_text = @clean_text.apply(InlineFormattingRule)
|
85
|
+
@clean_text = clean_quotations(@clean_text)
|
86
|
+
@clean_text = clean_table_of_contents(@clean_text)
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
def remove_all_newlines(txt)
|
92
|
+
clean_text = remove_newline_in_middle_of_sentence(txt)
|
93
|
+
remove_newline_in_middle_of_word(clean_text)
|
94
|
+
end
|
95
|
+
|
96
|
+
def remove_newline_in_middle_of_sentence(txt)
|
97
|
+
txt.dup.gsub!(/(?:[^\.])*/) do |match|
|
98
|
+
next unless match.include?("\n")
|
99
|
+
orig = match.dup
|
100
|
+
match.gsub!(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '')
|
101
|
+
txt.gsub!(/#{Regexp.escape(orig)}/, "#{match}")
|
102
|
+
end
|
103
|
+
txt
|
104
|
+
end
|
105
|
+
|
106
|
+
def remove_newline_in_middle_of_word(txt)
|
107
|
+
txt.apply(NewLineInMiddleOfWordRule)
|
108
|
+
end
|
109
|
+
|
110
|
+
def replace_double_newlines(txt)
|
111
|
+
txt.apply(DoubleNewLineWithSpaceRule).
|
112
|
+
apply(DoubleNewLineRule)
|
113
|
+
end
|
114
|
+
|
115
|
+
def replace_newlines(txt)
|
116
|
+
if doc_type.eql?('pdf')
|
117
|
+
txt = remove_pdf_line_breaks(txt)
|
118
|
+
else
|
119
|
+
txt =
|
120
|
+
txt.apply(NewLineFollowedByPeriodRule).
|
121
|
+
apply(ReplaceNewlineWithCarriageReturnRule)
|
122
|
+
end
|
123
|
+
txt
|
124
|
+
end
|
125
|
+
|
126
|
+
def remove_pdf_line_breaks(txt)
|
127
|
+
txt.apply(NewLineFollowedByBulletRule).
|
128
|
+
apply(PDF_NewLineInMiddleOfSentenceRule).
|
129
|
+
apply(PDF_NewLineInMiddleOfSentenceNoSpacesRule)
|
130
|
+
end
|
131
|
+
|
132
|
+
def clean_quotations(txt)
|
133
|
+
txt.apply(QuotationsFirstRule).
|
134
|
+
apply(QuotationsSecondRule)
|
135
|
+
end
|
136
|
+
|
137
|
+
def clean_table_of_contents(txt)
|
138
|
+
txt.apply(TableOfContentsRule).apply(ConsecutivePeriodsRule)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
|
3
|
+
module PragmaticSegmenter
|
4
|
+
module Rules
|
5
|
+
# This class searches for ellipses within a string and
|
6
|
+
# replaces the periods.
|
7
|
+
|
8
|
+
# http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
|
9
|
+
# http://www.thepunctuationguide.com/ellipses.html
|
10
|
+
|
11
|
+
module EllipsisRules
|
12
|
+
# Rubular: http://rubular.com/r/i60hCK81fz
|
13
|
+
ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
|
14
|
+
|
15
|
+
# Rubular: http://rubular.com/r/Hdqpd90owl
|
16
|
+
FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
|
17
|
+
|
18
|
+
# Rubular: http://rubular.com/r/YBG1dIHTRu
|
19
|
+
ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
|
20
|
+
|
21
|
+
# Rubular: http://rubular.com/r/2VvZ8wRbd8
|
22
|
+
FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
|
23
|
+
|
24
|
+
OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
|
25
|
+
|
26
|
+
All = [
|
27
|
+
ThreeSpaceRule,
|
28
|
+
FourSpaceRule,
|
29
|
+
FourConsecutiveRule,
|
30
|
+
ThreeConsecutiveRule,
|
31
|
+
OtherThreePeriodRule
|
32
|
+
]
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# -*- encoding : utf-8 -*-
|
2
|
+
require 'pragmatic_segmenter/punctuation_replacer'
|
3
|
+
|
4
|
+
module PragmaticSegmenter
|
5
|
+
# This class searches for exclamation points that
|
6
|
+
# are part of words and not ending punctuation and replaces them.
|
7
|
+
module ExclamationWords
|
8
|
+
WORDS_WITH_EXCLAMATIONS = ['!Xũ', '!Kung', 'ǃʼOǃKung', '!Xuun', '!Kung-Ekoka', 'ǃHu', 'ǃKhung', 'ǃKu', 'ǃung', 'ǃXo', 'ǃXû', 'ǃXung', 'ǃXũ', '!Xun', 'Yahoo!', 'Y!J', 'Yum!']
|
9
|
+
|
10
|
+
def self.apply_rules(text)
|
11
|
+
WORDS_WITH_EXCLAMATIONS.each do |exclamation|
|
12
|
+
PragmaticSegmenter::PunctuationReplacer.new(
|
13
|
+
matches_array: text.scan(/#{Regexp.escape(exclamation)}/),
|
14
|
+
text: text
|
15
|
+
).replace
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|