pragmatic_segmenter 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +730 -0
  7. data/Rakefile +4 -0
  8. data/lib/pragmatic_segmenter.rb +2 -0
  9. data/lib/pragmatic_segmenter/abbreviation.rb +22 -0
  10. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +149 -0
  11. data/lib/pragmatic_segmenter/between_punctuation.rb +78 -0
  12. data/lib/pragmatic_segmenter/cleaner.rb +141 -0
  13. data/lib/pragmatic_segmenter/ellipsis.rb +36 -0
  14. data/lib/pragmatic_segmenter/exclamation_words.rb +19 -0
  15. data/lib/pragmatic_segmenter/languages/amharic.rb +33 -0
  16. data/lib/pragmatic_segmenter/languages/arabic.rb +83 -0
  17. data/lib/pragmatic_segmenter/languages/armenian.rb +33 -0
  18. data/lib/pragmatic_segmenter/languages/burmese.rb +33 -0
  19. data/lib/pragmatic_segmenter/languages/deutsch.rb +132 -0
  20. data/lib/pragmatic_segmenter/languages/english.rb +44 -0
  21. data/lib/pragmatic_segmenter/languages/french.rb +29 -0
  22. data/lib/pragmatic_segmenter/languages/greek.rb +29 -0
  23. data/lib/pragmatic_segmenter/languages/hindi.rb +33 -0
  24. data/lib/pragmatic_segmenter/languages/italian.rb +39 -0
  25. data/lib/pragmatic_segmenter/languages/japanese.rb +58 -0
  26. data/lib/pragmatic_segmenter/languages/persian.rb +56 -0
  27. data/lib/pragmatic_segmenter/languages/russian.rb +60 -0
  28. data/lib/pragmatic_segmenter/languages/spanish.rb +39 -0
  29. data/lib/pragmatic_segmenter/languages/urdu.rb +33 -0
  30. data/lib/pragmatic_segmenter/list.rb +169 -0
  31. data/lib/pragmatic_segmenter/number.rb +35 -0
  32. data/lib/pragmatic_segmenter/process.rb +126 -0
  33. data/lib/pragmatic_segmenter/punctuation.rb +12 -0
  34. data/lib/pragmatic_segmenter/punctuation_replacer.rb +62 -0
  35. data/lib/pragmatic_segmenter/rules.rb +38 -0
  36. data/lib/pragmatic_segmenter/segmenter.rb +81 -0
  37. data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +17 -0
  38. data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +37 -0
  39. data/lib/pragmatic_segmenter/types.rb +12 -0
  40. data/lib/pragmatic_segmenter/version.rb +3 -0
  41. data/pragmatic_segmenter.gemspec +25 -0
  42. data/spec/performance_spec.rb +24 -0
  43. data/spec/pragmatic_segmenter_spec.rb +1906 -0
  44. data/spec/spec_helper.rb +1 -0
  45. metadata +150 -0
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
@@ -0,0 +1,2 @@
1
+ require "pragmatic_segmenter/version"
2
+ require "pragmatic_segmenter/segmenter"
@@ -0,0 +1,22 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module PragmaticSegmenter
4
+ # Defines the abbreviations for each language (if available)
5
+ class Abbreviation
6
+ ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
7
+ PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
8
+ NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
9
+
10
+ def all
11
+ ABBREVIATIONS
12
+ end
13
+
14
+ def prepositive
15
+ PREPOSITIVE_ABBREVIATIONS
16
+ end
17
+
18
+ def number
19
+ NUMBER_ABBREVIATIONS
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,149 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'pragmatic_segmenter/abbreviation'
3
+ require 'pragmatic_segmenter/single_letter_abbreviation'
4
+
5
+ module PragmaticSegmenter
6
+ # This class searches for periods within an abbreviation and
7
+ # replaces the periods.
8
+ class AbbreviationReplacer
9
+ # Rubular: http://rubular.com/r/yqa4Rit8EY
10
+ PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
11
+
12
+ # Rubular: http://rubular.com/r/xDkpFZ0EgH
13
+ MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
14
+
15
+ module AmPmRules
16
+ # Rubular: http://rubular.com/r/Vnx3m4Spc8
17
+ UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
18
+
19
+ # Rubular: http://rubular.com/r/AJMCotJVbW
20
+ UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
21
+
22
+ # Rubular: http://rubular.com/r/13q7SnOhgA
23
+ LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
24
+
25
+ # Rubular: http://rubular.com/r/DgUDq4mLz5
26
+ LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
27
+
28
+ All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
29
+ end
30
+
31
+ SENTENCE_STARTERS = %w(A Being Did For He How However I In Millions More She That The There They We What When Where Who Why)
32
+
33
+ attr_reader :text
34
+ def initialize(text:)
35
+ @text = Text.new(text)
36
+ end
37
+
38
+ def replace
39
+ @reformatted_text = text.apply(PossessiveAbbreviationRule)
40
+ @reformatted_text = PragmaticSegmenter::SingleLetterAbbreviation.new(text: @reformatted_text).replace
41
+ @reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
42
+ @reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
43
+ @reformatted_text = @reformatted_text.apply(AmPmRules::All)
44
+ replace_abbreviation_as_sentence_boundary(@reformatted_text)
45
+ end
46
+
47
+ private
48
+
49
+ def search_for_abbreviations_in_string(txt, abbr)
50
+ original = txt.dup
51
+ downcased = txt.downcase
52
+ abbr.all.each do |a|
53
+ next unless downcased.include?(a.strip)
54
+ abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(a.strip)}/i)
55
+ next if abbrev_match.empty?
56
+ next_word_start = /(?<=#{Regexp.escape(a.strip)} ).{1}/
57
+ character_array = @text.scan(next_word_start)
58
+ abbrev_match.each_with_index do |am, index|
59
+ txt = scan_for_replacements(txt, am, index, character_array, abbr)
60
+ end
61
+ end
62
+ txt
63
+ end
64
+
65
+ def scan_for_replacements(txt, am, index, character_array, abbr)
66
+ character = character_array[index]
67
+ prepositive = abbr.prepositive
68
+ number_abbr = abbr.number
69
+ upper = /[[:upper:]]/.match(character.to_s)
70
+ if upper.nil? || prepositive.include?(am.downcase.strip)
71
+ if prepositive.include?(am.downcase.strip)
72
+ txt = replace_prepositive_abbr(txt, am)
73
+ elsif number_abbr.include?(am.downcase.strip)
74
+ txt = replace_pre_number_abbr(txt, am)
75
+ else
76
+ txt = replace_period_of_abbr(txt, am)
77
+ end
78
+ end
79
+ txt
80
+ end
81
+
82
+ def abbreviations
83
+ PragmaticSegmenter::Abbreviation.new
84
+ end
85
+
86
+ def replace_abbreviation_as_sentence_boundary(txt)
87
+ # As we are being conservative and keeping ambiguous
88
+ # sentence boundaries as one sentence instead of
89
+ # splitting into two, we can split at words that
90
+ # we know for certain never follow these abbreviations.
91
+ # Some might say that the set of words that follow an
92
+ # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
93
+ # the set of words that could start a sentence and
94
+ # never follow U.S. However, we are being conservative
95
+ # and not splitting by default, so we need to look for places
96
+ # where we definitely can split. Obviously SENTENCE_STARTERS
97
+ # will never cover all cases, but as the gem is named
98
+ # 'Pragmatic Segmenter' we need to be pragmatic
99
+ # and try to cover the words that most often start a
100
+ # sentence but could never follow one of the abbreviations below.
101
+
102
+ SENTENCE_STARTERS.each do |word|
103
+ txt = txt.gsub(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
104
+ .gsub(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
105
+ .gsub(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
106
+ .gsub(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
107
+ .gsub(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
108
+ .gsub(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
109
+ .gsub(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
110
+ .gsub(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
111
+ .gsub(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
112
+ end
113
+ txt
114
+ end
115
+
116
+ def replace_multi_period_abbreviations(txt)
117
+ mpa = txt.scan(MULTI_PERIOD_ABBREVIATION_REGEX)
118
+ return txt if mpa.empty?
119
+ mpa.each do |r|
120
+ txt = txt.gsub(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
121
+ end
122
+ txt
123
+ end
124
+
125
+ def replace_period_in_am_pm(txt)
126
+ txt.gsub(UPPERCASE_PM_REGEX, '.')
127
+ .gsub(UPPERCASE_AM_REGEX, '.')
128
+ .gsub(LOWERCASE_PM_REGEX, '.')
129
+ .gsub(LOWERCASE_AM_REGEX, '.')
130
+ end
131
+
132
+ def replace_pre_number_abbr(txt, abbr)
133
+ txt.gsub(/(?<=#{abbr.strip})\.(?=\s\d)/, '∯').gsub(/(?<=#{abbr.strip})\.(?=\s+\()/, '∯')
134
+ end
135
+
136
+ def replace_prepositive_abbr(txt, abbr)
137
+ txt.gsub(/(?<=#{abbr.strip})\.(?=\s)/, '∯')
138
+ end
139
+
140
+ def replace_period_of_abbr(txt, abbr)
141
+ txt.gsub(/(?<=#{abbr.strip})\.(?=((\.|:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
142
+ .gsub(/(?<=#{abbr.strip})\.(?=,)/, '∯')
143
+ end
144
+
145
+ def replace_possessive_abbreviations(txt)
146
+ txt.gsub(POSSESSIVE_ABBREVIATION_REGEX, '∯')
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,78 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'pragmatic_segmenter/punctuation_replacer'
3
+
4
+ module PragmaticSegmenter
5
+ # This class searches for punctuation between quotes or parenthesis
6
+ # and replaces it
7
+ class BetweenPunctuation
8
+ # Rubular: http://rubular.com/r/2YFrKWQUYi
9
+ BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
10
+
11
+ # Rubular: http://rubular.com/r/3Pw1QlXOjd
12
+ BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/
13
+
14
+ # Rubular: http://rubular.com/r/x6s4PZK8jc
15
+ BETWEEN_QUOTE_ARROW_REGEX = /«(?>[^»\\]+|\\{2}|\\.)*»/
16
+
17
+ # Rubular: http://rubular.com/r/JbAIpKdlSq
18
+ BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
19
+
20
+ # Rubular: http://rubular.com/r/6tTityPflI
21
+ BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
22
+
23
+ attr_reader :text
24
+ def initialize(text:)
25
+ @text = text
26
+ end
27
+
28
+ def replace
29
+ sub_punctuation_between_quotes_and_parens(text)
30
+ end
31
+
32
+ private
33
+
34
+ def sub_punctuation_between_quotes_and_parens(txt)
35
+ sub_punctuation_between_single_quotes(txt)
36
+ sub_punctuation_between_double_quotes(txt)
37
+ sub_punctuation_between_parens(txt)
38
+ sub_punctuation_between_quotes_arrow(txt)
39
+ sub_punctuation_between_quotes_slanted(txt)
40
+ end
41
+
42
+ def sub_punctuation_between_parens(txt)
43
+ PragmaticSegmenter::PunctuationReplacer.new(
44
+ matches_array: txt.scan(BETWEEN_PARENS_REGEX),
45
+ text: txt
46
+ ).replace
47
+ end
48
+
49
+ def sub_punctuation_between_single_quotes(txt)
50
+ PragmaticSegmenter::PunctuationReplacer.new(
51
+ matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
52
+ text: txt
53
+ ).replace
54
+ end
55
+
56
+ def sub_punctuation_between_double_quotes(txt)
57
+ btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
58
+ PragmaticSegmenter::PunctuationReplacer.new(
59
+ matches_array: btwn_dbl_quote,
60
+ text: txt
61
+ ).replace
62
+ end
63
+
64
+ def sub_punctuation_between_quotes_arrow(txt)
65
+ PragmaticSegmenter::PunctuationReplacer.new(
66
+ matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
67
+ text: txt
68
+ ).replace
69
+ end
70
+
71
+ def sub_punctuation_between_quotes_slanted(txt)
72
+ PragmaticSegmenter::PunctuationReplacer.new(
73
+ matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
74
+ text: txt
75
+ ).replace
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,141 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module PragmaticSegmenter
4
+ module Rules
5
+ module HtmlRules
6
+ # Rubular: http://rubular.com/r/ENrVFMdJ8v
7
+ HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
8
+
9
+ # Rubular: http://rubular.com/r/XZVqMPJhea
10
+ EscapedHTMLTagRule = Rule.new(/&lt;\/?[^gt;]*gt;/, '')
11
+
12
+ All = [HTMLTagRule, EscapedHTMLTagRule]
13
+ end
14
+ end
15
+
16
+ # This is an opinionated class that removes errant newlines,
17
+ # xhtml, inline formatting, etc.
18
+ class Cleaner
19
+ include Rules
20
+ # Rubular: http://rubular.com/r/V57WnM9Zut
21
+ NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
22
+
23
+ # Rubular: http://rubular.com/r/3GiRiP2IbD
24
+ NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
25
+
26
+ # Rubular: http://rubular.com/r/UZAVcwqck8
27
+ PDF_NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
28
+
29
+ # Rubular: http://rubular.com/r/eaNwGavmdo
30
+ PDF_NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
31
+
32
+ # Rubular: http://rubular.com/r/bAJrhyLNeZ
33
+ InlineFormattingRule = Rule.new(/\{b\^&gt;\d*&lt;b\^\}|\{b\^>\d*<b\^\}/, '')
34
+
35
+ # Rubular: http://rubular.com/r/dMxp5MixFS
36
+ DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
37
+
38
+ # Rubular: http://rubular.com/r/H6HOJeA8bq
39
+ DoubleNewLineRule = Rule.new(/\n\n/, "\r")
40
+
41
+ # Rubular: http://rubular.com/r/Gn18aAnLdZ
42
+ NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
43
+
44
+ # Rubular: http://rubular.com/r/FseyMiiYFT
45
+ NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
46
+
47
+ # Rubular: http://rubular.com/r/8mc1ArOIGy
48
+ TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
49
+
50
+ # Rubular: http://rubular.com/r/DwNSuZrNtk
51
+ ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
52
+
53
+ ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
54
+
55
+ QuotationsFirstRule = Rule.new(/''/, '"')
56
+ QuotationsSecondRule = Rule.new(/``/, '"')
57
+
58
+ attr_reader :text, :doc_type
59
+ def initialize(text:, **args)
60
+ @text = Text.new(text.dup)
61
+ @doc_type = args[:doc_type]
62
+ end
63
+
64
+ # Clean text of unwanted formatting
65
+ #
66
+ # Example:
67
+ # >> text = "This is a sentence\ncut off in the middle because pdf."
68
+ # >> PragmaticSegmenter::Cleaner(text: text).clean
69
+ # => "This is a sentence cut off in the middle because pdf."
70
+ #
71
+ # Arguments:
72
+ # text: (String) *required
73
+ # language: (String) *optional
74
+ # (two-digit ISO 639-1 code e.g. 'en')
75
+ # doc_type: (String) *optional
76
+ # (e.g. 'pdf')
77
+
78
+ def clean
79
+ return unless text
80
+ @clean_text = remove_all_newlines(text)
81
+ @clean_text = replace_double_newlines(@clean_text)
82
+ @clean_text = replace_newlines(@clean_text)
83
+ @clean_text = @clean_text.apply(HtmlRules::All)
84
+ @clean_text = @clean_text.apply(InlineFormattingRule)
85
+ @clean_text = clean_quotations(@clean_text)
86
+ @clean_text = clean_table_of_contents(@clean_text)
87
+ end
88
+
89
+ private
90
+
91
+ def remove_all_newlines(txt)
92
+ clean_text = remove_newline_in_middle_of_sentence(txt)
93
+ remove_newline_in_middle_of_word(clean_text)
94
+ end
95
+
96
+ def remove_newline_in_middle_of_sentence(txt)
97
+ txt.dup.gsub!(/(?:[^\.])*/) do |match|
98
+ next unless match.include?("\n")
99
+ orig = match.dup
100
+ match.gsub!(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '')
101
+ txt.gsub!(/#{Regexp.escape(orig)}/, "#{match}")
102
+ end
103
+ txt
104
+ end
105
+
106
+ def remove_newline_in_middle_of_word(txt)
107
+ txt.apply(NewLineInMiddleOfWordRule)
108
+ end
109
+
110
+ def replace_double_newlines(txt)
111
+ txt.apply(DoubleNewLineWithSpaceRule).
112
+ apply(DoubleNewLineRule)
113
+ end
114
+
115
+ def replace_newlines(txt)
116
+ if doc_type.eql?('pdf')
117
+ txt = remove_pdf_line_breaks(txt)
118
+ else
119
+ txt =
120
+ txt.apply(NewLineFollowedByPeriodRule).
121
+ apply(ReplaceNewlineWithCarriageReturnRule)
122
+ end
123
+ txt
124
+ end
125
+
126
+ def remove_pdf_line_breaks(txt)
127
+ txt.apply(NewLineFollowedByBulletRule).
128
+ apply(PDF_NewLineInMiddleOfSentenceRule).
129
+ apply(PDF_NewLineInMiddleOfSentenceNoSpacesRule)
130
+ end
131
+
132
+ def clean_quotations(txt)
133
+ txt.apply(QuotationsFirstRule).
134
+ apply(QuotationsSecondRule)
135
+ end
136
+
137
+ def clean_table_of_contents(txt)
138
+ txt.apply(TableOfContentsRule).apply(ConsecutivePeriodsRule)
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,36 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module PragmaticSegmenter
4
+ module Rules
5
+ # This class searches for ellipses within a string and
6
+ # replaces the periods.
7
+
8
+ # http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
9
+ # http://www.thepunctuationguide.com/ellipses.html
10
+
11
+ module EllipsisRules
12
+ # Rubular: http://rubular.com/r/i60hCK81fz
13
+ ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
14
+
15
+ # Rubular: http://rubular.com/r/Hdqpd90owl
16
+ FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
17
+
18
+ # Rubular: http://rubular.com/r/YBG1dIHTRu
19
+ ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
20
+
21
+ # Rubular: http://rubular.com/r/2VvZ8wRbd8
22
+ FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
23
+
24
+ OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
25
+
26
+ All = [
27
+ ThreeSpaceRule,
28
+ FourSpaceRule,
29
+ FourConsecutiveRule,
30
+ ThreeConsecutiveRule,
31
+ OtherThreePeriodRule
32
+ ]
33
+
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,19 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'pragmatic_segmenter/punctuation_replacer'
3
+
4
+ module PragmaticSegmenter
5
+ # This class searches for exclamation points that
6
+ # are part of words and not ending punctuation and replaces them.
7
+ module ExclamationWords
8
+ WORDS_WITH_EXCLAMATIONS = ['!Xũ', '!Kung', 'ǃʼOǃKung', '!Xuun', '!Kung-Ekoka', 'ǃHu', 'ǃKhung', 'ǃKu', 'ǃung', 'ǃXo', 'ǃXû', 'ǃXung', 'ǃXũ', '!Xun', 'Yahoo!', 'Y!J', 'Yum!']
9
+
10
+ def self.apply_rules(text)
11
+ WORDS_WITH_EXCLAMATIONS.each do |exclamation|
12
+ PragmaticSegmenter::PunctuationReplacer.new(
13
+ matches_array: text.scan(/#{Regexp.escape(exclamation)}/),
14
+ text: text
15
+ ).replace
16
+ end
17
+ end
18
+ end
19
+ end