pragmatic_segmenter 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +730 -0
  7. data/Rakefile +4 -0
  8. data/lib/pragmatic_segmenter.rb +2 -0
  9. data/lib/pragmatic_segmenter/abbreviation.rb +22 -0
  10. data/lib/pragmatic_segmenter/abbreviation_replacer.rb +149 -0
  11. data/lib/pragmatic_segmenter/between_punctuation.rb +78 -0
  12. data/lib/pragmatic_segmenter/cleaner.rb +141 -0
  13. data/lib/pragmatic_segmenter/ellipsis.rb +36 -0
  14. data/lib/pragmatic_segmenter/exclamation_words.rb +19 -0
  15. data/lib/pragmatic_segmenter/languages/amharic.rb +33 -0
  16. data/lib/pragmatic_segmenter/languages/arabic.rb +83 -0
  17. data/lib/pragmatic_segmenter/languages/armenian.rb +33 -0
  18. data/lib/pragmatic_segmenter/languages/burmese.rb +33 -0
  19. data/lib/pragmatic_segmenter/languages/deutsch.rb +132 -0
  20. data/lib/pragmatic_segmenter/languages/english.rb +44 -0
  21. data/lib/pragmatic_segmenter/languages/french.rb +29 -0
  22. data/lib/pragmatic_segmenter/languages/greek.rb +29 -0
  23. data/lib/pragmatic_segmenter/languages/hindi.rb +33 -0
  24. data/lib/pragmatic_segmenter/languages/italian.rb +39 -0
  25. data/lib/pragmatic_segmenter/languages/japanese.rb +58 -0
  26. data/lib/pragmatic_segmenter/languages/persian.rb +56 -0
  27. data/lib/pragmatic_segmenter/languages/russian.rb +60 -0
  28. data/lib/pragmatic_segmenter/languages/spanish.rb +39 -0
  29. data/lib/pragmatic_segmenter/languages/urdu.rb +33 -0
  30. data/lib/pragmatic_segmenter/list.rb +169 -0
  31. data/lib/pragmatic_segmenter/number.rb +35 -0
  32. data/lib/pragmatic_segmenter/process.rb +126 -0
  33. data/lib/pragmatic_segmenter/punctuation.rb +12 -0
  34. data/lib/pragmatic_segmenter/punctuation_replacer.rb +62 -0
  35. data/lib/pragmatic_segmenter/rules.rb +38 -0
  36. data/lib/pragmatic_segmenter/segmenter.rb +81 -0
  37. data/lib/pragmatic_segmenter/sentence_boundary_punctuation.rb +17 -0
  38. data/lib/pragmatic_segmenter/single_letter_abbreviation.rb +37 -0
  39. data/lib/pragmatic_segmenter/types.rb +12 -0
  40. data/lib/pragmatic_segmenter/version.rb +3 -0
  41. data/pragmatic_segmenter.gemspec +25 -0
  42. data/spec/performance_spec.rb +24 -0
  43. data/spec/pragmatic_segmenter_spec.rb +1906 -0
  44. data/spec/spec_helper.rb +1 -0
  45. metadata +150 -0
data/Rakefile ADDED
@@ -0,0 +1,4 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
@@ -0,0 +1,2 @@
1
+ require "pragmatic_segmenter/version"
2
+ require "pragmatic_segmenter/segmenter"
@@ -0,0 +1,22 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module PragmaticSegmenter
4
+ # Defines the abbreviations for each language (if available)
5
+ class Abbreviation
6
+ ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
7
+ PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
8
+ NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
9
+
10
+ def all
11
+ ABBREVIATIONS
12
+ end
13
+
14
+ def prepositive
15
+ PREPOSITIVE_ABBREVIATIONS
16
+ end
17
+
18
+ def number
19
+ NUMBER_ABBREVIATIONS
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,149 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'pragmatic_segmenter/abbreviation'
3
+ require 'pragmatic_segmenter/single_letter_abbreviation'
4
+
5
+ module PragmaticSegmenter
6
+ # This class searches for periods within an abbreviation and
7
+ # replaces the periods.
8
+ class AbbreviationReplacer
9
+ # Rubular: http://rubular.com/r/yqa4Rit8EY
10
+ PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')
11
+
12
+ # Rubular: http://rubular.com/r/xDkpFZ0EgH
13
+ MULTI_PERIOD_ABBREVIATION_REGEX = /\b[a-z](?:\.[a-z])+[.]/i
14
+
15
+ module AmPmRules
16
+ # Rubular: http://rubular.com/r/Vnx3m4Spc8
17
+ UpperCasePmRule = Rule.new(/(?<=P∯M)∯(?=\s[A-Z])/, '.')
18
+
19
+ # Rubular: http://rubular.com/r/AJMCotJVbW
20
+ UpperCaseAmRule = Rule.new(/(?<=A∯M)∯(?=\s[A-Z])/, '.')
21
+
22
+ # Rubular: http://rubular.com/r/13q7SnOhgA
23
+ LowerCasePmRule = Rule.new(/(?<=p∯m)∯(?=\s[A-Z])/, '.')
24
+
25
+ # Rubular: http://rubular.com/r/DgUDq4mLz5
26
+ LowerCaseAmRule = Rule.new(/(?<=a∯m)∯(?=\s[A-Z])/, '.')
27
+
28
+ All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
29
+ end
30
+
31
+ SENTENCE_STARTERS = %w(A Being Did For He How However I In Millions More She That The There They We What When Where Who Why)
32
+
33
+ attr_reader :text
34
+ def initialize(text:)
35
+ @text = Text.new(text)
36
+ end
37
+
38
+ def replace
39
+ @reformatted_text = text.apply(PossessiveAbbreviationRule)
40
+ @reformatted_text = PragmaticSegmenter::SingleLetterAbbreviation.new(text: @reformatted_text).replace
41
+ @reformatted_text = search_for_abbreviations_in_string(@reformatted_text, abbreviations)
42
+ @reformatted_text = replace_multi_period_abbreviations(@reformatted_text)
43
+ @reformatted_text = @reformatted_text.apply(AmPmRules::All)
44
+ replace_abbreviation_as_sentence_boundary(@reformatted_text)
45
+ end
46
+
47
+ private
48
+
49
+ def search_for_abbreviations_in_string(txt, abbr)
50
+ original = txt.dup
51
+ downcased = txt.downcase
52
+ abbr.all.each do |a|
53
+ next unless downcased.include?(a.strip)
54
+ abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(a.strip)}/i)
55
+ next if abbrev_match.empty?
56
+ next_word_start = /(?<=#{Regexp.escape(a.strip)} ).{1}/
57
+ character_array = @text.scan(next_word_start)
58
+ abbrev_match.each_with_index do |am, index|
59
+ txt = scan_for_replacements(txt, am, index, character_array, abbr)
60
+ end
61
+ end
62
+ txt
63
+ end
64
+
65
+ def scan_for_replacements(txt, am, index, character_array, abbr)
66
+ character = character_array[index]
67
+ prepositive = abbr.prepositive
68
+ number_abbr = abbr.number
69
+ upper = /[[:upper:]]/.match(character.to_s)
70
+ if upper.nil? || prepositive.include?(am.downcase.strip)
71
+ if prepositive.include?(am.downcase.strip)
72
+ txt = replace_prepositive_abbr(txt, am)
73
+ elsif number_abbr.include?(am.downcase.strip)
74
+ txt = replace_pre_number_abbr(txt, am)
75
+ else
76
+ txt = replace_period_of_abbr(txt, am)
77
+ end
78
+ end
79
+ txt
80
+ end
81
+
82
+ def abbreviations
83
+ PragmaticSegmenter::Abbreviation.new
84
+ end
85
+
86
+ def replace_abbreviation_as_sentence_boundary(txt)
87
+ # As we are being conservative and keeping ambiguous
88
+ # sentence boundaries as one sentence instead of
89
+ # splitting into two, we can split at words that
90
+ # we know for certain never follow these abbreviations.
91
+ # Some might say that the set of words that follow an
92
+ # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
93
+ # the set of words that could start a sentence and
94
+ # never follow U.S. However, we are being conservative
95
+ # and not splitting by default, so we need to look for places
96
+ # where we definitely can split. Obviously SENTENCE_STARTERS
97
+ # will never cover all cases, but as the gem is named
98
+ # 'Pragmatic Segmenter' we need to be pragmatic
99
+ # and try to cover the words that most often start a
100
+ # sentence but could never follow one of the abbreviations below.
101
+
102
+ SENTENCE_STARTERS.each do |word|
103
+ txt = txt.gsub(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
104
+ .gsub(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
105
+ .gsub(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
106
+ .gsub(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
107
+ .gsub(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
108
+ .gsub(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
109
+ .gsub(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
110
+ .gsub(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
111
+ .gsub(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
112
+ end
113
+ txt
114
+ end
115
+
116
+ def replace_multi_period_abbreviations(txt)
117
+ mpa = txt.scan(MULTI_PERIOD_ABBREVIATION_REGEX)
118
+ return txt if mpa.empty?
119
+ mpa.each do |r|
120
+ txt = txt.gsub(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
121
+ end
122
+ txt
123
+ end
124
+
125
+ def replace_period_in_am_pm(txt)
126
+ txt.gsub(UPPERCASE_PM_REGEX, '.')
127
+ .gsub(UPPERCASE_AM_REGEX, '.')
128
+ .gsub(LOWERCASE_PM_REGEX, '.')
129
+ .gsub(LOWERCASE_AM_REGEX, '.')
130
+ end
131
+
132
+ def replace_pre_number_abbr(txt, abbr)
133
+ txt.gsub(/(?<=#{abbr.strip})\.(?=\s\d)/, '∯').gsub(/(?<=#{abbr.strip})\.(?=\s+\()/, '∯')
134
+ end
135
+
136
+ def replace_prepositive_abbr(txt, abbr)
137
+ txt.gsub(/(?<=#{abbr.strip})\.(?=\s)/, '∯')
138
+ end
139
+
140
+ def replace_period_of_abbr(txt, abbr)
141
+ txt.gsub(/(?<=#{abbr.strip})\.(?=((\.|:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
142
+ .gsub(/(?<=#{abbr.strip})\.(?=,)/, '∯')
143
+ end
144
+
145
+ def replace_possessive_abbreviations(txt)
146
+ txt.gsub(POSSESSIVE_ABBREVIATION_REGEX, '∯')
147
+ end
148
+ end
149
+ end
@@ -0,0 +1,78 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'pragmatic_segmenter/punctuation_replacer'
3
+
4
+ module PragmaticSegmenter
5
+ # This class searches for punctuation between quotes or parenthesis
6
+ # and replaces it
7
+ class BetweenPunctuation
8
+ # Rubular: http://rubular.com/r/2YFrKWQUYi
9
+ BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
10
+
11
+ # Rubular: http://rubular.com/r/3Pw1QlXOjd
12
+ BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/
13
+
14
+ # Rubular: http://rubular.com/r/x6s4PZK8jc
15
+ BETWEEN_QUOTE_ARROW_REGEX = /«(?>[^»\\]+|\\{2}|\\.)*»/
16
+
17
+ # Rubular: http://rubular.com/r/JbAIpKdlSq
18
+ BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
19
+
20
+ # Rubular: http://rubular.com/r/6tTityPflI
21
+ BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
22
+
23
+ attr_reader :text
24
+ def initialize(text:)
25
+ @text = text
26
+ end
27
+
28
+ def replace
29
+ sub_punctuation_between_quotes_and_parens(text)
30
+ end
31
+
32
+ private
33
+
34
+ def sub_punctuation_between_quotes_and_parens(txt)
35
+ sub_punctuation_between_single_quotes(txt)
36
+ sub_punctuation_between_double_quotes(txt)
37
+ sub_punctuation_between_parens(txt)
38
+ sub_punctuation_between_quotes_arrow(txt)
39
+ sub_punctuation_between_quotes_slanted(txt)
40
+ end
41
+
42
+ def sub_punctuation_between_parens(txt)
43
+ PragmaticSegmenter::PunctuationReplacer.new(
44
+ matches_array: txt.scan(BETWEEN_PARENS_REGEX),
45
+ text: txt
46
+ ).replace
47
+ end
48
+
49
+ def sub_punctuation_between_single_quotes(txt)
50
+ PragmaticSegmenter::PunctuationReplacer.new(
51
+ matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),
52
+ text: txt
53
+ ).replace
54
+ end
55
+
56
+ def sub_punctuation_between_double_quotes(txt)
57
+ btwn_dbl_quote = txt.scan(BETWEEN_DOUBLE_QUOTES_REGEX)
58
+ PragmaticSegmenter::PunctuationReplacer.new(
59
+ matches_array: btwn_dbl_quote,
60
+ text: txt
61
+ ).replace
62
+ end
63
+
64
+ def sub_punctuation_between_quotes_arrow(txt)
65
+ PragmaticSegmenter::PunctuationReplacer.new(
66
+ matches_array: txt.scan(BETWEEN_QUOTE_ARROW_REGEX),
67
+ text: txt
68
+ ).replace
69
+ end
70
+
71
+ def sub_punctuation_between_quotes_slanted(txt)
72
+ PragmaticSegmenter::PunctuationReplacer.new(
73
+ matches_array: txt.scan(BETWEEN_QUOTE_SLANTED_REGEX),
74
+ text: txt
75
+ ).replace
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,141 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module PragmaticSegmenter
4
+ module Rules
5
+ module HtmlRules
6
+ # Rubular: http://rubular.com/r/ENrVFMdJ8v
7
+ HTMLTagRule = Rule.new(/<\/?[^>]*>/, '')
8
+
9
+ # Rubular: http://rubular.com/r/XZVqMPJhea
10
+ EscapedHTMLTagRule = Rule.new(/&lt;\/?[^gt;]*gt;/, '')
11
+
12
+ All = [HTMLTagRule, EscapedHTMLTagRule]
13
+ end
14
+ end
15
+
16
+ # This is an opinionated class that removes errant newlines,
17
+ # xhtml, inline formatting, etc.
18
+ class Cleaner
19
+ include Rules
20
+ # Rubular: http://rubular.com/r/V57WnM9Zut
21
+ NewLineInMiddleOfWordRule = Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
22
+
23
+ # Rubular: http://rubular.com/r/3GiRiP2IbD
24
+ NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = /(?<=\s)\n(?=([a-z]|\())/
25
+
26
+ # Rubular: http://rubular.com/r/UZAVcwqck8
27
+ PDF_NewLineInMiddleOfSentenceRule = Rule.new(/(?<=[^\n]\s)\n(?=\S)/, '')
28
+
29
+ # Rubular: http://rubular.com/r/eaNwGavmdo
30
+ PDF_NewLineInMiddleOfSentenceNoSpacesRule = Rule.new(/\n(?=[a-z])/, ' ')
31
+
32
+ # Rubular: http://rubular.com/r/bAJrhyLNeZ
33
+ InlineFormattingRule = Rule.new(/\{b\^&gt;\d*&lt;b\^\}|\{b\^>\d*<b\^\}/, '')
34
+
35
+ # Rubular: http://rubular.com/r/dMxp5MixFS
36
+ DoubleNewLineWithSpaceRule = Rule.new(/\n \n/, "\r")
37
+
38
+ # Rubular: http://rubular.com/r/H6HOJeA8bq
39
+ DoubleNewLineRule = Rule.new(/\n\n/, "\r")
40
+
41
+ # Rubular: http://rubular.com/r/Gn18aAnLdZ
42
+ NewLineFollowedByBulletRule = Rule.new(/\n(?=•)/, "\r")
43
+
44
+ # Rubular: http://rubular.com/r/FseyMiiYFT
45
+ NewLineFollowedByPeriodRule = Rule.new(/\n(?=\.(\s|\n))/, '')
46
+
47
+ # Rubular: http://rubular.com/r/8mc1ArOIGy
48
+ TableOfContentsRule = Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
49
+
50
+ # Rubular: http://rubular.com/r/DwNSuZrNtk
51
+ ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
52
+
53
+ ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
54
+
55
+ QuotationsFirstRule = Rule.new(/''/, '"')
56
+ QuotationsSecondRule = Rule.new(/``/, '"')
57
+
58
+ attr_reader :text, :doc_type
59
+ def initialize(text:, **args)
60
+ @text = Text.new(text.dup)
61
+ @doc_type = args[:doc_type]
62
+ end
63
+
64
+ # Clean text of unwanted formatting
65
+ #
66
+ # Example:
67
+ # >> text = "This is a sentence\ncut off in the middle because pdf."
68
+ # >> PragmaticSegmenter::Cleaner(text: text).clean
69
+ # => "This is a sentence cut off in the middle because pdf."
70
+ #
71
+ # Arguments:
72
+ # text: (String) *required
73
+ # language: (String) *optional
74
+ # (two-digit ISO 639-1 code e.g. 'en')
75
+ # doc_type: (String) *optional
76
+ # (e.g. 'pdf')
77
+
78
+ def clean
79
+ return unless text
80
+ @clean_text = remove_all_newlines(text)
81
+ @clean_text = replace_double_newlines(@clean_text)
82
+ @clean_text = replace_newlines(@clean_text)
83
+ @clean_text = @clean_text.apply(HtmlRules::All)
84
+ @clean_text = @clean_text.apply(InlineFormattingRule)
85
+ @clean_text = clean_quotations(@clean_text)
86
+ @clean_text = clean_table_of_contents(@clean_text)
87
+ end
88
+
89
+ private
90
+
91
+ def remove_all_newlines(txt)
92
+ clean_text = remove_newline_in_middle_of_sentence(txt)
93
+ remove_newline_in_middle_of_word(clean_text)
94
+ end
95
+
96
+ def remove_newline_in_middle_of_sentence(txt)
97
+ txt.dup.gsub!(/(?:[^\.])*/) do |match|
98
+ next unless match.include?("\n")
99
+ orig = match.dup
100
+ match.gsub!(NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '')
101
+ txt.gsub!(/#{Regexp.escape(orig)}/, "#{match}")
102
+ end
103
+ txt
104
+ end
105
+
106
+ def remove_newline_in_middle_of_word(txt)
107
+ txt.apply(NewLineInMiddleOfWordRule)
108
+ end
109
+
110
+ def replace_double_newlines(txt)
111
+ txt.apply(DoubleNewLineWithSpaceRule).
112
+ apply(DoubleNewLineRule)
113
+ end
114
+
115
+ def replace_newlines(txt)
116
+ if doc_type.eql?('pdf')
117
+ txt = remove_pdf_line_breaks(txt)
118
+ else
119
+ txt =
120
+ txt.apply(NewLineFollowedByPeriodRule).
121
+ apply(ReplaceNewlineWithCarriageReturnRule)
122
+ end
123
+ txt
124
+ end
125
+
126
+ def remove_pdf_line_breaks(txt)
127
+ txt.apply(NewLineFollowedByBulletRule).
128
+ apply(PDF_NewLineInMiddleOfSentenceRule).
129
+ apply(PDF_NewLineInMiddleOfSentenceNoSpacesRule)
130
+ end
131
+
132
+ def clean_quotations(txt)
133
+ txt.apply(QuotationsFirstRule).
134
+ apply(QuotationsSecondRule)
135
+ end
136
+
137
+ def clean_table_of_contents(txt)
138
+ txt.apply(TableOfContentsRule).apply(ConsecutivePeriodsRule)
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,36 @@
1
+ # -*- encoding : utf-8 -*-
2
+
3
+ module PragmaticSegmenter
4
+ module Rules
5
+ # This class searches for ellipses within a string and
6
+ # replaces the periods.
7
+
8
+ # http://www.dailywritingtips.com/in-search-of-a-4-dot-ellipsis/
9
+ # http://www.thepunctuationguide.com/ellipses.html
10
+
11
+ module EllipsisRules
12
+ # Rubular: http://rubular.com/r/i60hCK81fz
13
+ ThreeConsecutiveRule = Rule.new(/\.\.\.(?=\s+[A-Z])/, '☏.')
14
+
15
+ # Rubular: http://rubular.com/r/Hdqpd90owl
16
+ FourConsecutiveRule = Rule.new(/(?<=\S)\.{3}(?=\.\s[A-Z])/, 'ƪ')
17
+
18
+ # Rubular: http://rubular.com/r/YBG1dIHTRu
19
+ ThreeSpaceRule = Rule.new(/(\s\.){3}\s/, '♟')
20
+
21
+ # Rubular: http://rubular.com/r/2VvZ8wRbd8
22
+ FourSpaceRule = Rule.new(/(?<=[a-z])(\.\s){3}\.(\z|$|\n)/, '♝')
23
+
24
+ OtherThreePeriodRule = Rule.new(/\.\.\./, 'ƪ')
25
+
26
+ All = [
27
+ ThreeSpaceRule,
28
+ FourSpaceRule,
29
+ FourConsecutiveRule,
30
+ ThreeConsecutiveRule,
31
+ OtherThreePeriodRule
32
+ ]
33
+
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,19 @@
1
+ # -*- encoding : utf-8 -*-
2
+ require 'pragmatic_segmenter/punctuation_replacer'
3
+
4
+ module PragmaticSegmenter
5
+ # This class searches for exclamation points that
6
+ # are part of words and not ending punctuation and replaces them.
7
+ module ExclamationWords
8
+ WORDS_WITH_EXCLAMATIONS = ['!Xũ', '!Kung', 'ǃʼOǃKung', '!Xuun', '!Kung-Ekoka', 'ǃHu', 'ǃKhung', 'ǃKu', 'ǃung', 'ǃXo', 'ǃXû', 'ǃXung', 'ǃXũ', '!Xun', 'Yahoo!', 'Y!J', 'Yum!']
9
+
10
+ def self.apply_rules(text)
11
+ WORDS_WITH_EXCLAMATIONS.each do |exclamation|
12
+ PragmaticSegmenter::PunctuationReplacer.new(
13
+ matches_array: text.scan(/#{Regexp.escape(exclamation)}/),
14
+ text: text
15
+ ).replace
16
+ end
17
+ end
18
+ end
19
+ end