pragmatic_segmenter 0.0.4 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9d58b0da895c9249efd632eca9c21ba386ab0ba3
4
- data.tar.gz: 3e9b46ad1c9bbc1704fe2a6b2c4ce899c73267fa
3
+ metadata.gz: e0545b8e2fe6446107740b5c458b96e76b6edc51
4
+ data.tar.gz: 746d97aba038d8f23a6701d7df08205ff48203a8
5
5
  SHA512:
6
- metadata.gz: 14b408a8e527d35d6da36082fa17240c1c9c6b50f66747fda05e52ed859ca0d1c3b6f5a8f57ef8fd0827b5134d960beed9536509738c94db31dbcd12b3a03fcd
7
- data.tar.gz: 6c67d78cf0777504c4c21e6cf9a168897b4c8c43bf3e3e84a6161f81bcbb8355a5794d35a9924196fafed4a0d682ff74063548a676d454e1a198ad3ea9959a83
6
+ metadata.gz: 5975faedda7f913678ea122317266722895376da90be3b4094e50d61d2eef1e531b3df890d93199393f1945a053e14646e8d2b7bc73287de9250751f332483aa
7
+ data.tar.gz: d48fcd09e289833f82a5e6aa5915b3faa5ecf4417874273a2b8bb7640f51454f1374736449478da8328006277b0727c16b3b8badd258791fb011dd23f351266e
@@ -50,9 +50,13 @@ module PragmaticSegmenter
50
50
  # Rubular: http://rubular.com/r/DwNSuZrNtk
51
51
  ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
52
52
 
53
- # http://rubular.com/r/IQ4TPfsbd8
53
+ # Rubular: http://rubular.com/r/IQ4TPfsbd8
54
54
  ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
55
55
 
56
+ EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
57
+
58
+ EscapedNewLineRule = Rule.new(/\\n/, "\n")
59
+
56
60
  ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
57
61
 
58
62
  QuotationsFirstRule = Rule.new(/''/, '"')
@@ -81,12 +85,14 @@ module PragmaticSegmenter
81
85
  def clean
82
86
  return unless text
83
87
  @clean_text = remove_all_newlines(text)
84
- @clean_text = replace_double_newlines(@clean_text)
85
- @clean_text = replace_newlines(@clean_text)
86
- @clean_text = @clean_text.apply(HtmlRules::All)
87
- @clean_text = @clean_text.apply(InlineFormattingRule)
88
- @clean_text = clean_quotations(@clean_text)
89
- @clean_text = clean_table_of_contents(@clean_text)
88
+ replace_double_newlines(@clean_text)
89
+ replace_newlines(@clean_text)
90
+ replace_escaped_newlines(@clean_text)
91
+ @clean_text.apply(HtmlRules::All)
92
+ @clean_text.apply(InlineFormattingRule)
93
+ clean_quotations(@clean_text)
94
+ clean_table_of_contents(@clean_text)
95
+ clean_consecutive_characters(@clean_text)
90
96
  end
91
97
 
92
98
  private
@@ -110,6 +116,11 @@ module PragmaticSegmenter
110
116
  txt.apply(NewLineInMiddleOfWordRule)
111
117
  end
112
118
 
119
+ def replace_escaped_newlines(txt)
120
+ txt.apply(EscapedNewLineRule).
121
+ apply(EscapedCarriageReturnRule)
122
+ end
123
+
113
124
  def replace_double_newlines(txt)
114
125
  txt.apply(DoubleNewLineWithSpaceRule).
115
126
  apply(DoubleNewLineRule)
@@ -117,13 +128,11 @@ module PragmaticSegmenter
117
128
 
118
129
  def replace_newlines(txt)
119
130
  if doc_type.eql?('pdf')
120
- txt = remove_pdf_line_breaks(txt)
131
+ remove_pdf_line_breaks(txt)
121
132
  else
122
- txt =
123
- txt.apply(NewLineFollowedByPeriodRule).
124
- apply(ReplaceNewlineWithCarriageReturnRule)
133
+ txt.apply(NewLineFollowedByPeriodRule).
134
+ apply(ReplaceNewlineWithCarriageReturnRule)
125
135
  end
126
- txt
127
136
  end
128
137
 
129
138
  def remove_pdf_line_breaks(txt)
@@ -142,5 +151,10 @@ module PragmaticSegmenter
142
151
  apply(ConsecutivePeriodsRule).
143
152
  apply(ConsecutiveForwardSlashRule)
144
153
  end
154
+
155
+ def clean_consecutive_characters(txt)
156
+ txt.apply(ConsecutivePeriodsRule).
157
+ apply(ConsecutiveForwardSlashRule)
158
+ end
145
159
  end
146
160
  end
@@ -30,7 +30,6 @@ module PragmaticSegmenter
30
30
  ThreeConsecutiveRule,
31
31
  OtherThreePeriodRule
32
32
  ]
33
-
34
33
  end
35
34
  end
36
35
  end
@@ -25,6 +25,5 @@ module PragmaticSegmenter
25
25
  def cleaner_class
26
26
  Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}::Cleaner")
27
27
  end
28
-
29
28
  end
30
29
  end
@@ -29,7 +29,7 @@ module PragmaticSegmenter
29
29
 
30
30
  def replace
31
31
  super
32
- @formatted_text.apply(NumberPeriodSpaceRule).apply(NegativeNumberPeriodSpaceRule)
32
+ @text.apply(NumberPeriodSpaceRule).apply(NegativeNumberPeriodSpaceRule)
33
33
  end
34
34
  end
35
35
 
@@ -124,14 +124,12 @@ module PragmaticSegmenter
124
124
  txt.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
125
125
  a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
126
126
  end
127
- txt
128
127
  end
129
128
 
130
129
  def replace_alphabet_list_parens(a, txt)
131
130
  txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
132
131
  a.eql?(m) ? "\r#{Regexp.escape(a.to_s)}" : "#{m}"
133
132
  end
134
- txt
135
133
  end
136
134
 
137
135
  def replace_correct_alphabet_list(a, txt, parens)
@@ -25,11 +25,11 @@ module PragmaticSegmenter
25
25
  end
26
26
 
27
27
  def replace
28
- @formatted_text = @text.apply(PeriodBeforeNumberRule).
29
- apply(NumberAfterPeriodBeforeLetterRule).
30
- apply(NewLineNumberPeriodSpaceLetterRule).
31
- apply(StartLineNumberPeriodRule).
32
- apply(StartLineTwoDigitNumberPeriodRule)
28
+ @text.apply(PeriodBeforeNumberRule).
29
+ apply(NumberAfterPeriodBeforeLetterRule).
30
+ apply(NewLineNumberPeriodSpaceLetterRule).
31
+ apply(StartLineNumberPeriodRule).
32
+ apply(StartLineTwoDigitNumberPeriodRule)
33
33
  end
34
34
  end
35
35
  end
@@ -30,57 +30,53 @@ module PragmaticSegmenter
30
30
  reformatted_text = replace_abbreviations(reformatted_text)
31
31
  reformatted_text = replace_numbers(reformatted_text)
32
32
  reformatted_text = reformatted_text.apply(GeoLocationRule)
33
- split_lines(reformatted_text)
33
+ split_into_segments(reformatted_text)
34
34
  end
35
35
 
36
36
  private
37
37
 
38
- def split_lines(txt)
39
- segments = txt.split("\r")
38
+ def split_into_segments(txt)
39
+ txt.split("\r")
40
+ .map! { |segment| segment.apply(SingleNewLineRule, EllipsisRules::All, EmailRule) }
41
+ .map { |segment| check_for_punctuation(segment) }.flatten
42
+ .map! { |segment| segment.apply(SubSymbolsRules::All) }
43
+ .map { |segment| post_process_segments(segment) }
44
+ .flatten.compact.delete_if(&:empty?)
45
+ end
40
46
 
41
- segments.map! do |line|
42
- line.apply(SingleNewLineRule, EllipsisRules::All, EmailRule)
47
+ def post_process_segments(txt)
48
+ return if consecutive_underscore?(txt) || txt.length < 2
49
+ txt.apply(ReinsertEllipsisRules::All).apply(ExtraWhiteSpaceRule)
50
+ if txt =~ QUOTATION_AT_END_OF_SENTENCE_REGEX
51
+ txt.split(SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
52
+ else
53
+ txt.tr("\n", '').strip
43
54
  end
55
+ end
44
56
 
45
- segments = segments.map { |line| analyze_lines(line) }.flatten
46
-
47
- segments.map! {|segment| sub_symbols(segment) }
48
-
49
- sentence_array = []
50
- segments.each_with_index do |line|
51
- next if line.gsub(/_{3,}/, '').length.eql?(0) || line.length < 2
52
- line = reinsert_ellipsis(line)
53
- line = line.apply(ExtraWhiteSpaceRule)
54
- if line =~ QUOTATION_AT_END_OF_SENTENCE_REGEX
55
- subline = line.split(SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
56
- subline.each do |s|
57
- sentence_array << s
58
- end
59
- else
60
- sentence_array << line.tr("\n", '').strip
61
- end
62
- end
63
- sentence_array.reject(&:empty?)
57
+ def consecutive_underscore?(txt)
58
+ # Rubular: http://rubular.com/r/fTF2Ff3WBL
59
+ txt.gsub(/_{3,}/, '').length.eql?(0)
64
60
  end
65
61
 
66
- def analyze_lines(line)
67
- if punctuation_array.any? { |p| line.include?(p) }
68
- process_text(line)
62
+ def check_for_punctuation(txt)
63
+ if punctuation_array.any? { |p| txt.include?(p) }
64
+ process_text(txt)
69
65
  else
70
- line
66
+ txt
71
67
  end
72
68
  end
73
69
 
74
- def process_text(line)
75
- line << 'ȸ' unless punctuation_array.any? { |p| line[-1].include?(p) }
76
- PragmaticSegmenter::ExclamationWords.apply_rules(line)
77
- between_punctutation(line)
78
- line = line.apply(
70
+ def process_text(txt)
71
+ txt << 'ȸ' unless punctuation_array.any? { |p| txt[-1].include?(p) }
72
+ PragmaticSegmenter::ExclamationWords.apply_rules(txt)
73
+ between_punctutation(txt)
74
+ txt = txt.apply(
79
75
  DoublePuctationRules::All,
80
76
  QuestionMarkInQuotationRule,
81
77
  ExclamationPointRules::All
82
78
  )
83
- sentence_boundary_punctuation(line)
79
+ sentence_boundary_punctuation(txt)
84
80
  end
85
81
 
86
82
  def replace_numbers(txt)
@@ -102,17 +98,5 @@ module PragmaticSegmenter
102
98
  def sentence_boundary_punctuation(txt)
103
99
  PragmaticSegmenter::SentenceBoundaryPunctuation.new(text: txt).split
104
100
  end
105
-
106
- def sub_symbols(txt)
107
- txt.gsub(/∯/, '.').gsub(/♬/, '،').gsub(/♭/, ':').gsub(/ᓰ/, '。').gsub(/ᓱ/, '.')
108
- .gsub(/ᓳ/, '!').gsub(/ᓴ/, '!').gsub(/ᓷ/, '?').gsub(/ᓸ/, '?').gsub(/☉/, '?!')
109
- .gsub(/☈/, '!?').gsub(/☇/, '??').gsub(/☄/, '!!').delete('ȸ').gsub(/ȹ/, "\n")
110
- end
111
-
112
- def reinsert_ellipsis(line)
113
- line.gsub(/ƪ/, '...').gsub(/♟/, ' . . . ')
114
- .gsub(/♝/, '. . . .').gsub(/☏/, '..')
115
- .gsub(/∮/, '.')
116
- end
117
101
  end
118
102
  end
@@ -4,6 +4,7 @@ module PragmaticSegmenter
4
4
  # This class replaces punctuation that is typically a sentence boundary
5
5
  # but in this case is not a sentence boundary.
6
6
  class PunctuationReplacer
7
+ include Rules
7
8
  attr_reader :matches_array, :text
8
9
  def initialize(text:, matches_array:)
9
10
  @text = text
@@ -18,45 +19,24 @@ module PragmaticSegmenter
18
19
 
19
20
  def replace_punctuation(array, txt)
20
21
  return if !array || array.empty?
21
- txt.gsub!('(', '\\(')
22
- txt.gsub!(')', '\\)')
23
- txt.gsub!(']', '\\]')
24
- txt.gsub!('[', '\\[')
25
- txt.gsub!('-', '\\-')
22
+ txt.apply(EscapeRegexReservedCharacters::All)
26
23
  array.each do |a|
27
- a.gsub!('(', '\\(')
28
- a.gsub!(')', '\\)')
29
- a.gsub!(']', '\\]')
30
- a.gsub!('[', '\\[')
31
- a.gsub!('-', '\\-')
32
-
33
- sub = a.gsub('.', '')
34
- txt.gsub!(/#{Regexp.escape(a)}/, "#{sub}")
35
-
36
- sub_1 = sub.gsub('。', 'ᓰ')
37
- txt.gsub!(/#{Regexp.escape(sub)}/, "#{sub_1}")
38
-
39
- sub_2 = sub_1.gsub('.', 'ᓱ')
40
- txt.gsub!(/#{Regexp.escape(sub_1)}/, "#{sub_2}")
41
-
42
- sub_3 = sub_2.gsub('!', 'ᓳ')
43
- txt.gsub!(/#{Regexp.escape(sub_2)}/, "#{sub_3}")
44
-
45
- sub_4 = sub_3.gsub('!', 'ᓴ')
46
- txt.gsub!(/#{Regexp.escape(sub_3)}/, "#{sub_4}")
47
-
48
- sub_5 = sub_4.gsub('?', 'ᓷ')
49
- txt.gsub!(/#{Regexp.escape(sub_4)}/, "#{sub_5}")
50
-
51
- sub_6 = sub_5.gsub('?', 'ᓸ')
52
- txt.gsub!(/#{Regexp.escape(sub_5)}/, "#{sub_6}")
24
+ a.apply(EscapeRegexReservedCharacters::All)
25
+ sub = sub_characters(txt, a, '.', '')
26
+ sub_1 = sub_characters(txt, sub, '', '&ᓰ&')
27
+ sub_2 = sub_characters(txt, sub_1, '', '&ᓱ&')
28
+ sub_3 = sub_characters(txt, sub_2, '', '&ᓳ&')
29
+ sub_4 = sub_characters(txt, sub_3, '!', '&ᓴ&')
30
+ sub_5 = sub_characters(txt, sub_4, '?', '&ᓷ&')
31
+ sub_6 = sub_characters(txt, sub_5, '?', '&ᓸ&')
53
32
  end
54
- txt.gsub!('\\(', '(')
55
- txt.gsub!('\\)', ')')
56
- txt.gsub!('\\[', '[')
57
- txt.gsub!('\\]', ']')
58
- txt.gsub!('\\-', '-')
59
- txt
33
+ txt.apply(SubEscapedRegexReservedCharacters::All)
34
+ end
35
+
36
+ def sub_characters(txt, string, char_a, char_b)
37
+ sub = string.gsub(char_a, char_b)
38
+ txt.gsub!(/#{Regexp.escape(string)}/, "#{sub}")
39
+ sub
60
40
  end
61
41
  end
62
42
  end
@@ -11,17 +11,17 @@ module PragmaticSegmenter
11
11
  ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
12
12
 
13
13
  # Rubular: http://rubular.com/r/aXPUGm6fQh
14
- QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '')
14
+ QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
15
15
 
16
16
  module ExclamationPointRules
17
17
  # Rubular: http://rubular.com/r/XS1XXFRfM2
18
- InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '')
18
+ InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
19
19
 
20
20
  # Rubular: http://rubular.com/r/sl57YI8LkA
21
- BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '')
21
+ BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
22
22
 
23
23
  # Rubular: http://rubular.com/r/f9zTjmkIPb
24
- MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '')
24
+ MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
25
25
 
26
26
  All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
27
27
  end
@@ -34,5 +34,66 @@ module PragmaticSegmenter
34
34
 
35
35
  All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
36
36
  end
37
+
38
+ module ReinsertEllipsisRules
39
+ ThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
40
+ ThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
41
+ FourSpacePeriod = Rule.new(/♝/, '. . . .')
42
+ TwoConsecutivePeriod = Rule.new(/☏/, '..')
43
+ OnePeriod = Rule.new(/∮/, '.')
44
+
45
+ All = [ ThreeConsecutivePeriod, ThreeSpacePeriod,
46
+ FourSpacePeriod, TwoConsecutivePeriod,
47
+ OnePeriod ]
48
+ end
49
+
50
+ module SubSymbolsRules
51
+ Period = Rule.new(/∯/, '.')
52
+ ArabicComma = Rule.new(/♬/, '،')
53
+ SemiColon = Rule.new(/♭/, ':')
54
+ FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
55
+ SpecialPeriod = Rule.new(/&ᓱ&/, '.')
56
+ FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
57
+ ExclamationPoint = Rule.new(/&ᓴ&/, '!')
58
+ QuestionMark = Rule.new(/&ᓷ&/, '?')
59
+ FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
60
+ MixedDoubleQE = Rule.new(/☉/, '?!')
61
+ MixedDoubleQQ = Rule.new(/☇/, '??')
62
+ MixedDoubleEQ = Rule.new(/☈/, '!?')
63
+ MixedDoubleEE = Rule.new(/☄/, '!!')
64
+ TemporaryEndingPunctutation = Rule.new('ȸ', '')
65
+ Newline = Rule.new(/ȹ/, "\n")
66
+
67
+ All = [ Period, ArabicComma,
68
+ SemiColon, FullWidthPeriod,
69
+ SpecialPeriod, FullWidthExclamation,
70
+ ExclamationPoint, QuestionMark,
71
+ FullWidthQuestionMark, MixedDoubleQE,
72
+ MixedDoubleQQ, MixedDoubleEQ,
73
+ MixedDoubleEE, TemporaryEndingPunctutation,
74
+ Newline ]
75
+ end
76
+
77
+ module EscapeRegexReservedCharacters
78
+ LeftParen = Rule.new('(', '\\(')
79
+ RightParen = Rule.new(')', '\\)')
80
+ LeftBracket = Rule.new('[', '\\[')
81
+ RightBracket = Rule.new(']', '\\]')
82
+ Dash = Rule.new('-', '\\-')
83
+
84
+ All = [ LeftParen, RightParen,
85
+ LeftBracket, RightBracket, Dash ]
86
+ end
87
+
88
+ module SubEscapedRegexReservedCharacters
89
+ LeftParen = Rule.new('\\(', '(')
90
+ RightParen = Rule.new('\\)', ')')
91
+ LeftBracket = Rule.new('\\[', '[')
92
+ RightBracket = Rule.new('\\]', ']')
93
+ Dash = Rule.new('\\-', '-')
94
+
95
+ All = [ LeftParen, RightParen,
96
+ LeftBracket, RightBracket, Dash ]
97
+ end
37
98
  end
38
99
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
@@ -868,6 +868,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
868
868
  ps = PragmaticSegmenter::Segmenter.new(text: "////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////Header starts here\r////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////", language: 'en')
869
869
  expect(ps.segment).to eq(["Header starts here"])
870
870
  end
871
+
872
+ it 'correctly segments text #082' do
873
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Hello World. \r\n Hello.', language: 'en')
874
+ expect(ps.segment).to eq(["Hello World.", "Hello."])
875
+ end
871
876
  end
872
877
  end
873
878
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias