pragmatic_segmenter 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9d58b0da895c9249efd632eca9c21ba386ab0ba3
4
- data.tar.gz: 3e9b46ad1c9bbc1704fe2a6b2c4ce899c73267fa
3
+ metadata.gz: e0545b8e2fe6446107740b5c458b96e76b6edc51
4
+ data.tar.gz: 746d97aba038d8f23a6701d7df08205ff48203a8
5
5
  SHA512:
6
- metadata.gz: 14b408a8e527d35d6da36082fa17240c1c9c6b50f66747fda05e52ed859ca0d1c3b6f5a8f57ef8fd0827b5134d960beed9536509738c94db31dbcd12b3a03fcd
7
- data.tar.gz: 6c67d78cf0777504c4c21e6cf9a168897b4c8c43bf3e3e84a6161f81bcbb8355a5794d35a9924196fafed4a0d682ff74063548a676d454e1a198ad3ea9959a83
6
+ metadata.gz: 5975faedda7f913678ea122317266722895376da90be3b4094e50d61d2eef1e531b3df890d93199393f1945a053e14646e8d2b7bc73287de9250751f332483aa
7
+ data.tar.gz: d48fcd09e289833f82a5e6aa5915b3faa5ecf4417874273a2b8bb7640f51454f1374736449478da8328006277b0727c16b3b8badd258791fb011dd23f351266e
@@ -50,9 +50,13 @@ module PragmaticSegmenter
50
50
  # Rubular: http://rubular.com/r/DwNSuZrNtk
51
51
  ConsecutivePeriodsRule = Rule.new(/\.{5,}/, ' ')
52
52
 
53
- # http://rubular.com/r/IQ4TPfsbd8
53
+ # Rubular: http://rubular.com/r/IQ4TPfsbd8
54
54
  ConsecutiveForwardSlashRule = Rule.new(/\/{3}/, '')
55
55
 
56
+ EscapedCarriageReturnRule = Rule.new(/\\r/, "\r")
57
+
58
+ EscapedNewLineRule = Rule.new(/\\n/, "\n")
59
+
56
60
  ReplaceNewlineWithCarriageReturnRule = Rule.new(/\n/, "\r")
57
61
 
58
62
  QuotationsFirstRule = Rule.new(/''/, '"')
@@ -81,12 +85,14 @@ module PragmaticSegmenter
81
85
  def clean
82
86
  return unless text
83
87
  @clean_text = remove_all_newlines(text)
84
- @clean_text = replace_double_newlines(@clean_text)
85
- @clean_text = replace_newlines(@clean_text)
86
- @clean_text = @clean_text.apply(HtmlRules::All)
87
- @clean_text = @clean_text.apply(InlineFormattingRule)
88
- @clean_text = clean_quotations(@clean_text)
89
- @clean_text = clean_table_of_contents(@clean_text)
88
+ replace_double_newlines(@clean_text)
89
+ replace_newlines(@clean_text)
90
+ replace_escaped_newlines(@clean_text)
91
+ @clean_text.apply(HtmlRules::All)
92
+ @clean_text.apply(InlineFormattingRule)
93
+ clean_quotations(@clean_text)
94
+ clean_table_of_contents(@clean_text)
95
+ clean_consecutive_characters(@clean_text)
90
96
  end
91
97
 
92
98
  private
@@ -110,6 +116,11 @@ module PragmaticSegmenter
110
116
  txt.apply(NewLineInMiddleOfWordRule)
111
117
  end
112
118
 
119
+ def replace_escaped_newlines(txt)
120
+ txt.apply(EscapedNewLineRule).
121
+ apply(EscapedCarriageReturnRule)
122
+ end
123
+
113
124
  def replace_double_newlines(txt)
114
125
  txt.apply(DoubleNewLineWithSpaceRule).
115
126
  apply(DoubleNewLineRule)
@@ -117,13 +128,11 @@ module PragmaticSegmenter
117
128
 
118
129
  def replace_newlines(txt)
119
130
  if doc_type.eql?('pdf')
120
- txt = remove_pdf_line_breaks(txt)
131
+ remove_pdf_line_breaks(txt)
121
132
  else
122
- txt =
123
- txt.apply(NewLineFollowedByPeriodRule).
124
- apply(ReplaceNewlineWithCarriageReturnRule)
133
+ txt.apply(NewLineFollowedByPeriodRule).
134
+ apply(ReplaceNewlineWithCarriageReturnRule)
125
135
  end
126
- txt
127
136
  end
128
137
 
129
138
  def remove_pdf_line_breaks(txt)
@@ -142,5 +151,10 @@ module PragmaticSegmenter
142
151
  apply(ConsecutivePeriodsRule).
143
152
  apply(ConsecutiveForwardSlashRule)
144
153
  end
154
+
155
+ def clean_consecutive_characters(txt)
156
+ txt.apply(ConsecutivePeriodsRule).
157
+ apply(ConsecutiveForwardSlashRule)
158
+ end
145
159
  end
146
160
  end
@@ -30,7 +30,6 @@ module PragmaticSegmenter
30
30
  ThreeConsecutiveRule,
31
31
  OtherThreePeriodRule
32
32
  ]
33
-
34
33
  end
35
34
  end
36
35
  end
@@ -25,6 +25,5 @@ module PragmaticSegmenter
25
25
  def cleaner_class
26
26
  Object.const_get("PragmaticSegmenter::Languages::#{LANGUAGE_CODES[language] || 'Common'}::Cleaner")
27
27
  end
28
-
29
28
  end
30
29
  end
@@ -29,7 +29,7 @@ module PragmaticSegmenter
29
29
 
30
30
  def replace
31
31
  super
32
- @formatted_text.apply(NumberPeriodSpaceRule).apply(NegativeNumberPeriodSpaceRule)
32
+ @text.apply(NumberPeriodSpaceRule).apply(NegativeNumberPeriodSpaceRule)
33
33
  end
34
34
  end
35
35
 
@@ -124,14 +124,12 @@ module PragmaticSegmenter
124
124
  txt.gsub!(ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX).with_index do |m|
125
125
  a.eql?(m.chomp('.')) ? "\r#{Regexp.escape(a.to_s)}∯" : "#{m}"
126
126
  end
127
- txt
128
127
  end
129
128
 
130
129
  def replace_alphabet_list_parens(a, txt)
131
130
  txt.gsub!(EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX).with_index do |m|
132
131
  a.eql?(m) ? "\r#{Regexp.escape(a.to_s)}" : "#{m}"
133
132
  end
134
- txt
135
133
  end
136
134
 
137
135
  def replace_correct_alphabet_list(a, txt, parens)
@@ -25,11 +25,11 @@ module PragmaticSegmenter
25
25
  end
26
26
 
27
27
  def replace
28
- @formatted_text = @text.apply(PeriodBeforeNumberRule).
29
- apply(NumberAfterPeriodBeforeLetterRule).
30
- apply(NewLineNumberPeriodSpaceLetterRule).
31
- apply(StartLineNumberPeriodRule).
32
- apply(StartLineTwoDigitNumberPeriodRule)
28
+ @text.apply(PeriodBeforeNumberRule).
29
+ apply(NumberAfterPeriodBeforeLetterRule).
30
+ apply(NewLineNumberPeriodSpaceLetterRule).
31
+ apply(StartLineNumberPeriodRule).
32
+ apply(StartLineTwoDigitNumberPeriodRule)
33
33
  end
34
34
  end
35
35
  end
@@ -30,57 +30,53 @@ module PragmaticSegmenter
30
30
  reformatted_text = replace_abbreviations(reformatted_text)
31
31
  reformatted_text = replace_numbers(reformatted_text)
32
32
  reformatted_text = reformatted_text.apply(GeoLocationRule)
33
- split_lines(reformatted_text)
33
+ split_into_segments(reformatted_text)
34
34
  end
35
35
 
36
36
  private
37
37
 
38
- def split_lines(txt)
39
- segments = txt.split("\r")
38
+ def split_into_segments(txt)
39
+ txt.split("\r")
40
+ .map! { |segment| segment.apply(SingleNewLineRule, EllipsisRules::All, EmailRule) }
41
+ .map { |segment| check_for_punctuation(segment) }.flatten
42
+ .map! { |segment| segment.apply(SubSymbolsRules::All) }
43
+ .map { |segment| post_process_segments(segment) }
44
+ .flatten.compact.delete_if(&:empty?)
45
+ end
40
46
 
41
- segments.map! do |line|
42
- line.apply(SingleNewLineRule, EllipsisRules::All, EmailRule)
47
+ def post_process_segments(txt)
48
+ return if consecutive_underscore?(txt) || txt.length < 2
49
+ txt.apply(ReinsertEllipsisRules::All).apply(ExtraWhiteSpaceRule)
50
+ if txt =~ QUOTATION_AT_END_OF_SENTENCE_REGEX
51
+ txt.split(SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
52
+ else
53
+ txt.tr("\n", '').strip
43
54
  end
55
+ end
44
56
 
45
- segments = segments.map { |line| analyze_lines(line) }.flatten
46
-
47
- segments.map! {|segment| sub_symbols(segment) }
48
-
49
- sentence_array = []
50
- segments.each_with_index do |line|
51
- next if line.gsub(/_{3,}/, '').length.eql?(0) || line.length < 2
52
- line = reinsert_ellipsis(line)
53
- line = line.apply(ExtraWhiteSpaceRule)
54
- if line =~ QUOTATION_AT_END_OF_SENTENCE_REGEX
55
- subline = line.split(SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX)
56
- subline.each do |s|
57
- sentence_array << s
58
- end
59
- else
60
- sentence_array << line.tr("\n", '').strip
61
- end
62
- end
63
- sentence_array.reject(&:empty?)
57
+ def consecutive_underscore?(txt)
58
+ # Rubular: http://rubular.com/r/fTF2Ff3WBL
59
+ txt.gsub(/_{3,}/, '').length.eql?(0)
64
60
  end
65
61
 
66
- def analyze_lines(line)
67
- if punctuation_array.any? { |p| line.include?(p) }
68
- process_text(line)
62
+ def check_for_punctuation(txt)
63
+ if punctuation_array.any? { |p| txt.include?(p) }
64
+ process_text(txt)
69
65
  else
70
- line
66
+ txt
71
67
  end
72
68
  end
73
69
 
74
- def process_text(line)
75
- line << 'ȸ' unless punctuation_array.any? { |p| line[-1].include?(p) }
76
- PragmaticSegmenter::ExclamationWords.apply_rules(line)
77
- between_punctutation(line)
78
- line = line.apply(
70
+ def process_text(txt)
71
+ txt << 'ȸ' unless punctuation_array.any? { |p| txt[-1].include?(p) }
72
+ PragmaticSegmenter::ExclamationWords.apply_rules(txt)
73
+ between_punctutation(txt)
74
+ txt = txt.apply(
79
75
  DoublePuctationRules::All,
80
76
  QuestionMarkInQuotationRule,
81
77
  ExclamationPointRules::All
82
78
  )
83
- sentence_boundary_punctuation(line)
79
+ sentence_boundary_punctuation(txt)
84
80
  end
85
81
 
86
82
  def replace_numbers(txt)
@@ -102,17 +98,5 @@ module PragmaticSegmenter
102
98
  def sentence_boundary_punctuation(txt)
103
99
  PragmaticSegmenter::SentenceBoundaryPunctuation.new(text: txt).split
104
100
  end
105
-
106
- def sub_symbols(txt)
107
- txt.gsub(/∯/, '.').gsub(/♬/, '،').gsub(/♭/, ':').gsub(/ᓰ/, '。').gsub(/ᓱ/, '.')
108
- .gsub(/ᓳ/, '!').gsub(/ᓴ/, '!').gsub(/ᓷ/, '?').gsub(/ᓸ/, '?').gsub(/☉/, '?!')
109
- .gsub(/☈/, '!?').gsub(/☇/, '??').gsub(/☄/, '!!').delete('ȸ').gsub(/ȹ/, "\n")
110
- end
111
-
112
- def reinsert_ellipsis(line)
113
- line.gsub(/ƪ/, '...').gsub(/♟/, ' . . . ')
114
- .gsub(/♝/, '. . . .').gsub(/☏/, '..')
115
- .gsub(/∮/, '.')
116
- end
117
101
  end
118
102
  end
@@ -4,6 +4,7 @@ module PragmaticSegmenter
4
4
  # This class replaces punctuation that is typically a sentence boundary
5
5
  # but in this case is not a sentence boundary.
6
6
  class PunctuationReplacer
7
+ include Rules
7
8
  attr_reader :matches_array, :text
8
9
  def initialize(text:, matches_array:)
9
10
  @text = text
@@ -18,45 +19,24 @@ module PragmaticSegmenter
18
19
 
19
20
  def replace_punctuation(array, txt)
20
21
  return if !array || array.empty?
21
- txt.gsub!('(', '\\(')
22
- txt.gsub!(')', '\\)')
23
- txt.gsub!(']', '\\]')
24
- txt.gsub!('[', '\\[')
25
- txt.gsub!('-', '\\-')
22
+ txt.apply(EscapeRegexReservedCharacters::All)
26
23
  array.each do |a|
27
- a.gsub!('(', '\\(')
28
- a.gsub!(')', '\\)')
29
- a.gsub!(']', '\\]')
30
- a.gsub!('[', '\\[')
31
- a.gsub!('-', '\\-')
32
-
33
- sub = a.gsub('.', '')
34
- txt.gsub!(/#{Regexp.escape(a)}/, "#{sub}")
35
-
36
- sub_1 = sub.gsub('。', 'ᓰ')
37
- txt.gsub!(/#{Regexp.escape(sub)}/, "#{sub_1}")
38
-
39
- sub_2 = sub_1.gsub('.', 'ᓱ')
40
- txt.gsub!(/#{Regexp.escape(sub_1)}/, "#{sub_2}")
41
-
42
- sub_3 = sub_2.gsub('!', 'ᓳ')
43
- txt.gsub!(/#{Regexp.escape(sub_2)}/, "#{sub_3}")
44
-
45
- sub_4 = sub_3.gsub('!', 'ᓴ')
46
- txt.gsub!(/#{Regexp.escape(sub_3)}/, "#{sub_4}")
47
-
48
- sub_5 = sub_4.gsub('?', 'ᓷ')
49
- txt.gsub!(/#{Regexp.escape(sub_4)}/, "#{sub_5}")
50
-
51
- sub_6 = sub_5.gsub('?', 'ᓸ')
52
- txt.gsub!(/#{Regexp.escape(sub_5)}/, "#{sub_6}")
24
+ a.apply(EscapeRegexReservedCharacters::All)
25
+ sub = sub_characters(txt, a, '.', '')
26
+ sub_1 = sub_characters(txt, sub, '', '&ᓰ&')
27
+ sub_2 = sub_characters(txt, sub_1, '', '&ᓱ&')
28
+ sub_3 = sub_characters(txt, sub_2, '', '&ᓳ&')
29
+ sub_4 = sub_characters(txt, sub_3, '!', '&ᓴ&')
30
+ sub_5 = sub_characters(txt, sub_4, '?', '&ᓷ&')
31
+ sub_6 = sub_characters(txt, sub_5, '?', '&ᓸ&')
53
32
  end
54
- txt.gsub!('\\(', '(')
55
- txt.gsub!('\\)', ')')
56
- txt.gsub!('\\[', '[')
57
- txt.gsub!('\\]', ']')
58
- txt.gsub!('\\-', '-')
59
- txt
33
+ txt.apply(SubEscapedRegexReservedCharacters::All)
34
+ end
35
+
36
+ def sub_characters(txt, string, char_a, char_b)
37
+ sub = string.gsub(char_a, char_b)
38
+ txt.gsub!(/#{Regexp.escape(string)}/, "#{sub}")
39
+ sub
60
40
  end
61
41
  end
62
42
  end
@@ -11,17 +11,17 @@ module PragmaticSegmenter
11
11
  ExtraWhiteSpaceRule = Rule.new(/\s{3,}/, ' ')
12
12
 
13
13
  # Rubular: http://rubular.com/r/aXPUGm6fQh
14
- QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '')
14
+ QuestionMarkInQuotationRule = Rule.new(/\?(?=(\'|\"))/, '&ᓷ&')
15
15
 
16
16
  module ExclamationPointRules
17
17
  # Rubular: http://rubular.com/r/XS1XXFRfM2
18
- InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '')
18
+ InQuotationRule = Rule.new(/\!(?=(\'|\"))/, '&ᓴ&')
19
19
 
20
20
  # Rubular: http://rubular.com/r/sl57YI8LkA
21
- BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '')
21
+ BeforeCommaMidSentenceRule = Rule.new(/\!(?=\,\s[a-z])/, '&ᓴ&')
22
22
 
23
23
  # Rubular: http://rubular.com/r/f9zTjmkIPb
24
- MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '')
24
+ MidSentenceRule = Rule.new(/\!(?=\s[a-z])/, '&ᓴ&')
25
25
 
26
26
  All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
27
27
  end
@@ -34,5 +34,66 @@ module PragmaticSegmenter
34
34
 
35
35
  All = [ FirstRule, SecondRule, ThirdRule, ForthRule ]
36
36
  end
37
+
38
+ module ReinsertEllipsisRules
39
+ ThreeConsecutivePeriod = Rule.new(/ƪ/, '...')
40
+ ThreeSpacePeriod = Rule.new(/♟/, ' . . . ')
41
+ FourSpacePeriod = Rule.new(/♝/, '. . . .')
42
+ TwoConsecutivePeriod = Rule.new(/☏/, '..')
43
+ OnePeriod = Rule.new(/∮/, '.')
44
+
45
+ All = [ ThreeConsecutivePeriod, ThreeSpacePeriod,
46
+ FourSpacePeriod, TwoConsecutivePeriod,
47
+ OnePeriod ]
48
+ end
49
+
50
+ module SubSymbolsRules
51
+ Period = Rule.new(/∯/, '.')
52
+ ArabicComma = Rule.new(/♬/, '،')
53
+ SemiColon = Rule.new(/♭/, ':')
54
+ FullWidthPeriod = Rule.new(/&ᓰ&/, '。')
55
+ SpecialPeriod = Rule.new(/&ᓱ&/, '.')
56
+ FullWidthExclamation = Rule.new(/&ᓳ&/, '!')
57
+ ExclamationPoint = Rule.new(/&ᓴ&/, '!')
58
+ QuestionMark = Rule.new(/&ᓷ&/, '?')
59
+ FullWidthQuestionMark = Rule.new(/&ᓸ&/, '?')
60
+ MixedDoubleQE = Rule.new(/☉/, '?!')
61
+ MixedDoubleQQ = Rule.new(/☇/, '??')
62
+ MixedDoubleEQ = Rule.new(/☈/, '!?')
63
+ MixedDoubleEE = Rule.new(/☄/, '!!')
64
+ TemporaryEndingPunctutation = Rule.new('ȸ', '')
65
+ Newline = Rule.new(/ȹ/, "\n")
66
+
67
+ All = [ Period, ArabicComma,
68
+ SemiColon, FullWidthPeriod,
69
+ SpecialPeriod, FullWidthExclamation,
70
+ ExclamationPoint, QuestionMark,
71
+ FullWidthQuestionMark, MixedDoubleQE,
72
+ MixedDoubleQQ, MixedDoubleEQ,
73
+ MixedDoubleEE, TemporaryEndingPunctutation,
74
+ Newline ]
75
+ end
76
+
77
+ module EscapeRegexReservedCharacters
78
+ LeftParen = Rule.new('(', '\\(')
79
+ RightParen = Rule.new(')', '\\)')
80
+ LeftBracket = Rule.new('[', '\\[')
81
+ RightBracket = Rule.new(']', '\\]')
82
+ Dash = Rule.new('-', '\\-')
83
+
84
+ All = [ LeftParen, RightParen,
85
+ LeftBracket, RightBracket, Dash ]
86
+ end
87
+
88
+ module SubEscapedRegexReservedCharacters
89
+ LeftParen = Rule.new('\\(', '(')
90
+ RightParen = Rule.new('\\)', ')')
91
+ LeftBracket = Rule.new('\\[', '[')
92
+ RightBracket = Rule.new('\\]', ']')
93
+ Dash = Rule.new('\\-', '-')
94
+
95
+ All = [ LeftParen, RightParen,
96
+ LeftBracket, RightBracket, Dash ]
97
+ end
37
98
  end
38
99
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticSegmenter
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.5"
3
3
  end
@@ -868,6 +868,11 @@ RSpec.describe PragmaticSegmenter::Segmenter do
868
868
  ps = PragmaticSegmenter::Segmenter.new(text: "////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////Header starts here\r////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////", language: 'en')
869
869
  expect(ps.segment).to eq(["Header starts here"])
870
870
  end
871
+
872
+ it 'correctly segments text #082' do
873
+ ps = PragmaticSegmenter::Segmenter.new(text: 'Hello World. \r\n Hello.', language: 'en')
874
+ expect(ps.segment).to eq(["Hello World.", "Hello."])
875
+ end
871
876
  end
872
877
  end
873
878
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias