pragmatic_segmenter 0.3.22 → 0.3.23

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 77e3c8da3e5e184a5143dff774dbe851de58c385
4
- data.tar.gz: f9f2803fb49a298b2015e206f7237b792be26e10
2
+ SHA256:
3
+ metadata.gz: 2c66c757c1b4bd8d090e88d7db6c627720f58f6f26e6fab9916a20e8bc15471c
4
+ data.tar.gz: da3a9088f72c90ddde6f0deda67d3e3b4ea3bed317970416deef794e0f594d89
5
5
  SHA512:
6
- metadata.gz: 050ea08ad001c6786f44c936581e4d82d05ebf2cc1ac5265a5b24c3af25d8cad4f562f3d7c241baf48d49a14374559d12e4888f7514f9edb23733645c49999d7
7
- data.tar.gz: d5c39d307836ce2571f8c8b0507110dd907d5150a6efc686136d915e2ecef6bca43a262ac7fc76fa908a9f18e9a1344f59b4da9115806ddc6e24e536bea0bd7d
6
+ metadata.gz: 503c52965b2f98eebbc24e1215204c45307958a0279d56834e0c929d18625e81ac8c5c78779efb1a5946b5fdda5d8496b54a72b009ad6b2a597a70c4ba0fff66
7
+ data.tar.gz: f23773139a3a6d9f45cecaacabb363a7fb825a21eb76b40514abf4d0407191ed3b1afa887a5bc5328626abe2dbac5864895add62a1da036234036984d19a3454
data/NEWS CHANGED
@@ -1,3 +1,7 @@
1
+ 0.3.22 (2021-05-03):
2
+
3
+ * Improvement: Refactor for Ruby 3.0 compatibility
4
+
1
5
  0.3.22 (2018-09-23):
2
6
 
3
7
  * Improvement: Initial support for Kazakh
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Pragmatic Segmenter
2
2
 
3
- [![Gem Version](https://badge.fury.io/rb/pragmatic_segmenter.svg)](http://badge.fury.io/rb/pragmatic_segmenter) [![Code Climate](https://codeclimate.com/github/diasks2/pragmatic_segmenter/badges/gpa.svg)](https://codeclimate.com/github/diasks2/pragmatic_segmenter) [![Build Status](https://travis-ci.org/diasks2/pragmatic_segmenter.png)](https://travis-ci.org/diasks2/pragmatic_segmenter) [![Test Coverage](https://codeclimate.com/github/diasks2/pragmatic_segmenter/badges/coverage.svg)](https://codeclimate.com/github/diasks2/pragmatic_segmenter) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/diasks2/pragmatic_segmenter/blob/master/LICENSE.txt)
3
+ [![Gem Version](https://badge.fury.io/rb/pragmatic_segmenter.svg)](http://badge.fury.io/rb/pragmatic_segmenter) [![Build Status](https://travis-ci.org/diasks2/pragmatic_segmenter.png)](https://travis-ci.org/diasks2/pragmatic_segmenter) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/diasks2/pragmatic_segmenter/blob/master/LICENSE.txt)
4
4
 
5
5
  Pragmatic Segmenter is a rule-based sentence boundary detection gem that works out-of-the-box across many languages.
6
6
 
@@ -887,6 +887,9 @@ To test the relative performance of different segmentation tools and libraries I
887
887
  **Version 0.3.22**
888
888
  * Add initial support and tests for Kazakh
889
889
 
890
+ **Version 0.3.23**
891
+ * Refactor for Ruby 3.0 compatibility
892
+
890
893
  ## Contributing
891
894
 
892
895
  If you find a text that is incorrectly segmented using this gem, please submit an issue.
@@ -900,6 +903,7 @@ If you find a text that is incorrectly segmented using this gem, please submit a
900
903
  ## Ports
901
904
 
902
905
  * [C# - PragmaticSegmenterNet](https://github.com/UglyToad/PragmaticSegmenterNet)
906
+ * [Python - pySBD](https://github.com/nipunsadvilkar/pySBD)
903
907
 
904
908
  ## License
905
909
 
@@ -10,18 +10,19 @@ module PragmaticSegmenter
10
10
 
11
11
  attr_reader :text
12
12
  def initialize(text:, language: )
13
- @text = Text.new(text)
13
+ @text = text.dup
14
14
  @language = language
15
15
  end
16
16
 
17
17
  def replace
18
- @text.apply(@language::PossessiveAbbreviationRule,
18
+ Rule.apply(@text,
19
+ @language::PossessiveAbbreviationRule,
19
20
  @language::KommanditgesellschaftRule,
20
21
  @language::SingleLetterAbbreviationRules::All)
21
22
 
22
23
  @text = search_for_abbreviations_in_string(@text)
23
24
  @text = replace_multi_period_abbreviations(@text)
24
- @text.apply(@language::AmPmRules::All)
25
+ Rule.apply(@text, @language::AmPmRules::All)
25
26
  replace_abbreviation_as_sentence_boundary(@text)
26
27
  end
27
28
 
@@ -11,7 +11,7 @@ module PragmaticSegmenter
11
11
 
12
12
  attr_reader :text, :doc_type
13
13
  def initialize(text:, doc_type: nil, language: Languages::Common)
14
- @text = Text.new(text)
14
+ @text = text.dup
15
15
  @doc_type = doc_type
16
16
  @language = language
17
17
  end
@@ -37,10 +37,10 @@ module PragmaticSegmenter
37
37
  replace_newlines
38
38
  replace_escaped_newlines
39
39
 
40
- @text.apply(HTML::All)
40
+ Rule.apply(@text, HTML::All)
41
41
 
42
42
  replace_punctuation_in_brackets
43
- @text.apply(InlineFormattingRule)
43
+ Rule.apply(@text, InlineFormattingRule)
44
44
  clean_quotations
45
45
  clean_table_of_contents
46
46
  check_for_no_space_in_between_sentences
@@ -72,7 +72,7 @@ module PragmaticSegmenter
72
72
  if word =~ regex
73
73
  unless URL_EMAIL_KEYWORDS.any? { |web| word =~ /#{web}/ }
74
74
  unless abbreviations.any? { |abbr| word =~ /#{abbr}/i }
75
- new_word = word.dup.apply(rule)
75
+ new_word = Rule.apply(word.dup, rule)
76
76
  txt.gsub!(/#{Regexp.escape(word)}/, new_word)
77
77
  end
78
78
  end
@@ -92,45 +92,45 @@ module PragmaticSegmenter
92
92
  end
93
93
 
94
94
  def remove_newline_in_middle_of_word
95
- @text.apply NewLineInMiddleOfWordRule
95
+ Rule.apply @text, NewLineInMiddleOfWordRule
96
96
  end
97
97
 
98
98
  def replace_escaped_newlines
99
- @text.apply EscapedNewLineRule, EscapedCarriageReturnRule,
99
+ Rule.apply @text, EscapedNewLineRule, EscapedCarriageReturnRule,
100
100
  TypoEscapedNewLineRule, TypoEscapedCarriageReturnRule
101
101
  end
102
102
 
103
103
  def replace_double_newlines
104
- @text.apply DoubleNewLineWithSpaceRule, DoubleNewLineRule
104
+ Rule.apply @text, DoubleNewLineWithSpaceRule, DoubleNewLineRule
105
105
  end
106
106
 
107
107
  def replace_newlines
108
108
  if doc_type.eql?('pdf')
109
109
  remove_pdf_line_breaks
110
110
  else
111
- @text.apply NewLineFollowedByPeriodRule,
111
+ Rule.apply @text, NewLineFollowedByPeriodRule,
112
112
  ReplaceNewlineWithCarriageReturnRule
113
113
  end
114
114
  end
115
115
 
116
116
  def remove_pdf_line_breaks
117
- @text.apply NewLineFollowedByBulletRule,
117
+ Rule.apply @text, NewLineFollowedByBulletRule,
118
118
 
119
119
  PDF::NewLineInMiddleOfSentenceRule,
120
120
  PDF::NewLineInMiddleOfSentenceNoSpacesRule
121
121
  end
122
122
 
123
123
  def clean_quotations
124
- @text.apply QuotationsFirstRule, QuotationsSecondRule
124
+ Rule.apply @text, QuotationsFirstRule, QuotationsSecondRule
125
125
  end
126
126
 
127
127
  def clean_table_of_contents
128
- @text.apply TableOfContentsRule, ConsecutivePeriodsRule,
128
+ Rule.apply @text, TableOfContentsRule, ConsecutivePeriodsRule,
129
129
  ConsecutiveForwardSlashRule
130
130
  end
131
131
 
132
132
  def clean_consecutive_characters
133
- @text.apply ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
133
+ Rule.apply @text, ConsecutivePeriodsRule, ConsecutiveForwardSlashRule
134
134
  end
135
135
  end
136
136
  end
@@ -47,7 +47,7 @@ module PragmaticSegmenter
47
47
  private
48
48
 
49
49
  def replace_numbers
50
- @text.apply Numbers::All
50
+ Rule.apply @text, Numbers::All
51
51
 
52
52
  replace_period_in_deutsch_dates
53
53
  end
@@ -68,7 +68,8 @@ module PragmaticSegmenter
68
68
  ).freeze
69
69
 
70
70
  def replace
71
- @text = text.apply(
71
+ @text = Rule.apply(
72
+ text,
72
73
  @language::PossessiveAbbreviationRule,
73
74
  @language::SingleLetterAbbreviationRules::All,
74
75
  SingleLowerCaseLetterRule,
@@ -76,7 +77,7 @@ module PragmaticSegmenter
76
77
 
77
78
  @text = search_for_abbreviations_in_string(@text)
78
79
  @text = replace_multi_period_abbreviations(@text)
79
- @text.apply(Languages::Common::AmPmRules::All)
80
+ Rule.apply(@text, Languages::Common::AmPmRules::All)
80
81
  replace_abbreviation_as_sentence_boundary(@text)
81
82
  end
82
83
 
@@ -17,7 +17,7 @@ module PragmaticSegmenter
17
17
  private
18
18
 
19
19
  def remove_newline_in_middle_of_word
20
- @text.apply NewLineInMiddleOfWordRule
20
+ Rule.apply @text, NewLineInMiddleOfWordRule
21
21
  end
22
22
  end
23
23
 
@@ -23,7 +23,7 @@ module PragmaticSegmenter
23
23
 
24
24
  def between_punctuation(txt)
25
25
  super(txt)
26
- txt.apply(QuestionMarkFollowedByDashLowercaseRule, ExclamationMarkFollowedByDashLowercaseRule)
26
+ Rule.apply(txt, QuestionMarkFollowedByDashLowercaseRule, ExclamationMarkFollowedByDashLowercaseRule)
27
27
  end
28
28
  end
29
29
 
@@ -35,7 +35,7 @@ module PragmaticSegmenter
35
35
 
36
36
  def replace
37
37
  super
38
- @text.apply(SingleUpperCaseCyrillicLetterAtStartOfLineRule, SingleUpperCaseCyrillicLetterRule)
38
+ Rule.apply(@text, SingleUpperCaseCyrillicLetterAtStartOfLineRule, SingleUpperCaseCyrillicLetterRule)
39
39
  end
40
40
  end
41
41
  end
@@ -48,7 +48,7 @@ module PragmaticSegmenter
48
48
 
49
49
  attr_reader :text
50
50
  def initialize(text:)
51
- @text = Text.new(text)
51
+ @text = text.dup
52
52
  end
53
53
 
54
54
  def add_line_break
@@ -68,13 +68,13 @@ module PragmaticSegmenter
68
68
  def format_numbered_list_with_parens
69
69
  replace_parens_in_numbered_list
70
70
  add_line_breaks_for_numbered_list_with_parens
71
- @text.apply(ListMarkerRule)
71
+ Rule.apply(@text, ListMarkerRule)
72
72
  end
73
73
 
74
74
  def format_numbered_list_with_periods
75
75
  replace_periods_in_numbered_list
76
76
  add_line_breaks_for_numbered_list_with_periods
77
- @text.apply(SubstituteListPeriodRule)
77
+ Rule.apply(@text, SubstituteListPeriodRule)
78
78
  end
79
79
 
80
80
  def format_alphabetical_lists
@@ -93,7 +93,7 @@ module PragmaticSegmenter
93
93
 
94
94
  def add_line_breaks_for_numbered_list_with_periods
95
95
  if @text.include?('♨') && @text !~ /♨.+\n.+♨|♨.+\r.+♨/ && @text !~ /for\s\d{1,2}♨\s[a-z]/
96
- @text.apply(SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule)
96
+ Rule.apply(@text, SpaceBetweenListItemsFirstRule, SpaceBetweenListItemsSecondRule)
97
97
  end
98
98
  end
99
99
 
@@ -105,7 +105,7 @@ module PragmaticSegmenter
105
105
 
106
106
  def add_line_breaks_for_numbered_list_with_parens
107
107
  if @text.include?('☝') && @text !~ /☝.+\n.+☝|☝.+\r.+☝/
108
- @text.apply(SpaceBetweenListItemsThirdRule)
108
+ Rule.apply(@text, SpaceBetweenListItemsThirdRule)
109
109
  end
110
110
  end
111
111
 
@@ -24,9 +24,9 @@ module PragmaticSegmenter
24
24
  replace_numbers
25
25
  replace_continuous_punctuation
26
26
  replace_periods_before_numeric_references
27
- @text.apply(@language::Abbreviations::WithMultiplePeriodsAndEmailRule)
28
- @text.apply(@language::GeoLocationRule)
29
- @text.apply(@language::FileFormatRule)
27
+ Rule.apply(@text, @language::Abbreviations::WithMultiplePeriodsAndEmailRule)
28
+ Rule.apply(@text, @language::GeoLocationRule)
29
+ Rule.apply(@text, @language::FileFormatRule)
30
30
  split_into_segments
31
31
  end
32
32
 
@@ -34,18 +34,19 @@ module PragmaticSegmenter
34
34
 
35
35
  def split_into_segments
36
36
  check_for_parens_between_quotes(@text).split("\r")
37
- .map! { |segment| segment.apply(@language::SingleNewLineRule, @language::EllipsisRules::All) }
37
+ .map! { |segment| Rule.apply(segment, @language::SingleNewLineRule, @language::EllipsisRules::All) }
38
38
  .map { |segment| check_for_punctuation(segment) }.flatten
39
- .map! { |segment| segment.apply(@language::SubSymbolsRules::All) }
39
+ .map! { |segment| Rule.apply(segment, @language::SubSymbolsRules::All) }
40
40
  .map { |segment| post_process_segments(segment) }
41
41
  .flatten.compact.delete_if(&:empty?)
42
- .map! { |segment| segment.apply(@language::SubSingleQuoteRule) }
42
+ .map! { |segment| Rule.apply(segment, @language::SubSingleQuoteRule) }
43
43
  end
44
44
 
45
45
  def post_process_segments(txt)
46
46
  return txt if txt.length < 2 && txt =~ /\A[a-zA-Z]*\Z/
47
47
  return if consecutive_underscore?(txt) || txt.length < 2
48
- txt.apply(
48
+ Rule.apply(
49
+ txt,
49
50
  @language::ReinsertEllipsisRules::All,
50
51
  @language::ExtraWhiteSpaceRule
51
52
  )
@@ -91,7 +92,8 @@ module PragmaticSegmenter
91
92
  txt << 'ȸ' unless @language::Punctuations.any? { |p| txt[-1].include?(p) }
92
93
  ExclamationWords.apply_rules(txt)
93
94
  between_punctuation(txt)
94
- txt = txt.apply(
95
+ txt = Rule.apply(
96
+ txt,
95
97
  @language::DoublePunctuationRules::All,
96
98
  @language::QuestionMarkInQuotationRule,
97
99
  @language::ExclamationPointRules::All
@@ -101,7 +103,7 @@ module PragmaticSegmenter
101
103
  end
102
104
 
103
105
  def replace_numbers
104
- @text.apply @language::Numbers::All
106
+ Rule.apply @text, @language::Numbers::All
105
107
  end
106
108
 
107
109
  def abbreviations_replacer
@@ -129,8 +131,8 @@ module PragmaticSegmenter
129
131
  end
130
132
 
131
133
  def sentence_boundary_punctuation(txt)
132
- txt = txt.apply @language::ReplaceColonBetweenNumbersRule if defined? @language::ReplaceColonBetweenNumbersRule
133
- txt = txt.apply @language::ReplaceNonSentenceBoundaryCommaRule if defined? @language::ReplaceNonSentenceBoundaryCommaRule
134
+ txt = Rule.apply txt, @language::ReplaceColonBetweenNumbersRule if defined? @language::ReplaceColonBetweenNumbersRule
135
+ txt = Rule.apply txt, @language::ReplaceNonSentenceBoundaryCommaRule if defined? @language::ReplaceNonSentenceBoundaryCommaRule
134
136
 
135
137
  txt.scan(@language::SENTENCE_BOUNDARY_REGEX)
136
138
  end
@@ -45,9 +45,9 @@ module PragmaticSegmenter
45
45
 
46
46
  def replace_punctuation(array)
47
47
  return if !array || array.empty?
48
- @text.apply(Rules::EscapeRegexReservedCharacters::All)
48
+ Rule.apply(@text, Rules::EscapeRegexReservedCharacters::All)
49
49
  array.each do |a|
50
- a.apply(Rules::EscapeRegexReservedCharacters::All)
50
+ Rule.apply(a, Rules::EscapeRegexReservedCharacters::All)
51
51
  sub = sub_characters(a, '.', '∯')
52
52
  sub_1 = sub_characters(sub, '。', '&ᓰ&')
53
53
  sub_2 = sub_characters(sub_1, '.', '&ᓱ&')
@@ -59,7 +59,7 @@ module PragmaticSegmenter
59
59
  sub_7 = sub_characters(sub_6, "'", '&⎋&')
60
60
  end
61
61
  end
62
- @text.apply(Rules::SubEscapedRegexReservedCharacters::All)
62
+ Rule.apply(@text, Rules::SubEscapedRegexReservedCharacters::All)
63
63
  end
64
64
 
65
65
  def sub_characters(string, char_a, char_b)
@@ -1,14 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module PragmaticSegmenter
4
- Rule = Struct.new(:pattern, :replacement)
5
-
6
- class Text < String
7
- def apply(*rules)
8
- rules.flatten.each do |rule|
9
- self.gsub!(rule.pattern, rule.replacement)
4
+ class Rule < Struct.new(:pattern, :replacement)
5
+ class << self
6
+ def apply(str, *rules)
7
+ rules.flatten.each do |rule|
8
+ str.gsub!(rule.pattern, rule.replacement)
9
+ end
10
+ str
10
11
  end
11
- self
12
12
  end
13
13
  end
14
14
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module PragmaticSegmenter
4
- VERSION = "0.3.22"
4
+ VERSION = "0.3.23"
5
5
  end
@@ -19,8 +19,8 @@ Gem::Specification.new do |spec|
19
19
  spec.require_paths = ["lib"]
20
20
 
21
21
  spec.add_runtime_dependency "unicode"
22
- spec.add_development_dependency "bundler", "~> 1.7"
23
- spec.add_development_dependency "rake", "~> 10.0"
22
+ spec.add_development_dependency "bundler", ">= 1.7"
23
+ spec.add_development_dependency "rake", ">= 12.3.3"
24
24
  spec.add_development_dependency "rspec"
25
25
  spec.add_development_dependency "stackprof"
26
26
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.22
4
+ version: 0.3.23
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-09-23 00:00:00.000000000 Z
11
+ date: 2021-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -28,30 +28,30 @@ dependencies:
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
33
  version: '1.7'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.7'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rake
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ">="
46
46
  - !ruby/object:Gem::Version
47
- version: '10.0'
47
+ version: 12.3.3
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ">="
53
53
  - !ruby/object:Gem::Version
54
- version: '10.0'
54
+ version: 12.3.3
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: rspec
57
57
  requirement: !ruby/object:Gem::Requirement
@@ -166,7 +166,7 @@ homepage: https://github.com/diasks2/pragmatic_segmenter
166
166
  licenses:
167
167
  - MIT
168
168
  metadata: {}
169
- post_install_message:
169
+ post_install_message:
170
170
  rdoc_options: []
171
171
  require_paths:
172
172
  - lib
@@ -181,9 +181,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
181
181
  - !ruby/object:Gem::Version
182
182
  version: '0'
183
183
  requirements: []
184
- rubyforge_project:
185
- rubygems_version: 2.6.14
186
- signing_key:
184
+ rubyforge_project:
185
+ rubygems_version: 2.7.6
186
+ signing_key:
187
187
  specification_version: 4
188
188
  summary: A rule-based sentence boundary detection gem that works out-of-the-box across
189
189
  many languages